diff --git a/config.yaml b/config.yaml index 4e8422c..7cc8f4d 100644 --- a/config.yaml +++ b/config.yaml @@ -8,6 +8,10 @@ services: url: http://hodor-gateway:8200/health - name: citadel-mcp url: http://citadel-mcp:8300/sse + - name: sam-research + url: http://sam-research:8500/health + - name: searxng + url: http://searxng:8080 # Core services - name: open-webui @@ -34,3 +38,6 @@ services: # One-shot agents to watchdog (reads /opt/agent-os/logs//last-run.json). agents: - name: bran-changelog + - name: sam-research + - name: citadel-mcp + - name: varys-monitor diff --git a/main.py b/main.py index 1f4b732..e685c2d 100644 --- a/main.py +++ b/main.py @@ -13,9 +13,36 @@ logger = logging.getLogger("varys") SITES_DIR = Path(os.getenv("SITES_DIR", "/opt/sites")) AGENT_OS_DIR = Path(os.getenv("AGENT_OS_DIR", "/opt/agent-os")) +RAVEN_URL = os.getenv("RAVEN_URL", "") CONFIG_FILE = Path(__file__).parent / "config.yaml" +def _load_prev_states() -> dict: + path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json" + if not path.exists(): + return {} + try: + return json.loads(path.read_text()) + except Exception: + return {} + + +def _save_states(services: list): + path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json" + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps({s["name"]: s["status"] for s in services})) + + +def _notify_raven(message: str, severity: str): + if not RAVEN_URL: + return + try: + httpx.post(f"{RAVEN_URL}/notify", json={"message": message, "severity": severity, "source": "varys"}, timeout=5) + logger.info(f"raven notified: {message}") + except Exception as e: + logger.warning(f"raven notify failed (raven not live yet?): {e}") + + def check_service(name: str, url: str) -> dict: start = time.monotonic() try: @@ -151,6 +178,15 @@ def main(): logger.info(f"agent {agent['name']}: {result['status']}") agents.append(result) + prev_states = _load_prev_states() + for s in services: + prev = prev_states.get(s["name"]) + if prev and prev != "down" and s["status"] == "down": + _notify_raven(f"{s['name']} is DOWN", "critical") + elif prev == "down" and s["status"] not in ("down", "degraded"): + _notify_raven(f"{s['name']} recovered (UP)", "info") + _save_states(services) + out_dir = SITES_DIR / "varys" out_dir.mkdir(parents=True, exist_ok=True) (out_dir / "index.html").write_text(render_html(services, agents)) diff --git a/run.sh b/run.sh index a006950..781af50 100755 --- a/run.sh +++ b/run.sh @@ -12,4 +12,5 @@ docker run --rm \ --volume /opt/agent-os:/opt/agent-os \ --env SITES_DIR=/opt/sites \ --env AGENT_OS_DIR=/opt/agent-os \ + --env RAVEN_URL= \ varys-monitor