From e88b2fda06d0b7bb477caaf7be1afb91a8b6ee26 Mon Sep 17 00:00:00 2001 From: nxm Date: Wed, 6 May 2026 12:37:50 +0200 Subject: [PATCH] feat: alert on agent failure/recovery via Raven MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tracks agent statuses in service-states.json alongside services. Sends critical alert when agent status changes to failure (includes result/error message from last-run.json). Sends recovery alert on failure → success transition. Co-Authored-By: Claude Sonnet 4.6 --- main.py | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index e685c2d..7f2ad63 100644 --- a/main.py +++ b/main.py @@ -20,17 +20,24 @@ CONFIG_FILE = Path(__file__).parent / "config.yaml" def _load_prev_states() -> dict: path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json" if not path.exists(): - return {} + return {"services": {}, "agents": {}} try: - return json.loads(path.read_text()) + data = json.loads(path.read_text()) + # migrate old flat format (services only) + if "services" not in data: + return {"services": data, "agents": {}} + return data except Exception: - return {} + return {"services": {}, "agents": {}} -def _save_states(services: list): +def _save_states(services: list, agents: list): path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json" path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps({s["name"]: s["status"] for s in services})) + path.write_text(json.dumps({ + "services": {s["name"]: s["status"] for s in services}, + "agents": {a["name"]: a["status"] for a in agents}, + })) def _notify_raven(message: str, severity: str): @@ -179,13 +186,25 @@ def main(): agents.append(result) prev_states = _load_prev_states() + prev_services = prev_states.get("services", {}) + prev_agents = prev_states.get("agents", {}) + for s in services: - prev = prev_states.get(s["name"]) + prev = prev_services.get(s["name"]) if prev and prev != "down" and s["status"] == "down": _notify_raven(f"{s['name']} is DOWN", "critical") elif prev == "down" and s["status"] not in ("down", "degraded"): _notify_raven(f"{s['name']} recovered (UP)", "info") - _save_states(services) + + for a in agents: + prev = prev_agents.get(a["name"]) + if a["status"] == "failure" and prev != "failure": + detail = a.get("result", "no details") + _notify_raven(f"{a['name']} failed: {detail}", "critical") + elif prev == "failure" and a["status"] == "success": + _notify_raven(f"{a['name']} recovered (success)", "info") + + _save_states(services, agents) out_dir = SITES_DIR / "varys" out_dir.mkdir(parents=True, exist_ok=True)