feat: alert on agent failure/recovery via Raven

Tracks agent statuses in service-states.json alongside services.
Sends critical alert when agent status changes to failure (includes
result/error message from last-run.json). Sends recovery alert on
failure → success transition.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
nxm
2026-05-06 12:37:50 +02:00
parent 32b4c72407
commit e88b2fda06
+26 -7
View File
@@ -20,17 +20,24 @@ CONFIG_FILE = Path(__file__).parent / "config.yaml"
def _load_prev_states() -> dict: def _load_prev_states() -> dict:
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json" path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
if not path.exists(): if not path.exists():
return {} return {"services": {}, "agents": {}}
try: try:
return json.loads(path.read_text()) data = json.loads(path.read_text())
# migrate old flat format (services only)
if "services" not in data:
return {"services": data, "agents": {}}
return data
except Exception: except Exception:
return {} return {"services": {}, "agents": {}}
def _save_states(services: list): def _save_states(services: list, agents: list):
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json" path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
path.parent.mkdir(parents=True, exist_ok=True) path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps({s["name"]: s["status"] for s in services})) path.write_text(json.dumps({
"services": {s["name"]: s["status"] for s in services},
"agents": {a["name"]: a["status"] for a in agents},
}))
def _notify_raven(message: str, severity: str): def _notify_raven(message: str, severity: str):
@@ -179,13 +186,25 @@ def main():
agents.append(result) agents.append(result)
prev_states = _load_prev_states() prev_states = _load_prev_states()
prev_services = prev_states.get("services", {})
prev_agents = prev_states.get("agents", {})
for s in services: for s in services:
prev = prev_states.get(s["name"]) prev = prev_services.get(s["name"])
if prev and prev != "down" and s["status"] == "down": if prev and prev != "down" and s["status"] == "down":
_notify_raven(f"{s['name']} is DOWN", "critical") _notify_raven(f"{s['name']} is DOWN", "critical")
elif prev == "down" and s["status"] not in ("down", "degraded"): elif prev == "down" and s["status"] not in ("down", "degraded"):
_notify_raven(f"{s['name']} recovered (UP)", "info") _notify_raven(f"{s['name']} recovered (UP)", "info")
_save_states(services)
for a in agents:
prev = prev_agents.get(a["name"])
if a["status"] == "failure" and prev != "failure":
detail = a.get("result", "no details")
_notify_raven(f"{a['name']} failed: {detail}", "critical")
elif prev == "failure" and a["status"] == "success":
_notify_raven(f"{a['name']} recovered (success)", "info")
_save_states(services, agents)
out_dir = SITES_DIR / "varys" out_dir = SITES_DIR / "varys"
out_dir.mkdir(parents=True, exist_ok=True) out_dir.mkdir(parents=True, exist_ok=True)