feat: alert on agent failure/recovery via Raven

Tracks agent statuses in service-states.json alongside services.
Sends critical alert when agent status changes to failure (includes
result/error message from last-run.json). Sends recovery alert on
failure → success transition.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
nxm
2026-05-06 12:37:50 +02:00
parent 32b4c72407
commit e88b2fda06
+26 -7
View File
@@ -20,17 +20,24 @@ CONFIG_FILE = Path(__file__).parent / "config.yaml"
def _load_prev_states() -> dict:
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
if not path.exists():
return {}
return {"services": {}, "agents": {}}
try:
return json.loads(path.read_text())
data = json.loads(path.read_text())
# migrate old flat format (services only)
if "services" not in data:
return {"services": data, "agents": {}}
return data
except Exception:
return {}
return {"services": {}, "agents": {}}
def _save_states(services: list):
def _save_states(services: list, agents: list):
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps({s["name"]: s["status"] for s in services}))
path.write_text(json.dumps({
"services": {s["name"]: s["status"] for s in services},
"agents": {a["name"]: a["status"] for a in agents},
}))
def _notify_raven(message: str, severity: str):
@@ -179,13 +186,25 @@ def main():
agents.append(result)
prev_states = _load_prev_states()
prev_services = prev_states.get("services", {})
prev_agents = prev_states.get("agents", {})
for s in services:
prev = prev_states.get(s["name"])
prev = prev_services.get(s["name"])
if prev and prev != "down" and s["status"] == "down":
_notify_raven(f"{s['name']} is DOWN", "critical")
elif prev == "down" and s["status"] not in ("down", "degraded"):
_notify_raven(f"{s['name']} recovered (UP)", "info")
_save_states(services)
for a in agents:
prev = prev_agents.get(a["name"])
if a["status"] == "failure" and prev != "failure":
detail = a.get("result", "no details")
_notify_raven(f"{a['name']} failed: {detail}", "critical")
elif prev == "failure" and a["status"] == "success":
_notify_raven(f"{a['name']} recovered (success)", "info")
_save_states(services, agents)
out_dir = SITES_DIR / "varys"
out_dir.mkdir(parents=True, exist_ok=True)