feat: alert on agent failure/recovery via Raven
Tracks agent statuses in service-states.json alongside services. Sends critical alert when agent status changes to failure (includes result/error message from last-run.json). Sends recovery alert on failure → success transition. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -20,17 +20,24 @@ CONFIG_FILE = Path(__file__).parent / "config.yaml"
|
||||
def _load_prev_states() -> dict:
|
||||
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
|
||||
if not path.exists():
|
||||
return {}
|
||||
return {"services": {}, "agents": {}}
|
||||
try:
|
||||
return json.loads(path.read_text())
|
||||
data = json.loads(path.read_text())
|
||||
# migrate old flat format (services only)
|
||||
if "services" not in data:
|
||||
return {"services": data, "agents": {}}
|
||||
return data
|
||||
except Exception:
|
||||
return {}
|
||||
return {"services": {}, "agents": {}}
|
||||
|
||||
|
||||
def _save_states(services: list):
|
||||
def _save_states(services: list, agents: list):
|
||||
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps({s["name"]: s["status"] for s in services}))
|
||||
path.write_text(json.dumps({
|
||||
"services": {s["name"]: s["status"] for s in services},
|
||||
"agents": {a["name"]: a["status"] for a in agents},
|
||||
}))
|
||||
|
||||
|
||||
def _notify_raven(message: str, severity: str):
|
||||
@@ -179,13 +186,25 @@ def main():
|
||||
agents.append(result)
|
||||
|
||||
prev_states = _load_prev_states()
|
||||
prev_services = prev_states.get("services", {})
|
||||
prev_agents = prev_states.get("agents", {})
|
||||
|
||||
for s in services:
|
||||
prev = prev_states.get(s["name"])
|
||||
prev = prev_services.get(s["name"])
|
||||
if prev and prev != "down" and s["status"] == "down":
|
||||
_notify_raven(f"{s['name']} is DOWN", "critical")
|
||||
elif prev == "down" and s["status"] not in ("down", "degraded"):
|
||||
_notify_raven(f"{s['name']} recovered (UP)", "info")
|
||||
_save_states(services)
|
||||
|
||||
for a in agents:
|
||||
prev = prev_agents.get(a["name"])
|
||||
if a["status"] == "failure" and prev != "failure":
|
||||
detail = a.get("result", "no details")
|
||||
_notify_raven(f"{a['name']} failed: {detail}", "critical")
|
||||
elif prev == "failure" and a["status"] == "success":
|
||||
_notify_raven(f"{a['name']} recovered (success)", "info")
|
||||
|
||||
_save_states(services, agents)
|
||||
|
||||
out_dir = SITES_DIR / "varys"
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
Reference in New Issue
Block a user