feat: alert on agent failure/recovery via Raven
Tracks agent statuses in service-states.json alongside services. Sends critical alert when agent status changes to failure (includes result/error message from last-run.json). Sends recovery alert on failure → success transition. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -20,17 +20,24 @@ CONFIG_FILE = Path(__file__).parent / "config.yaml"
|
|||||||
def _load_prev_states() -> dict:
|
def _load_prev_states() -> dict:
|
||||||
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
|
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
return {}
|
return {"services": {}, "agents": {}}
|
||||||
try:
|
try:
|
||||||
return json.loads(path.read_text())
|
data = json.loads(path.read_text())
|
||||||
|
# migrate old flat format (services only)
|
||||||
|
if "services" not in data:
|
||||||
|
return {"services": data, "agents": {}}
|
||||||
|
return data
|
||||||
except Exception:
|
except Exception:
|
||||||
return {}
|
return {"services": {}, "agents": {}}
|
||||||
|
|
||||||
|
|
||||||
def _save_states(services: list):
|
def _save_states(services: list, agents: list):
|
||||||
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
|
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
|
||||||
path.parent.mkdir(parents=True, exist_ok=True)
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
path.write_text(json.dumps({s["name"]: s["status"] for s in services}))
|
path.write_text(json.dumps({
|
||||||
|
"services": {s["name"]: s["status"] for s in services},
|
||||||
|
"agents": {a["name"]: a["status"] for a in agents},
|
||||||
|
}))
|
||||||
|
|
||||||
|
|
||||||
def _notify_raven(message: str, severity: str):
|
def _notify_raven(message: str, severity: str):
|
||||||
@@ -179,13 +186,25 @@ def main():
|
|||||||
agents.append(result)
|
agents.append(result)
|
||||||
|
|
||||||
prev_states = _load_prev_states()
|
prev_states = _load_prev_states()
|
||||||
|
prev_services = prev_states.get("services", {})
|
||||||
|
prev_agents = prev_states.get("agents", {})
|
||||||
|
|
||||||
for s in services:
|
for s in services:
|
||||||
prev = prev_states.get(s["name"])
|
prev = prev_services.get(s["name"])
|
||||||
if prev and prev != "down" and s["status"] == "down":
|
if prev and prev != "down" and s["status"] == "down":
|
||||||
_notify_raven(f"{s['name']} is DOWN", "critical")
|
_notify_raven(f"{s['name']} is DOWN", "critical")
|
||||||
elif prev == "down" and s["status"] not in ("down", "degraded"):
|
elif prev == "down" and s["status"] not in ("down", "degraded"):
|
||||||
_notify_raven(f"{s['name']} recovered (UP)", "info")
|
_notify_raven(f"{s['name']} recovered (UP)", "info")
|
||||||
_save_states(services)
|
|
||||||
|
for a in agents:
|
||||||
|
prev = prev_agents.get(a["name"])
|
||||||
|
if a["status"] == "failure" and prev != "failure":
|
||||||
|
detail = a.get("result", "no details")
|
||||||
|
_notify_raven(f"{a['name']} failed: {detail}", "critical")
|
||||||
|
elif prev == "failure" and a["status"] == "success":
|
||||||
|
_notify_raven(f"{a['name']} recovered (success)", "info")
|
||||||
|
|
||||||
|
_save_states(services, agents)
|
||||||
|
|
||||||
out_dir = SITES_DIR / "varys"
|
out_dir = SITES_DIR / "varys"
|
||||||
out_dir.mkdir(parents=True, exist_ok=True)
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|||||||
Reference in New Issue
Block a user