Files
varys-monitor/main.py
T
admin e88b2fda06 feat: alert on agent failure/recovery via Raven
Tracks agent statuses in service-states.json alongside services.
Sends critical alert when agent status changes to failure (includes
result/error message from last-run.json). Sends recovery alert on
failure → success transition.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 12:37:50 +02:00

229 lines
8.8 KiB
Python

import json
import logging
import os
import time
from datetime import datetime, timezone
from pathlib import Path
import httpx
import yaml
logging.basicConfig(level=logging.INFO, format="%(asctime)s [varys] %(message)s")
logger = logging.getLogger("varys")
SITES_DIR = Path(os.getenv("SITES_DIR", "/opt/sites"))
AGENT_OS_DIR = Path(os.getenv("AGENT_OS_DIR", "/opt/agent-os"))
RAVEN_URL = os.getenv("RAVEN_URL", "")
CONFIG_FILE = Path(__file__).parent / "config.yaml"
def _load_prev_states() -> dict:
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
if not path.exists():
return {"services": {}, "agents": {}}
try:
data = json.loads(path.read_text())
# migrate old flat format (services only)
if "services" not in data:
return {"services": data, "agents": {}}
return data
except Exception:
return {"services": {}, "agents": {}}
def _save_states(services: list, agents: list):
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps({
"services": {s["name"]: s["status"] for s in services},
"agents": {a["name"]: a["status"] for a in agents},
}))
def _notify_raven(message: str, severity: str):
if not RAVEN_URL:
return
try:
httpx.post(f"{RAVEN_URL}/notify", json={"message": message, "severity": severity, "source": "varys"}, timeout=5)
logger.info(f"raven notified: {message}")
except Exception as e:
logger.warning(f"raven notify failed (raven not live yet?): {e}")
def check_service(name: str, url: str) -> dict:
start = time.monotonic()
try:
# stream=True so we get the status code without reading the body (handles SSE endpoints)
with httpx.stream("GET", url, timeout=5, verify=False, follow_redirects=True) as r:
ms = int((time.monotonic() - start) * 1000)
status = "up" if r.status_code < 500 else "degraded"
return {"name": name, "status": status, "code": r.status_code, "ms": ms}
except Exception:
ms = int((time.monotonic() - start) * 1000)
return {"name": name, "status": "down", "code": None, "ms": ms}
def read_agent_status(name: str) -> dict:
path = AGENT_OS_DIR / "logs" / name / "last-run.json"
if not path.exists():
return {"name": name, "status": "unknown", "timestamp": None, "result": "no data"}
try:
data = json.loads(path.read_text())
return {
"name": name,
"status": data.get("status", "unknown"),
"timestamp": data.get("timestamp", ""),
"result": data.get("result", ""),
}
except Exception as e:
return {"name": name, "status": "error", "timestamp": None, "result": str(e)}
def render_html(services: list, agents: list) -> str:
now_iso = datetime.now(timezone.utc).isoformat()
SERVICE_COLOURS = {"up": "#3fb950", "degraded": "#d29922", "down": "#f85149"}
AGENT_COLOURS = {"success": "#3fb950", "failure": "#f85149"}
service_cards = ""
for s in services:
colour = SERVICE_COLOURS.get(s["status"], "#8b949e")
label = s["status"].upper()
ms_text = f"{s['ms']} ms" if s["status"] != "down" else ""
service_cards += f"""
<div class="svc">
<span class="dot" style="background:{colour}"></span>
<span class="svc-name">{s['name']}</span>
<span class="svc-label" style="color:{colour}">{label}</span>
<span class="svc-ms">{ms_text}</span>
</div>"""
agent_rows = ""
for a in agents:
colour = AGENT_COLOURS.get(a["status"], "#8b949e")
raw_ts = a["timestamp"]
ts = f'<span data-utc="{raw_ts}"></span>' if raw_ts else ""
agent_rows += f"""
<tr>
<td><span class="dot" style="background:{colour}"></span>{a['name']}</td>
<td style="color:{colour}">{a['status'].upper()}</td>
<td>{ts}</td>
<td class="dim">{a['result']}</td>
</tr>"""
return f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta http-equiv="refresh" content="900">
<title>Varys — Status</title>
<style>
* {{ box-sizing: border-box; margin: 0; padding: 0; }}
body {{ font-family: monospace; background: #0d1117; color: #c9d1d9; padding: 2rem; }}
h1 {{ color: #58a6ff; margin-bottom: 0.25rem; }}
h2 {{ color: #8b949e; font-size: 0.85rem; text-transform: uppercase; letter-spacing: 0.05em; margin: 2rem 0 0.75rem; }}
.desc {{ color: #8b949e; font-size: 0.9rem; margin: 0.4rem 0 0.25rem; }}
.meta {{ color: #8b949e; font-size: 0.85rem; margin-bottom: 2rem; }}
.meta a {{ color: #8b949e; }}
.dot {{ display: inline-block; width: 8px; height: 8px; border-radius: 50%; margin-right: 6px; vertical-align: middle; }}
.svc {{ display: flex; align-items: center; gap: 0.75rem; border: 1px solid #30363d; border-radius: 6px; padding: 0.6rem 1rem; margin-bottom: 0.5rem; }}
.svc-name {{ flex: 1; }}
.svc-label {{ font-size: 0.8rem; min-width: 5rem; }}
.svc-ms {{ color: #8b949e; font-size: 0.8rem; min-width: 4rem; text-align: right; }}
table {{ width: 100%; border-collapse: collapse; }}
td {{ padding: 0.6rem 1rem; border-bottom: 1px solid #21262d; font-size: 0.9rem; }}
.dim {{ color: #8b949e; }}
</style>
</head>
<body>
<h1>Varys — Status</h1>
<p class="desc">Checks HTTP reachability for all services in the NxM stack and monitors agent run status. Refreshes every 15 minutes.</p>
<p class="meta">Updated <span data-utc="{now_iso}"></span> &nbsp;·&nbsp; <a href="/">← home</a></p>
<h2>Services</h2>
{service_cards}
<h2>Agents</h2>
<table><tbody>{agent_rows}</tbody></table>
<script>
document.querySelectorAll('[data-utc]').forEach(el => {{
el.textContent = new Date(el.dataset.utc).toLocaleString(undefined, {{year:'numeric',month:'2-digit',day:'2-digit',hour:'2-digit',minute:'2-digit'}});
}});
</script>
</body>
</html>"""
def render_summary_md(services: list, agents: list) -> str:
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
up = sum(1 for s in services if s["status"] == "up")
down = [s["name"] for s in services if s["status"] in ("down", "degraded")]
lines = [f"# Varys — Status Summary\n\nUpdated: {now}\n"]
lines.append(f"## Services: {up}/{len(services)} up")
if down:
lines.append(f"Issues: {', '.join(down)}")
lines.append("")
lines.append("## Agents")
for a in agents:
ts = a["timestamp"][:16].replace("T", " ") + " UTC" if a["timestamp"] else "no data"
lines.append(f"- {a['name']}: {a['status']} ({ts}) — {a['result']}")
return "\n".join(lines)
def main():
config = yaml.safe_load(CONFIG_FILE.read_text())
services = []
for svc in config.get("services", []):
result = check_service(svc["name"], svc["url"])
logger.info(f"{svc['name']}: {result['status']} ({result['ms']} ms)")
services.append(result)
agents = []
for agent in config.get("agents", []):
result = read_agent_status(agent["name"])
logger.info(f"agent {agent['name']}: {result['status']}")
agents.append(result)
prev_states = _load_prev_states()
prev_services = prev_states.get("services", {})
prev_agents = prev_states.get("agents", {})
for s in services:
prev = prev_services.get(s["name"])
if prev and prev != "down" and s["status"] == "down":
_notify_raven(f"{s['name']} is DOWN", "critical")
elif prev == "down" and s["status"] not in ("down", "degraded"):
_notify_raven(f"{s['name']} recovered (UP)", "info")
for a in agents:
prev = prev_agents.get(a["name"])
if a["status"] == "failure" and prev != "failure":
detail = a.get("result", "no details")
_notify_raven(f"{a['name']} failed: {detail}", "critical")
elif prev == "failure" and a["status"] == "success":
_notify_raven(f"{a['name']} recovered (success)", "info")
_save_states(services, agents)
out_dir = SITES_DIR / "varys"
out_dir.mkdir(parents=True, exist_ok=True)
(out_dir / "index.html").write_text(render_html(services, agents))
(out_dir / "last-output.md").write_text(render_summary_md(services, agents))
down_count = sum(1 for s in services if s["status"] != "up")
status = {
"agent": "varys-monitor",
"timestamp": datetime.now(timezone.utc).isoformat(),
"status": "success",
"result": f"{len(services)} services checked, {down_count} not up",
}
log_dir = AGENT_OS_DIR / "logs" / "varys-monitor"
log_dir.mkdir(parents=True, exist_ok=True)
(log_dir / "last-run.json").write_text(json.dumps(status, indent=2))
logger.info(f"done — {len(services)} services, {down_count} not up")
if __name__ == "__main__":
main()