import json import logging import os import time from datetime import datetime, timezone from pathlib import Path import httpx import yaml logging.basicConfig(level=logging.INFO, format="%(asctime)s [varys] %(message)s") logger = logging.getLogger("varys") SITES_DIR = Path(os.getenv("SITES_DIR", "/opt/sites")) AGENT_OS_DIR = Path(os.getenv("AGENT_OS_DIR", "/opt/agent-os")) CONFIG_FILE = Path(__file__).parent / "config.yaml" def check_service(name: str, url: str) -> dict: start = time.monotonic() try: # stream=True so we get the status code without reading the body (handles SSE endpoints) with httpx.stream("GET", url, timeout=5, verify=False, follow_redirects=True) as r: ms = int((time.monotonic() - start) * 1000) status = "up" if r.status_code < 500 else "degraded" return {"name": name, "status": status, "code": r.status_code, "ms": ms} except Exception: ms = int((time.monotonic() - start) * 1000) return {"name": name, "status": "down", "code": None, "ms": ms} def read_agent_status(name: str) -> dict: path = AGENT_OS_DIR / "logs" / name / "last-run.json" if not path.exists(): return {"name": name, "status": "unknown", "timestamp": None, "result": "no data"} try: data = json.loads(path.read_text()) return { "name": name, "status": data.get("status", "unknown"), "timestamp": data.get("timestamp", ""), "result": data.get("result", ""), } except Exception as e: return {"name": name, "status": "error", "timestamp": None, "result": str(e)} def render_html(services: list, agents: list) -> str: now_iso = datetime.now(timezone.utc).isoformat() SERVICE_COLOURS = {"up": "#3fb950", "degraded": "#d29922", "down": "#f85149"} AGENT_COLOURS = {"success": "#3fb950", "failure": "#f85149"} service_cards = "" for s in services: colour = SERVICE_COLOURS.get(s["status"], "#8b949e") label = s["status"].upper() ms_text = f"{s['ms']} ms" if s["status"] != "down" else "—" service_cards += f"""
{s['name']} {label} {ms_text}
""" agent_rows = "" for a in agents: colour = AGENT_COLOURS.get(a["status"], "#8b949e") raw_ts = a["timestamp"] ts = f'' if raw_ts else "—" agent_rows += f""" {a['name']} {a['status'].upper()} {ts} {a['result']} """ return f""" Varys — Status

Varys — Status

Checks HTTP reachability for all services in the NxM stack and monitors agent run status. Refreshes every 15 minutes.

Updated  ·  ← home

Services

{service_cards}

Agents

{agent_rows}
""" def render_summary_md(services: list, agents: list) -> str: now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") up = sum(1 for s in services if s["status"] == "up") down = [s["name"] for s in services if s["status"] in ("down", "degraded")] lines = [f"# Varys — Status Summary\n\nUpdated: {now}\n"] lines.append(f"## Services: {up}/{len(services)} up") if down: lines.append(f"Issues: {', '.join(down)}") lines.append("") lines.append("## Agents") for a in agents: ts = a["timestamp"][:16].replace("T", " ") + " UTC" if a["timestamp"] else "no data" lines.append(f"- {a['name']}: {a['status']} ({ts}) — {a['result']}") return "\n".join(lines) def main(): config = yaml.safe_load(CONFIG_FILE.read_text()) services = [] for svc in config.get("services", []): result = check_service(svc["name"], svc["url"]) logger.info(f"{svc['name']}: {result['status']} ({result['ms']} ms)") services.append(result) agents = [] for agent in config.get("agents", []): result = read_agent_status(agent["name"]) logger.info(f"agent {agent['name']}: {result['status']}") agents.append(result) out_dir = SITES_DIR / "varys" out_dir.mkdir(parents=True, exist_ok=True) (out_dir / "index.html").write_text(render_html(services, agents)) (out_dir / "last-output.md").write_text(render_summary_md(services, agents)) down_count = sum(1 for s in services if s["status"] != "up") status = { "agent": "varys-monitor", "timestamp": datetime.now(timezone.utc).isoformat(), "status": "success", "result": f"{len(services)} services checked, {down_count} not up", } log_dir = AGENT_OS_DIR / "logs" / "varys-monitor" log_dir.mkdir(parents=True, exist_ok=True) (log_dir / "last-run.json").write_text(json.dumps(status, indent=2)) logger.info(f"done — {len(services)} services, {down_count} not up") if __name__ == "__main__": main()