Varys — Status
Checks HTTP reachability for all services in the NxM stack and monitors agent run status. Refreshes every 15 minutes.
import json import logging import os import time from datetime import datetime, timezone from pathlib import Path import httpx import yaml logging.basicConfig(level=logging.INFO, format="%(asctime)s [varys] %(message)s") logger = logging.getLogger("varys") SITES_DIR = Path(os.getenv("SITES_DIR", "/opt/sites")) AGENT_OS_DIR = Path(os.getenv("AGENT_OS_DIR", "/opt/agent-os")) RAVEN_URL = os.getenv("RAVEN_URL", "") CONFIG_FILE = Path(__file__).parent / "config.yaml" # Load Proxmox credentials from mounted keys file def _load_proxmox_config() -> dict: keyfile = Path("/etc/nxm-keys") cfg = {} if keyfile.exists(): for line in keyfile.read_text().splitlines(): line = line.strip() if line and "=" in line and not line.startswith("#"): k, v = line.split("=", 1) cfg[k.strip()] = v.strip() return cfg _pve = _load_proxmox_config() PROXMOX_HOST = _pve.get("PROXMOX_HOST", "172.27.40.2") PROXMOX_USER = _pve.get("PROXMOX_USER", "claude@pve") PROXMOX_TOKEN_ID = _pve.get("PROXMOX_TOKEN_ID", "claude-code") PROXMOX_TOKEN_SECRET = _pve.get("PROXMOX_TOKEN_SECRET", "") PROXMOX_BACKUP_STORAGE = "truenas-backups" PROXMOX_BACKUP_NODE = "proxmox" PROXMOX_BACKUP_WARN_DAYS = 8 # alert if last backup older than this def _load_prev_states() -> dict: path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json" if not path.exists(): return {"services": {}, "agents": {}} try: data = json.loads(path.read_text()) # migrate old flat format (services only) if "services" not in data: return {"services": data, "agents": {}} return data except Exception: return {"services": {}, "agents": {}} def _save_states(services: list, agents: list): path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json" path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps({ "services": {s["name"]: s["status"] for s in services}, "agents": {a["name"]: a["status"] for a in agents}, })) def _notify_raven(message: str, severity: str): if not RAVEN_URL: return try: httpx.post(f"{RAVEN_URL}/notify", json={"message": message, "severity": severity, "source": "varys"}, timeout=5) logger.info(f"raven notified: {message}") except Exception as e: logger.warning(f"raven notify failed (raven not live yet?): {e}") def check_service(name: str, url: str) -> dict: start = time.monotonic() try: # stream=True so we get the status code without reading the body (handles SSE endpoints) with httpx.stream("GET", url, timeout=5, verify=False, follow_redirects=True) as r: ms = int((time.monotonic() - start) * 1000) status = "up" if r.status_code < 500 else "degraded" return {"name": name, "status": status, "code": r.status_code, "ms": ms} except Exception: ms = int((time.monotonic() - start) * 1000) return {"name": name, "status": "down", "code": None, "ms": ms} def read_agent_status(name: str) -> dict: path = AGENT_OS_DIR / "logs" / name / "last-run.json" if not path.exists(): return {"name": name, "status": "unknown", "timestamp": None, "result": "no data"} try: data = json.loads(path.read_text()) return { "name": name, "status": data.get("status", "unknown"), "timestamp": data.get("timestamp", ""), "result": data.get("result", ""), } except Exception as e: return {"name": name, "status": "error", "timestamp": None, "result": str(e)} def check_proxmox_backups(vmids: list[str]) -> list[dict]: """Check last backup age for each vmid on truenas-backups. Returns status per VM.""" if not PROXMOX_TOKEN_SECRET: return [{"vmid": v, "status": "unknown", "reason": "no credentials"} for v in vmids] auth = f"PVEAPIToken={PROXMOX_USER}!{PROXMOX_TOKEN_ID}={PROXMOX_TOKEN_SECRET}" url = f"https://{PROXMOX_HOST}:8006/api2/json/nodes/{PROXMOX_BACKUP_NODE}/storage/{PROXMOX_BACKUP_STORAGE}/content" try: r = httpx.get(url, headers={"Authorization": auth}, verify=False, timeout=10) r.raise_for_status() items = r.json().get("data", []) except Exception as e: logger.warning(f"proxmox backup check failed: {e}") return [{"vmid": v, "status": "error", "reason": str(e)} for v in vmids] now = datetime.now(timezone.utc) latest: dict[str, dict] = {} for item in items: vid = str(item.get("vmid", "")) ctime = item.get("ctime", 0) if vid and (vid not in latest or ctime > latest[vid]["ctime"]): latest[vid] = {"ctime": ctime, "size_gb": round(item.get("size", 0) / 1024**3, 2)} results = [] for vid in vmids: if vid not in latest: results.append({"vmid": vid, "status": "error", "reason": "no backups found", "age_days": None}) continue age_days = (now - datetime.fromtimestamp(latest[vid]["ctime"], tz=timezone.utc)).total_seconds() / 86400 last_dt = datetime.fromtimestamp(latest[vid]["ctime"], tz=timezone.utc).strftime("%Y-%m-%d %H:%M UTC") status = "ok" if age_days <= PROXMOX_BACKUP_WARN_DAYS else "warning" results.append({ "vmid": vid, "status": status, "last_backup": last_dt, "age_days": round(age_days, 1), "size_gb": latest[vid]["size_gb"], "ctime": latest[vid]["ctime"], # used to detect new backups between runs }) logger.info(f"backup VM {vid}: {status} — {round(age_days,1)} days old ({latest[vid]['size_gb']} GB)") return results def render_html(services: list, agents: list, backups: list | None = None) -> str: now_iso = datetime.now(timezone.utc).isoformat() SERVICE_COLOURS = {"up": "#3fb950", "degraded": "#d29922", "down": "#f85149"} AGENT_COLOURS = {"success": "#3fb950", "failure": "#f85149"} BACKUP_COLOURS = {"ok": "#3fb950", "warning": "#d29922", "error": "#f85149", "unknown": "#8b949e"} service_cards = "" for s in services: colour = SERVICE_COLOURS.get(s["status"], "#8b949e") label = s["status"].upper() ms_text = f"{s['ms']} ms" if s["status"] != "down" else "—" service_cards += f"""
Checks HTTP reachability for all services in the NxM stack and monitors agent run status. Refreshes every 15 minutes.