import json import logging import os import time from datetime import datetime, timezone from pathlib import Path import httpx import yaml logging.basicConfig(level=logging.INFO, format="%(asctime)s [varys] %(message)s") logger = logging.getLogger("varys") SITES_DIR = Path(os.getenv("SITES_DIR", "/opt/sites")) AGENT_OS_DIR = Path(os.getenv("AGENT_OS_DIR", "/opt/agent-os")) RAVEN_URL = os.getenv("RAVEN_URL", "") CONFIG_FILE = Path(__file__).parent / "config.yaml" # Load Proxmox credentials from mounted keys file def _load_proxmox_config() -> dict: keyfile = Path("/etc/nxm-keys") cfg = {} if keyfile.exists(): for line in keyfile.read_text().splitlines(): line = line.strip() if line and "=" in line and not line.startswith("#"): k, v = line.split("=", 1) cfg[k.strip()] = v.strip() return cfg _pve = _load_proxmox_config() PROXMOX_HOST = _pve.get("PROXMOX_HOST", "172.27.40.2") PROXMOX_USER = _pve.get("PROXMOX_USER", "claude@pve") PROXMOX_TOKEN_ID = _pve.get("PROXMOX_TOKEN_ID", "claude-code") PROXMOX_TOKEN_SECRET = _pve.get("PROXMOX_TOKEN_SECRET", "") PROXMOX_BACKUP_STORAGE = "truenas-backups" PROXMOX_BACKUP_NODE = "proxmox" PROXMOX_BACKUP_WARN_DAYS = 8 # alert if last backup older than this def _load_prev_states() -> dict: path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json" if not path.exists(): return {"services": {}, "agents": {}} try: data = json.loads(path.read_text()) # migrate old flat format (services only) if "services" not in data: return {"services": data, "agents": {}} return data except Exception: return {"services": {}, "agents": {}} def _save_states(services: list, agents: list): path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json" path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps({ "services": {s["name"]: s["status"] for s in services}, "agents": {a["name"]: a["status"] for a in agents}, })) def _notify_raven(message: str, severity: str): if not RAVEN_URL: return try: httpx.post(f"{RAVEN_URL}/notify", json={"message": message, "severity": severity, "source": "varys"}, timeout=5) logger.info(f"raven notified: {message}") except Exception as e: logger.warning(f"raven notify failed (raven not live yet?): {e}") def check_service(name: str, url: str) -> dict: start = time.monotonic() try: # stream=True so we get the status code without reading the body (handles SSE endpoints) with httpx.stream("GET", url, timeout=5, verify=False, follow_redirects=True) as r: ms = int((time.monotonic() - start) * 1000) status = "up" if r.status_code < 500 else "degraded" return {"name": name, "status": status, "code": r.status_code, "ms": ms} except Exception: ms = int((time.monotonic() - start) * 1000) return {"name": name, "status": "down", "code": None, "ms": ms} def read_agent_status(name: str) -> dict: path = AGENT_OS_DIR / "logs" / name / "last-run.json" if not path.exists(): return {"name": name, "status": "unknown", "timestamp": None, "result": "no data"} try: data = json.loads(path.read_text()) return { "name": name, "status": data.get("status", "unknown"), "timestamp": data.get("timestamp", ""), "result": data.get("result", ""), } except Exception as e: return {"name": name, "status": "error", "timestamp": None, "result": str(e)} def check_proxmox_backups(vmids: list[str]) -> list[dict]: """Check last backup age for each vmid on truenas-backups. Returns status per VM.""" if not PROXMOX_TOKEN_SECRET: return [{"vmid": v, "status": "unknown", "reason": "no credentials"} for v in vmids] auth = f"PVEAPIToken={PROXMOX_USER}!{PROXMOX_TOKEN_ID}={PROXMOX_TOKEN_SECRET}" url = f"https://{PROXMOX_HOST}:8006/api2/json/nodes/{PROXMOX_BACKUP_NODE}/storage/{PROXMOX_BACKUP_STORAGE}/content" try: r = httpx.get(url, headers={"Authorization": auth}, verify=False, timeout=10) r.raise_for_status() items = r.json().get("data", []) except Exception as e: logger.warning(f"proxmox backup check failed: {e}") return [{"vmid": v, "status": "error", "reason": str(e)} for v in vmids] now = datetime.now(timezone.utc) latest: dict[str, dict] = {} for item in items: vid = str(item.get("vmid", "")) ctime = item.get("ctime", 0) if vid and (vid not in latest or ctime > latest[vid]["ctime"]): latest[vid] = {"ctime": ctime, "size_gb": round(item.get("size", 0) / 1024**3, 2)} results = [] for vid in vmids: if vid not in latest: results.append({"vmid": vid, "status": "error", "reason": "no backups found", "age_days": None}) continue age_days = (now - datetime.fromtimestamp(latest[vid]["ctime"], tz=timezone.utc)).total_seconds() / 86400 last_dt = datetime.fromtimestamp(latest[vid]["ctime"], tz=timezone.utc).strftime("%Y-%m-%d %H:%M UTC") status = "ok" if age_days <= PROXMOX_BACKUP_WARN_DAYS else "warning" results.append({ "vmid": vid, "status": status, "last_backup": last_dt, "age_days": round(age_days, 1), "size_gb": latest[vid]["size_gb"], "ctime": latest[vid]["ctime"], # used to detect new backups between runs }) logger.info(f"backup VM {vid}: {status} — {round(age_days,1)} days old ({latest[vid]['size_gb']} GB)") return results def render_html(services: list, agents: list, backups: list | None = None) -> str: now_iso = datetime.now(timezone.utc).isoformat() SERVICE_COLOURS = {"up": "#3fb950", "degraded": "#d29922", "down": "#f85149"} AGENT_COLOURS = {"success": "#3fb950", "failure": "#f85149"} BACKUP_COLOURS = {"ok": "#3fb950", "warning": "#d29922", "error": "#f85149", "unknown": "#8b949e"} service_cards = "" for s in services: colour = SERVICE_COLOURS.get(s["status"], "#8b949e") label = s["status"].upper() ms_text = f"{s['ms']} ms" if s["status"] != "down" else "—" service_cards += f"""
{s['name']} {label} {ms_text}
""" agent_rows = "" for a in agents: colour = AGENT_COLOURS.get(a["status"], "#8b949e") raw_ts = a["timestamp"] ts = f'' if raw_ts else "—" agent_rows += f""" {a['name']} {a['status'].upper()} {ts} {a['result']} """ backup_rows = "" if backups: for b in backups: colour = BACKUP_COLOURS.get(b["status"], "#8b949e") age = f"{b['age_days']}d" if b.get("age_days") is not None else "—" size = f"{b['size_gb']} GB" if b.get("size_gb") else "—" last = b.get("last_backup", b.get("reason", "—")) backup_rows += f""" VM {b['vmid']} {b['status'].upper()} {last} {age} ago  ·  {size} """ backup_section = f"""

Proxmox Backups

{backup_rows}
""" if backups else "" return f""" Varys — Status

Varys — Status

Checks HTTP reachability for all services in the NxM stack and monitors agent run status. Refreshes every 15 minutes.

Updated

Services

{service_cards}

Agents

{agent_rows}
{backup_section}
""" def render_summary_md(services: list, agents: list) -> str: now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") up = sum(1 for s in services if s["status"] == "up") down = [s["name"] for s in services if s["status"] in ("down", "degraded")] lines = [f"# Varys — Status Summary\n\nUpdated: {now}\n"] lines.append(f"## Services: {up}/{len(services)} up") if down: lines.append(f"Issues: {', '.join(down)}") lines.append("") lines.append("## Agents") for a in agents: ts = a["timestamp"][:16].replace("T", " ") + " UTC" if a["timestamp"] else "no data" lines.append(f"- {a['name']}: {a['status']} ({ts}) — {a['result']}") return "\n".join(lines) def main(): config = yaml.safe_load(CONFIG_FILE.read_text()) services = [] for svc in config.get("services", []): result = check_service(svc["name"], svc["url"]) logger.info(f"{svc['name']}: {result['status']} ({result['ms']} ms)") services.append(result) agents = [] for agent in config.get("agents", []): result = read_agent_status(agent["name"]) logger.info(f"agent {agent['name']}: {result['status']}") agents.append(result) # Check Proxmox backups backup_vmids = [str(v) for v in config.get("proxmox_backup_vmids", [])] backups = check_proxmox_backups(backup_vmids) if backup_vmids else [] prev_states = _load_prev_states() prev_services = prev_states.get("services", {}) prev_agents = prev_states.get("agents", {}) prev_backups = prev_states.get("backups", {}) for s in services: prev = prev_services.get(s["name"]) if prev and prev != "down" and s["status"] == "down": _notify_raven(f"{s['name']} is DOWN", "critical") elif prev == "down" and s["status"] not in ("down", "degraded"): _notify_raven(f"{s['name']} recovered (UP)", "info") for a in agents: prev = prev_agents.get(a["name"]) if a["status"] == "failure" and prev != "failure": detail = a.get("result", "no details") _notify_raven(f"{a['name']} failed: {detail}", "critical") elif prev == "failure" and a["status"] == "success": _notify_raven(f"{a['name']} recovered (success)", "info") prev_backup_ctimes = prev_states.get("backup_ctimes", {}) for b in backups: vid = b["vmid"] prev_status = prev_backups.get(vid) prev_ctime = prev_backup_ctimes.get(vid, 0) curr_ctime = b.get("ctime", 0) # New backup detected — ctime is newer than last recorded if curr_ctime and curr_ctime > prev_ctime: _notify_raven( f"✅ Proxmox backup completed — VM {vid} ({b.get('size_gb', '?')} GB) at {b.get('last_backup', '?')}", "info", ) # Backup gone overdue or missing if b["status"] in ("warning", "error") and prev_status not in ("warning", "error"): age = f"{b['age_days']}d" if b.get("age_days") is not None else "unknown age" reason = b.get("reason") or f"last backup {age} ago" _notify_raven(f"⚠️ Proxmox backup VM {vid}: {b['status'].upper()} — {reason}", "critical") elif prev_status in ("warning", "error") and b["status"] == "ok": _notify_raven(f"✅ Proxmox backup VM {vid}: recovered (last backup {b.get('age_days', 0)}d ago)", "info") _save_states(services, agents) # Save backup states and ctimes (extend existing format) state_path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json" if state_path.exists(): try: existing = json.loads(state_path.read_text()) except Exception: existing = {} existing["backups"] = {b["vmid"]: b["status"] for b in backups} existing["backup_ctimes"] = {b["vmid"]: b.get("ctime", 0) for b in backups} state_path.write_text(json.dumps(existing)) out_dir = SITES_DIR / "varys" out_dir.mkdir(parents=True, exist_ok=True) (out_dir / "index.html").write_text(render_html(services, agents, backups)) (out_dir / "last-output.md").write_text(render_summary_md(services, agents)) down_count = sum(1 for s in services if s["status"] != "up") backup_issues = sum(1 for b in backups if b["status"] != "ok") status = { "agent": "varys-monitor", "timestamp": datetime.now(timezone.utc).isoformat(), "status": "success", "result": f"{len(services)} services checked, {down_count} not up; {len(backups)} backups checked, {backup_issues} issues", } log_dir = AGENT_OS_DIR / "logs" / "varys-monitor" log_dir.mkdir(parents=True, exist_ok=True) (log_dir / "last-run.json").write_text(json.dumps(status, indent=2)) logger.info(f"done — {len(services)} services, {down_count} not up; {len(backups)} backups, {backup_issues} issues") if __name__ == "__main__": main()