Varys — Status
Checks HTTP reachability for all services in the NxM stack and monitors agent run status. Refreshes every 15 minutes.
- +diff --git a/config.yaml b/config.yaml index cdda011..c70bbdf 100644 --- a/config.yaml +++ b/config.yaml @@ -8,12 +8,16 @@ services: url: http://hodor-gateway:8200/health - name: citadel-mcp url: http://citadel-mcp:8300/sse + - name: jon-snow + url: http://jon-snow:8900/health - name: sam-research url: http://sam-research:8500/health - name: searxng url: http://searxng:8080 - name: raven-notify url: http://raven-notify:8400/health + - name: tarly-backup + url: http://tarly-backup:8750/health # Core services - name: open-webui @@ -36,10 +40,6 @@ services: url: http://172.27.40.3:7575 - name: netbox url: http://172.27.40.3:8100 - - name: netbird-dashboard - url: http://netbird-dashboard:80 - - name: netbird-server - url: http://netbird-server:80 - name: influxdb url: http://influxdb:8086/health - name: grafana @@ -52,3 +52,9 @@ agents: - name: citadel-mcp - name: raven-notify - name: varys-monitor + - name: jon-snow + - name: tarly-backup + +# Proxmox VM IDs to check for backup freshness (warns if last backup > 8 days old). +proxmox_backup_vmids: + - 106 diff --git a/main.py b/main.py index 7f2ad63..f6b403a 100644 --- a/main.py +++ b/main.py @@ -16,6 +16,27 @@ AGENT_OS_DIR = Path(os.getenv("AGENT_OS_DIR", "/opt/agent-os")) RAVEN_URL = os.getenv("RAVEN_URL", "") CONFIG_FILE = Path(__file__).parent / "config.yaml" +# Load Proxmox credentials from mounted keys file +def _load_proxmox_config() -> dict: + keyfile = Path("/etc/nxm-keys") + cfg = {} + if keyfile.exists(): + for line in keyfile.read_text().splitlines(): + line = line.strip() + if line and "=" in line and not line.startswith("#"): + k, v = line.split("=", 1) + cfg[k.strip()] = v.strip() + return cfg + +_pve = _load_proxmox_config() +PROXMOX_HOST = _pve.get("PROXMOX_HOST", "172.27.40.2") +PROXMOX_USER = _pve.get("PROXMOX_USER", "claude@pve") +PROXMOX_TOKEN_ID = _pve.get("PROXMOX_TOKEN_ID", "claude-code") +PROXMOX_TOKEN_SECRET = _pve.get("PROXMOX_TOKEN_SECRET", "") +PROXMOX_BACKUP_STORAGE = "truenas-backups" +PROXMOX_BACKUP_NODE = "proxmox" +PROXMOX_BACKUP_WARN_DAYS = 8 # alert if last backup older than this + def _load_prev_states() -> dict: path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json" @@ -79,11 +100,54 @@ def read_agent_status(name: str) -> dict: return {"name": name, "status": "error", "timestamp": None, "result": str(e)} -def render_html(services: list, agents: list) -> str: +def check_proxmox_backups(vmids: list[str]) -> list[dict]: + """Check last backup age for each vmid on truenas-backups. Returns status per VM.""" + if not PROXMOX_TOKEN_SECRET: + return [{"vmid": v, "status": "unknown", "reason": "no credentials"} for v in vmids] + auth = f"PVEAPIToken={PROXMOX_USER}!{PROXMOX_TOKEN_ID}={PROXMOX_TOKEN_SECRET}" + url = f"https://{PROXMOX_HOST}:8006/api2/json/nodes/{PROXMOX_BACKUP_NODE}/storage/{PROXMOX_BACKUP_STORAGE}/content" + try: + r = httpx.get(url, headers={"Authorization": auth}, verify=False, timeout=10) + r.raise_for_status() + items = r.json().get("data", []) + except Exception as e: + logger.warning(f"proxmox backup check failed: {e}") + return [{"vmid": v, "status": "error", "reason": str(e)} for v in vmids] + + now = datetime.now(timezone.utc) + latest: dict[str, dict] = {} + for item in items: + vid = str(item.get("vmid", "")) + ctime = item.get("ctime", 0) + if vid and (vid not in latest or ctime > latest[vid]["ctime"]): + latest[vid] = {"ctime": ctime, "size_gb": round(item.get("size", 0) / 1024**3, 2)} + + results = [] + for vid in vmids: + if vid not in latest: + results.append({"vmid": vid, "status": "error", "reason": "no backups found", "age_days": None}) + continue + age_days = (now - datetime.fromtimestamp(latest[vid]["ctime"], tz=timezone.utc)).total_seconds() / 86400 + last_dt = datetime.fromtimestamp(latest[vid]["ctime"], tz=timezone.utc).strftime("%Y-%m-%d %H:%M UTC") + status = "ok" if age_days <= PROXMOX_BACKUP_WARN_DAYS else "warning" + results.append({ + "vmid": vid, + "status": status, + "last_backup": last_dt, + "age_days": round(age_days, 1), + "size_gb": latest[vid]["size_gb"], + "ctime": latest[vid]["ctime"], # used to detect new backups between runs + }) + logger.info(f"backup VM {vid}: {status} — {round(age_days,1)} days old ({latest[vid]['size_gb']} GB)") + return results + + +def render_html(services: list, agents: list, backups: list | None = None) -> str: now_iso = datetime.now(timezone.utc).isoformat() SERVICE_COLOURS = {"up": "#3fb950", "degraded": "#d29922", "down": "#f85149"} AGENT_COLOURS = {"success": "#3fb950", "failure": "#f85149"} + BACKUP_COLOURS = {"ok": "#3fb950", "warning": "#d29922", "error": "#f85149", "unknown": "#8b949e"} service_cards = "" for s in services: @@ -111,6 +175,25 @@ def render_html(services: list, agents: list) -> str:
Checks HTTP reachability for all services in the NxM stack and monitors agent run status. Refreshes every 15 minutes.
- +