diff --git a/config.yaml b/config.yaml index cdda011..c70bbdf 100644 --- a/config.yaml +++ b/config.yaml @@ -8,12 +8,16 @@ services: url: http://hodor-gateway:8200/health - name: citadel-mcp url: http://citadel-mcp:8300/sse + - name: jon-snow + url: http://jon-snow:8900/health - name: sam-research url: http://sam-research:8500/health - name: searxng url: http://searxng:8080 - name: raven-notify url: http://raven-notify:8400/health + - name: tarly-backup + url: http://tarly-backup:8750/health # Core services - name: open-webui @@ -36,10 +40,6 @@ services: url: http://172.27.40.3:7575 - name: netbox url: http://172.27.40.3:8100 - - name: netbird-dashboard - url: http://netbird-dashboard:80 - - name: netbird-server - url: http://netbird-server:80 - name: influxdb url: http://influxdb:8086/health - name: grafana @@ -52,3 +52,9 @@ agents: - name: citadel-mcp - name: raven-notify - name: varys-monitor + - name: jon-snow + - name: tarly-backup + +# Proxmox VM IDs to check for backup freshness (warns if last backup > 8 days old). +proxmox_backup_vmids: + - 106 diff --git a/main.py b/main.py index 7f2ad63..f6b403a 100644 --- a/main.py +++ b/main.py @@ -16,6 +16,27 @@ AGENT_OS_DIR = Path(os.getenv("AGENT_OS_DIR", "/opt/agent-os")) RAVEN_URL = os.getenv("RAVEN_URL", "") CONFIG_FILE = Path(__file__).parent / "config.yaml" +# Load Proxmox credentials from mounted keys file +def _load_proxmox_config() -> dict: + keyfile = Path("/etc/nxm-keys") + cfg = {} + if keyfile.exists(): + for line in keyfile.read_text().splitlines(): + line = line.strip() + if line and "=" in line and not line.startswith("#"): + k, v = line.split("=", 1) + cfg[k.strip()] = v.strip() + return cfg + +_pve = _load_proxmox_config() +PROXMOX_HOST = _pve.get("PROXMOX_HOST", "172.27.40.2") +PROXMOX_USER = _pve.get("PROXMOX_USER", "claude@pve") +PROXMOX_TOKEN_ID = _pve.get("PROXMOX_TOKEN_ID", "claude-code") +PROXMOX_TOKEN_SECRET = _pve.get("PROXMOX_TOKEN_SECRET", "") +PROXMOX_BACKUP_STORAGE = "truenas-backups" +PROXMOX_BACKUP_NODE = "proxmox" +PROXMOX_BACKUP_WARN_DAYS = 8 # alert if last backup older than this + def _load_prev_states() -> dict: path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json" @@ -79,11 +100,54 @@ def read_agent_status(name: str) -> dict: return {"name": name, "status": "error", "timestamp": None, "result": str(e)} -def render_html(services: list, agents: list) -> str: +def check_proxmox_backups(vmids: list[str]) -> list[dict]: + """Check last backup age for each vmid on truenas-backups. Returns status per VM.""" + if not PROXMOX_TOKEN_SECRET: + return [{"vmid": v, "status": "unknown", "reason": "no credentials"} for v in vmids] + auth = f"PVEAPIToken={PROXMOX_USER}!{PROXMOX_TOKEN_ID}={PROXMOX_TOKEN_SECRET}" + url = f"https://{PROXMOX_HOST}:8006/api2/json/nodes/{PROXMOX_BACKUP_NODE}/storage/{PROXMOX_BACKUP_STORAGE}/content" + try: + r = httpx.get(url, headers={"Authorization": auth}, verify=False, timeout=10) + r.raise_for_status() + items = r.json().get("data", []) + except Exception as e: + logger.warning(f"proxmox backup check failed: {e}") + return [{"vmid": v, "status": "error", "reason": str(e)} for v in vmids] + + now = datetime.now(timezone.utc) + latest: dict[str, dict] = {} + for item in items: + vid = str(item.get("vmid", "")) + ctime = item.get("ctime", 0) + if vid and (vid not in latest or ctime > latest[vid]["ctime"]): + latest[vid] = {"ctime": ctime, "size_gb": round(item.get("size", 0) / 1024**3, 2)} + + results = [] + for vid in vmids: + if vid not in latest: + results.append({"vmid": vid, "status": "error", "reason": "no backups found", "age_days": None}) + continue + age_days = (now - datetime.fromtimestamp(latest[vid]["ctime"], tz=timezone.utc)).total_seconds() / 86400 + last_dt = datetime.fromtimestamp(latest[vid]["ctime"], tz=timezone.utc).strftime("%Y-%m-%d %H:%M UTC") + status = "ok" if age_days <= PROXMOX_BACKUP_WARN_DAYS else "warning" + results.append({ + "vmid": vid, + "status": status, + "last_backup": last_dt, + "age_days": round(age_days, 1), + "size_gb": latest[vid]["size_gb"], + "ctime": latest[vid]["ctime"], # used to detect new backups between runs + }) + logger.info(f"backup VM {vid}: {status} — {round(age_days,1)} days old ({latest[vid]['size_gb']} GB)") + return results + + +def render_html(services: list, agents: list, backups: list | None = None) -> str: now_iso = datetime.now(timezone.utc).isoformat() SERVICE_COLOURS = {"up": "#3fb950", "degraded": "#d29922", "down": "#f85149"} AGENT_COLOURS = {"success": "#3fb950", "failure": "#f85149"} + BACKUP_COLOURS = {"ok": "#3fb950", "warning": "#d29922", "error": "#f85149", "unknown": "#8b949e"} service_cards = "" for s in services: @@ -111,6 +175,25 @@ def render_html(services: list, agents: list) -> str: {a['result']} """ + backup_rows = "" + if backups: + for b in backups: + colour = BACKUP_COLOURS.get(b["status"], "#8b949e") + age = f"{b['age_days']}d" if b.get("age_days") is not None else "—" + size = f"{b['size_gb']} GB" if b.get("size_gb") else "—" + last = b.get("last_backup", b.get("reason", "—")) + backup_rows += f""" + + VM {b['vmid']} + {b['status'].upper()} + {last} + {age} ago  ·  {size} + """ + + backup_section = f""" +

Proxmox Backups

+ {backup_rows}
""" if backups else "" + return f""" @@ -118,37 +201,60 @@ def render_html(services: list, agents: list) -> str: Varys — Status + + + +

Varys — Status

Checks HTTP reachability for all services in the NxM stack and monitors agent run status. Refreshes every 15 minutes.

-

Updated  ·  ← home

+

Updated

Services

{service_cards}

Agents

{agent_rows}
+ {backup_section} +
""" @@ -185,9 +291,14 @@ def main(): logger.info(f"agent {agent['name']}: {result['status']}") agents.append(result) + # Check Proxmox backups + backup_vmids = [str(v) for v in config.get("proxmox_backup_vmids", [])] + backups = check_proxmox_backups(backup_vmids) if backup_vmids else [] + prev_states = _load_prev_states() prev_services = prev_states.get("services", {}) prev_agents = prev_states.get("agents", {}) + prev_backups = prev_states.get("backups", {}) for s in services: prev = prev_services.get(s["name"]) @@ -204,24 +315,58 @@ def main(): elif prev == "failure" and a["status"] == "success": _notify_raven(f"{a['name']} recovered (success)", "info") + prev_backup_ctimes = prev_states.get("backup_ctimes", {}) + + for b in backups: + vid = b["vmid"] + prev_status = prev_backups.get(vid) + prev_ctime = prev_backup_ctimes.get(vid, 0) + curr_ctime = b.get("ctime", 0) + + # New backup detected — ctime is newer than last recorded + if curr_ctime and curr_ctime > prev_ctime: + _notify_raven( + f"✅ Proxmox backup completed — VM {vid} ({b.get('size_gb', '?')} GB) at {b.get('last_backup', '?')}", + "info", + ) + + # Backup gone overdue or missing + if b["status"] in ("warning", "error") and prev_status not in ("warning", "error"): + age = f"{b['age_days']}d" if b.get("age_days") is not None else "unknown age" + reason = b.get("reason") or f"last backup {age} ago" + _notify_raven(f"⚠️ Proxmox backup VM {vid}: {b['status'].upper()} — {reason}", "critical") + elif prev_status in ("warning", "error") and b["status"] == "ok": + _notify_raven(f"✅ Proxmox backup VM {vid}: recovered (last backup {b.get('age_days', 0)}d ago)", "info") + _save_states(services, agents) + # Save backup states and ctimes (extend existing format) + state_path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json" + if state_path.exists(): + try: + existing = json.loads(state_path.read_text()) + except Exception: + existing = {} + existing["backups"] = {b["vmid"]: b["status"] for b in backups} + existing["backup_ctimes"] = {b["vmid"]: b.get("ctime", 0) for b in backups} + state_path.write_text(json.dumps(existing)) out_dir = SITES_DIR / "varys" out_dir.mkdir(parents=True, exist_ok=True) - (out_dir / "index.html").write_text(render_html(services, agents)) + (out_dir / "index.html").write_text(render_html(services, agents, backups)) (out_dir / "last-output.md").write_text(render_summary_md(services, agents)) down_count = sum(1 for s in services if s["status"] != "up") + backup_issues = sum(1 for b in backups if b["status"] != "ok") status = { "agent": "varys-monitor", "timestamp": datetime.now(timezone.utc).isoformat(), "status": "success", - "result": f"{len(services)} services checked, {down_count} not up", + "result": f"{len(services)} services checked, {down_count} not up; {len(backups)} backups checked, {backup_issues} issues", } log_dir = AGENT_OS_DIR / "logs" / "varys-monitor" log_dir.mkdir(parents=True, exist_ok=True) (log_dir / "last-run.json").write_text(json.dumps(status, indent=2)) - logger.info(f"done — {len(services)} services, {down_count} not up") + logger.info(f"done — {len(services)} services, {down_count} not up; {len(backups)} backups, {backup_issues} issues") if __name__ == "__main__": diff --git a/run.sh b/run.sh index 1a333bb..435009a 100755 --- a/run.sh +++ b/run.sh @@ -10,6 +10,7 @@ docker run --rm \ --network proxy \ --volume /opt/sites:/opt/sites \ --volume /opt/agent-os:/opt/agent-os \ + --volume /home/nxm/.proxmox-keys:/etc/proxmox-keys:ro \ --env SITES_DIR=/opt/sites \ --env AGENT_OS_DIR=/opt/agent-os \ --env RAVEN_URL=http://raven-notify:8400 \