feat: monitor Proxmox backup freshness

This commit is contained in:
2026-06-23 17:18:29 +00:00
parent b29edde06d
commit 97a51a0903
3 changed files with 175 additions and 23 deletions
+10 -4
View File
@@ -8,12 +8,16 @@ services:
url: http://hodor-gateway:8200/health
- name: citadel-mcp
url: http://citadel-mcp:8300/sse
- name: jon-snow
url: http://jon-snow:8900/health
- name: sam-research
url: http://sam-research:8500/health
- name: searxng
url: http://searxng:8080
- name: raven-notify
url: http://raven-notify:8400/health
- name: tarly-backup
url: http://tarly-backup:8750/health
# Core services
- name: open-webui
@@ -36,10 +40,6 @@ services:
url: http://172.27.40.3:7575
- name: netbox
url: http://172.27.40.3:8100
- name: netbird-dashboard
url: http://netbird-dashboard:80
- name: netbird-server
url: http://netbird-server:80
- name: influxdb
url: http://influxdb:8086/health
- name: grafana
@@ -52,3 +52,9 @@ agents:
- name: citadel-mcp
- name: raven-notify
- name: varys-monitor
- name: jon-snow
- name: tarly-backup
# Proxmox VM IDs to check for backup freshness (warns if last backup > 8 days old).
proxmox_backup_vmids:
- 106
+164 -19
View File
@@ -16,6 +16,27 @@ AGENT_OS_DIR = Path(os.getenv("AGENT_OS_DIR", "/opt/agent-os"))
RAVEN_URL = os.getenv("RAVEN_URL", "")
CONFIG_FILE = Path(__file__).parent / "config.yaml"
# Load Proxmox credentials from mounted keys file
def _load_proxmox_config() -> dict:
keyfile = Path("/etc/nxm-keys")
cfg = {}
if keyfile.exists():
for line in keyfile.read_text().splitlines():
line = line.strip()
if line and "=" in line and not line.startswith("#"):
k, v = line.split("=", 1)
cfg[k.strip()] = v.strip()
return cfg
_pve = _load_proxmox_config()
PROXMOX_HOST = _pve.get("PROXMOX_HOST", "172.27.40.2")
PROXMOX_USER = _pve.get("PROXMOX_USER", "claude@pve")
PROXMOX_TOKEN_ID = _pve.get("PROXMOX_TOKEN_ID", "claude-code")
PROXMOX_TOKEN_SECRET = _pve.get("PROXMOX_TOKEN_SECRET", "")
PROXMOX_BACKUP_STORAGE = "truenas-backups"
PROXMOX_BACKUP_NODE = "proxmox"
PROXMOX_BACKUP_WARN_DAYS = 8 # alert if last backup older than this
def _load_prev_states() -> dict:
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
@@ -79,11 +100,54 @@ def read_agent_status(name: str) -> dict:
return {"name": name, "status": "error", "timestamp": None, "result": str(e)}
def render_html(services: list, agents: list) -> str:
def check_proxmox_backups(vmids: list[str]) -> list[dict]:
"""Check last backup age for each vmid on truenas-backups. Returns status per VM."""
if not PROXMOX_TOKEN_SECRET:
return [{"vmid": v, "status": "unknown", "reason": "no credentials"} for v in vmids]
auth = f"PVEAPIToken={PROXMOX_USER}!{PROXMOX_TOKEN_ID}={PROXMOX_TOKEN_SECRET}"
url = f"https://{PROXMOX_HOST}:8006/api2/json/nodes/{PROXMOX_BACKUP_NODE}/storage/{PROXMOX_BACKUP_STORAGE}/content"
try:
r = httpx.get(url, headers={"Authorization": auth}, verify=False, timeout=10)
r.raise_for_status()
items = r.json().get("data", [])
except Exception as e:
logger.warning(f"proxmox backup check failed: {e}")
return [{"vmid": v, "status": "error", "reason": str(e)} for v in vmids]
now = datetime.now(timezone.utc)
latest: dict[str, dict] = {}
for item in items:
vid = str(item.get("vmid", ""))
ctime = item.get("ctime", 0)
if vid and (vid not in latest or ctime > latest[vid]["ctime"]):
latest[vid] = {"ctime": ctime, "size_gb": round(item.get("size", 0) / 1024**3, 2)}
results = []
for vid in vmids:
if vid not in latest:
results.append({"vmid": vid, "status": "error", "reason": "no backups found", "age_days": None})
continue
age_days = (now - datetime.fromtimestamp(latest[vid]["ctime"], tz=timezone.utc)).total_seconds() / 86400
last_dt = datetime.fromtimestamp(latest[vid]["ctime"], tz=timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
status = "ok" if age_days <= PROXMOX_BACKUP_WARN_DAYS else "warning"
results.append({
"vmid": vid,
"status": status,
"last_backup": last_dt,
"age_days": round(age_days, 1),
"size_gb": latest[vid]["size_gb"],
"ctime": latest[vid]["ctime"], # used to detect new backups between runs
})
logger.info(f"backup VM {vid}: {status}{round(age_days,1)} days old ({latest[vid]['size_gb']} GB)")
return results
def render_html(services: list, agents: list, backups: list | None = None) -> str:
now_iso = datetime.now(timezone.utc).isoformat()
SERVICE_COLOURS = {"up": "#3fb950", "degraded": "#d29922", "down": "#f85149"}
AGENT_COLOURS = {"success": "#3fb950", "failure": "#f85149"}
BACKUP_COLOURS = {"ok": "#3fb950", "warning": "#d29922", "error": "#f85149", "unknown": "#8b949e"}
service_cards = ""
for s in services:
@@ -111,6 +175,25 @@ def render_html(services: list, agents: list) -> str:
<td class="dim">{a['result']}</td>
</tr>"""
backup_rows = ""
if backups:
for b in backups:
colour = BACKUP_COLOURS.get(b["status"], "#8b949e")
age = f"{b['age_days']}d" if b.get("age_days") is not None else ""
size = f"{b['size_gb']} GB" if b.get("size_gb") else ""
last = b.get("last_backup", b.get("reason", ""))
backup_rows += f"""
<tr>
<td><span class="dot" style="background:{colour}"></span>VM {b['vmid']}</td>
<td style="color:{colour}">{b['status'].upper()}</td>
<td>{last}</td>
<td class="dim">{age} ago &nbsp;·&nbsp; {size}</td>
</tr>"""
backup_section = f"""
<h2>Proxmox Backups</h2>
<table><tbody>{backup_rows}</tbody></table>""" if backups else ""
return f"""<!DOCTYPE html>
<html lang="en">
<head>
@@ -118,37 +201,60 @@ def render_html(services: list, agents: list) -> str:
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta http-equiv="refresh" content="900">
<title>Varys — Status</title>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600&display=swap" rel="stylesheet">
<style>
* {{ box-sizing: border-box; margin: 0; padding: 0; }}
body {{ font-family: monospace; background: #0d1117; color: #c9d1d9; padding: 2rem; }}
h1 {{ color: #58a6ff; margin-bottom: 0.25rem; }}
h2 {{ color: #8b949e; font-size: 0.85rem; text-transform: uppercase; letter-spacing: 0.05em; margin: 2rem 0 0.75rem; }}
.desc {{ color: #8b949e; font-size: 0.9rem; margin: 0.4rem 0 0.25rem; }}
.meta {{ color: #8b949e; font-size: 0.85rem; margin-bottom: 2rem; }}
.meta a {{ color: #8b949e; }}
.dot {{ display: inline-block; width: 8px; height: 8px; border-radius: 50%; margin-right: 6px; vertical-align: middle; }}
.svc {{ display: flex; align-items: center; gap: 0.75rem; border: 1px solid #30363d; border-radius: 6px; padding: 0.6rem 1rem; margin-bottom: 0.5rem; }}
.svc-name {{ flex: 1; }}
.svc-label {{ font-size: 0.8rem; min-width: 5rem; }}
.svc-ms {{ color: #8b949e; font-size: 0.8rem; min-width: 4rem; text-align: right; }}
:root {{ --bg: #0d1117; --surface: #161b22; --border: #30363d; --dim: #21262d; --text: #e6edf3; --muted: #8b949e; }}
*, *::before, *::after {{ box-sizing: border-box; margin: 0; padding: 0; }}
body {{ font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; background: var(--bg); color: var(--text); min-height: 100vh; font-size: 15px; line-height: 1.6; }}
.site-nav {{ background: var(--surface); border-bottom: 1px solid var(--border); padding: 0.7rem 2rem; display: flex; align-items: center; gap: 0.6rem; position: sticky; top: 0; z-index: 10; }}
.nav-brand {{ color: #58a6ff; font-weight: 600; font-size: 0.9rem; text-decoration: none; }}
.nav-brand:hover {{ color: var(--text); }}
.nav-sep {{ color: var(--border); }}
.nav-title {{ color: var(--text); font-weight: 500; font-size: 0.9rem; }}
.nav-right {{ margin-left: auto; }}
.nav-back {{ color: var(--muted); font-size: 0.85rem; text-decoration: none; }}
.nav-back:hover {{ color: var(--text); }}
.main {{ max-width: 1100px; margin: 0 auto; padding: 2.5rem 2rem 4rem; }}
h1 {{ font-size: 1.5rem; font-weight: 600; color: var(--text); margin-bottom: 0.2rem; }}
h2 {{ color: var(--muted); font-size: 0.7rem; font-weight: 600; text-transform: uppercase; letter-spacing: 0.1em; margin: 2rem 0 0.875rem; }}
.desc {{ color: var(--muted); font-size: 0.9rem; margin: 0.3rem 0 0.2rem; }}
.meta {{ color: var(--muted); font-size: 0.82rem; margin-top: 0.2rem; margin-bottom: 0; }}
.meta a {{ color: var(--muted); text-decoration: none; }}
.meta a:hover {{ color: var(--text); }}
.dot {{ display: inline-block; width: 8px; height: 8px; border-radius: 50%; margin-right: 8px; vertical-align: middle; flex-shrink: 0; }}
.svc {{ display: flex; align-items: center; gap: 0.75rem; background: var(--surface); border: 1px solid var(--border); border-radius: 6px; padding: 0.65rem 1rem; margin-bottom: 0.5rem; transition: border-color 0.15s; }}
.svc:hover {{ border-color: #444c56; }}
.svc-name {{ flex: 1; font-size: 0.9rem; }}
.svc-label {{ font-size: 0.75rem; font-weight: 600; min-width: 5rem; }}
.svc-ms {{ color: var(--muted); font-size: 0.8rem; min-width: 4rem; text-align: right; font-family: ui-monospace, monospace; }}
table {{ width: 100%; border-collapse: collapse; }}
td {{ padding: 0.6rem 1rem; border-bottom: 1px solid #21262d; font-size: 0.9rem; }}
.dim {{ color: #8b949e; }}
td {{ padding: 0.6rem 1rem; border-bottom: 1px solid var(--dim); font-size: 0.88rem; }}
.dim {{ color: var(--muted); }}
</style>
</head>
<body>
<nav class="site-nav">
<a class="nav-brand" href="/">◈ NxM</a>
<span class="nav-sep">·</span>
<span class="nav-title">Varys — Status</span>
<span class="nav-right"><a class="nav-back" href="/">← home</a></span>
</nav>
<main class="main">
<h1>Varys — Status</h1>
<p class="desc">Checks HTTP reachability for all services in the NxM stack and monitors agent run status. Refreshes every 15 minutes.</p>
<p class="meta">Updated <span data-utc="{now_iso}"></span> &nbsp;·&nbsp; <a href="/">← home</a></p>
<p class="meta">Updated <span data-utc="{now_iso}"></span></p>
<h2>Services</h2>
{service_cards}
<h2>Agents</h2>
<table><tbody>{agent_rows}</tbody></table>
{backup_section}
<script>
document.querySelectorAll('[data-utc]').forEach(el => {{
el.textContent = new Date(el.dataset.utc).toLocaleString(undefined, {{year:'numeric',month:'2-digit',day:'2-digit',hour:'2-digit',minute:'2-digit'}});
}});
</script>
</main>
</body>
</html>"""
@@ -185,9 +291,14 @@ def main():
logger.info(f"agent {agent['name']}: {result['status']}")
agents.append(result)
# Check Proxmox backups
backup_vmids = [str(v) for v in config.get("proxmox_backup_vmids", [])]
backups = check_proxmox_backups(backup_vmids) if backup_vmids else []
prev_states = _load_prev_states()
prev_services = prev_states.get("services", {})
prev_agents = prev_states.get("agents", {})
prev_backups = prev_states.get("backups", {})
for s in services:
prev = prev_services.get(s["name"])
@@ -204,24 +315,58 @@ def main():
elif prev == "failure" and a["status"] == "success":
_notify_raven(f"{a['name']} recovered (success)", "info")
prev_backup_ctimes = prev_states.get("backup_ctimes", {})
for b in backups:
vid = b["vmid"]
prev_status = prev_backups.get(vid)
prev_ctime = prev_backup_ctimes.get(vid, 0)
curr_ctime = b.get("ctime", 0)
# New backup detected — ctime is newer than last recorded
if curr_ctime and curr_ctime > prev_ctime:
_notify_raven(
f"✅ Proxmox backup completed — VM {vid} ({b.get('size_gb', '?')} GB) at {b.get('last_backup', '?')}",
"info",
)
# Backup gone overdue or missing
if b["status"] in ("warning", "error") and prev_status not in ("warning", "error"):
age = f"{b['age_days']}d" if b.get("age_days") is not None else "unknown age"
reason = b.get("reason") or f"last backup {age} ago"
_notify_raven(f"⚠️ Proxmox backup VM {vid}: {b['status'].upper()}{reason}", "critical")
elif prev_status in ("warning", "error") and b["status"] == "ok":
_notify_raven(f"✅ Proxmox backup VM {vid}: recovered (last backup {b.get('age_days', 0)}d ago)", "info")
_save_states(services, agents)
# Save backup states and ctimes (extend existing format)
state_path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
if state_path.exists():
try:
existing = json.loads(state_path.read_text())
except Exception:
existing = {}
existing["backups"] = {b["vmid"]: b["status"] for b in backups}
existing["backup_ctimes"] = {b["vmid"]: b.get("ctime", 0) for b in backups}
state_path.write_text(json.dumps(existing))
out_dir = SITES_DIR / "varys"
out_dir.mkdir(parents=True, exist_ok=True)
(out_dir / "index.html").write_text(render_html(services, agents))
(out_dir / "index.html").write_text(render_html(services, agents, backups))
(out_dir / "last-output.md").write_text(render_summary_md(services, agents))
down_count = sum(1 for s in services if s["status"] != "up")
backup_issues = sum(1 for b in backups if b["status"] != "ok")
status = {
"agent": "varys-monitor",
"timestamp": datetime.now(timezone.utc).isoformat(),
"status": "success",
"result": f"{len(services)} services checked, {down_count} not up",
"result": f"{len(services)} services checked, {down_count} not up; {len(backups)} backups checked, {backup_issues} issues",
}
log_dir = AGENT_OS_DIR / "logs" / "varys-monitor"
log_dir.mkdir(parents=True, exist_ok=True)
(log_dir / "last-run.json").write_text(json.dumps(status, indent=2))
logger.info(f"done — {len(services)} services, {down_count} not up")
logger.info(f"done — {len(services)} services, {down_count} not up; {len(backups)} backups, {backup_issues} issues")
if __name__ == "__main__":
+1
View File
@@ -10,6 +10,7 @@ docker run --rm \
--network proxy \
--volume /opt/sites:/opt/sites \
--volume /opt/agent-os:/opt/agent-os \
--volume /home/nxm/.proxmox-keys:/etc/proxmox-keys:ro \
--env SITES_DIR=/opt/sites \
--env AGENT_OS_DIR=/opt/agent-os \
--env RAVEN_URL=http://raven-notify:8400 \