From 2997119c242a2f6fdf4af389e4c03214a212a798 Mon Sep 17 00:00:00 2001 From: nxm Date: Sun, 3 May 2026 19:56:42 +0200 Subject: [PATCH] Initial varys-monitor agent HTTP service reachability checks for 12 services + agent watchdog. Writes status dashboard to /opt/sites/varys/ and last-run.json for Citadel. Co-Authored-By: Claude Sonnet 4.6 --- Dockerfile | 6 ++ config.yaml | 36 +++++++++++ main.py | 164 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 + run.sh | 15 +++++ 5 files changed, 223 insertions(+) create mode 100644 Dockerfile create mode 100644 config.yaml create mode 100644 main.py create mode 100644 requirements.txt create mode 100755 run.sh diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..4ee4b25 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3.12-slim +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt +COPY main.py config.yaml ./ +CMD ["python", "main.py"] diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..4e8422c --- /dev/null +++ b/config.yaml @@ -0,0 +1,36 @@ +# Services varys checks for HTTP reachability. +# Any HTTP response (including 4xx) = UP. Connection error/timeout = DOWN. +# Add/remove entries freely — no code changes needed. + +services: + # Agent infrastructure (on proxy Docker network — reachable by service name) + - name: hodor-gateway + url: http://hodor-gateway:8200/health + - name: citadel-mcp + url: http://citadel-mcp:8300/sse + + # Core services + - name: open-webui + url: http://172.27.40.3:3010 + - name: nginx-proxy-manager + url: http://172.27.40.3:81 + - name: portainer + url: https://172.27.40.3:9443 + - name: gitea + url: http://172.27.40.3:3000 + - name: uptime-kuma + url: http://172.27.40.3:3002 + - name: headscale + url: http://172.27.40.3:8080 + - name: vaultwarden + url: http://172.27.40.3:8222 + - name: plane + url: http://172.27.40.3:8095 + - name: homarr + url: http://172.27.40.3:7575 + - name: netbox + url: http://172.27.40.3:8100 + +# One-shot agents to watchdog (reads /opt/agent-os/logs//last-run.json). +agents: + - name: bran-changelog diff --git a/main.py b/main.py new file mode 100644 index 0000000..1faca3b --- /dev/null +++ b/main.py @@ -0,0 +1,164 @@ +import json +import logging +import os +import time +from datetime import datetime, timezone +from pathlib import Path + +import httpx +import yaml + +logging.basicConfig(level=logging.INFO, format="%(asctime)s [varys] %(message)s") +logger = logging.getLogger("varys") + +SITES_DIR = Path(os.getenv("SITES_DIR", "/opt/sites")) +AGENT_OS_DIR = Path(os.getenv("AGENT_OS_DIR", "/opt/agent-os")) +CONFIG_FILE = Path(__file__).parent / "config.yaml" + + +def check_service(name: str, url: str) -> dict: + start = time.monotonic() + try: + # stream=True so we get the status code without reading the body (handles SSE endpoints) + with httpx.stream("GET", url, timeout=5, verify=False, follow_redirects=True) as r: + ms = int((time.monotonic() - start) * 1000) + status = "up" if r.status_code < 500 else "degraded" + return {"name": name, "status": status, "code": r.status_code, "ms": ms} + except Exception: + ms = int((time.monotonic() - start) * 1000) + return {"name": name, "status": "down", "code": None, "ms": ms} + + +def read_agent_status(name: str) -> dict: + path = AGENT_OS_DIR / "logs" / name / "last-run.json" + if not path.exists(): + return {"name": name, "status": "unknown", "timestamp": None, "result": "no data"} + try: + data = json.loads(path.read_text()) + return { + "name": name, + "status": data.get("status", "unknown"), + "timestamp": data.get("timestamp", ""), + "result": data.get("result", ""), + } + except Exception as e: + return {"name": name, "status": "error", "timestamp": None, "result": str(e)} + + +def render_html(services: list, agents: list) -> str: + now = datetime.now().strftime("%Y-%m-%d %H:%M") + + SERVICE_COLOURS = {"up": "#3fb950", "degraded": "#d29922", "down": "#f85149"} + AGENT_COLOURS = {"success": "#3fb950", "failure": "#f85149"} + + service_cards = "" + for s in services: + colour = SERVICE_COLOURS.get(s["status"], "#8b949e") + label = s["status"].upper() + ms_text = f"{s['ms']} ms" if s["status"] != "down" else "—" + service_cards += f""" +
+ + {s['name']} + {label} + {ms_text} +
""" + + agent_rows = "" + for a in agents: + colour = AGENT_COLOURS.get(a["status"], "#8b949e") + ts = a["timestamp"][:16].replace("T", " ") + " UTC" if a["timestamp"] else "—" + agent_rows += f""" + + {a['name']} + {a['status'].upper()} + {ts} + {a['result']} + """ + + return f""" + + + + + Varys — Status + + + +

Varys — Status

+

Updated {now}  ·  ← home

+

Services

+ {service_cards} +

Agents

+ {agent_rows}
+ +""" + + +def render_summary_md(services: list, agents: list) -> str: + now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") + up = sum(1 for s in services if s["status"] == "up") + down = [s["name"] for s in services if s["status"] in ("down", "degraded")] + + lines = [f"# Varys — Status Summary\n\nUpdated: {now}\n"] + lines.append(f"## Services: {up}/{len(services)} up") + if down: + lines.append(f"Issues: {', '.join(down)}") + lines.append("") + lines.append("## Agents") + for a in agents: + ts = a["timestamp"][:16].replace("T", " ") + " UTC" if a["timestamp"] else "no data" + lines.append(f"- {a['name']}: {a['status']} ({ts}) — {a['result']}") + return "\n".join(lines) + + +def main(): + config = yaml.safe_load(CONFIG_FILE.read_text()) + + services = [] + for svc in config.get("services", []): + result = check_service(svc["name"], svc["url"]) + logger.info(f"{svc['name']}: {result['status']} ({result['ms']} ms)") + services.append(result) + + agents = [] + for agent in config.get("agents", []): + result = read_agent_status(agent["name"]) + logger.info(f"agent {agent['name']}: {result['status']}") + agents.append(result) + + out_dir = SITES_DIR / "varys" + out_dir.mkdir(parents=True, exist_ok=True) + (out_dir / "index.html").write_text(render_html(services, agents)) + (out_dir / "last-output.md").write_text(render_summary_md(services, agents)) + + down_count = sum(1 for s in services if s["status"] != "up") + status = { + "agent": "varys-monitor", + "timestamp": datetime.now(timezone.utc).isoformat(), + "status": "success", + "result": f"{len(services)} services checked, {down_count} not up", + } + log_dir = AGENT_OS_DIR / "logs" / "varys-monitor" + log_dir.mkdir(parents=True, exist_ok=True) + (log_dir / "last-run.json").write_text(json.dumps(status, indent=2)) + logger.info(f"done — {len(services)} services, {down_count} not up") + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f4f490c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +httpx +pyyaml diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..a006950 --- /dev/null +++ b/run.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# Run varys-monitor: builds image, checks all services and agents, exits. +# Schedule via cron: */15 * * * * /opt/stacks/varys-monitor/run.sh >> /opt/agent-os/logs/varys.log 2>&1 + +set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +docker build -q -t varys-monitor "$SCRIPT_DIR" +docker run --rm \ + --network proxy \ + --volume /opt/sites:/opt/sites \ + --volume /opt/agent-os:/opt/agent-os \ + --env SITES_DIR=/opt/sites \ + --env AGENT_OS_DIR=/opt/agent-os \ + varys-monitor