Initial varys-monitor agent
HTTP service reachability checks for 12 services + agent watchdog. Writes status dashboard to /opt/sites/varys/ and last-run.json for Citadel. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,6 @@
|
|||||||
|
FROM python:3.12-slim
|
||||||
|
WORKDIR /app
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
COPY main.py config.yaml ./
|
||||||
|
CMD ["python", "main.py"]
|
||||||
+36
@@ -0,0 +1,36 @@
|
|||||||
|
# Services varys checks for HTTP reachability.
|
||||||
|
# Any HTTP response (including 4xx) = UP. Connection error/timeout = DOWN.
|
||||||
|
# Add/remove entries freely — no code changes needed.
|
||||||
|
|
||||||
|
services:
|
||||||
|
# Agent infrastructure (on proxy Docker network — reachable by service name)
|
||||||
|
- name: hodor-gateway
|
||||||
|
url: http://hodor-gateway:8200/health
|
||||||
|
- name: citadel-mcp
|
||||||
|
url: http://citadel-mcp:8300/sse
|
||||||
|
|
||||||
|
# Core services
|
||||||
|
- name: open-webui
|
||||||
|
url: http://172.27.40.3:3010
|
||||||
|
- name: nginx-proxy-manager
|
||||||
|
url: http://172.27.40.3:81
|
||||||
|
- name: portainer
|
||||||
|
url: https://172.27.40.3:9443
|
||||||
|
- name: gitea
|
||||||
|
url: http://172.27.40.3:3000
|
||||||
|
- name: uptime-kuma
|
||||||
|
url: http://172.27.40.3:3002
|
||||||
|
- name: headscale
|
||||||
|
url: http://172.27.40.3:8080
|
||||||
|
- name: vaultwarden
|
||||||
|
url: http://172.27.40.3:8222
|
||||||
|
- name: plane
|
||||||
|
url: http://172.27.40.3:8095
|
||||||
|
- name: homarr
|
||||||
|
url: http://172.27.40.3:7575
|
||||||
|
- name: netbox
|
||||||
|
url: http://172.27.40.3:8100
|
||||||
|
|
||||||
|
# One-shot agents to watchdog (reads /opt/agent-os/logs/<name>/last-run.json).
|
||||||
|
agents:
|
||||||
|
- name: bran-changelog
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [varys] %(message)s")
|
||||||
|
logger = logging.getLogger("varys")
|
||||||
|
|
||||||
|
SITES_DIR = Path(os.getenv("SITES_DIR", "/opt/sites"))
|
||||||
|
AGENT_OS_DIR = Path(os.getenv("AGENT_OS_DIR", "/opt/agent-os"))
|
||||||
|
CONFIG_FILE = Path(__file__).parent / "config.yaml"
|
||||||
|
|
||||||
|
|
||||||
|
def check_service(name: str, url: str) -> dict:
|
||||||
|
start = time.monotonic()
|
||||||
|
try:
|
||||||
|
# stream=True so we get the status code without reading the body (handles SSE endpoints)
|
||||||
|
with httpx.stream("GET", url, timeout=5, verify=False, follow_redirects=True) as r:
|
||||||
|
ms = int((time.monotonic() - start) * 1000)
|
||||||
|
status = "up" if r.status_code < 500 else "degraded"
|
||||||
|
return {"name": name, "status": status, "code": r.status_code, "ms": ms}
|
||||||
|
except Exception:
|
||||||
|
ms = int((time.monotonic() - start) * 1000)
|
||||||
|
return {"name": name, "status": "down", "code": None, "ms": ms}
|
||||||
|
|
||||||
|
|
||||||
|
def read_agent_status(name: str) -> dict:
|
||||||
|
path = AGENT_OS_DIR / "logs" / name / "last-run.json"
|
||||||
|
if not path.exists():
|
||||||
|
return {"name": name, "status": "unknown", "timestamp": None, "result": "no data"}
|
||||||
|
try:
|
||||||
|
data = json.loads(path.read_text())
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"status": data.get("status", "unknown"),
|
||||||
|
"timestamp": data.get("timestamp", ""),
|
||||||
|
"result": data.get("result", ""),
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
return {"name": name, "status": "error", "timestamp": None, "result": str(e)}
|
||||||
|
|
||||||
|
|
||||||
|
def render_html(services: list, agents: list) -> str:
|
||||||
|
now = datetime.now().strftime("%Y-%m-%d %H:%M")
|
||||||
|
|
||||||
|
SERVICE_COLOURS = {"up": "#3fb950", "degraded": "#d29922", "down": "#f85149"}
|
||||||
|
AGENT_COLOURS = {"success": "#3fb950", "failure": "#f85149"}
|
||||||
|
|
||||||
|
service_cards = ""
|
||||||
|
for s in services:
|
||||||
|
colour = SERVICE_COLOURS.get(s["status"], "#8b949e")
|
||||||
|
label = s["status"].upper()
|
||||||
|
ms_text = f"{s['ms']} ms" if s["status"] != "down" else "—"
|
||||||
|
service_cards += f"""
|
||||||
|
<div class="svc">
|
||||||
|
<span class="dot" style="background:{colour}"></span>
|
||||||
|
<span class="svc-name">{s['name']}</span>
|
||||||
|
<span class="svc-label" style="color:{colour}">{label}</span>
|
||||||
|
<span class="svc-ms">{ms_text}</span>
|
||||||
|
</div>"""
|
||||||
|
|
||||||
|
agent_rows = ""
|
||||||
|
for a in agents:
|
||||||
|
colour = AGENT_COLOURS.get(a["status"], "#8b949e")
|
||||||
|
ts = a["timestamp"][:16].replace("T", " ") + " UTC" if a["timestamp"] else "—"
|
||||||
|
agent_rows += f"""
|
||||||
|
<tr>
|
||||||
|
<td><span class="dot" style="background:{colour}"></span>{a['name']}</td>
|
||||||
|
<td style="color:{colour}">{a['status'].upper()}</td>
|
||||||
|
<td>{ts}</td>
|
||||||
|
<td class="dim">{a['result']}</td>
|
||||||
|
</tr>"""
|
||||||
|
|
||||||
|
return f"""<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Varys — Status</title>
|
||||||
|
<style>
|
||||||
|
* {{ box-sizing: border-box; margin: 0; padding: 0; }}
|
||||||
|
body {{ font-family: monospace; background: #0d1117; color: #c9d1d9; padding: 2rem; }}
|
||||||
|
h1 {{ color: #58a6ff; margin-bottom: 0.25rem; }}
|
||||||
|
h2 {{ color: #8b949e; font-size: 0.85rem; text-transform: uppercase; letter-spacing: 0.05em; margin: 2rem 0 0.75rem; }}
|
||||||
|
.meta {{ color: #8b949e; font-size: 0.85rem; margin-bottom: 2rem; }}
|
||||||
|
.meta a {{ color: #8b949e; }}
|
||||||
|
.dot {{ display: inline-block; width: 8px; height: 8px; border-radius: 50%; margin-right: 6px; vertical-align: middle; }}
|
||||||
|
.svc {{ display: flex; align-items: center; gap: 0.75rem; border: 1px solid #30363d; border-radius: 6px; padding: 0.6rem 1rem; margin-bottom: 0.5rem; }}
|
||||||
|
.svc-name {{ flex: 1; }}
|
||||||
|
.svc-label {{ font-size: 0.8rem; min-width: 5rem; }}
|
||||||
|
.svc-ms {{ color: #8b949e; font-size: 0.8rem; min-width: 4rem; text-align: right; }}
|
||||||
|
table {{ width: 100%; border-collapse: collapse; }}
|
||||||
|
td {{ padding: 0.6rem 1rem; border-bottom: 1px solid #21262d; font-size: 0.9rem; }}
|
||||||
|
.dim {{ color: #8b949e; }}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Varys — Status</h1>
|
||||||
|
<p class="meta">Updated {now} · <a href="/">← home</a></p>
|
||||||
|
<h2>Services</h2>
|
||||||
|
{service_cards}
|
||||||
|
<h2>Agents</h2>
|
||||||
|
<table><tbody>{agent_rows}</tbody></table>
|
||||||
|
</body>
|
||||||
|
</html>"""
|
||||||
|
|
||||||
|
|
||||||
|
def render_summary_md(services: list, agents: list) -> str:
|
||||||
|
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
|
||||||
|
up = sum(1 for s in services if s["status"] == "up")
|
||||||
|
down = [s["name"] for s in services if s["status"] in ("down", "degraded")]
|
||||||
|
|
||||||
|
lines = [f"# Varys — Status Summary\n\nUpdated: {now}\n"]
|
||||||
|
lines.append(f"## Services: {up}/{len(services)} up")
|
||||||
|
if down:
|
||||||
|
lines.append(f"Issues: {', '.join(down)}")
|
||||||
|
lines.append("")
|
||||||
|
lines.append("## Agents")
|
||||||
|
for a in agents:
|
||||||
|
ts = a["timestamp"][:16].replace("T", " ") + " UTC" if a["timestamp"] else "no data"
|
||||||
|
lines.append(f"- {a['name']}: {a['status']} ({ts}) — {a['result']}")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
config = yaml.safe_load(CONFIG_FILE.read_text())
|
||||||
|
|
||||||
|
services = []
|
||||||
|
for svc in config.get("services", []):
|
||||||
|
result = check_service(svc["name"], svc["url"])
|
||||||
|
logger.info(f"{svc['name']}: {result['status']} ({result['ms']} ms)")
|
||||||
|
services.append(result)
|
||||||
|
|
||||||
|
agents = []
|
||||||
|
for agent in config.get("agents", []):
|
||||||
|
result = read_agent_status(agent["name"])
|
||||||
|
logger.info(f"agent {agent['name']}: {result['status']}")
|
||||||
|
agents.append(result)
|
||||||
|
|
||||||
|
out_dir = SITES_DIR / "varys"
|
||||||
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(out_dir / "index.html").write_text(render_html(services, agents))
|
||||||
|
(out_dir / "last-output.md").write_text(render_summary_md(services, agents))
|
||||||
|
|
||||||
|
down_count = sum(1 for s in services if s["status"] != "up")
|
||||||
|
status = {
|
||||||
|
"agent": "varys-monitor",
|
||||||
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"status": "success",
|
||||||
|
"result": f"{len(services)} services checked, {down_count} not up",
|
||||||
|
}
|
||||||
|
log_dir = AGENT_OS_DIR / "logs" / "varys-monitor"
|
||||||
|
log_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(log_dir / "last-run.json").write_text(json.dumps(status, indent=2))
|
||||||
|
logger.info(f"done — {len(services)} services, {down_count} not up")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
httpx
|
||||||
|
pyyaml
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Run varys-monitor: builds image, checks all services and agents, exits.
|
||||||
|
# Schedule via cron: */15 * * * * /opt/stacks/varys-monitor/run.sh >> /opt/agent-os/logs/varys.log 2>&1
|
||||||
|
|
||||||
|
set -e
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
|
||||||
|
docker build -q -t varys-monitor "$SCRIPT_DIR"
|
||||||
|
docker run --rm \
|
||||||
|
--network proxy \
|
||||||
|
--volume /opt/sites:/opt/sites \
|
||||||
|
--volume /opt/agent-os:/opt/agent-os \
|
||||||
|
--env SITES_DIR=/opt/sites \
|
||||||
|
--env AGENT_OS_DIR=/opt/agent-os \
|
||||||
|
varys-monitor
|
||||||
Reference in New Issue
Block a user