feat: add raven stub, state tracking, sam-research + searxng monitoring

- RAVEN_URL env var: sends down/recovery alerts when Raven is live (silent no-op until then)
- service-states.json persists prev state for change detection
- config.yaml: adds sam-research + searxng services, adds sam/citadel/varys agents

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
nxm
2026-05-06 10:54:43 +02:00
parent 01ced45815
commit ffd907cd80
3 changed files with 44 additions and 0 deletions
+7
View File
@@ -8,6 +8,10 @@ services:
url: http://hodor-gateway:8200/health url: http://hodor-gateway:8200/health
- name: citadel-mcp - name: citadel-mcp
url: http://citadel-mcp:8300/sse url: http://citadel-mcp:8300/sse
- name: sam-research
url: http://sam-research:8500/health
- name: searxng
url: http://searxng:8080
# Core services # Core services
- name: open-webui - name: open-webui
@@ -34,3 +38,6 @@ services:
# One-shot agents to watchdog (reads /opt/agent-os/logs/<name>/last-run.json). # One-shot agents to watchdog (reads /opt/agent-os/logs/<name>/last-run.json).
agents: agents:
- name: bran-changelog - name: bran-changelog
- name: sam-research
- name: citadel-mcp
- name: varys-monitor
+36
View File
@@ -13,9 +13,36 @@ logger = logging.getLogger("varys")
SITES_DIR = Path(os.getenv("SITES_DIR", "/opt/sites")) SITES_DIR = Path(os.getenv("SITES_DIR", "/opt/sites"))
AGENT_OS_DIR = Path(os.getenv("AGENT_OS_DIR", "/opt/agent-os")) AGENT_OS_DIR = Path(os.getenv("AGENT_OS_DIR", "/opt/agent-os"))
RAVEN_URL = os.getenv("RAVEN_URL", "")
CONFIG_FILE = Path(__file__).parent / "config.yaml" CONFIG_FILE = Path(__file__).parent / "config.yaml"
def _load_prev_states() -> dict:
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
if not path.exists():
return {}
try:
return json.loads(path.read_text())
except Exception:
return {}
def _save_states(services: list):
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps({s["name"]: s["status"] for s in services}))
def _notify_raven(message: str, severity: str):
if not RAVEN_URL:
return
try:
httpx.post(f"{RAVEN_URL}/notify", json={"message": message, "severity": severity, "source": "varys"}, timeout=5)
logger.info(f"raven notified: {message}")
except Exception as e:
logger.warning(f"raven notify failed (raven not live yet?): {e}")
def check_service(name: str, url: str) -> dict: def check_service(name: str, url: str) -> dict:
start = time.monotonic() start = time.monotonic()
try: try:
@@ -151,6 +178,15 @@ def main():
logger.info(f"agent {agent['name']}: {result['status']}") logger.info(f"agent {agent['name']}: {result['status']}")
agents.append(result) agents.append(result)
prev_states = _load_prev_states()
for s in services:
prev = prev_states.get(s["name"])
if prev and prev != "down" and s["status"] == "down":
_notify_raven(f"{s['name']} is DOWN", "critical")
elif prev == "down" and s["status"] not in ("down", "degraded"):
_notify_raven(f"{s['name']} recovered (UP)", "info")
_save_states(services)
out_dir = SITES_DIR / "varys" out_dir = SITES_DIR / "varys"
out_dir.mkdir(parents=True, exist_ok=True) out_dir.mkdir(parents=True, exist_ok=True)
(out_dir / "index.html").write_text(render_html(services, agents)) (out_dir / "index.html").write_text(render_html(services, agents))
+1
View File
@@ -12,4 +12,5 @@ docker run --rm \
--volume /opt/agent-os:/opt/agent-os \ --volume /opt/agent-os:/opt/agent-os \
--env SITES_DIR=/opt/sites \ --env SITES_DIR=/opt/sites \
--env AGENT_OS_DIR=/opt/agent-os \ --env AGENT_OS_DIR=/opt/agent-os \
--env RAVEN_URL= \
varys-monitor varys-monitor