feat: add raven stub, state tracking, sam-research + searxng monitoring
- RAVEN_URL env var: sends down/recovery alerts when Raven is live (silent no-op until then) - service-states.json persists prev state for change detection - config.yaml: adds sam-research + searxng services, adds sam/citadel/varys agents Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -8,6 +8,10 @@ services:
|
||||
url: http://hodor-gateway:8200/health
|
||||
- name: citadel-mcp
|
||||
url: http://citadel-mcp:8300/sse
|
||||
- name: sam-research
|
||||
url: http://sam-research:8500/health
|
||||
- name: searxng
|
||||
url: http://searxng:8080
|
||||
|
||||
# Core services
|
||||
- name: open-webui
|
||||
@@ -34,3 +38,6 @@ services:
|
||||
# One-shot agents to watchdog (reads /opt/agent-os/logs/<name>/last-run.json).
|
||||
agents:
|
||||
- name: bran-changelog
|
||||
- name: sam-research
|
||||
- name: citadel-mcp
|
||||
- name: varys-monitor
|
||||
|
||||
@@ -13,9 +13,36 @@ logger = logging.getLogger("varys")
|
||||
|
||||
SITES_DIR = Path(os.getenv("SITES_DIR", "/opt/sites"))
|
||||
AGENT_OS_DIR = Path(os.getenv("AGENT_OS_DIR", "/opt/agent-os"))
|
||||
RAVEN_URL = os.getenv("RAVEN_URL", "")
|
||||
CONFIG_FILE = Path(__file__).parent / "config.yaml"
|
||||
|
||||
|
||||
def _load_prev_states() -> dict:
|
||||
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
|
||||
if not path.exists():
|
||||
return {}
|
||||
try:
|
||||
return json.loads(path.read_text())
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def _save_states(services: list):
|
||||
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps({s["name"]: s["status"] for s in services}))
|
||||
|
||||
|
||||
def _notify_raven(message: str, severity: str):
|
||||
if not RAVEN_URL:
|
||||
return
|
||||
try:
|
||||
httpx.post(f"{RAVEN_URL}/notify", json={"message": message, "severity": severity, "source": "varys"}, timeout=5)
|
||||
logger.info(f"raven notified: {message}")
|
||||
except Exception as e:
|
||||
logger.warning(f"raven notify failed (raven not live yet?): {e}")
|
||||
|
||||
|
||||
def check_service(name: str, url: str) -> dict:
|
||||
start = time.monotonic()
|
||||
try:
|
||||
@@ -151,6 +178,15 @@ def main():
|
||||
logger.info(f"agent {agent['name']}: {result['status']}")
|
||||
agents.append(result)
|
||||
|
||||
prev_states = _load_prev_states()
|
||||
for s in services:
|
||||
prev = prev_states.get(s["name"])
|
||||
if prev and prev != "down" and s["status"] == "down":
|
||||
_notify_raven(f"{s['name']} is DOWN", "critical")
|
||||
elif prev == "down" and s["status"] not in ("down", "degraded"):
|
||||
_notify_raven(f"{s['name']} recovered (UP)", "info")
|
||||
_save_states(services)
|
||||
|
||||
out_dir = SITES_DIR / "varys"
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
(out_dir / "index.html").write_text(render_html(services, agents))
|
||||
|
||||
Reference in New Issue
Block a user