feat: add raven stub, state tracking, sam-research + searxng monitoring
- RAVEN_URL env var: sends down/recovery alerts when Raven is live (silent no-op until then) - service-states.json persists prev state for change detection - config.yaml: adds sam-research + searxng services, adds sam/citadel/varys agents Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -8,6 +8,10 @@ services:
|
|||||||
url: http://hodor-gateway:8200/health
|
url: http://hodor-gateway:8200/health
|
||||||
- name: citadel-mcp
|
- name: citadel-mcp
|
||||||
url: http://citadel-mcp:8300/sse
|
url: http://citadel-mcp:8300/sse
|
||||||
|
- name: sam-research
|
||||||
|
url: http://sam-research:8500/health
|
||||||
|
- name: searxng
|
||||||
|
url: http://searxng:8080
|
||||||
|
|
||||||
# Core services
|
# Core services
|
||||||
- name: open-webui
|
- name: open-webui
|
||||||
@@ -34,3 +38,6 @@ services:
|
|||||||
# One-shot agents to watchdog (reads /opt/agent-os/logs/<name>/last-run.json).
|
# One-shot agents to watchdog (reads /opt/agent-os/logs/<name>/last-run.json).
|
||||||
agents:
|
agents:
|
||||||
- name: bran-changelog
|
- name: bran-changelog
|
||||||
|
- name: sam-research
|
||||||
|
- name: citadel-mcp
|
||||||
|
- name: varys-monitor
|
||||||
|
|||||||
@@ -13,9 +13,36 @@ logger = logging.getLogger("varys")
|
|||||||
|
|
||||||
SITES_DIR = Path(os.getenv("SITES_DIR", "/opt/sites"))
|
SITES_DIR = Path(os.getenv("SITES_DIR", "/opt/sites"))
|
||||||
AGENT_OS_DIR = Path(os.getenv("AGENT_OS_DIR", "/opt/agent-os"))
|
AGENT_OS_DIR = Path(os.getenv("AGENT_OS_DIR", "/opt/agent-os"))
|
||||||
|
RAVEN_URL = os.getenv("RAVEN_URL", "")
|
||||||
CONFIG_FILE = Path(__file__).parent / "config.yaml"
|
CONFIG_FILE = Path(__file__).parent / "config.yaml"
|
||||||
|
|
||||||
|
|
||||||
|
def _load_prev_states() -> dict:
|
||||||
|
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
|
||||||
|
if not path.exists():
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
return json.loads(path.read_text())
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def _save_states(services: list):
|
||||||
|
path = AGENT_OS_DIR / "logs" / "varys-monitor" / "service-states.json"
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
path.write_text(json.dumps({s["name"]: s["status"] for s in services}))
|
||||||
|
|
||||||
|
|
||||||
|
def _notify_raven(message: str, severity: str):
|
||||||
|
if not RAVEN_URL:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
httpx.post(f"{RAVEN_URL}/notify", json={"message": message, "severity": severity, "source": "varys"}, timeout=5)
|
||||||
|
logger.info(f"raven notified: {message}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"raven notify failed (raven not live yet?): {e}")
|
||||||
|
|
||||||
|
|
||||||
def check_service(name: str, url: str) -> dict:
|
def check_service(name: str, url: str) -> dict:
|
||||||
start = time.monotonic()
|
start = time.monotonic()
|
||||||
try:
|
try:
|
||||||
@@ -151,6 +178,15 @@ def main():
|
|||||||
logger.info(f"agent {agent['name']}: {result['status']}")
|
logger.info(f"agent {agent['name']}: {result['status']}")
|
||||||
agents.append(result)
|
agents.append(result)
|
||||||
|
|
||||||
|
prev_states = _load_prev_states()
|
||||||
|
for s in services:
|
||||||
|
prev = prev_states.get(s["name"])
|
||||||
|
if prev and prev != "down" and s["status"] == "down":
|
||||||
|
_notify_raven(f"{s['name']} is DOWN", "critical")
|
||||||
|
elif prev == "down" and s["status"] not in ("down", "degraded"):
|
||||||
|
_notify_raven(f"{s['name']} recovered (UP)", "info")
|
||||||
|
_save_states(services)
|
||||||
|
|
||||||
out_dir = SITES_DIR / "varys"
|
out_dir = SITES_DIR / "varys"
|
||||||
out_dir.mkdir(parents=True, exist_ok=True)
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
(out_dir / "index.html").write_text(render_html(services, agents))
|
(out_dir / "index.html").write_text(render_html(services, agents))
|
||||||
|
|||||||
@@ -12,4 +12,5 @@ docker run --rm \
|
|||||||
--volume /opt/agent-os:/opt/agent-os \
|
--volume /opt/agent-os:/opt/agent-os \
|
||||||
--env SITES_DIR=/opt/sites \
|
--env SITES_DIR=/opt/sites \
|
||||||
--env AGENT_OS_DIR=/opt/agent-os \
|
--env AGENT_OS_DIR=/opt/agent-os \
|
||||||
|
--env RAVEN_URL= \
|
||||||
varys-monitor
|
varys-monitor
|
||||||
|
|||||||
Reference in New Issue
Block a user