feat: LLM task extraction, token tracking, direct Claude brain
- brain.py: prefers direct Anthropic API (ANTHROPIC_API_KEY) over Hermes
for all LLM calls — ~22x cheaper (122 tokens vs 5600+ Hermes overhead).
Falls back to Hermes then Ollama if key unavailable.
extract_task_fields(): non-streaming call returns clean {title, project}
from any natural language phrasing — no more regex whack-a-mole.
- token_log.py: appends every LLM call to token-usage.jsonl with intent,
in/out token counts, and USD cost. get_summary() aggregates all-time,
today, and per-intent breakdowns.
- main.py: task handler uses extract_task_fields() with regex fallback;
streaming handler captures usage from final chunk; GET /usage endpoint
returns live cost summary.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+28
-12
@@ -14,7 +14,9 @@ from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .approval import cleanup_expired, generate_token, pop_action, queue_action, verify_token
|
||||
from .brain import stream_completion
|
||||
from .brain import extract_task_fields, stream_completion
|
||||
from .token_log import get_summary as get_token_summary
|
||||
from .token_log import log_usage
|
||||
from .intent import (
|
||||
classify_intent,
|
||||
extract_agent_name,
|
||||
@@ -158,24 +160,29 @@ async def _stream_text(text: str) -> AsyncGenerator[str, None]:
|
||||
yield _sse_done(chunk_id)
|
||||
|
||||
|
||||
async def _stream_llm(messages: list[dict], use_smart: bool = False) -> AsyncGenerator[str, None]:
|
||||
async def _stream_llm(
|
||||
messages: list[dict], use_smart: bool = False, intent: str = "planning"
|
||||
) -> AsyncGenerator[str, None]:
|
||||
chunk_id = f"chatcmpl-{int(time.time())}"
|
||||
collected = []
|
||||
prompt_tokens = completion_tokens = 0
|
||||
try:
|
||||
response = await stream_completion(messages, use_smart=use_smart)
|
||||
async for chunk in response:
|
||||
if chunk.choices and chunk.choices[0].delta.content:
|
||||
content = chunk.choices[0].delta.content
|
||||
collected.append(content)
|
||||
yield _sse_chunk(content, chunk_id)
|
||||
# Capture usage from final chunk (stream_options include_usage)
|
||||
if hasattr(chunk, "usage") and chunk.usage:
|
||||
prompt_tokens = chunk.usage.prompt_tokens or 0
|
||||
completion_tokens = chunk.usage.completion_tokens or 0
|
||||
yield _sse_done(chunk_id)
|
||||
except Exception as e:
|
||||
logger.error(f"LLM stream error: {e}")
|
||||
error_msg = f"Error reaching LLM: {e}"
|
||||
async for part in _stream_text(error_msg):
|
||||
async for part in _stream_text(f"Error reaching LLM: {e}"):
|
||||
yield part
|
||||
collected.append(error_msg)
|
||||
return
|
||||
finally:
|
||||
if prompt_tokens or completion_tokens:
|
||||
log_usage(intent, prompt_tokens, completion_tokens)
|
||||
|
||||
|
||||
# --- Routes ---
|
||||
@@ -185,6 +192,11 @@ async def health():
|
||||
return {"status": "ok", "agent": "jon-snow", "version": "0.3.0"}
|
||||
|
||||
|
||||
@app.get("/usage")
|
||||
async def usage():
|
||||
return get_token_summary()
|
||||
|
||||
|
||||
@app.get("/v1/models")
|
||||
async def list_models():
|
||||
return {
|
||||
@@ -503,9 +515,13 @@ async def chat_completions(req: ChatRequest):
|
||||
yield chunk
|
||||
|
||||
elif intent == "task":
|
||||
# Live destination match first, static keyword map as fallback
|
||||
project_hint = extract_task_destination(user_message) or extract_project_name(user_message)
|
||||
title = extract_task_title(user_message)
|
||||
# LLM extraction — handles any natural language phrasing
|
||||
fields, usage = await extract_task_fields(user_message)
|
||||
log_usage("task_extract", usage["prompt_tokens"], usage["completion_tokens"])
|
||||
|
||||
title = fields.get("title") or extract_task_title(user_message)
|
||||
project_hint = fields.get("project") or extract_project_name(user_message)
|
||||
|
||||
try:
|
||||
issue = await create_plane_issue(title, project_hint)
|
||||
response_text = (
|
||||
@@ -521,7 +537,7 @@ async def chat_completions(req: ChatRequest):
|
||||
yield chunk
|
||||
|
||||
else: # planning / general
|
||||
async for chunk in _stream_llm(messages, use_smart=True):
|
||||
async for chunk in _stream_llm(messages, use_smart=True, intent="planning"):
|
||||
yield chunk
|
||||
summary = f"Planning query: {user_message[:100]}"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user