diff --git a/AGENT_README.md b/AGENT_README.md index 9a13da4f7..095d047a5 100644 --- a/AGENT_README.md +++ b/AGENT_README.md @@ -111,6 +111,16 @@ Response to User --- +## Latency Notes + +Single-tool queries average 5–10 seconds due to Claude Sonnet response generation time. +The classify step (keyword matching) adds <10ms. Tool execution adds 50–200ms. The majority +of latency is LLM synthesis. Streaming responses (`/chat/steps`, `/chat/stream`) are +implemented to improve perceived performance. A startup warmup pre-establishes the LLM +connection to reduce cold-start latency on the first request. + +--- + ## Verification Strategy ### 3 Verification Systems Implemented diff --git a/main.py b/main.py index 1fe7db3f7..e937d3ad0 100644 --- a/main.py +++ b/main.py @@ -91,6 +91,22 @@ app.add_middleware( ) +@app.on_event("startup") +async def warmup(): + """Pre-warm the LLM connection on startup to reduce first-request latency.""" + try: + import anthropic + client = anthropic.Anthropic() + client.messages.create( + model="claude-sonnet-4-20250514", + max_tokens=10, + messages=[{"role": "user", "content": "hi"}], + ) + logger.info("LLM connection warmed up") + except Exception as e: + logger.warning(f"Warmup failed: {e}") + + graph = build_graph() feedback_log: list[dict] = []