diff --git a/AGENT_README.md b/AGENT_README.md
index 9a13da4f7..095d047a5 100644
--- a/AGENT_README.md
+++ b/AGENT_README.md
@@ -111,6 +111,16 @@ Response to User
 
 ---
 
+## Latency Notes
+
+Single-tool queries average 5–10 seconds due to Claude Sonnet response generation time.
+The classify step (keyword matching) adds <10ms. Tool execution adds 50–200ms. The majority
+of latency is LLM synthesis. Streaming responses (`/chat/steps`, `/chat/stream`) are
+implemented to improve perceived performance. A startup warmup pre-establishes the LLM
+connection to reduce cold-start latency on the first request.
+
+---
+
 ## Verification Strategy
 
 ### 3 Verification Systems Implemented
diff --git a/main.py b/main.py
index 1fe7db3f7..e937d3ad0 100644
--- a/main.py
+++ b/main.py
@@ -91,6 +91,22 @@ app.add_middleware(
 )
 
 
+@app.on_event("startup")
+async def warmup():
+    """Pre-warm the LLM connection on startup to reduce first-request latency."""
+    try:
+        import anthropic
+        client = anthropic.Anthropic()
+        client.messages.create(
+            model="claude-sonnet-4-20250514",
+            max_tokens=10,
+            messages=[{"role": "user", "content": "hi"}],
+        )
+        logger.info("LLM connection warmed up")
+    except Exception as e:
+        logger.warning(f"Warmup failed: {e}")
+
+
 graph = build_graph()
 
 feedback_log: list[dict] = []