feat: add eval history storage with regression detection — saves every run to JSON, flags when pass rate drops

Made-with: Cursor
5 months ago · 7a76750cdd
3 changed files with 139 additions and 0 deletions
--- a/agent/evals/results/eval_history.json
+++ b/agent/evals/results/eval_history.json
@ -0,0 +1,22 @@
+[
+  {
+    "timestamp": "2026-02-28T14:47:17.398436Z",
+    "passed": 0,
+    "failed": 0,
+    "errors": 0,
+    "total": 0,
+    "pass_rate_pct": 0,
+    "status": "FAIL",
+    "regression": false
+  },
+  {
+    "timestamp": "2026-02-28T14:48:25.385529Z",
+    "passed": 182,
+    "failed": 0,
+    "errors": 0,
+    "total": 182,
+    "pass_rate_pct": 100.0,
+    "status": "PASS",
+    "regression": false
+  }
+]
--- a/agent/evals/results/latest_run.json
+++ b/agent/evals/results/latest_run.json
@ -0,0 +1,10 @@
+{
+  "timestamp": "2026-02-28T14:48:25.385529Z",
+  "passed": 182,
+  "failed": 0,
+  "errors": 0,
+  "total": 182,
+  "pass_rate_pct": 100.0,
+  "status": "PASS",
+  "regression": false
+}
--- a/agent/evals/save_eval_results.py
+++ b/agent/evals/save_eval_results.py
@ -0,0 +1,107 @@
+"""
+Runs the eval suite and saves results to a JSON history file for regression tracking.
+"""
+import json
+import re
+import subprocess
+import sys
+from datetime import datetime
+from pathlib import Path
+
+
+def run_and_save_evals():
+    """Runs the eval suite and saves results to a JSON history file for regression tracking."""
+    results_dir = Path(__file__).parent / "results"
+    results_dir.mkdir(exist_ok=True)
+
+    # Run pytest from project root (parent of agent)
+    project_root = Path(__file__).parent.parent.parent
+    result = subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "pytest",
+            "agent/evals",
+            "-v",
+            "--tb=short",
+            "-q",
+        ],
+        capture_output=True,
+        text=True,
+        cwd=project_root,
+    )
+
+    # Parse results from pytest output
+    output = result.stdout + result.stderr
+    lines = output.split("\n")
+
+    passed = sum(1 for l in lines if " PASSED" in l)
+    failed = sum(1 for l in lines if " FAILED" in l)
+    errors = sum(1 for l in lines if " ERROR" in l)
+
+    # Fallback: parse summary line like "182 passed, 1 warning in 30.32s"
+    if passed == 0 and failed == 0 and "passed" in output.lower():
+        m = re.search(r"(\d+)\s+passed", output)
+        if m:
+            passed = int(m.group(1))
+        m = re.search(r"(\d+)\s+failed", output)
+        if m:
+            failed = int(m.group(1))
+        m = re.search(r"(\d+)\s+error", output, re.I)
+        if m:
+            errors = int(m.group(1))
+
+    total = passed + failed + errors
+    pass_rate = round(passed / total * 100, 1) if total > 0 else 0
+
+    run_record = {
+        "timestamp": datetime.utcnow().isoformat() + "Z",
+        "passed": passed,
+        "failed": failed,
+        "errors": errors,
+        "total": total,
+        "pass_rate_pct": pass_rate,
+        "status": "PASS" if pass_rate >= 80 else "FAIL",
+        "regression": False,
+    }
+
+    # Load history
+    history_file = results_dir / "eval_history.json"
+    history = []
+    if history_file.exists():
+        try:
+            history = json.loads(history_file.read_text())
+        except Exception:
+            history = []
+
+    # Check for regression
+    if history:
+        last = history[-1]
+        if pass_rate < last.get("pass_rate_pct", 100):
+            run_record["regression"] = True
+            run_record["regression_detail"] = (
+                f"Pass rate dropped from "
+                f"{last['pass_rate_pct']}% to {pass_rate}%"
+            )
+
+    history.append(run_record)
+    history_file.write_text(json.dumps(history, indent=2))
+
+    # Also save latest run separately
+    latest_file = results_dir / "latest_run.json"
+    latest_file.write_text(json.dumps(run_record, indent=2))
+
+    print(f"\n{'='*50}")
+    print(f"EVAL RUN: {run_record['timestamp']}")
+    print(f"Passed:   {passed}/{total} ({pass_rate}%)")
+    print(f"Status:   {run_record['status']}")
+    if run_record.get("regression"):
+        print(f"⚠️  REGRESSION: {run_record['regression_detail']}")
+    print(f"History:  {len(history)} runs saved")
+    print(f"{'='*50}\n")
+
+    return run_record
+
+
+if __name__ == "__main__":
+    run_and_save_evals()