Browse Source

feat: add eval history storage with regression detection — saves every run to JSON, flags when pass rate drops

Made-with: Cursor
pull/6453/head
Priyanka Punukollu 1 month ago
parent
commit
7a76750cdd
  1. 22
      agent/evals/results/eval_history.json
  2. 10
      agent/evals/results/latest_run.json
  3. 107
      agent/evals/save_eval_results.py

22
agent/evals/results/eval_history.json

@ -0,0 +1,22 @@
[
{
"timestamp": "2026-02-28T14:47:17.398436Z",
"passed": 0,
"failed": 0,
"errors": 0,
"total": 0,
"pass_rate_pct": 0,
"status": "FAIL",
"regression": false
},
{
"timestamp": "2026-02-28T14:48:25.385529Z",
"passed": 182,
"failed": 0,
"errors": 0,
"total": 182,
"pass_rate_pct": 100.0,
"status": "PASS",
"regression": false
}
]

10
agent/evals/results/latest_run.json

@ -0,0 +1,10 @@
{
"timestamp": "2026-02-28T14:48:25.385529Z",
"passed": 182,
"failed": 0,
"errors": 0,
"total": 182,
"pass_rate_pct": 100.0,
"status": "PASS",
"regression": false
}

107
agent/evals/save_eval_results.py

@ -0,0 +1,107 @@
"""
Runs the eval suite and saves results to a JSON history file for regression tracking.
"""
import json
import re
import subprocess
import sys
from datetime import datetime
from pathlib import Path
def run_and_save_evals():
"""Runs the eval suite and saves results to a JSON history file for regression tracking."""
results_dir = Path(__file__).parent / "results"
results_dir.mkdir(exist_ok=True)
# Run pytest from project root (parent of agent)
project_root = Path(__file__).parent.parent.parent
result = subprocess.run(
[
sys.executable,
"-m",
"pytest",
"agent/evals",
"-v",
"--tb=short",
"-q",
],
capture_output=True,
text=True,
cwd=project_root,
)
# Parse results from pytest output
output = result.stdout + result.stderr
lines = output.split("\n")
passed = sum(1 for l in lines if " PASSED" in l)
failed = sum(1 for l in lines if " FAILED" in l)
errors = sum(1 for l in lines if " ERROR" in l)
# Fallback: parse summary line like "182 passed, 1 warning in 30.32s"
if passed == 0 and failed == 0 and "passed" in output.lower():
m = re.search(r"(\d+)\s+passed", output)
if m:
passed = int(m.group(1))
m = re.search(r"(\d+)\s+failed", output)
if m:
failed = int(m.group(1))
m = re.search(r"(\d+)\s+error", output, re.I)
if m:
errors = int(m.group(1))
total = passed + failed + errors
pass_rate = round(passed / total * 100, 1) if total > 0 else 0
run_record = {
"timestamp": datetime.utcnow().isoformat() + "Z",
"passed": passed,
"failed": failed,
"errors": errors,
"total": total,
"pass_rate_pct": pass_rate,
"status": "PASS" if pass_rate >= 80 else "FAIL",
"regression": False,
}
# Load history
history_file = results_dir / "eval_history.json"
history = []
if history_file.exists():
try:
history = json.loads(history_file.read_text())
except Exception:
history = []
# Check for regression
if history:
last = history[-1]
if pass_rate < last.get("pass_rate_pct", 100):
run_record["regression"] = True
run_record["regression_detail"] = (
f"Pass rate dropped from "
f"{last['pass_rate_pct']}% to {pass_rate}%"
)
history.append(run_record)
history_file.write_text(json.dumps(history, indent=2))
# Also save latest run separately
latest_file = results_dir / "latest_run.json"
latest_file.write_text(json.dumps(run_record, indent=2))
print(f"\n{'='*50}")
print(f"EVAL RUN: {run_record['timestamp']}")
print(f"Passed: {passed}/{total} ({pass_rate}%)")
print(f"Status: {run_record['status']}")
if run_record.get("regression"):
print(f"⚠️ REGRESSION: {run_record['regression_detail']}")
print(f"History: {len(history)} runs saved")
print(f"{'='*50}\n")
return run_record
if __name__ == "__main__":
run_and_save_evals()
Loading…
Cancel
Save