From 7a76750cdd7dabb295e682c05f77eced69957f21 Mon Sep 17 00:00:00 2001 From: Priyanka Punukollu Date: Sat, 28 Feb 2026 08:48:43 -0600 Subject: [PATCH] =?UTF-8?q?feat:=20add=20eval=20history=20storage=20with?= =?UTF-8?q?=20regression=20detection=20=E2=80=94=20saves=20every=20run=20t?= =?UTF-8?q?o=20JSON,=20flags=20when=20pass=20rate=20drops?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Made-with: Cursor --- agent/evals/results/eval_history.json | 22 ++++++ agent/evals/results/latest_run.json | 10 +++ agent/evals/save_eval_results.py | 107 ++++++++++++++++++++++++++ 3 files changed, 139 insertions(+) create mode 100644 agent/evals/results/eval_history.json create mode 100644 agent/evals/results/latest_run.json create mode 100644 agent/evals/save_eval_results.py diff --git a/agent/evals/results/eval_history.json b/agent/evals/results/eval_history.json new file mode 100644 index 000000000..4b5de0db5 --- /dev/null +++ b/agent/evals/results/eval_history.json @@ -0,0 +1,22 @@ +[ + { + "timestamp": "2026-02-28T14:47:17.398436Z", + "passed": 0, + "failed": 0, + "errors": 0, + "total": 0, + "pass_rate_pct": 0, + "status": "FAIL", + "regression": false + }, + { + "timestamp": "2026-02-28T14:48:25.385529Z", + "passed": 182, + "failed": 0, + "errors": 0, + "total": 182, + "pass_rate_pct": 100.0, + "status": "PASS", + "regression": false + } +] \ No newline at end of file diff --git a/agent/evals/results/latest_run.json b/agent/evals/results/latest_run.json new file mode 100644 index 000000000..a0deea809 --- /dev/null +++ b/agent/evals/results/latest_run.json @@ -0,0 +1,10 @@ +{ + "timestamp": "2026-02-28T14:48:25.385529Z", + "passed": 182, + "failed": 0, + "errors": 0, + "total": 182, + "pass_rate_pct": 100.0, + "status": "PASS", + "regression": false +} \ No newline at end of file diff --git a/agent/evals/save_eval_results.py b/agent/evals/save_eval_results.py new file mode 100644 index 000000000..e3f9e47f2 --- /dev/null +++ b/agent/evals/save_eval_results.py @@ -0,0 +1,107 @@ +""" +Runs the eval suite and saves results to a JSON history file for regression tracking. +""" +import json +import re +import subprocess +import sys +from datetime import datetime +from pathlib import Path + + +def run_and_save_evals(): + """Runs the eval suite and saves results to a JSON history file for regression tracking.""" + results_dir = Path(__file__).parent / "results" + results_dir.mkdir(exist_ok=True) + + # Run pytest from project root (parent of agent) + project_root = Path(__file__).parent.parent.parent + result = subprocess.run( + [ + sys.executable, + "-m", + "pytest", + "agent/evals", + "-v", + "--tb=short", + "-q", + ], + capture_output=True, + text=True, + cwd=project_root, + ) + + # Parse results from pytest output + output = result.stdout + result.stderr + lines = output.split("\n") + + passed = sum(1 for l in lines if " PASSED" in l) + failed = sum(1 for l in lines if " FAILED" in l) + errors = sum(1 for l in lines if " ERROR" in l) + + # Fallback: parse summary line like "182 passed, 1 warning in 30.32s" + if passed == 0 and failed == 0 and "passed" in output.lower(): + m = re.search(r"(\d+)\s+passed", output) + if m: + passed = int(m.group(1)) + m = re.search(r"(\d+)\s+failed", output) + if m: + failed = int(m.group(1)) + m = re.search(r"(\d+)\s+error", output, re.I) + if m: + errors = int(m.group(1)) + + total = passed + failed + errors + pass_rate = round(passed / total * 100, 1) if total > 0 else 0 + + run_record = { + "timestamp": datetime.utcnow().isoformat() + "Z", + "passed": passed, + "failed": failed, + "errors": errors, + "total": total, + "pass_rate_pct": pass_rate, + "status": "PASS" if pass_rate >= 80 else "FAIL", + "regression": False, + } + + # Load history + history_file = results_dir / "eval_history.json" + history = [] + if history_file.exists(): + try: + history = json.loads(history_file.read_text()) + except Exception: + history = [] + + # Check for regression + if history: + last = history[-1] + if pass_rate < last.get("pass_rate_pct", 100): + run_record["regression"] = True + run_record["regression_detail"] = ( + f"Pass rate dropped from " + f"{last['pass_rate_pct']}% to {pass_rate}%" + ) + + history.append(run_record) + history_file.write_text(json.dumps(history, indent=2)) + + # Also save latest run separately + latest_file = results_dir / "latest_run.json" + latest_file.write_text(json.dumps(run_record, indent=2)) + + print(f"\n{'='*50}") + print(f"EVAL RUN: {run_record['timestamp']}") + print(f"Passed: {passed}/{total} ({pass_rate}%)") + print(f"Status: {run_record['status']}") + if run_record.get("regression"): + print(f"⚠️ REGRESSION: {run_record['regression_detail']}") + print(f"History: {len(history)} runs saved") + print(f"{'='*50}\n") + + return run_record + + +if __name__ == "__main__": + run_and_save_evals()