mirror of https://github.com/ghostfolio/ghostfolio
3 changed files with 139 additions and 0 deletions
@ -0,0 +1,22 @@ |
|||
[ |
|||
{ |
|||
"timestamp": "2026-02-28T14:47:17.398436Z", |
|||
"passed": 0, |
|||
"failed": 0, |
|||
"errors": 0, |
|||
"total": 0, |
|||
"pass_rate_pct": 0, |
|||
"status": "FAIL", |
|||
"regression": false |
|||
}, |
|||
{ |
|||
"timestamp": "2026-02-28T14:48:25.385529Z", |
|||
"passed": 182, |
|||
"failed": 0, |
|||
"errors": 0, |
|||
"total": 182, |
|||
"pass_rate_pct": 100.0, |
|||
"status": "PASS", |
|||
"regression": false |
|||
} |
|||
] |
|||
@ -0,0 +1,10 @@ |
|||
{ |
|||
"timestamp": "2026-02-28T14:48:25.385529Z", |
|||
"passed": 182, |
|||
"failed": 0, |
|||
"errors": 0, |
|||
"total": 182, |
|||
"pass_rate_pct": 100.0, |
|||
"status": "PASS", |
|||
"regression": false |
|||
} |
|||
@ -0,0 +1,107 @@ |
|||
""" |
|||
Runs the eval suite and saves results to a JSON history file for regression tracking. |
|||
""" |
|||
import json |
|||
import re |
|||
import subprocess |
|||
import sys |
|||
from datetime import datetime |
|||
from pathlib import Path |
|||
|
|||
|
|||
def run_and_save_evals(): |
|||
"""Runs the eval suite and saves results to a JSON history file for regression tracking.""" |
|||
results_dir = Path(__file__).parent / "results" |
|||
results_dir.mkdir(exist_ok=True) |
|||
|
|||
# Run pytest from project root (parent of agent) |
|||
project_root = Path(__file__).parent.parent.parent |
|||
result = subprocess.run( |
|||
[ |
|||
sys.executable, |
|||
"-m", |
|||
"pytest", |
|||
"agent/evals", |
|||
"-v", |
|||
"--tb=short", |
|||
"-q", |
|||
], |
|||
capture_output=True, |
|||
text=True, |
|||
cwd=project_root, |
|||
) |
|||
|
|||
# Parse results from pytest output |
|||
output = result.stdout + result.stderr |
|||
lines = output.split("\n") |
|||
|
|||
passed = sum(1 for l in lines if " PASSED" in l) |
|||
failed = sum(1 for l in lines if " FAILED" in l) |
|||
errors = sum(1 for l in lines if " ERROR" in l) |
|||
|
|||
# Fallback: parse summary line like "182 passed, 1 warning in 30.32s" |
|||
if passed == 0 and failed == 0 and "passed" in output.lower(): |
|||
m = re.search(r"(\d+)\s+passed", output) |
|||
if m: |
|||
passed = int(m.group(1)) |
|||
m = re.search(r"(\d+)\s+failed", output) |
|||
if m: |
|||
failed = int(m.group(1)) |
|||
m = re.search(r"(\d+)\s+error", output, re.I) |
|||
if m: |
|||
errors = int(m.group(1)) |
|||
|
|||
total = passed + failed + errors |
|||
pass_rate = round(passed / total * 100, 1) if total > 0 else 0 |
|||
|
|||
run_record = { |
|||
"timestamp": datetime.utcnow().isoformat() + "Z", |
|||
"passed": passed, |
|||
"failed": failed, |
|||
"errors": errors, |
|||
"total": total, |
|||
"pass_rate_pct": pass_rate, |
|||
"status": "PASS" if pass_rate >= 80 else "FAIL", |
|||
"regression": False, |
|||
} |
|||
|
|||
# Load history |
|||
history_file = results_dir / "eval_history.json" |
|||
history = [] |
|||
if history_file.exists(): |
|||
try: |
|||
history = json.loads(history_file.read_text()) |
|||
except Exception: |
|||
history = [] |
|||
|
|||
# Check for regression |
|||
if history: |
|||
last = history[-1] |
|||
if pass_rate < last.get("pass_rate_pct", 100): |
|||
run_record["regression"] = True |
|||
run_record["regression_detail"] = ( |
|||
f"Pass rate dropped from " |
|||
f"{last['pass_rate_pct']}% to {pass_rate}%" |
|||
) |
|||
|
|||
history.append(run_record) |
|||
history_file.write_text(json.dumps(history, indent=2)) |
|||
|
|||
# Also save latest run separately |
|||
latest_file = results_dir / "latest_run.json" |
|||
latest_file.write_text(json.dumps(run_record, indent=2)) |
|||
|
|||
print(f"\n{'='*50}") |
|||
print(f"EVAL RUN: {run_record['timestamp']}") |
|||
print(f"Passed: {passed}/{total} ({pass_rate}%)") |
|||
print(f"Status: {run_record['status']}") |
|||
if run_record.get("regression"): |
|||
print(f"⚠️ REGRESSION: {run_record['regression_detail']}") |
|||
print(f"History: {len(history)} runs saved") |
|||
print(f"{'='*50}\n") |
|||
|
|||
return run_record |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
run_and_save_evals() |
|||
Loading…
Reference in new issue