ghostfolio/agent/evals/save_eval_results.py


								"""

								Runs the eval suite and saves results to a JSON history file for regression tracking.

								"""

								import json

								import re

								import subprocess

								import sys

								from datetime import datetime

								from pathlib import Path


								def run_and_save_evals():

								    """Runs the eval suite and saves results to a JSON history file for regression tracking."""

								    results_dir = Path(__file__).parent / "results"

								    results_dir.mkdir(exist_ok=True)


								    # Run pytest from project root (parent of agent)

								    project_root = Path(__file__).parent.parent.parent

								    result = subprocess.run(

								        [

								            sys.executable,

								            "-m",

								            "pytest",

								            "agent/evals",

								            "-v",

								            "--tb=short",

								            "-q",

								        ],

								        capture_output=True,

								        text=True,

								        cwd=project_root,

								    )


								    # Parse results from pytest output

								    output = result.stdout + result.stderr

								    lines = output.split("\n")


								    passed = sum(1 for l in lines if " PASSED" in l)

								    failed = sum(1 for l in lines if " FAILED" in l)

								    errors = sum(1 for l in lines if " ERROR" in l)


								    # Fallback: parse summary line like "182 passed, 1 warning in 30.32s"

								    if passed == 0 and failed == 0 and "passed" in output.lower():

								        m = re.search(r"(\d+)\s+passed", output)

								        if m:

								            passed = int(m.group(1))

								        m = re.search(r"(\d+)\s+failed", output)

								        if m:

								            failed = int(m.group(1))

								        m = re.search(r"(\d+)\s+error", output, re.I)

								        if m:

								            errors = int(m.group(1))


								    total = passed + failed + errors

								    pass_rate = round(passed / total * 100, 1) if total > 0 else 0


								    run_record = {

								        "timestamp": datetime.utcnow().isoformat() + "Z",

								        "passed": passed,

								        "failed": failed,

								        "errors": errors,

								        "total": total,

								        "pass_rate_pct": pass_rate,

								        "status": "PASS" if pass_rate >= 80 else "FAIL",

								        "regression": False,

								    }


								    # Load history

								    history_file = results_dir / "eval_history.json"

								    history = []

								    if history_file.exists():

								        try:

								            history = json.loads(history_file.read_text())

								        except Exception:

								            history = []


								    # Check for regression

								    if history:

								        last = history[-1]

								        if pass_rate < last.get("pass_rate_pct", 100):

								            run_record["regression"] = True

								            run_record["regression_detail"] = (

								                f"Pass rate dropped from "

								                f"{last['pass_rate_pct']}% to {pass_rate}%"

								            )


								    history.append(run_record)

								    history_file.write_text(json.dumps(history, indent=2))


								    # Also save latest run separately

								    latest_file = results_dir / "latest_run.json"

								    latest_file.write_text(json.dumps(run_record, indent=2))


								    print(f"\n{'='*50}")

								    print(f"EVAL RUN: {run_record['timestamp']}")

								    print(f"Passed:   {passed}/{total} ({pass_rate}%)")

								    print(f"Status:   {run_record['status']}")

								    if run_record.get("regression"):

								        print(f"⚠️  REGRESSION: {run_record['regression_detail']}")

								    print(f"History:  {len(history)} runs saved")

								    print(f"{'='*50}\n")


								    return run_record


								if __name__ == "__main__":

								    run_and_save_evals()