import asyncio, yaml, httpx, time, json from datetime import datetime BASE = "http://localhost:8000" async def run_check(client, case, retries=2): if not case.get('query') and case.get('query') != '': return {**case, 'passed': True, 'note': 'skipped'} last_exc = None for attempt in range(1, retries + 1): start = time.time() try: resp = await client.post(f"{BASE}/chat", json={"query": case.get('query', ''), "history": []}, timeout=30.0) data = resp.json() elapsed = time.time() - start break except Exception as e: last_exc = e if attempt < retries: await asyncio.sleep(2) else: return { 'id': case['id'], 'passed': False, 'failures': [f"EXCEPTION (after {retries} attempts): {str(last_exc)}"], 'latency': 0, 'tools_used': [] } response_text = data.get('response', '').lower() tools_used = data.get('tools_used', []) failures = [] # Check 1: Tool selection for tool in case.get('expected_tools', []): if tool not in tools_used: failures.append(f"TOOL SELECTION: Expected '{tool}' — got {tools_used}") # Check 2: Content validation (must_contain) for phrase in case.get('must_contain', []): if phrase.lower() not in response_text: failures.append(f"CONTENT: Missing required phrase '{phrase}'") # Check 3: must_contain_one_of one_of = case.get('must_contain_one_of', []) if one_of and not any(p.lower() in response_text for p in one_of): failures.append(f"CONTENT: Must contain one of {one_of}") # Check 4: Negative validation (must_not_contain) for phrase in case.get('must_not_contain', []): if phrase.lower() in response_text: failures.append(f"NEGATIVE: Contains forbidden phrase '{phrase}'") # Check 5: Latency (30s budget for complex multi-tool queries) limit = 30.0 if elapsed > limit: failures.append(f"LATENCY: {elapsed:.1f}s exceeded {limit}s") passed = len(failures) == 0 return { 'id': case['id'], 'category': case.get('category', ''), 'difficulty': case.get('difficulty', ''), 'subcategory': case.get('subcategory', ''), 'passed': passed, 'latency': round(elapsed, 2), 'tools_used': tools_used, 'failures': failures, 'query': case.get('query', '')[:60] } async def main(): # Load both files with open('evals/golden_sets.yaml') as f: golden = yaml.safe_load(f) with open('evals/labeled_scenarios.yaml') as f: scenarios = yaml.safe_load(f) print("=" * 60) print("GHOSTFOLIO AGENT — GOLDEN SETS") print("=" * 60) async with httpx.AsyncClient() as client: # Run golden sets first golden_results = [] for case in golden: r = await run_check(client, case) golden_results.append(r) status = "✅ PASS" if r['passed'] else "❌ FAIL" print(f"{status} | {r['id']} | {r.get('latency',0):.1f}s | tools: {r.get('tools_used', [])}") if not r['passed']: for f in r['failures']: print(f" → {f}") golden_pass = sum(r['passed'] for r in golden_results) print(f"\nGOLDEN SETS: {golden_pass}/{len(golden_results)} passed") if golden_pass < len(golden_results): print("\n⚠️ GOLDEN SET FAILURES — something is fundamentally broken.") print("Fix these before looking at labeled scenarios.\n") # Still save partial results and continue to scenarios for full picture all_results = { 'timestamp': datetime.utcnow().isoformat(), 'golden_sets': golden_results, 'labeled_scenarios': [], 'summary': { 'golden_pass_rate': f"{golden_pass}/{len(golden_results)}", 'scenario_pass_rate': "not run", } } with open('evals/golden_results.json', 'w') as f: json.dump(all_results, f, indent=2) print(f"Partial results → evals/golden_results.json") return print("\n✅ All golden sets passed. Running labeled scenarios...\n") print("=" * 60) print("LABELED SCENARIOS — COVERAGE ANALYSIS") print("=" * 60) # Run labeled scenarios scenario_results = [] for case in scenarios: r = await run_check(client, case) scenario_results.append(r) status = "✅ PASS" if r['passed'] else "❌ FAIL" diff = case.get('difficulty', '') cat = case.get('subcategory', '') print(f"{status} | {r['id']} | {diff:15} | {cat:30} | {r.get('latency',0):.1f}s") if not r['passed']: for f in r['failures']: print(f" → {f}") scenario_pass = sum(r['passed'] for r in scenario_results) # Results by difficulty print(f"\n{'='*60}") print(f"RESULTS BY DIFFICULTY:") for diff in ['straightforward', 'ambiguous', 'edge_case', 'adversarial']: subset = [r for r in scenario_results if r.get('difficulty') == diff] if subset: p = sum(r['passed'] for r in subset) print(f" {diff:20}: {p}/{len(subset)}") print(f"\nSCENARIOS: {scenario_pass}/{len(scenario_results)} passed") print(f"OVERALL: {golden_pass + scenario_pass}/{len(golden_results) + len(scenario_results)} passed") # Save results all_results = { 'timestamp': datetime.utcnow().isoformat(), 'golden_sets': golden_results, 'labeled_scenarios': scenario_results, 'summary': { 'golden_pass_rate': f"{golden_pass}/{len(golden_results)}", 'scenario_pass_rate': f"{scenario_pass}/{len(scenario_results)}", } } with open('evals/golden_results.json', 'w') as f: json.dump(all_results, f, indent=2) print(f"\nFull results → evals/golden_results.json") asyncio.run(main())