mirror of https://github.com/ghostfolio/ghostfolio
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
171 lines
6.2 KiB
171 lines
6.2 KiB
import asyncio, yaml, httpx, time, json
|
|
from datetime import datetime
|
|
|
|
BASE = "http://localhost:8000"
|
|
|
|
|
|
async def run_check(client, case, retries=2):
|
|
if not case.get('query') and case.get('query') != '':
|
|
return {**case, 'passed': True, 'note': 'skipped'}
|
|
|
|
last_exc = None
|
|
for attempt in range(1, retries + 1):
|
|
start = time.time()
|
|
try:
|
|
resp = await client.post(f"{BASE}/chat",
|
|
json={"query": case.get('query', ''), "history": []},
|
|
timeout=30.0)
|
|
data = resp.json()
|
|
elapsed = time.time() - start
|
|
break
|
|
except Exception as e:
|
|
last_exc = e
|
|
if attempt < retries:
|
|
await asyncio.sleep(2)
|
|
else:
|
|
return {
|
|
'id': case['id'],
|
|
'passed': False,
|
|
'failures': [f"EXCEPTION (after {retries} attempts): {str(last_exc)}"],
|
|
'latency': 0,
|
|
'tools_used': []
|
|
}
|
|
|
|
response_text = data.get('response', '').lower()
|
|
tools_used = data.get('tools_used', [])
|
|
|
|
failures = []
|
|
|
|
# Check 1: Tool selection
|
|
for tool in case.get('expected_tools', []):
|
|
if tool not in tools_used:
|
|
failures.append(f"TOOL SELECTION: Expected '{tool}' — got {tools_used}")
|
|
|
|
# Check 2: Content validation (must_contain)
|
|
for phrase in case.get('must_contain', []):
|
|
if phrase.lower() not in response_text:
|
|
failures.append(f"CONTENT: Missing required phrase '{phrase}'")
|
|
|
|
# Check 3: must_contain_one_of
|
|
one_of = case.get('must_contain_one_of', [])
|
|
if one_of and not any(p.lower() in response_text for p in one_of):
|
|
failures.append(f"CONTENT: Must contain one of {one_of}")
|
|
|
|
# Check 4: Negative validation (must_not_contain)
|
|
for phrase in case.get('must_not_contain', []):
|
|
if phrase.lower() in response_text:
|
|
failures.append(f"NEGATIVE: Contains forbidden phrase '{phrase}'")
|
|
|
|
# Check 5: Latency (30s budget for complex multi-tool queries)
|
|
limit = 30.0
|
|
if elapsed > limit:
|
|
failures.append(f"LATENCY: {elapsed:.1f}s exceeded {limit}s")
|
|
|
|
passed = len(failures) == 0
|
|
return {
|
|
'id': case['id'],
|
|
'category': case.get('category', ''),
|
|
'difficulty': case.get('difficulty', ''),
|
|
'subcategory': case.get('subcategory', ''),
|
|
'passed': passed,
|
|
'latency': round(elapsed, 2),
|
|
'tools_used': tools_used,
|
|
'failures': failures,
|
|
'query': case.get('query', '')[:60]
|
|
}
|
|
|
|
|
|
|
|
async def main():
|
|
# Load both files
|
|
with open('evals/golden_sets.yaml') as f:
|
|
golden = yaml.safe_load(f)
|
|
with open('evals/labeled_scenarios.yaml') as f:
|
|
scenarios = yaml.safe_load(f)
|
|
|
|
print("=" * 60)
|
|
print("GHOSTFOLIO AGENT — GOLDEN SETS")
|
|
print("=" * 60)
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
# Run golden sets first
|
|
golden_results = []
|
|
for case in golden:
|
|
r = await run_check(client, case)
|
|
golden_results.append(r)
|
|
status = "✅ PASS" if r['passed'] else "❌ FAIL"
|
|
print(f"{status} | {r['id']} | {r.get('latency',0):.1f}s | tools: {r.get('tools_used', [])}")
|
|
if not r['passed']:
|
|
for f in r['failures']:
|
|
print(f" → {f}")
|
|
|
|
golden_pass = sum(r['passed'] for r in golden_results)
|
|
print(f"\nGOLDEN SETS: {golden_pass}/{len(golden_results)} passed")
|
|
|
|
if golden_pass < len(golden_results):
|
|
print("\n⚠️ GOLDEN SET FAILURES — something is fundamentally broken.")
|
|
print("Fix these before looking at labeled scenarios.\n")
|
|
|
|
# Still save partial results and continue to scenarios for full picture
|
|
all_results = {
|
|
'timestamp': datetime.utcnow().isoformat(),
|
|
'golden_sets': golden_results,
|
|
'labeled_scenarios': [],
|
|
'summary': {
|
|
'golden_pass_rate': f"{golden_pass}/{len(golden_results)}",
|
|
'scenario_pass_rate': "not run",
|
|
}
|
|
}
|
|
with open('evals/golden_results.json', 'w') as f:
|
|
json.dump(all_results, f, indent=2)
|
|
print(f"Partial results → evals/golden_results.json")
|
|
return
|
|
|
|
print("\n✅ All golden sets passed. Running labeled scenarios...\n")
|
|
print("=" * 60)
|
|
print("LABELED SCENARIOS — COVERAGE ANALYSIS")
|
|
print("=" * 60)
|
|
|
|
# Run labeled scenarios
|
|
scenario_results = []
|
|
for case in scenarios:
|
|
r = await run_check(client, case)
|
|
scenario_results.append(r)
|
|
status = "✅ PASS" if r['passed'] else "❌ FAIL"
|
|
diff = case.get('difficulty', '')
|
|
cat = case.get('subcategory', '')
|
|
print(f"{status} | {r['id']} | {diff:15} | {cat:30} | {r.get('latency',0):.1f}s")
|
|
if not r['passed']:
|
|
for f in r['failures']:
|
|
print(f" → {f}")
|
|
|
|
scenario_pass = sum(r['passed'] for r in scenario_results)
|
|
|
|
# Results by difficulty
|
|
print(f"\n{'='*60}")
|
|
print(f"RESULTS BY DIFFICULTY:")
|
|
for diff in ['straightforward', 'ambiguous', 'edge_case', 'adversarial']:
|
|
subset = [r for r in scenario_results if r.get('difficulty') == diff]
|
|
if subset:
|
|
p = sum(r['passed'] for r in subset)
|
|
print(f" {diff:20}: {p}/{len(subset)}")
|
|
|
|
print(f"\nSCENARIOS: {scenario_pass}/{len(scenario_results)} passed")
|
|
print(f"OVERALL: {golden_pass + scenario_pass}/{len(golden_results) + len(scenario_results)} passed")
|
|
|
|
# Save results
|
|
all_results = {
|
|
'timestamp': datetime.utcnow().isoformat(),
|
|
'golden_sets': golden_results,
|
|
'labeled_scenarios': scenario_results,
|
|
'summary': {
|
|
'golden_pass_rate': f"{golden_pass}/{len(golden_results)}",
|
|
'scenario_pass_rate': f"{scenario_pass}/{len(scenario_results)}",
|
|
}
|
|
}
|
|
with open('evals/golden_results.json', 'w') as f:
|
|
json.dump(all_results, f, indent=2)
|
|
print(f"\nFull results → evals/golden_results.json")
|
|
|
|
|
|
asyncio.run(main())
|
|
|