mirror of https://github.com/ghostfolio/ghostfolio
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
170 lines
4.2 KiB
170 lines
4.2 KiB
import { createScorer } from 'evalite';
|
|
|
|
interface AgentResponse {
|
|
toolCalls: string[];
|
|
text: string;
|
|
}
|
|
|
|
export interface GoldenExpected {
|
|
toolsAtLeast?: string[];
|
|
toolsExactly?: string[];
|
|
noTools?: boolean;
|
|
containsPattern?: RegExp[];
|
|
containsNone?: string[];
|
|
hasTable?: boolean;
|
|
hasList?: boolean;
|
|
nonEmpty?: boolean;
|
|
}
|
|
|
|
interface CheckResult {
|
|
name: string;
|
|
pass: boolean;
|
|
detail?: string;
|
|
}
|
|
|
|
function checkToolMatch(
|
|
actual: string[],
|
|
expected: GoldenExpected
|
|
): CheckResult[] {
|
|
const results: CheckResult[] = [];
|
|
const actualSet = new Set(actual);
|
|
|
|
if (expected.toolsAtLeast) {
|
|
const missing = expected.toolsAtLeast.filter((t) => !actualSet.has(t));
|
|
results.push({
|
|
name: 'ToolsAtLeast',
|
|
pass: missing.length === 0,
|
|
detail:
|
|
missing.length > 0
|
|
? `missing: ${missing.join(', ')}`
|
|
: `found: ${expected.toolsAtLeast.join(', ')}`
|
|
});
|
|
}
|
|
|
|
if (expected.toolsExactly) {
|
|
const expectedSet = new Set(expected.toolsExactly);
|
|
const match =
|
|
actualSet.size === expectedSet.size &&
|
|
[...expectedSet].every((t) => actualSet.has(t));
|
|
results.push({
|
|
name: 'ToolsExactly',
|
|
pass: match,
|
|
detail: match
|
|
? `matched: ${[...actualSet].join(', ')}`
|
|
: `expected: ${expected.toolsExactly.join(', ')}, got: ${actual.join(', ')}`
|
|
});
|
|
}
|
|
|
|
if (expected.noTools) {
|
|
results.push({
|
|
name: 'NoTools',
|
|
pass: actual.length === 0,
|
|
detail:
|
|
actual.length > 0
|
|
? `unexpected tools: ${actual.join(', ')}`
|
|
: 'no tools called'
|
|
});
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
function checkPatterns(text: string, expected: GoldenExpected): CheckResult[] {
|
|
const results: CheckResult[] = [];
|
|
|
|
if (expected.containsPattern) {
|
|
for (const re of expected.containsPattern) {
|
|
results.push({
|
|
name: `Pattern(${re.source})`,
|
|
pass: re.test(text),
|
|
detail: re.test(text) ? 'matched' : 'no match'
|
|
});
|
|
}
|
|
}
|
|
|
|
if (expected.containsNone) {
|
|
const lower = text.toLowerCase();
|
|
for (const forbidden of expected.containsNone) {
|
|
const found = lower.includes(forbidden.toLowerCase());
|
|
results.push({
|
|
name: `Forbidden("${forbidden}")`,
|
|
pass: !found,
|
|
detail: found ? 'FOUND in response' : 'absent'
|
|
});
|
|
}
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
function checkStructure(text: string, expected: GoldenExpected): CheckResult[] {
|
|
const results: CheckResult[] = [];
|
|
|
|
if (expected.hasTable) {
|
|
const hasTablePattern = /\|[-:]+/.test(text);
|
|
results.push({
|
|
name: 'HasTable',
|
|
pass: hasTablePattern,
|
|
detail: hasTablePattern ? 'table found' : 'no markdown table detected'
|
|
});
|
|
}
|
|
|
|
if (expected.hasList) {
|
|
const hasBullet = /^[\s]*[-*]\s/m.test(text);
|
|
const hasNumbered = /^[\s]*\d+\.\s/m.test(text);
|
|
const pass = hasBullet || hasNumbered;
|
|
results.push({
|
|
name: 'HasList',
|
|
pass,
|
|
detail: pass ? 'list found' : 'no bullet or numbered list detected'
|
|
});
|
|
}
|
|
|
|
if (expected.nonEmpty) {
|
|
const pass = text.trim().length > 0;
|
|
results.push({
|
|
name: 'NonEmpty',
|
|
pass,
|
|
detail: pass ? `${text.trim().length} chars` : 'empty response'
|
|
});
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
/**
|
|
* Deterministic meta-scorer: returns 1 only if ALL specified checks pass.
|
|
* Metadata shows each individual check result.
|
|
*/
|
|
export const GoldenCheck = createScorer<string, AgentResponse, GoldenExpected>({
|
|
name: 'Golden Check',
|
|
description: 'Deterministic binary pass/fail — all checks must pass',
|
|
scorer: ({ output, expected }) => {
|
|
if (!expected) {
|
|
return { score: 0, metadata: { error: 'no expected config' } };
|
|
}
|
|
|
|
const checks = [
|
|
...checkToolMatch(output.toolCalls, expected),
|
|
...checkPatterns(output.text, expected),
|
|
...checkStructure(output.text, expected)
|
|
];
|
|
|
|
const failed = checks.filter((c) => !c.pass);
|
|
const score = failed.length === 0 ? 1 : 0;
|
|
|
|
return {
|
|
score,
|
|
metadata: {
|
|
total: checks.length,
|
|
passed: checks.length - failed.length,
|
|
failed: failed.length,
|
|
checks: checks.map((c) => ({
|
|
name: c.name,
|
|
pass: c.pass,
|
|
detail: c.detail
|
|
}))
|
|
}
|
|
};
|
|
}
|
|
});
|
|
|