mirror of https://github.com/ghostfolio/ghostfolio
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
86 lines
2.4 KiB
86 lines
2.4 KiB
import { createScorer } from 'evalite';
|
|
|
|
import type { AgentResponse } from '../helpers';
|
|
|
|
/**
|
|
* Deterministic verification scorer that runs output validation +
|
|
* hallucination checks on eval outputs. Uses tool results from the
|
|
* extended AgentResponse.
|
|
*/
|
|
export const VerificationCheck = createScorer<string, AgentResponse, string>({
|
|
name: 'Verification',
|
|
description:
|
|
'Checks output validity and hallucination risk using tool results',
|
|
scorer: ({ output }) => {
|
|
const issues: string[] = [];
|
|
let checks = 0;
|
|
let passed = 0;
|
|
|
|
// Output validation: non-empty
|
|
checks++;
|
|
if (output.text.trim().length >= 10) {
|
|
passed++;
|
|
} else {
|
|
issues.push('Response too short');
|
|
}
|
|
|
|
// Output validation: if tools called, response should have numbers
|
|
if (output.toolCalls.length > 0) {
|
|
checks++;
|
|
if (/\d/.test(output.text)) {
|
|
passed++;
|
|
} else {
|
|
issues.push('Tools called but no numeric data in response');
|
|
}
|
|
}
|
|
|
|
// Hallucination: dollar amounts should appear in tool results
|
|
if (output.toolResults.length > 0) {
|
|
const responseDollars = extractDollarAmounts(output.text);
|
|
const toolDataStr = JSON.stringify(
|
|
output.toolResults.map((tr) => tr.result)
|
|
);
|
|
const toolDollars = extractDollarAmounts(toolDataStr);
|
|
|
|
if (responseDollars.length > 0 && toolDollars.length > 0) {
|
|
checks++;
|
|
const unmatched = responseDollars.filter(
|
|
(rd) => !toolDollars.some((td) => isApproxMatch(rd, td))
|
|
);
|
|
if (unmatched.length / responseDollars.length <= 0.5) {
|
|
passed++;
|
|
} else {
|
|
issues.push(
|
|
`Unmatched dollar amounts: ${unmatched
|
|
.slice(0, 3)
|
|
.map((a) => '$' + a)
|
|
.join(', ')}`
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
const score = checks > 0 ? passed / checks : 1;
|
|
|
|
return {
|
|
score: Math.round(score * 100) / 100,
|
|
metadata: {
|
|
checks,
|
|
passed,
|
|
issues
|
|
}
|
|
};
|
|
}
|
|
});
|
|
|
|
function extractDollarAmounts(str: string): number[] {
|
|
const matches = str.match(/\$[\d,]+(?:\.\d{1,2})?/g) ?? [];
|
|
return matches.map((m) => parseFloat(m.replace(/[$,]/g, '')));
|
|
}
|
|
|
|
function isApproxMatch(a: number, b: number): boolean {
|
|
if (a === 0 && b === 0) return true;
|
|
const diff = Math.abs(a - b);
|
|
const max = Math.max(Math.abs(a), Math.abs(b));
|
|
return diff / max < 0.05 || diff < 1;
|
|
}
|
|
|