You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

221 lines
5.3 KiB

import dataset from './datasets/ghostfolio-finance-agent-evals.v1.json' with {
type: 'json'
};
export const FINANCE_AGENT_EVAL_DATASET = dataset;
export const FINANCE_AGENT_EVAL_CATEGORIES = [
'happy_path',
'edge_case',
'adversarial',
'multi_step'
];
function hasExpectedVerification({
actualChecks,
expectedCheck
}) {
return (actualChecks ?? []).some(({ check, status }) => {
if (check !== expectedCheck.check) {
return false;
}
if (!expectedCheck.status) {
return true;
}
return status === expectedCheck.status;
});
}
export function evaluateFinanceAgentResponse({
evalCase,
response
}) {
const failures = [];
const observedTools = (response.toolCalls ?? []).map(({ tool }) => tool);
for (const requiredTool of evalCase.expected.requiredTools ?? []) {
if (!observedTools.includes(requiredTool)) {
failures.push(`Missing required tool: ${requiredTool}`);
}
}
for (const forbiddenTool of evalCase.expected.forbiddenTools ?? []) {
if (observedTools.includes(forbiddenTool)) {
failures.push(`Forbidden tool executed: ${forbiddenTool}`);
}
}
for (const expectedCall of evalCase.expected.requiredToolCalls ?? []) {
const matched = (response.toolCalls ?? []).some((toolCall) => {
return (
toolCall.tool === expectedCall.tool &&
(!expectedCall.status || toolCall.status === expectedCall.status)
);
});
if (!matched) {
failures.push(
`Missing required tool call: ${expectedCall.tool}${expectedCall.status ? `:${expectedCall.status}` : ''}`
);
}
}
if (
typeof evalCase.expected.minCitations === 'number' &&
(response.citations ?? []).length < evalCase.expected.minCitations
) {
failures.push(
`Expected at least ${evalCase.expected.minCitations} citation(s), got ${(response.citations ?? []).length}`
);
}
if (
typeof evalCase.expected.memoryTurnsAtLeast === 'number' &&
(response.memory?.turns ?? 0) < evalCase.expected.memoryTurnsAtLeast
) {
failures.push(
`Expected memory turns >= ${evalCase.expected.memoryTurnsAtLeast}, got ${response.memory?.turns ?? 0}`
);
}
if (
typeof evalCase.expected.confidenceScoreMin === 'number' &&
(response.confidence?.score ?? 0) < evalCase.expected.confidenceScoreMin
) {
failures.push(
`Expected confidence score >= ${evalCase.expected.confidenceScoreMin}, got ${response.confidence?.score ?? 0}`
);
}
for (const expectedText of evalCase.expected.answerIncludes ?? []) {
if (!String(response.answer ?? '').includes(expectedText)) {
failures.push(`Answer does not include expected text: "${expectedText}"`);
}
}
for (const expectedVerification of evalCase.expected.verificationChecks ?? []) {
if (
!hasExpectedVerification({
actualChecks: response.verification ?? [],
expectedCheck: expectedVerification
})
) {
failures.push(
`Missing verification check: ${expectedVerification.check}${expectedVerification.status ? `:${expectedVerification.status}` : ''}`
);
}
}
return failures;
}
export function summarizeFinanceAgentEvalByCategory({
cases,
results
}) {
const passedById = new Map(
results.map(({ id, passed }) => {
return [id, passed];
})
);
const categoryStats = new Map(
FINANCE_AGENT_EVAL_CATEGORIES.map((category) => {
return [category, { passed: 0, total: 0 }];
})
);
for (const evalCase of cases) {
const stats = categoryStats.get(evalCase.category);
if (!stats) {
continue;
}
stats.total += 1;
if (passedById.get(evalCase.id)) {
stats.passed += 1;
}
}
return FINANCE_AGENT_EVAL_CATEGORIES.map((category) => {
const { passed, total } = categoryStats.get(category) ?? {
passed: 0,
total: 0
};
return {
category,
passRate: total > 0 ? passed / total : 0,
passed,
total
};
});
}
export async function runFinanceAgentEvalSuite({
cases = FINANCE_AGENT_EVAL_DATASET,
execute
}) {
const results = [];
for (const evalCase of cases) {
const startedAt = Date.now();
try {
const response = await execute(evalCase);
const failures = evaluateFinanceAgentResponse({
evalCase,
response
});
results.push({
durationInMs: Date.now() - startedAt,
failures,
id: evalCase.id,
passed: failures.length === 0,
response
});
} catch (error) {
results.push({
durationInMs: Date.now() - startedAt,
failures: [error instanceof Error ? error.message : 'unknown eval error'],
id: evalCase.id,
passed: false
});
}
}
const passed = results.filter(({ passed: isPassed }) => isPassed).length;
const total = cases.length;
return {
categorySummaries: summarizeFinanceAgentEvalByCategory({
cases,
results
}),
passRate: total > 0 ? passed / total : 0,
passed,
results,
total
};
}
export function getFinanceAgentEvalCategoryCounts(
cases = FINANCE_AGENT_EVAL_DATASET
) {
return cases.reduce(
(result, { category }) => {
result[category] += 1;
return result;
},
{
adversarial: 0,
edge_case: 0,
happy_path: 0,
multi_step: 0
}
);
}