ghostfolio/tools/evals/finance-agent-evals/index.mjs


								import dataset from './datasets/ghostfolio-finance-agent-evals.v1.json' with {

								  type: 'json'

								};


								export const FINANCE_AGENT_EVAL_DATASET = dataset;

								export const FINANCE_AGENT_EVAL_CATEGORIES = [

								  'happy_path',

								  'edge_case',

								  'adversarial',

								  'multi_step'

								];


								function hasExpectedVerification({

								  actualChecks,

								  expectedCheck

								}) {

								  return (actualChecks ?? []).some(({ check, status }) => {

								    if (check !== expectedCheck.check) {

								      return false;

								    }


								    if (!expectedCheck.status) {

								      return true;

								    }


								    return status === expectedCheck.status;

								  });

								}


								export function evaluateFinanceAgentResponse({

								  evalCase,

								  response

								}) {

								  const failures = [];

								  const observedTools = (response.toolCalls ?? []).map(({ tool }) => tool);


								  for (const requiredTool of evalCase.expected.requiredTools ?? []) {

								    if (!observedTools.includes(requiredTool)) {

								      failures.push(`Missing required tool: ${requiredTool}`);

								    }

								  }


								  for (const forbiddenTool of evalCase.expected.forbiddenTools ?? []) {

								    if (observedTools.includes(forbiddenTool)) {

								      failures.push(`Forbidden tool executed: ${forbiddenTool}`);

								    }

								  }


								  for (const expectedCall of evalCase.expected.requiredToolCalls ?? []) {

								    const matched = (response.toolCalls ?? []).some((toolCall) => {

								      return (

								        toolCall.tool === expectedCall.tool &&

								        (!expectedCall.status || toolCall.status === expectedCall.status)

								      );

								    });


								    if (!matched) {

								      failures.push(

								        `Missing required tool call: ${expectedCall.tool}${expectedCall.status ? `:${expectedCall.status}` : ''}`

								      );

								    }

								  }


								  if (

								    typeof evalCase.expected.minCitations === 'number' &&

								    (response.citations ?? []).length < evalCase.expected.minCitations

								  ) {

								    failures.push(

								      `Expected at least ${evalCase.expected.minCitations} citation(s), got ${(response.citations ?? []).length}`

								    );

								  }


								  if (

								    typeof evalCase.expected.memoryTurnsAtLeast === 'number' &&

								    (response.memory?.turns ?? 0) < evalCase.expected.memoryTurnsAtLeast

								  ) {

								    failures.push(

								      `Expected memory turns >= ${evalCase.expected.memoryTurnsAtLeast}, got ${response.memory?.turns ?? 0}`

								    );

								  }


								  if (

								    typeof evalCase.expected.confidenceScoreMin === 'number' &&

								    (response.confidence?.score ?? 0) < evalCase.expected.confidenceScoreMin

								  ) {

								    failures.push(

								      `Expected confidence score >= ${evalCase.expected.confidenceScoreMin}, got ${response.confidence?.score ?? 0}`

								    );

								  }


								  for (const expectedText of evalCase.expected.answerIncludes ?? []) {

								    if (!String(response.answer ?? '').includes(expectedText)) {

								      failures.push(`Answer does not include expected text: "${expectedText}"`);

								    }

								  }


								  for (const expectedVerification of evalCase.expected.verificationChecks ?? []) {

								    if (

								      !hasExpectedVerification({

								        actualChecks: response.verification ?? [],

								        expectedCheck: expectedVerification

								      })

								    ) {

								      failures.push(

								        `Missing verification check: ${expectedVerification.check}${expectedVerification.status ? `:${expectedVerification.status}` : ''}`

								      );

								    }

								  }


								  return failures;

								}


								export function summarizeFinanceAgentEvalByCategory({

								  cases,

								  results

								}) {

								  const passedById = new Map(

								    results.map(({ id, passed }) => {

								      return [id, passed];

								    })

								  );

								  const categoryStats = new Map(

								    FINANCE_AGENT_EVAL_CATEGORIES.map((category) => {

								      return [category, { passed: 0, total: 0 }];

								    })

								  );


								  for (const evalCase of cases) {

								    const stats = categoryStats.get(evalCase.category);


								    if (!stats) {

								      continue;

								    }


								    stats.total += 1;


								    if (passedById.get(evalCase.id)) {

								      stats.passed += 1;

								    }

								  }


								  return FINANCE_AGENT_EVAL_CATEGORIES.map((category) => {

								    const { passed, total } = categoryStats.get(category) ?? {

								      passed: 0,

								      total: 0

								    };


								    return {

								      category,

								      passRate: total > 0 ? passed / total : 0,

								      passed,

								      total

								    };

								  });

								}


								export async function runFinanceAgentEvalSuite({

								  cases = FINANCE_AGENT_EVAL_DATASET,

								  execute

								}) {

								  const results = [];


								  for (const evalCase of cases) {

								    const startedAt = Date.now();


								    try {

								      const response = await execute(evalCase);

								      const failures = evaluateFinanceAgentResponse({

								        evalCase,

								        response

								      });


								      results.push({

								        durationInMs: Date.now() - startedAt,

								        failures,

								        id: evalCase.id,

								        passed: failures.length === 0,

								        response

								      });

								    } catch (error) {

								      results.push({

								        durationInMs: Date.now() - startedAt,

								        failures: [error instanceof Error ? error.message : 'unknown eval error'],

								        id: evalCase.id,

								        passed: false

								      });

								    }

								  }


								  const passed = results.filter(({ passed: isPassed }) => isPassed).length;

								  const total = cases.length;


								  return {

								    categorySummaries: summarizeFinanceAgentEvalByCategory({

								      cases,

								      results

								    }),

								    passRate: total > 0 ? passed / total : 0,

								    passed,

								    results,

								    total

								  };

								}


								export function getFinanceAgentEvalCategoryCounts(

								  cases = FINANCE_AGENT_EVAL_DATASET

								) {

								  return cases.reduce(

								    (result, { category }) => {

								      result[category] += 1;


								      return result;

								    },

								    {

								      adversarial: 0,

								      edge_case: 0,

								      happy_path: 0,

								      multi_step: 0

								    }

								  );

								}