mirror of https://github.com/ghostfolio/ghostfolio
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
106 lines
2.4 KiB
106 lines
2.4 KiB
export type FinanceEvalCategory =
|
|
| 'happy_path'
|
|
| 'edge_case'
|
|
| 'adversarial'
|
|
| 'multi_step';
|
|
|
|
export interface FinanceEvalExpectedToolCall {
|
|
status?: 'success' | 'failed';
|
|
tool: string;
|
|
}
|
|
|
|
export interface FinanceEvalExpectedVerification {
|
|
check: string;
|
|
status?: 'passed' | 'warning' | 'failed';
|
|
}
|
|
|
|
export interface FinanceEvalCase {
|
|
category: FinanceEvalCategory;
|
|
expected: {
|
|
answerIncludes?: string[];
|
|
confidenceScoreMin?: number;
|
|
forbiddenTools?: string[];
|
|
memoryTurnsAtLeast?: number;
|
|
minCitations?: number;
|
|
requiredToolCalls?: FinanceEvalExpectedToolCall[];
|
|
requiredTools?: string[];
|
|
verificationChecks?: FinanceEvalExpectedVerification[];
|
|
};
|
|
id: string;
|
|
input: {
|
|
languageCode?: string;
|
|
query: string;
|
|
sessionId: string;
|
|
symbols?: string[];
|
|
userCurrency?: string;
|
|
userId: string;
|
|
};
|
|
intent: string;
|
|
setup: Record<string, unknown>;
|
|
}
|
|
|
|
export interface FinanceEvalResponse {
|
|
answer: string;
|
|
citations?: unknown[];
|
|
confidence?: { score?: number };
|
|
memory?: { turns?: number };
|
|
toolCalls?: { status: 'success' | 'failed'; tool: string }[];
|
|
verification?: {
|
|
check: string;
|
|
status: 'passed' | 'warning' | 'failed';
|
|
}[];
|
|
}
|
|
|
|
export interface FinanceEvalResult {
|
|
durationInMs: number;
|
|
failures: string[];
|
|
id: string;
|
|
passed: boolean;
|
|
response?: FinanceEvalResponse;
|
|
}
|
|
|
|
export interface FinanceEvalCategorySummary {
|
|
category: FinanceEvalCategory;
|
|
passRate: number;
|
|
passed: number;
|
|
total: number;
|
|
}
|
|
|
|
export interface FinanceEvalSuiteResult {
|
|
categorySummaries: FinanceEvalCategorySummary[];
|
|
passRate: number;
|
|
passed: number;
|
|
results: FinanceEvalResult[];
|
|
total: number;
|
|
}
|
|
|
|
export const FINANCE_AGENT_EVAL_DATASET: FinanceEvalCase[];
|
|
export const FINANCE_AGENT_EVAL_CATEGORIES: FinanceEvalCategory[];
|
|
|
|
export function evaluateFinanceAgentResponse({
|
|
evalCase,
|
|
response
|
|
}: {
|
|
evalCase: FinanceEvalCase;
|
|
response: FinanceEvalResponse;
|
|
}): string[];
|
|
|
|
export function summarizeFinanceAgentEvalByCategory({
|
|
cases,
|
|
results
|
|
}: {
|
|
cases: FinanceEvalCase[];
|
|
results: FinanceEvalResult[];
|
|
}): FinanceEvalCategorySummary[];
|
|
|
|
export function runFinanceAgentEvalSuite({
|
|
cases,
|
|
execute
|
|
}: {
|
|
cases?: FinanceEvalCase[];
|
|
execute: (evalCase: FinanceEvalCase) => Promise<FinanceEvalResponse>;
|
|
}): Promise<FinanceEvalSuiteResult>;
|
|
|
|
export function getFinanceAgentEvalCategoryCounts(
|
|
cases?: FinanceEvalCase[]
|
|
): Record<FinanceEvalCategory, number>;
|
|
|