mirror of https://github.com/ghostfolio/ghostfolio
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
68 lines
2.3 KiB
68 lines
2.3 KiB
import { createAnthropic } from '@ai-sdk/anthropic';
|
|
import { generateText } from 'ai';
|
|
import { createScorer } from 'evalite';
|
|
|
|
interface AgentResponse {
|
|
toolCalls: string[];
|
|
text: string;
|
|
}
|
|
|
|
/**
|
|
* LLM-judged scorer that evaluates response quality on a 0-1 scale.
|
|
* Uses Haiku for fast, cheap scoring.
|
|
* Checks: relevance, data-groundedness, conciseness, formatting.
|
|
*/
|
|
export const ResponseQuality = createScorer<string, AgentResponse, string>({
|
|
name: 'Response Quality',
|
|
description:
|
|
'LLM-judged score for relevance, accuracy, and helpfulness of the agent response',
|
|
scorer: async ({ input, output }) => {
|
|
if (!output.text.trim()) {
|
|
return { score: 0, metadata: { reason: 'Empty response' } };
|
|
}
|
|
|
|
const { text: judgment } = await generateText({
|
|
model: createAnthropic()('claude-haiku-4-5-20251001'),
|
|
prompt: `You are evaluating a financial AI assistant's response quality.
|
|
|
|
USER QUERY: "${input}"
|
|
TOOLS CALLED: ${output.toolCalls.length > 0 ? output.toolCalls.join(', ') : 'none'}
|
|
ASSISTANT RESPONSE:
|
|
${output.text}
|
|
|
|
Score the response on these criteria (each 0-1):
|
|
1. RELEVANCE: Does the response address the user's query?
|
|
2. DATA_GROUNDED: Does it reference specific data (numbers, holdings, dates) rather than vague generalities? Score 0.5 if no data tools were called (conversational).
|
|
3. CONCISENESS: Is it appropriately concise without unnecessary filler?
|
|
4. FORMATTING: Does it use structured formatting (tables, bullets) when presenting data? Score 0.5 if response is conversational.
|
|
|
|
Respond with ONLY a JSON object, no markdown:
|
|
{"relevance": 0.0, "data_grounded": 0.0, "conciseness": 0.0, "formatting": 0.0, "reason": "brief explanation"}`
|
|
});
|
|
|
|
try {
|
|
// Strip markdown code fences if present (e.g. ```json ... ```)
|
|
const cleaned = judgment
|
|
.replace(/^```(?:json)?\s*/i, '')
|
|
.replace(/\s*```\s*$/, '')
|
|
.trim();
|
|
const scores = JSON.parse(cleaned);
|
|
const avg =
|
|
(scores.relevance +
|
|
scores.data_grounded +
|
|
scores.conciseness +
|
|
scores.formatting) /
|
|
4;
|
|
|
|
return {
|
|
score: Math.round(avg * 100) / 100,
|
|
metadata: scores
|
|
};
|
|
} catch {
|
|
return {
|
|
score: 0.5,
|
|
metadata: { reason: 'Failed to parse LLM judgment', raw: judgment }
|
|
};
|
|
}
|
|
}
|
|
});
|
|
|