You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

68 lines
2.3 KiB

import { createAnthropic } from '@ai-sdk/anthropic';
import { generateText } from 'ai';
import { createScorer } from 'evalite';
interface AgentResponse {
toolCalls: string[];
text: string;
}
/**
* LLM-judged scorer that evaluates response quality on a 0-1 scale.
* Uses Haiku for fast, cheap scoring.
* Checks: relevance, data-groundedness, conciseness, formatting.
*/
export const ResponseQuality = createScorer<string, AgentResponse, string>({
name: 'Response Quality',
description:
'LLM-judged score for relevance, accuracy, and helpfulness of the agent response',
scorer: async ({ input, output }) => {
if (!output.text.trim()) {
return { score: 0, metadata: { reason: 'Empty response' } };
}
const { text: judgment } = await generateText({
model: createAnthropic()('claude-haiku-4-5-20251001'),
prompt: `You are evaluating a financial AI assistant's response quality.
USER QUERY: "${input}"
TOOLS CALLED: ${output.toolCalls.length > 0 ? output.toolCalls.join(', ') : 'none'}
ASSISTANT RESPONSE:
${output.text}
Score the response on these criteria (each 0-1):
1. RELEVANCE: Does the response address the user's query?
2. DATA_GROUNDED: Does it reference specific data (numbers, holdings, dates) rather than vague generalities? Score 0.5 if no data tools were called (conversational).
3. CONCISENESS: Is it appropriately concise without unnecessary filler?
4. FORMATTING: Does it use structured formatting (tables, bullets) when presenting data? Score 0.5 if response is conversational.
Respond with ONLY a JSON object, no markdown:
{"relevance": 0.0, "data_grounded": 0.0, "conciseness": 0.0, "formatting": 0.0, "reason": "brief explanation"}`
});
try {
// Strip markdown code fences if present (e.g. ```json ... ```)
const cleaned = judgment
.replace(/^```(?:json)?\s*/i, '')
.replace(/\s*```\s*$/, '')
.trim();
const scores = JSON.parse(cleaned);
const avg =
(scores.relevance +
scores.data_grounded +
scores.conciseness +
scores.formatting) /
4;
return {
score: Math.round(avg * 100) / 100,
metadata: scores
};
} catch {
return {
score: 0.5,
metadata: { reason: 'Failed to parse LLM judgment', raw: judgment }
};
}
}
});