import { createAnthropic } from '@ai-sdk/anthropic';
import { generateText } from 'ai';
import { createScorer } from 'evalite';

interface AgentResponse {
  toolCalls: string[];
  text: string;
}

/**
 * LLM-judged scorer that evaluates response quality on a 0-1 scale.
 * Uses Haiku for fast, cheap scoring.
 * Checks: relevance, data-groundedness, conciseness, formatting.
 */
export const ResponseQuality = createScorer<string, AgentResponse, string>({
  name: 'Response Quality',
  description:
    'LLM-judged score for relevance, accuracy, and helpfulness of the agent response',
  scorer: async ({ input, output }) => {
    if (!output.text.trim()) {
      return { score: 0, metadata: { reason: 'Empty response' } };
    }

    const { text: judgment } = await generateText({
      model: createAnthropic()('claude-haiku-4-5-20251001'),
      prompt: `You are evaluating a financial AI assistant's response quality.

USER QUERY: "${input}"
TOOLS CALLED: ${output.toolCalls.length > 0 ? output.toolCalls.join(', ') : 'none'}
ASSISTANT RESPONSE:
${output.text}

Score the response on these criteria (each 0-1):
1. RELEVANCE: Does the response address the user's query?
2. DATA_GROUNDED: Does it reference specific data (numbers, holdings, dates) rather than vague generalities? Score 0.5 if no data tools were called (conversational).
3. CONCISENESS: Is it appropriately concise without unnecessary filler?
4. FORMATTING: Does it use structured formatting (tables, bullets) when presenting data? Score 0.5 if response is conversational.

Respond with ONLY a JSON object, no markdown:
{"relevance": 0.0, "data_grounded": 0.0, "conciseness": 0.0, "formatting": 0.0, "reason": "brief explanation"}`
    });

    try {
      // Strip markdown code fences if present (e.g. ```json ... ```)
      const cleaned = judgment
        .replace(/^```(?:json)?\s*/i, '')
        .replace(/\s*```\s*$/, '')
        .trim();
      const scores = JSON.parse(cleaned);
      const avg =
        (scores.relevance +
          scores.data_grounded +
          scores.conciseness +
          scores.formatting) /
        4;

      return {
        score: Math.round(avg * 100) / 100,
        metadata: scores
      };
    } catch {
      return {
        score: 0.5,
        metadata: { reason: 'Failed to parse LLM judgment', raw: judgment }
      };
    }
  }
});