You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

93 lines
2.0 KiB

import {
AiAgentMvpEvalCase,
AiAgentMvpEvalResult,
AiAgentMvpEvalVerificationExpectation
} from './mvp-eval.interfaces';
function matchesExpectedVerification({
actualChecks,
expectedCheck
}: {
actualChecks: { check: string; status: 'passed' | 'warning' | 'failed' }[];
expectedCheck: AiAgentMvpEvalVerificationExpectation;
}) {
return actualChecks.some(({ check, status }) => {
if (check !== expectedCheck.check) {
return false;
}
if (!expectedCheck.status) {
return true;
}
return status === expectedCheck.status;
});
}
export function calculateHallucinationRate({
results
}: {
results: AiAgentMvpEvalResult[];
}) {
const responses = results
.map(({ response }) => response)
.filter(Boolean);
if (responses.length === 0) {
return 0;
}
const hallucinationFlags = responses.filter((response) => {
const citationCoverageCheck = response.verification.find(({ check }) => {
return check === 'citation_coverage';
});
return (
citationCoverageCheck?.status === 'failed' ||
citationCoverageCheck?.status === 'warning'
);
}).length;
return hallucinationFlags / responses.length;
}
export function calculateVerificationAccuracy({
cases,
results
}: {
cases: AiAgentMvpEvalCase[];
results: AiAgentMvpEvalResult[];
}) {
const resultsById = new Map(
results.map((result) => {
return [result.id, result];
})
);
let matched = 0;
let total = 0;
for (const evalCase of cases) {
const expectedChecks = evalCase.expected.verificationChecks ?? [];
if (expectedChecks.length === 0) {
continue;
}
const responseChecks = resultsById.get(evalCase.id)?.response?.verification ?? [];
for (const expectedCheck of expectedChecks) {
total += 1;
if (
matchesExpectedVerification({
actualChecks: responseChecks,
expectedCheck
})
) {
matched += 1;
}
}
}
return total > 0 ? matched / total : 1;
}