mirror of https://github.com/ghostfolio/ghostfolio
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
93 lines
2.0 KiB
93 lines
2.0 KiB
import {
|
|
AiAgentMvpEvalCase,
|
|
AiAgentMvpEvalResult,
|
|
AiAgentMvpEvalVerificationExpectation
|
|
} from './mvp-eval.interfaces';
|
|
|
|
function matchesExpectedVerification({
|
|
actualChecks,
|
|
expectedCheck
|
|
}: {
|
|
actualChecks: { check: string; status: 'passed' | 'warning' | 'failed' }[];
|
|
expectedCheck: AiAgentMvpEvalVerificationExpectation;
|
|
}) {
|
|
return actualChecks.some(({ check, status }) => {
|
|
if (check !== expectedCheck.check) {
|
|
return false;
|
|
}
|
|
|
|
if (!expectedCheck.status) {
|
|
return true;
|
|
}
|
|
|
|
return status === expectedCheck.status;
|
|
});
|
|
}
|
|
|
|
export function calculateHallucinationRate({
|
|
results
|
|
}: {
|
|
results: AiAgentMvpEvalResult[];
|
|
}) {
|
|
const responses = results
|
|
.map(({ response }) => response)
|
|
.filter(Boolean);
|
|
|
|
if (responses.length === 0) {
|
|
return 0;
|
|
}
|
|
|
|
const hallucinationFlags = responses.filter((response) => {
|
|
const citationCoverageCheck = response.verification.find(({ check }) => {
|
|
return check === 'citation_coverage';
|
|
});
|
|
|
|
return (
|
|
citationCoverageCheck?.status === 'failed' ||
|
|
citationCoverageCheck?.status === 'warning'
|
|
);
|
|
}).length;
|
|
|
|
return hallucinationFlags / responses.length;
|
|
}
|
|
|
|
export function calculateVerificationAccuracy({
|
|
cases,
|
|
results
|
|
}: {
|
|
cases: AiAgentMvpEvalCase[];
|
|
results: AiAgentMvpEvalResult[];
|
|
}) {
|
|
const resultsById = new Map(
|
|
results.map((result) => {
|
|
return [result.id, result];
|
|
})
|
|
);
|
|
let matched = 0;
|
|
let total = 0;
|
|
|
|
for (const evalCase of cases) {
|
|
const expectedChecks = evalCase.expected.verificationChecks ?? [];
|
|
|
|
if (expectedChecks.length === 0) {
|
|
continue;
|
|
}
|
|
|
|
const responseChecks = resultsById.get(evalCase.id)?.response?.verification ?? [];
|
|
|
|
for (const expectedCheck of expectedChecks) {
|
|
total += 1;
|
|
|
|
if (
|
|
matchesExpectedVerification({
|
|
actualChecks: responseChecks,
|
|
expectedCheck
|
|
})
|
|
) {
|
|
matched += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
return total > 0 ? matched / total : 1;
|
|
}
|
|
|