You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

111 lines
3.6 KiB

import type { K1ExtractedField } from '@ghostfolio/common/interfaces';
import { Injectable } from '@nestjs/common';
/**
* K-1 confidence scoring service.
* Assigns three-level confidence (HIGH/MEDIUM/LOW) based on extraction method
* and validation heuristics per research.md Decision 5.
*/
@Injectable()
export class K1ConfidenceService {
/**
* Apply confidence scoring to extracted fields.
* Tier 1 (pdf-parse): Base 0.90, bonus for clean regex + valid format.
* Tier 2 (Azure/Tesseract): Use provider's native confidence score.
*/
public scoreFields(
fields: K1ExtractedField[],
method: 'pdf-parse' | 'azure' | 'tesseract'
): K1ExtractedField[] {
const scored = fields.map((field) => ({
...field,
confidenceLevel: this.getConfidenceLevel(field.confidence)
}));
// Apply cross-field validation rules
return this.applyCrossFieldValidation(scored);
}
/**
* Map numeric confidence to three-level display.
* HIGH (>= 0.85): Green — no user attention needed
* MEDIUM (0.60–0.84): Yellow — optional review
* LOW (< 0.60): Red — requires manual review
*/
public getConfidenceLevel(
confidence: number
): 'HIGH' | 'MEDIUM' | 'LOW' {
if (confidence >= 0.85) return 'HIGH';
if (confidence >= 0.6) return 'MEDIUM';
return 'LOW';
}
/**
* Calculate overall extraction confidence.
*/
public calculateOverallConfidence(fields: K1ExtractedField[]): number {
if (fields.length === 0) return 0;
const sum = fields.reduce((acc, f) => acc + f.confidence, 0);
return Math.round((sum / fields.length) * 100) / 100;
}
/**
* Apply cross-field validation heuristics:
* - Box 6b (Qualified dividends) <= Box 6a (Ordinary dividends)
* - Sub-boxes should sum to parent where applicable
* Fields that fail validation get confidence reduced.
*/
private applyCrossFieldValidation(
fields: K1ExtractedField[]
): K1ExtractedField[] {
const fieldMap = new Map(fields.map((f) => [f.boxNumber, f]));
// Rule: Box 6b <= Box 6a
const box6a = fieldMap.get('6a');
const box6b = fieldMap.get('6b');
if (
box6a?.numericValue != null &&
box6b?.numericValue != null &&
box6b.numericValue > box6a.numericValue
) {
// Reduce confidence on 6b — possible extraction error
box6b.confidence = Math.max(box6b.confidence - 0.2, 0);
box6b.confidenceLevel = this.getConfidenceLevel(box6b.confidence);
}
// Rule: Box 4b (total guaranteed) should approximately equal
// Box 4 (services) + Box 4a (capital) if all three are present
const box4 = fieldMap.get('4');
const box4a = fieldMap.get('4a');
const box4b = fieldMap.get('4b');
if (
box4?.numericValue != null &&
box4a?.numericValue != null &&
box4b?.numericValue != null
) {
const expectedTotal = box4.numericValue + box4a.numericValue;
const diff = Math.abs(box4b.numericValue - expectedTotal);
// Allow 1% tolerance for rounding
if (diff > Math.abs(expectedTotal * 0.01) + 1) {
box4b.confidence = Math.max(box4b.confidence - 0.15, 0);
box4b.confidenceLevel = this.getConfidenceLevel(box4b.confidence);
}
}
return fields;
}
/**
* Auto-set isReviewed for high-confidence fields per Decision 12.
* High-confidence values are auto-accepted (pre-checked).
* Medium/low require explicit user review.
*/
public applyAutoReview(fields: K1ExtractedField[]): K1ExtractedField[] {
return fields.map((field) => ({
...field,
isReviewed: field.confidenceLevel === 'HIGH'
}));
}
}