mirror of https://github.com/ghostfolio/ghostfolio
16 changed files with 1875 additions and 24 deletions
@ -0,0 +1,10 @@ |
|||
import { IsInt, IsString, Min } from 'class-validator'; |
|||
|
|||
export class UploadK1Dto { |
|||
@IsString() |
|||
partnershipId: string; |
|||
|
|||
@IsInt() |
|||
@Min(1900) |
|||
taxYear: number; |
|||
} |
|||
@ -0,0 +1,302 @@ |
|||
import { ConfigurationService } from '@ghostfolio/api/services/configuration/configuration.service'; |
|||
import type { K1ExtractionResult, K1ExtractedField } from '@ghostfolio/common/interfaces'; |
|||
|
|||
import { Injectable, Logger } from '@nestjs/common'; |
|||
|
|||
import type { K1Extractor } from './k1-extractor.interface'; |
|||
|
|||
/** |
|||
* Tier 2 extractor using Azure AI Document Intelligence (Layout model). |
|||
* Primary cloud OCR for scanned K-1 PDFs. |
|||
* Requires AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and KEY to be configured. |
|||
*/ |
|||
@Injectable() |
|||
export class AzureExtractor implements K1Extractor { |
|||
private readonly logger = new Logger(AzureExtractor.name); |
|||
|
|||
public constructor( |
|||
private readonly configurationService: ConfigurationService |
|||
) {} |
|||
|
|||
public isAvailable(): boolean { |
|||
const endpoint = this.configurationService.get( |
|||
'AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT' |
|||
); |
|||
const key = this.configurationService.get( |
|||
'AZURE_DOCUMENT_INTELLIGENCE_KEY' |
|||
); |
|||
return !!(endpoint && key); |
|||
} |
|||
|
|||
public async extract( |
|||
buffer: Buffer, |
|||
fileName: string |
|||
): Promise<K1ExtractionResult> { |
|||
this.logger.log(`Extracting from scanned PDF via Azure DI: ${fileName}`); |
|||
|
|||
const endpoint = this.configurationService.get( |
|||
'AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT' |
|||
); |
|||
const key = this.configurationService.get( |
|||
'AZURE_DOCUMENT_INTELLIGENCE_KEY' |
|||
); |
|||
|
|||
if (!endpoint || !key) { |
|||
throw new Error( |
|||
'Azure Document Intelligence credentials not configured' |
|||
); |
|||
} |
|||
|
|||
// Dynamic import to avoid loading SDK when not configured
|
|||
const { AzureKeyCredential, DocumentAnalysisClient } = await import( |
|||
'@azure/ai-form-recognizer' |
|||
); |
|||
|
|||
const client = new DocumentAnalysisClient( |
|||
endpoint, |
|||
new AzureKeyCredential(key) |
|||
); |
|||
|
|||
// Use prebuilt-layout model for general document analysis
|
|||
const poller = await client.beginAnalyzeDocument( |
|||
'prebuilt-layout', |
|||
buffer |
|||
); |
|||
const result = await poller.pollUntilDone(); |
|||
|
|||
const fields: K1ExtractedField[] = []; |
|||
const pageCount = result.pages?.length || 0; |
|||
|
|||
// Extract key-value pairs from the document
|
|||
if (result.keyValuePairs) { |
|||
for (const kvPair of result.keyValuePairs) { |
|||
const keyContent = kvPair.key?.content?.trim(); |
|||
const valueContent = kvPair.value?.content?.trim(); |
|||
const confidence = kvPair.confidence || 0; |
|||
|
|||
if (!keyContent || !valueContent) continue; |
|||
|
|||
// Try to match key to a K-1 box number
|
|||
const boxNumber = this.matchKeyToBoxNumber(keyContent); |
|||
|
|||
if (boxNumber) { |
|||
const numericValue = this.parseNumericValue(valueContent); |
|||
let confidenceLevel: 'HIGH' | 'MEDIUM' | 'LOW'; |
|||
if (confidence >= 0.85) { |
|||
confidenceLevel = 'HIGH'; |
|||
} else if (confidence >= 0.6) { |
|||
confidenceLevel = 'MEDIUM'; |
|||
} else { |
|||
confidenceLevel = 'LOW'; |
|||
} |
|||
|
|||
fields.push({ |
|||
boxNumber, |
|||
label: '', // Will be filled by field mapper
|
|||
customLabel: null, |
|||
rawValue: valueContent, |
|||
numericValue, |
|||
confidence: Math.round(confidence * 100) / 100, |
|||
confidenceLevel, |
|||
isUserEdited: false, |
|||
isReviewed: false |
|||
}); |
|||
} |
|||
} |
|||
} |
|||
|
|||
// Extract tables (K-1 forms often use tabular layout)
|
|||
if (result.tables) { |
|||
for (const table of result.tables) { |
|||
this.extractFieldsFromTable(table, fields); |
|||
} |
|||
} |
|||
|
|||
// Extract metadata from the full text
|
|||
const fullText = result.content || ''; |
|||
const metadata = this.extractMetadata(fullText); |
|||
|
|||
const totalConfidence = fields.reduce((sum, f) => sum + f.confidence, 0); |
|||
const overallConfidence = |
|||
fields.length > 0 ? totalConfidence / fields.length : 0; |
|||
|
|||
return { |
|||
metadata, |
|||
fields, |
|||
unmappedItems: [], |
|||
overallConfidence: Math.round(overallConfidence * 100) / 100, |
|||
method: 'azure', |
|||
pagesProcessed: pageCount |
|||
}; |
|||
} |
|||
|
|||
private matchKeyToBoxNumber(key: string): string | null { |
|||
// Match patterns like "1", "6a", "19a", "Box 1", "Line 1"
|
|||
const boxPatterns: Array<{ pattern: RegExp; box: string }> = [ |
|||
{ pattern: /^(?:box\s*)?1(?:\s|$|\b)/i, box: '1' }, |
|||
{ pattern: /^(?:box\s*)?2(?:\s|$|\b)/i, box: '2' }, |
|||
{ pattern: /^(?:box\s*)?3(?:\s|$|\b)/i, box: '3' }, |
|||
{ pattern: /^(?:box\s*)?4a(?:\s|$|\b)/i, box: '4a' }, |
|||
{ pattern: /^(?:box\s*)?4b(?:\s|$|\b)/i, box: '4b' }, |
|||
{ pattern: /^(?:box\s*)?4(?:\s|$|\b)/i, box: '4' }, |
|||
{ pattern: /^(?:box\s*)?5(?:\s|$|\b)/i, box: '5' }, |
|||
{ pattern: /^(?:box\s*)?6a(?:\s|$|\b)/i, box: '6a' }, |
|||
{ pattern: /^(?:box\s*)?6b(?:\s|$|\b)/i, box: '6b' }, |
|||
{ pattern: /^(?:box\s*)?6c(?:\s|$|\b)/i, box: '6c' }, |
|||
{ pattern: /^(?:box\s*)?7(?:\s|$|\b)/i, box: '7' }, |
|||
{ pattern: /^(?:box\s*)?8(?:\s|$|\b)/i, box: '8' }, |
|||
{ pattern: /^(?:box\s*)?9a(?:\s|$|\b)/i, box: '9a' }, |
|||
{ pattern: /^(?:box\s*)?9b(?:\s|$|\b)/i, box: '9b' }, |
|||
{ pattern: /^(?:box\s*)?9c(?:\s|$|\b)/i, box: '9c' }, |
|||
{ pattern: /^(?:box\s*)?10(?:\s|$|\b)/i, box: '10' }, |
|||
{ pattern: /^(?:box\s*)?11(?:\s|$|\b)/i, box: '11' }, |
|||
{ pattern: /^(?:box\s*)?12(?:\s|$|\b)/i, box: '12' }, |
|||
{ pattern: /^(?:box\s*)?13(?:\s|$|\b)/i, box: '13' }, |
|||
{ pattern: /^(?:box\s*)?14(?:\s|$|\b)/i, box: '14' }, |
|||
{ pattern: /^(?:box\s*)?15(?:\s|$|\b)/i, box: '15' }, |
|||
{ pattern: /^(?:box\s*)?16(?:\s|$|\b)/i, box: '16' }, |
|||
{ pattern: /^(?:box\s*)?17(?:\s|$|\b)/i, box: '17' }, |
|||
{ pattern: /^(?:box\s*)?18(?:\s|$|\b)/i, box: '18' }, |
|||
{ pattern: /^(?:box\s*)?19a(?:\s|$|\b)/i, box: '19a' }, |
|||
{ pattern: /^(?:box\s*)?19b(?:\s|$|\b)/i, box: '19b' }, |
|||
{ pattern: /^(?:box\s*)?20(?:\s|$|\b)/i, box: '20' }, |
|||
{ pattern: /^(?:box\s*)?21(?:\s|$|\b)/i, box: '21' } |
|||
]; |
|||
|
|||
// Also match by label keywords
|
|||
const labelPatterns: Array<{ pattern: RegExp; box: string }> = [ |
|||
{ pattern: /ordinary\s+business\s+income/i, box: '1' }, |
|||
{ pattern: /net\s+rental\s+real\s+estate/i, box: '2' }, |
|||
{ pattern: /other\s+net\s+rental/i, box: '3' }, |
|||
{ pattern: /guaranteed\s+payments?\s+for\s+services/i, box: '4' }, |
|||
{ pattern: /guaranteed\s+payments?\s+for\s+capital/i, box: '4a' }, |
|||
{ pattern: /total\s+guaranteed\s+payments/i, box: '4b' }, |
|||
{ pattern: /interest\s+income/i, box: '5' }, |
|||
{ pattern: /ordinary\s+dividends/i, box: '6a' }, |
|||
{ pattern: /qualified\s+dividends/i, box: '6b' }, |
|||
{ pattern: /dividend\s+equivalents/i, box: '6c' }, |
|||
{ pattern: /royalties/i, box: '7' }, |
|||
{ pattern: /net\s+short[- ]term\s+capital/i, box: '8' }, |
|||
{ pattern: /net\s+long[- ]term\s+capital/i, box: '9a' }, |
|||
{ pattern: /collectibles.*28%/i, box: '9b' }, |
|||
{ pattern: /unrecaptured\s+section\s*1250/i, box: '9c' }, |
|||
{ pattern: /net\s+section\s*1231/i, box: '10' }, |
|||
{ pattern: /section\s+179\s+deduction/i, box: '12' }, |
|||
{ pattern: /self[- ]employment\s+earnings/i, box: '14' }, |
|||
{ pattern: /foreign\s+taxes\s+paid/i, box: '21' } |
|||
]; |
|||
|
|||
for (const { pattern, box } of boxPatterns) { |
|||
if (pattern.test(key)) return box; |
|||
} |
|||
|
|||
for (const { pattern, box } of labelPatterns) { |
|||
if (pattern.test(key)) return box; |
|||
} |
|||
|
|||
return null; |
|||
} |
|||
|
|||
private extractFieldsFromTable(table: any, fields: K1ExtractedField[]) { |
|||
if (!table.cells) return; |
|||
|
|||
const existingBoxes = new Set(fields.map((f) => f.boxNumber)); |
|||
|
|||
// Group cells by row
|
|||
const rows = new Map<number, any[]>(); |
|||
for (const cell of table.cells) { |
|||
const rowIndex = cell.rowIndex; |
|||
if (!rows.has(rowIndex)) { |
|||
rows.set(rowIndex, []); |
|||
} |
|||
rows.get(rowIndex).push(cell); |
|||
} |
|||
|
|||
for (const [, rowCells] of rows) { |
|||
if (rowCells.length < 2) continue; |
|||
|
|||
// Sort by column index
|
|||
rowCells.sort((a: any, b: any) => a.columnIndex - b.columnIndex); |
|||
|
|||
const keyCell = rowCells[0]?.content?.trim(); |
|||
const valueCell = rowCells[rowCells.length - 1]?.content?.trim(); |
|||
|
|||
if (!keyCell || !valueCell) continue; |
|||
|
|||
const boxNumber = this.matchKeyToBoxNumber(keyCell); |
|||
if (boxNumber && !existingBoxes.has(boxNumber)) { |
|||
const numericValue = this.parseNumericValue(valueCell); |
|||
fields.push({ |
|||
boxNumber, |
|||
label: '', |
|||
customLabel: null, |
|||
rawValue: valueCell, |
|||
numericValue, |
|||
confidence: 0.7, // Table extraction is less reliable
|
|||
confidenceLevel: 'MEDIUM', |
|||
isUserEdited: false, |
|||
isReviewed: false |
|||
}); |
|||
existingBoxes.add(boxNumber); |
|||
} |
|||
} |
|||
} |
|||
|
|||
private extractMetadata(text: string): K1ExtractionResult['metadata'] { |
|||
return { |
|||
partnershipName: this.extractPattern( |
|||
text, |
|||
/partnership['']s?\s+name[^:\n]*[:\s]+([^\n]{3,80})/i |
|||
), |
|||
partnershipEin: this.extractPattern( |
|||
text, |
|||
/partnership['']s?\s+(?:employer\s+identification\s+number|EIN)[^:\n]*[:\s]+(\d{2}[- ]\d{7})/i |
|||
), |
|||
partnerName: this.extractPattern( |
|||
text, |
|||
/partner['']s?\s+name[^:\n]*[:\s]+([^\n]{3,80})/i |
|||
), |
|||
partnerEin: this.extractPattern( |
|||
text, |
|||
/partner['']s?\s+(?:identifying|social\s+security)\s+number[^:\n]*[:\s]+(\d{2}[- ]\d{7}|\d{3}[- ]\d{2}[- ]\d{4})/i |
|||
), |
|||
taxYear: this.extractTaxYear(text), |
|||
isAmended: /amended/i.test(text), |
|||
isFinal: /final\s+k-?1/i.test(text) || /final\s+return/i.test(text) |
|||
}; |
|||
} |
|||
|
|||
private extractPattern(text: string, pattern: RegExp): string | null { |
|||
const match = text.match(pattern); |
|||
return match ? match[1].trim() : null; |
|||
} |
|||
|
|||
private extractTaxYear(text: string): number | null { |
|||
const yearPatterns = [ |
|||
/(?:calendar\s+year|tax\s+year)\s*(\d{4})/i, |
|||
/for\s+(?:calendar\s+year|tax\s+year)\s*(\d{4})/i |
|||
]; |
|||
|
|||
for (const pattern of yearPatterns) { |
|||
const match = text.match(pattern); |
|||
if (match) { |
|||
const year = parseInt(match[1], 10); |
|||
if (year >= 1900 && year <= 2100) return year; |
|||
} |
|||
} |
|||
return null; |
|||
} |
|||
|
|||
private parseNumericValue(raw: string): number | null { |
|||
if (!raw) return null; |
|||
let cleaned = raw.replace(/\s/g, ''); |
|||
const isNegative = |
|||
cleaned.startsWith('(') || |
|||
cleaned.startsWith('-') || |
|||
cleaned.startsWith('($'); |
|||
cleaned = cleaned.replace(/[$,()]/g, '').replace(/^-/, ''); |
|||
const num = parseFloat(cleaned); |
|||
if (isNaN(num)) return null; |
|||
return isNegative ? -num : num; |
|||
} |
|||
} |
|||
@ -0,0 +1,408 @@ |
|||
import type { K1ExtractionResult, K1ExtractedField, K1UnmappedItem } from '@ghostfolio/common/interfaces'; |
|||
|
|||
import { Injectable, Logger } from '@nestjs/common'; |
|||
import * as pdfParse from 'pdf-parse'; |
|||
|
|||
import type { K1Extractor } from './k1-extractor.interface'; |
|||
|
|||
/** |
|||
* Tier 1 extractor for digitally-generated K-1 PDFs. |
|||
* Uses pdf-parse to extract embedded text and regex-based box extraction. |
|||
*/ |
|||
@Injectable() |
|||
export class PdfParseExtractor implements K1Extractor { |
|||
private readonly logger = new Logger(PdfParseExtractor.name); |
|||
|
|||
// Regex patterns for K-1 box extraction
|
|||
private readonly BOX_PATTERNS: Array<{ |
|||
boxNumber: string; |
|||
patterns: RegExp[]; |
|||
}> = [ |
|||
{ |
|||
boxNumber: '1', |
|||
patterns: [ |
|||
/(?:box\s*1|line\s*1)[^a-z0-9]*ordinary\s+business\s+income[^$\d-]*([($\d,.\-)]+)/i, |
|||
/ordinary\s+business\s+income\s*\(loss\)[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '2', |
|||
patterns: [ |
|||
/(?:box\s*2|line\s*2)[^a-z0-9]*net\s+rental\s+real\s+estate[^$\d-]*([($\d,.\-)]+)/i, |
|||
/net\s+rental\s+real\s+estate\s+income[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '3', |
|||
patterns: [ |
|||
/(?:box\s*3|line\s*3)[^a-z0-9]*other\s+net\s+rental[^$\d-]*([($\d,.\-)]+)/i, |
|||
/other\s+net\s+rental\s+income[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '4', |
|||
patterns: [ |
|||
/guaranteed\s+payments?\s+for\s+services[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '4a', |
|||
patterns: [ |
|||
/guaranteed\s+payments?\s+for\s+capital[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '4b', |
|||
patterns: [ |
|||
/total\s+guaranteed\s+payments?[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '5', |
|||
patterns: [ |
|||
/(?:box\s*5|line\s*5)[^a-z0-9]*interest\s+income[^$\d-]*([($\d,.\-)]+)/i, |
|||
/interest\s+income[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '6a', |
|||
patterns: [ |
|||
/(?:6a|box\s*6a)[^a-z0-9]*ordinary\s+dividends[^$\d-]*([($\d,.\-)]+)/i, |
|||
/ordinary\s+dividends[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '6b', |
|||
patterns: [ |
|||
/(?:6b|box\s*6b)[^a-z0-9]*qualified\s+dividends[^$\d-]*([($\d,.\-)]+)/i, |
|||
/qualified\s+dividends[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '6c', |
|||
patterns: [ |
|||
/(?:6c|box\s*6c)[^a-z0-9]*dividend\s+equivalents[^$\d-]*([($\d,.\-)]+)/i, |
|||
/dividend\s+equivalents[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '7', |
|||
patterns: [ |
|||
/(?:box\s*7|line\s*7)[^a-z0-9]*royalties[^$\d-]*([($\d,.\-)]+)/i, |
|||
/royalties[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '8', |
|||
patterns: [ |
|||
/(?:box\s*8|line\s*8)[^a-z0-9]*net\s+short[- ]term\s+capital[^$\d-]*([($\d,.\-)]+)/i, |
|||
/net\s+short[- ]term\s+capital\s+gain[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '9a', |
|||
patterns: [ |
|||
/(?:9a|box\s*9a)[^a-z0-9]*net\s+long[- ]term\s+capital[^$\d-]*([($\d,.\-)]+)/i, |
|||
/net\s+long[- ]term\s+capital\s+gain[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '9b', |
|||
patterns: [ |
|||
/(?:9b|box\s*9b)[^a-z0-9]*collectibles[^$\d-]*([($\d,.\-)]+)/i, |
|||
/collectibles\s*\(28%\)\s*gain[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '9c', |
|||
patterns: [ |
|||
/(?:9c|box\s*9c)[^a-z0-9]*unrecaptured\s+section\s*1250[^$\d-]*([($\d,.\-)]+)/i, |
|||
/unrecaptured\s+section\s*1250\s+gain[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '10', |
|||
patterns: [ |
|||
/(?:box\s*10|line\s*10)[^a-z0-9]*net\s+section\s*1231[^$\d-]*([($\d,.\-)]+)/i, |
|||
/net\s+section\s*1231\s+gain[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '11', |
|||
patterns: [ |
|||
/(?:box\s*11|line\s*11)[^a-z0-9]*other\s+income[^$\d-]*([($\d,.\-)]+)/i, |
|||
/other\s+income\s*\(loss\)[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '12', |
|||
patterns: [ |
|||
/(?:box\s*12|line\s*12)[^a-z0-9]*section\s*179[^$\d-]*([($\d,.\-)]+)/i, |
|||
/section\s*179\s+deduction[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '13', |
|||
patterns: [ |
|||
/(?:box\s*13|line\s*13)[^a-z0-9]*other\s+deductions[^$\d-]*([($\d,.\-)]+)/i, |
|||
/other\s+deductions[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '14', |
|||
patterns: [ |
|||
/(?:box\s*14|line\s*14)[^a-z0-9]*self[- ]employment[^$\d-]*([($\d,.\-)]+)/i, |
|||
/self[- ]employment\s+earnings[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '15', |
|||
patterns: [ |
|||
/(?:box\s*15|line\s*15)[^a-z0-9]*credits[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '16', |
|||
patterns: [ |
|||
/(?:box\s*16|line\s*16)[^a-z0-9]*foreign\s+transactions[^$\d-]*([($\d,.\-)]+)/i, |
|||
/foreign\s+transactions[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '17', |
|||
patterns: [ |
|||
/(?:box\s*17|line\s*17)[^a-z0-9]*alternative\s+minimum\s+tax[^$\d-]*([($\d,.\-)]+)/i, |
|||
/alternative\s+minimum\s+tax[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '18', |
|||
patterns: [ |
|||
/(?:box\s*18|line\s*18)[^a-z0-9]*tax[- ]exempt[^$\d-]*([($\d,.\-)]+)/i, |
|||
/tax[- ]exempt\s+income[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '19a', |
|||
patterns: [ |
|||
/(?:19a|box\s*19a)[^a-z0-9]*distributions[^$\d-]*cash[^$\d-]*([($\d,.\-)]+)/i, |
|||
/distributions.*cash\s+and\s+marketable[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '19b', |
|||
patterns: [ |
|||
/(?:19b|box\s*19b)[^a-z0-9]*distributions[^$\d-]*other\s+property[^$\d-]*([($\d,.\-)]+)/i, |
|||
/distributions.*other\s+property[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '20', |
|||
patterns: [ |
|||
/(?:box\s*20|line\s*20)[^a-z0-9]*other\s+information[^$\d-]*([($\d,.\-)]+)/i, |
|||
/other\s+information[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
}, |
|||
{ |
|||
boxNumber: '21', |
|||
patterns: [ |
|||
/(?:box\s*21|line\s*21)[^a-z0-9]*foreign\s+taxes[^$\d-]*([($\d,.\-)]+)/i, |
|||
/foreign\s+taxes\s+paid[^$\d-]*([($\d,.\-)]+)/i |
|||
] |
|||
} |
|||
]; |
|||
|
|||
// Metadata extraction patterns
|
|||
private readonly METADATA_PATTERNS = { |
|||
partnershipName: [ |
|||
/partnership['']s?\s+name[^:\n]*[:\s]+([^\n]{3,80})/i, |
|||
/name\s+of\s+partnership[^:\n]*[:\s]+([^\n]{3,80})/i |
|||
], |
|||
partnershipEin: [ |
|||
/partnership['']s?\s+(?:employer\s+identification\s+number|EIN)[^:\n]*[:\s]+(\d{2}[- ]\d{7})/i |
|||
], |
|||
partnerName: [ |
|||
/partner['']s?\s+name[^:\n]*[:\s]+([^\n]{3,80})/i, |
|||
/name\s+of\s+partner[^:\n]*[:\s]+([^\n]{3,80})/i |
|||
], |
|||
partnerEin: [ |
|||
/partner['']s?\s+(?:identifying|social\s+security)\s+number[^:\n]*[:\s]+(\d{2}[- ]\d{7}|\d{3}[- ]\d{2}[- ]\d{4})/i |
|||
], |
|||
taxYear: [ |
|||
/(?:calendar\s+year|tax\s+year)\s*(\d{4})/i, |
|||
/for\s+(?:calendar\s+year|tax\s+year)\s*(\d{4})/i, |
|||
/(?:beginning|ending)\s+.*?(\d{4})/i |
|||
] |
|||
}; |
|||
|
|||
public isAvailable(): boolean { |
|||
return true; // Always available — no external dependencies
|
|||
} |
|||
|
|||
public async extract( |
|||
buffer: Buffer, |
|||
fileName: string |
|||
): Promise<K1ExtractionResult> { |
|||
this.logger.log(`Extracting from digital PDF: ${fileName}`); |
|||
|
|||
const parsed = await pdfParse(buffer); |
|||
const text = parsed.text; |
|||
const pageCount = parsed.numpages; |
|||
|
|||
// Extract metadata
|
|||
const metadata = this.extractMetadata(text); |
|||
|
|||
// Extract box values
|
|||
const fields = this.extractBoxValues(text); |
|||
|
|||
// Calculate overall confidence
|
|||
const totalConfidence = fields.reduce((sum, f) => sum + f.confidence, 0); |
|||
const overallConfidence = |
|||
fields.length > 0 ? totalConfidence / fields.length : 0; |
|||
|
|||
return { |
|||
metadata, |
|||
fields, |
|||
unmappedItems: [], |
|||
overallConfidence: Math.round(overallConfidence * 100) / 100, |
|||
method: 'pdf-parse', |
|||
pagesProcessed: pageCount |
|||
}; |
|||
} |
|||
|
|||
private extractMetadata(text: string): K1ExtractionResult['metadata'] { |
|||
const metadata: K1ExtractionResult['metadata'] = { |
|||
partnershipName: null, |
|||
partnershipEin: null, |
|||
partnerName: null, |
|||
partnerEin: null, |
|||
taxYear: null, |
|||
isAmended: /amended/i.test(text), |
|||
isFinal: /final\s+k-?1/i.test(text) || /final\s+return/i.test(text) |
|||
}; |
|||
|
|||
for (const pattern of this.METADATA_PATTERNS.partnershipName) { |
|||
const match = text.match(pattern); |
|||
if (match) { |
|||
metadata.partnershipName = match[1].trim(); |
|||
break; |
|||
} |
|||
} |
|||
|
|||
for (const pattern of this.METADATA_PATTERNS.partnershipEin) { |
|||
const match = text.match(pattern); |
|||
if (match) { |
|||
metadata.partnershipEin = match[1].trim(); |
|||
break; |
|||
} |
|||
} |
|||
|
|||
for (const pattern of this.METADATA_PATTERNS.partnerName) { |
|||
const match = text.match(pattern); |
|||
if (match) { |
|||
metadata.partnerName = match[1].trim(); |
|||
break; |
|||
} |
|||
} |
|||
|
|||
for (const pattern of this.METADATA_PATTERNS.partnerEin) { |
|||
const match = text.match(pattern); |
|||
if (match) { |
|||
metadata.partnerEin = match[1].trim(); |
|||
break; |
|||
} |
|||
} |
|||
|
|||
for (const pattern of this.METADATA_PATTERNS.taxYear) { |
|||
const match = text.match(pattern); |
|||
if (match) { |
|||
const year = parseInt(match[1], 10); |
|||
if (year >= 1900 && year <= 2100) { |
|||
metadata.taxYear = year; |
|||
break; |
|||
} |
|||
} |
|||
} |
|||
|
|||
return metadata; |
|||
} |
|||
|
|||
private extractBoxValues(text: string): K1ExtractedField[] { |
|||
const fields: K1ExtractedField[] = []; |
|||
|
|||
for (const box of this.BOX_PATTERNS) { |
|||
for (const pattern of box.patterns) { |
|||
const match = text.match(pattern); |
|||
if (match) { |
|||
const rawValue = match[1].trim(); |
|||
const numericValue = this.parseNumericValue(rawValue); |
|||
|
|||
// Confidence: 0.90 base + 0.05 for regex match + 0.05 for validated format
|
|||
let confidence = 0.9; |
|||
confidence += 0.05; // regex matched cleanly
|
|||
if (numericValue !== null) { |
|||
confidence += 0.05; // value format validated
|
|||
} |
|||
|
|||
fields.push({ |
|||
boxNumber: box.boxNumber, |
|||
label: '', // Will be filled by field mapper
|
|||
customLabel: null, |
|||
rawValue, |
|||
numericValue, |
|||
confidence: Math.min(confidence, 1.0), |
|||
confidenceLevel: 'HIGH', |
|||
isUserEdited: false, |
|||
isReviewed: false |
|||
}); |
|||
break; // Use first matching pattern
|
|||
} |
|||
} |
|||
} |
|||
|
|||
return fields; |
|||
} |
|||
|
|||
/** |
|||
* Parse a K-1 dollar value string to a number. |
|||
* Handles: $52,340 (52340) ($1,200) -$500 1200.50 |
|||
*/ |
|||
public parseNumericValue(raw: string): number | null { |
|||
if (!raw) return null; |
|||
|
|||
// Remove whitespace
|
|||
let cleaned = raw.replace(/\s/g, ''); |
|||
|
|||
// Detect negative values: ($1,200) or ($1200)
|
|||
const isNegative = |
|||
cleaned.startsWith('(') || |
|||
cleaned.startsWith('-') || |
|||
cleaned.startsWith('($'); |
|||
|
|||
// Remove currency symbols, commas, parens
|
|||
cleaned = cleaned.replace(/[$,()]/g, '').replace(/^-/, ''); |
|||
|
|||
const num = parseFloat(cleaned); |
|||
if (isNaN(num)) return null; |
|||
|
|||
return isNegative ? -num : num; |
|||
} |
|||
|
|||
/** |
|||
* Detect if the PDF is a digital (text-embedded) K-1 document. |
|||
* Returns true if sufficient text is found with K-1 keywords. |
|||
*/ |
|||
public async isDigitalK1(buffer: Buffer): Promise<boolean> { |
|||
try { |
|||
const parsed = await pdfParse(buffer); |
|||
const text = parsed.text || ''; |
|||
|
|||
if (text.length < 100) return false; |
|||
|
|||
const k1Keywords = ['Schedule K-1', 'Form 1065', "Partner's Share"]; |
|||
return k1Keywords.some((kw) => text.includes(kw)); |
|||
} catch { |
|||
return false; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,204 @@ |
|||
import type { K1ExtractionResult, K1ExtractedField } from '@ghostfolio/common/interfaces'; |
|||
|
|||
import { Injectable, Logger } from '@nestjs/common'; |
|||
|
|||
import { PdfParseExtractor } from './pdf-parse-extractor'; |
|||
import type { K1Extractor } from './k1-extractor.interface'; |
|||
|
|||
/** |
|||
* Tier 2 fallback extractor using tesseract.js (WASM-based OCR). |
|||
* Self-hosted, zero-config — no external API keys required. |
|||
* Lower accuracy (~75%) compared to Azure DI (~95%). |
|||
*/ |
|||
@Injectable() |
|||
export class TesseractExtractor implements K1Extractor { |
|||
private readonly logger = new Logger(TesseractExtractor.name); |
|||
private worker: any = null; |
|||
|
|||
public constructor( |
|||
private readonly pdfParseExtractor: PdfParseExtractor |
|||
) {} |
|||
|
|||
public isAvailable(): boolean { |
|||
return true; // Always available — WASM-based, no dependencies
|
|||
} |
|||
|
|||
public async extract( |
|||
buffer: Buffer, |
|||
fileName: string |
|||
): Promise<K1ExtractionResult> { |
|||
this.logger.log(`Extracting from scanned PDF via Tesseract.js: ${fileName}`); |
|||
|
|||
const Tesseract = await import('tesseract.js'); |
|||
|
|||
// Create worker if not yet initialized
|
|||
if (!this.worker) { |
|||
this.worker = await Tesseract.createWorker('eng'); |
|||
} |
|||
|
|||
// Tesseract.js works on images, so we need to convert PDF pages to images.
|
|||
// For scanned PDFs, each page is typically a single image.
|
|||
// We'll use pdf-parse to get the PDF info but perform OCR on the raw buffer.
|
|||
let text = ''; |
|||
let pageCount = 1; |
|||
|
|||
try { |
|||
// Try to recognize text directly from the PDF buffer
|
|||
// Tesseract.js can handle image buffers; for PDFs we extract what we can
|
|||
const result = await this.worker.recognize(buffer); |
|||
text = result.data.text; |
|||
pageCount = 1; |
|||
} catch (error) { |
|||
this.logger.warn( |
|||
`Tesseract direct PDF recognition failed, trying alternative approach: ${error}` |
|||
); |
|||
|
|||
// Fallback: try pdf-parse to at least get any embedded text
|
|||
try { |
|||
const pdfParse = await import('pdf-parse'); |
|||
const parsed = await pdfParse.default(buffer); |
|||
text = parsed.text; |
|||
pageCount = parsed.numpages; |
|||
} catch (parseError) { |
|||
this.logger.error( |
|||
`Both Tesseract and pdf-parse failed: ${parseError}` |
|||
); |
|||
text = ''; |
|||
} |
|||
} |
|||
|
|||
// Use regex-based extraction on the OCR'd text (same as pdf-parse extractor)
|
|||
// but with lower base confidence since OCR text is less reliable
|
|||
const fields = this.extractBoxValues(text); |
|||
const metadata = this.extractMetadata(text); |
|||
|
|||
const totalConfidence = fields.reduce((sum, f) => sum + f.confidence, 0); |
|||
const overallConfidence = |
|||
fields.length > 0 ? totalConfidence / fields.length : 0; |
|||
|
|||
return { |
|||
metadata, |
|||
fields, |
|||
unmappedItems: [], |
|||
overallConfidence: Math.round(overallConfidence * 100) / 100, |
|||
method: 'tesseract', |
|||
pagesProcessed: pageCount |
|||
}; |
|||
} |
|||
|
|||
private extractBoxValues(text: string): K1ExtractedField[] { |
|||
if (!text) return []; |
|||
|
|||
// Reuse the same regex patterns as PdfParseExtractor but with lower confidence
|
|||
const BOX_PATTERNS: Array<{ boxNumber: string; patterns: RegExp[] }> = [ |
|||
{ boxNumber: '1', patterns: [/ordinary\s+business\s+income[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '2', patterns: [/net\s+rental\s+real\s+estate[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '3', patterns: [/other\s+net\s+rental[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '4', patterns: [/guaranteed\s+payments?\s+for\s+services[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '4a', patterns: [/guaranteed\s+payments?\s+for\s+capital[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '4b', patterns: [/total\s+guaranteed\s+payments?[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '5', patterns: [/interest\s+income[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '6a', patterns: [/ordinary\s+dividends[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '6b', patterns: [/qualified\s+dividends[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '6c', patterns: [/dividend\s+equivalents[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '7', patterns: [/royalties[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '8', patterns: [/net\s+short[- ]term\s+capital[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '9a', patterns: [/net\s+long[- ]term\s+capital[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '9b', patterns: [/collectibles.*28%[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '9c', patterns: [/unrecaptured\s+section\s*1250[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '10', patterns: [/net\s+section\s*1231[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '11', patterns: [/other\s+income[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '12', patterns: [/section\s*179\s+deduction[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '13', patterns: [/other\s+deductions[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '14', patterns: [/self[- ]employment\s+earnings[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '15', patterns: [/credits[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '16', patterns: [/foreign\s+transactions[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '17', patterns: [/alternative\s+minimum\s+tax[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '18', patterns: [/tax[- ]exempt\s+income[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '19a', patterns: [/distributions.*cash\s+and\s+marketable[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '19b', patterns: [/distributions.*other\s+property[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '20', patterns: [/other\s+information[^$\d-]*([($\d,.\-)]+)/i] }, |
|||
{ boxNumber: '21', patterns: [/foreign\s+taxes\s+paid[^$\d-]*([($\d,.\-)]+)/i] } |
|||
]; |
|||
|
|||
const fields: K1ExtractedField[] = []; |
|||
|
|||
for (const box of BOX_PATTERNS) { |
|||
for (const pattern of box.patterns) { |
|||
const match = text.match(pattern); |
|||
if (match) { |
|||
const rawValue = match[1].trim(); |
|||
const numericValue = this.pdfParseExtractor.parseNumericValue(rawValue); |
|||
|
|||
// Tesseract: lower base confidence of 0.65
|
|||
let confidence = 0.65; |
|||
if (numericValue !== null) { |
|||
confidence += 0.1; // Value format validated
|
|||
} |
|||
|
|||
let confidenceLevel: 'HIGH' | 'MEDIUM' | 'LOW'; |
|||
if (confidence >= 0.85) { |
|||
confidenceLevel = 'HIGH'; |
|||
} else if (confidence >= 0.6) { |
|||
confidenceLevel = 'MEDIUM'; |
|||
} else { |
|||
confidenceLevel = 'LOW'; |
|||
} |
|||
|
|||
fields.push({ |
|||
boxNumber: box.boxNumber, |
|||
label: '', |
|||
customLabel: null, |
|||
rawValue, |
|||
numericValue, |
|||
confidence: Math.round(confidence * 100) / 100, |
|||
confidenceLevel, |
|||
isUserEdited: false, |
|||
isReviewed: false |
|||
}); |
|||
break; |
|||
} |
|||
} |
|||
} |
|||
|
|||
return fields; |
|||
} |
|||
|
|||
private extractMetadata(text: string): K1ExtractionResult['metadata'] { |
|||
return { |
|||
partnershipName: this.extractPattern( |
|||
text, |
|||
/partnership['']s?\s+name[^:\n]*[:\s]+([^\n]{3,80})/i |
|||
), |
|||
partnershipEin: this.extractPattern( |
|||
text, |
|||
/partnership['']s?\s+(?:employer\s+identification\s+number|EIN)[^:\n]*[:\s]+(\d{2}[- ]\d{7})/i |
|||
), |
|||
partnerName: this.extractPattern( |
|||
text, |
|||
/partner['']s?\s+name[^:\n]*[:\s]+([^\n]{3,80})/i |
|||
), |
|||
partnerEin: this.extractPattern( |
|||
text, |
|||
/partner['']s?\s+(?:identifying|social\s+security)\s+number[^:\n]*[:\s]+(\d{2}[- ]\d{7}|\d{3}[- ]\d{2}[- ]\d{4})/i |
|||
), |
|||
taxYear: this.extractTaxYear(text), |
|||
isAmended: /amended/i.test(text), |
|||
isFinal: /final\s+k-?1/i.test(text) || /final\s+return/i.test(text) |
|||
}; |
|||
} |
|||
|
|||
private extractPattern(text: string, pattern: RegExp): string | null { |
|||
const match = text.match(pattern); |
|||
return match ? match[1].trim() : null; |
|||
} |
|||
|
|||
private extractTaxYear(text: string): number | null { |
|||
const match = text.match(/(?:calendar\s+year|tax\s+year)\s*(\d{4})/i); |
|||
if (match) { |
|||
const year = parseInt(match[1], 10); |
|||
if (year >= 1900 && year <= 2100) return year; |
|||
} |
|||
return null; |
|||
} |
|||
} |
|||
@ -0,0 +1,8 @@ |
|||
import { Injectable } from '@nestjs/common'; |
|||
|
|||
/** |
|||
* Service for computing dynamic aggregation totals |
|||
* from CellAggregationRule records. Implemented in Phase 4 (US2). |
|||
*/ |
|||
@Injectable() |
|||
export class K1AggregationService {} |
|||
@ -0,0 +1,8 @@ |
|||
import { Injectable } from '@nestjs/common'; |
|||
|
|||
/** |
|||
* Service for allocating K-1 line items to partnership members |
|||
* by ownership percentage. Implemented in Phase 5 (US3). |
|||
*/ |
|||
@Injectable() |
|||
export class K1AllocationService {} |
|||
@ -0,0 +1,111 @@ |
|||
import type { K1ExtractedField } from '@ghostfolio/common/interfaces'; |
|||
|
|||
import { Injectable } from '@nestjs/common'; |
|||
|
|||
/** |
|||
* K-1 confidence scoring service. |
|||
* Assigns three-level confidence (HIGH/MEDIUM/LOW) based on extraction method |
|||
* and validation heuristics per research.md Decision 5. |
|||
*/ |
|||
@Injectable() |
|||
export class K1ConfidenceService { |
|||
/** |
|||
* Apply confidence scoring to extracted fields. |
|||
* Tier 1 (pdf-parse): Base 0.90, bonus for clean regex + valid format. |
|||
* Tier 2 (Azure/Tesseract): Use provider's native confidence score. |
|||
*/ |
|||
public scoreFields( |
|||
fields: K1ExtractedField[], |
|||
method: 'pdf-parse' | 'azure' | 'tesseract' |
|||
): K1ExtractedField[] { |
|||
const scored = fields.map((field) => ({ |
|||
...field, |
|||
confidenceLevel: this.getConfidenceLevel(field.confidence) |
|||
})); |
|||
|
|||
// Apply cross-field validation rules
|
|||
return this.applyCrossFieldValidation(scored); |
|||
} |
|||
|
|||
/** |
|||
* Map numeric confidence to three-level display. |
|||
* HIGH (>= 0.85): Green — no user attention needed |
|||
* MEDIUM (0.60–0.84): Yellow — optional review |
|||
* LOW (< 0.60): Red — requires manual review |
|||
*/ |
|||
public getConfidenceLevel( |
|||
confidence: number |
|||
): 'HIGH' | 'MEDIUM' | 'LOW' { |
|||
if (confidence >= 0.85) return 'HIGH'; |
|||
if (confidence >= 0.6) return 'MEDIUM'; |
|||
return 'LOW'; |
|||
} |
|||
|
|||
/** |
|||
* Calculate overall extraction confidence. |
|||
*/ |
|||
public calculateOverallConfidence(fields: K1ExtractedField[]): number { |
|||
if (fields.length === 0) return 0; |
|||
|
|||
const sum = fields.reduce((acc, f) => acc + f.confidence, 0); |
|||
return Math.round((sum / fields.length) * 100) / 100; |
|||
} |
|||
|
|||
/** |
|||
* Apply cross-field validation heuristics: |
|||
* - Box 6b (Qualified dividends) <= Box 6a (Ordinary dividends) |
|||
* - Sub-boxes should sum to parent where applicable |
|||
* Fields that fail validation get confidence reduced. |
|||
*/ |
|||
private applyCrossFieldValidation( |
|||
fields: K1ExtractedField[] |
|||
): K1ExtractedField[] { |
|||
const fieldMap = new Map(fields.map((f) => [f.boxNumber, f])); |
|||
|
|||
// Rule: Box 6b <= Box 6a
|
|||
const box6a = fieldMap.get('6a'); |
|||
const box6b = fieldMap.get('6b'); |
|||
if ( |
|||
box6a?.numericValue != null && |
|||
box6b?.numericValue != null && |
|||
box6b.numericValue > box6a.numericValue |
|||
) { |
|||
// Reduce confidence on 6b — possible extraction error
|
|||
box6b.confidence = Math.max(box6b.confidence - 0.2, 0); |
|||
box6b.confidenceLevel = this.getConfidenceLevel(box6b.confidence); |
|||
} |
|||
|
|||
// Rule: Box 4b (total guaranteed) should approximately equal
|
|||
// Box 4 (services) + Box 4a (capital) if all three are present
|
|||
const box4 = fieldMap.get('4'); |
|||
const box4a = fieldMap.get('4a'); |
|||
const box4b = fieldMap.get('4b'); |
|||
if ( |
|||
box4?.numericValue != null && |
|||
box4a?.numericValue != null && |
|||
box4b?.numericValue != null |
|||
) { |
|||
const expectedTotal = box4.numericValue + box4a.numericValue; |
|||
const diff = Math.abs(box4b.numericValue - expectedTotal); |
|||
// Allow 1% tolerance for rounding
|
|||
if (diff > Math.abs(expectedTotal * 0.01) + 1) { |
|||
box4b.confidence = Math.max(box4b.confidence - 0.15, 0); |
|||
box4b.confidenceLevel = this.getConfidenceLevel(box4b.confidence); |
|||
} |
|||
} |
|||
|
|||
return fields; |
|||
} |
|||
|
|||
/** |
|||
* Auto-set isReviewed for high-confidence fields per Decision 12. |
|||
* High-confidence values are auto-accepted (pre-checked). |
|||
* Medium/low require explicit user review. |
|||
*/ |
|||
public applyAutoReview(fields: K1ExtractedField[]): K1ExtractedField[] { |
|||
return fields.map((field) => ({ |
|||
...field, |
|||
isReviewed: field.confidenceLevel === 'HIGH' |
|||
})); |
|||
} |
|||
} |
|||
@ -0,0 +1,146 @@ |
|||
import type { K1ExtractedField, K1ExtractionResult, K1UnmappedItem } from '@ghostfolio/common/interfaces'; |
|||
|
|||
import { Injectable, Logger } from '@nestjs/common'; |
|||
|
|||
import { CellMappingService } from '../cell-mapping/cell-mapping.service'; |
|||
import { K1ConfidenceService } from './k1-confidence.service'; |
|||
|
|||
/** |
|||
* Maps raw extraction results to K-1 box fields using cell mapping configuration. |
|||
* Applies labels from cell mappings, scoring, and auto-review logic. |
|||
*/ |
|||
@Injectable() |
|||
export class K1FieldMapperService { |
|||
private readonly logger = new Logger(K1FieldMapperService.name); |
|||
|
|||
public constructor( |
|||
private readonly cellMappingService: CellMappingService, |
|||
private readonly confidenceService: K1ConfidenceService |
|||
) {} |
|||
|
|||
/** |
|||
* Map raw extraction results to fully labeled K1ExtractedFields using cell mappings. |
|||
* Also identifies unmapped items (extracted values that don't match any cell mapping). |
|||
*/ |
|||
public async mapFields( |
|||
extractionResult: K1ExtractionResult, |
|||
partnershipId: string |
|||
): Promise<K1ExtractionResult> { |
|||
// Load cell mappings for this partnership (with global fallback)
|
|||
const mappings = await this.cellMappingService.getMappings(partnershipId); |
|||
const mappingMap = new Map(mappings.map((m) => [m.boxNumber, m])); |
|||
|
|||
const mappedFields: K1ExtractedField[] = []; |
|||
const unmappedItems: K1UnmappedItem[] = [ |
|||
...extractionResult.unmappedItems |
|||
]; |
|||
|
|||
for (const field of extractionResult.fields) { |
|||
const mapping = mappingMap.get(field.boxNumber); |
|||
|
|||
if (mapping) { |
|||
mappedFields.push({ |
|||
...field, |
|||
label: mapping.label, |
|||
customLabel: mapping.isCustom ? mapping.label : field.customLabel |
|||
}); |
|||
} else { |
|||
// Field has a box number but no corresponding cell mapping
|
|||
this.logger.debug( |
|||
`No cell mapping for box ${field.boxNumber}, adding to unmapped items` |
|||
); |
|||
unmappedItems.push({ |
|||
rawLabel: field.label || `Box ${field.boxNumber}`, |
|||
rawValue: field.rawValue, |
|||
numericValue: field.numericValue, |
|||
confidence: field.confidence, |
|||
pageNumber: 1, // Default page number when unknown
|
|||
resolution: null, |
|||
assignedBoxNumber: null |
|||
}); |
|||
} |
|||
} |
|||
|
|||
// Sort mapped fields by the cell mapping sort order
|
|||
const sortedFields = mappedFields.sort((a, b) => { |
|||
const sortA = mappingMap.get(a.boxNumber)?.sortOrder ?? 999; |
|||
const sortB = mappingMap.get(b.boxNumber)?.sortOrder ?? 999; |
|||
return sortA - sortB; |
|||
}); |
|||
|
|||
// Apply confidence scoring
|
|||
const scoredFields = this.confidenceService.scoreFields( |
|||
sortedFields, |
|||
extractionResult.method |
|||
); |
|||
|
|||
// Apply auto-review (high-confidence auto-accepted)
|
|||
const reviewedFields = this.confidenceService.applyAutoReview(scoredFields); |
|||
|
|||
// Recalculate overall confidence
|
|||
const overallConfidence = |
|||
this.confidenceService.calculateOverallConfidence(reviewedFields); |
|||
|
|||
return { |
|||
...extractionResult, |
|||
fields: reviewedFields, |
|||
unmappedItems, |
|||
overallConfidence |
|||
}; |
|||
} |
|||
|
|||
/** |
|||
* Add any mapped cell mapping boxes that were NOT extracted as zero-value fields. |
|||
* This ensures the verification screen shows all expected K-1 boxes. |
|||
*/ |
|||
public async fillMissingBoxes( |
|||
result: K1ExtractionResult, |
|||
partnershipId: string |
|||
): Promise<K1ExtractionResult> { |
|||
const mappings = await this.cellMappingService.getMappings(partnershipId); |
|||
const existingBoxes = new Set(result.fields.map((f) => f.boxNumber)); |
|||
|
|||
const missingFields: K1ExtractedField[] = []; |
|||
|
|||
for (const mapping of mappings) { |
|||
if (!existingBoxes.has(mapping.boxNumber)) { |
|||
missingFields.push({ |
|||
boxNumber: mapping.boxNumber, |
|||
label: mapping.label, |
|||
customLabel: mapping.isCustom ? mapping.label : null, |
|||
rawValue: '', |
|||
numericValue: null, |
|||
confidence: 1.0, // Empty fields have full confidence
|
|||
confidenceLevel: 'HIGH', |
|||
isUserEdited: false, |
|||
isReviewed: true // No review needed for empty fields
|
|||
}); |
|||
} |
|||
} |
|||
|
|||
return { |
|||
...result, |
|||
fields: [...result.fields, ...missingFields].sort((a, b) => { |
|||
// Sort by natural box number order
|
|||
return this.compareBoxNumbers(a.boxNumber, b.boxNumber); |
|||
}) |
|||
}; |
|||
} |
|||
|
|||
/** |
|||
* Compare box numbers for natural ordering (1, 2, 3, 4, 4a, 4b, 5, 6a, ...). |
|||
*/ |
|||
private compareBoxNumbers(a: string, b: string): number { |
|||
const parseBox = (box: string) => { |
|||
const match = box.match(/^(\d+)([a-z]?)$/); |
|||
if (!match) return { num: 999, sub: box }; |
|||
return { num: parseInt(match[1], 10), sub: match[2] || '' }; |
|||
}; |
|||
|
|||
const pa = parseBox(a); |
|||
const pb = parseBox(b); |
|||
|
|||
if (pa.num !== pb.num) return pa.num - pb.num; |
|||
return pa.sub.localeCompare(pb.sub); |
|||
} |
|||
} |
|||
@ -0,0 +1,202 @@ |
|||
import { K1ImportDataService } from '@ghostfolio/client/services/k1-import-data.service'; |
|||
import { FamilyOfficeDataService } from '@ghostfolio/client/services/family-office-data.service'; |
|||
|
|||
import { CommonModule } from '@angular/common'; |
|||
import { |
|||
ChangeDetectionStrategy, |
|||
ChangeDetectorRef, |
|||
Component, |
|||
DestroyRef, |
|||
OnInit |
|||
} from '@angular/core'; |
|||
import { takeUntilDestroyed } from '@angular/core/rxjs-interop'; |
|||
import { FormsModule } from '@angular/forms'; |
|||
import { MatButtonModule } from '@angular/material/button'; |
|||
import { MatFormFieldModule } from '@angular/material/form-field'; |
|||
import { MatIconModule } from '@angular/material/icon'; |
|||
import { MatProgressBarModule } from '@angular/material/progress-bar'; |
|||
import { MatSelectModule } from '@angular/material/select'; |
|||
import { Router } from '@angular/router'; |
|||
import { addIcons } from 'ionicons'; |
|||
import { |
|||
cloudUploadOutline, |
|||
documentTextOutline |
|||
} from 'ionicons/icons'; |
|||
|
|||
@Component({ |
|||
changeDetection: ChangeDetectionStrategy.OnPush, |
|||
host: { class: 'page' }, |
|||
imports: [ |
|||
CommonModule, |
|||
FormsModule, |
|||
MatButtonModule, |
|||
MatFormFieldModule, |
|||
MatIconModule, |
|||
MatProgressBarModule, |
|||
MatSelectModule |
|||
], |
|||
selector: 'gf-k1-import-page', |
|||
styleUrls: ['./k1-import-page.scss'], |
|||
templateUrl: './k1-import-page.html' |
|||
}) |
|||
export class K1ImportPageComponent implements OnInit { |
|||
public error: string | null = null; |
|||
public extractionStatus: string | null = null; |
|||
public isUploading = false; |
|||
public partnerships: Array<{ id: string; name: string }> = []; |
|||
public selectedFile: File | null = null; |
|||
public selectedPartnershipId = ''; |
|||
public sessionId: string | null = null; |
|||
public taxYear: number; |
|||
public taxYearOptions: number[] = []; |
|||
public uploadProgress = 0; |
|||
|
|||
private pollingInterval: any = null; |
|||
|
|||
public constructor( |
|||
private readonly changeDetectorRef: ChangeDetectorRef, |
|||
private readonly destroyRef: DestroyRef, |
|||
private readonly familyOfficeDataService: FamilyOfficeDataService, |
|||
private readonly k1ImportDataService: K1ImportDataService, |
|||
private readonly router: Router |
|||
) { |
|||
addIcons({ cloudUploadOutline, documentTextOutline }); |
|||
const currentYear = new Date().getFullYear(); |
|||
this.taxYear = currentYear - 1; |
|||
for (let y = currentYear; y >= currentYear - 10; y--) { |
|||
this.taxYearOptions.push(y); |
|||
} |
|||
} |
|||
|
|||
public ngOnInit(): void { |
|||
this.fetchPartnerships(); |
|||
} |
|||
|
|||
public onFileSelected(event: Event): void { |
|||
const input = event.target as HTMLInputElement; |
|||
if (input.files && input.files.length > 0) { |
|||
const file = input.files[0]; |
|||
|
|||
// Client-side validation
|
|||
if (file.type !== 'application/pdf') { |
|||
this.error = 'Please select a valid PDF file.'; |
|||
this.selectedFile = null; |
|||
this.changeDetectorRef.markForCheck(); |
|||
return; |
|||
} |
|||
|
|||
if (file.size > 25 * 1024 * 1024) { |
|||
this.error = 'File exceeds 25 MB size limit.'; |
|||
this.selectedFile = null; |
|||
this.changeDetectorRef.markForCheck(); |
|||
return; |
|||
} |
|||
|
|||
this.error = null; |
|||
this.selectedFile = file; |
|||
this.changeDetectorRef.markForCheck(); |
|||
} |
|||
} |
|||
|
|||
public uploadK1(): void { |
|||
if (!this.selectedFile || !this.selectedPartnershipId || !this.taxYear) { |
|||
this.error = 'Please select a partnership, tax year, and PDF file.'; |
|||
this.changeDetectorRef.markForCheck(); |
|||
return; |
|||
} |
|||
|
|||
this.isUploading = true; |
|||
this.error = null; |
|||
this.extractionStatus = 'Uploading...'; |
|||
this.changeDetectorRef.markForCheck(); |
|||
|
|||
const formData = new FormData(); |
|||
formData.append('file', this.selectedFile); |
|||
formData.append('partnershipId', this.selectedPartnershipId); |
|||
formData.append('taxYear', this.taxYear.toString()); |
|||
|
|||
this.k1ImportDataService |
|||
.uploadK1(formData) |
|||
.pipe(takeUntilDestroyed(this.destroyRef)) |
|||
.subscribe({ |
|||
next: (result) => { |
|||
this.sessionId = result.id; |
|||
this.extractionStatus = 'Processing...'; |
|||
this.isUploading = false; |
|||
this.changeDetectorRef.markForCheck(); |
|||
|
|||
// Start polling for extraction completion
|
|||
this.startPolling(result.id); |
|||
}, |
|||
error: (err) => { |
|||
this.isUploading = false; |
|||
this.error = |
|||
err?.error?.message || err?.message || 'Upload failed.'; |
|||
this.extractionStatus = null; |
|||
this.changeDetectorRef.markForCheck(); |
|||
} |
|||
}); |
|||
} |
|||
|
|||
public resetForm(): void { |
|||
this.selectedFile = null; |
|||
this.sessionId = null; |
|||
this.extractionStatus = null; |
|||
this.error = null; |
|||
this.stopPolling(); |
|||
this.changeDetectorRef.markForCheck(); |
|||
} |
|||
|
|||
private fetchPartnerships(): void { |
|||
this.familyOfficeDataService |
|||
.fetchPartnerships() |
|||
.pipe(takeUntilDestroyed(this.destroyRef)) |
|||
.subscribe({ |
|||
next: (partnerships) => { |
|||
this.partnerships = partnerships.map((p) => ({ |
|||
id: p.id, |
|||
name: p.name |
|||
})); |
|||
this.changeDetectorRef.markForCheck(); |
|||
} |
|||
}); |
|||
} |
|||
|
|||
private startPolling(sessionId: string): void { |
|||
this.stopPolling(); |
|||
|
|||
this.pollingInterval = setInterval(() => { |
|||
this.k1ImportDataService |
|||
.fetchImportSession(sessionId) |
|||
.pipe(takeUntilDestroyed(this.destroyRef)) |
|||
.subscribe({ |
|||
next: (session) => { |
|||
this.extractionStatus = session.status; |
|||
|
|||
if (session.status === 'EXTRACTED') { |
|||
this.stopPolling(); |
|||
// Navigate to verification page (to be created in Phase 4)
|
|||
this.router.navigate(['/k1-import', sessionId, 'verify']); |
|||
} else if (session.status === 'FAILED') { |
|||
this.stopPolling(); |
|||
this.error = |
|||
session.errorMessage || 'Extraction failed.'; |
|||
this.extractionStatus = 'FAILED'; |
|||
} |
|||
|
|||
this.changeDetectorRef.markForCheck(); |
|||
}, |
|||
error: () => { |
|||
// Continue polling on transient errors
|
|||
} |
|||
}); |
|||
}, 2000); // Poll every 2 seconds
|
|||
} |
|||
|
|||
private stopPolling(): void { |
|||
if (this.pollingInterval) { |
|||
clearInterval(this.pollingInterval); |
|||
this.pollingInterval = null; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,99 @@ |
|||
<div class="container"> |
|||
<div class="row"> |
|||
<div class="col"> |
|||
<h1 class="d-none d-sm-block h3 mb-4 text-center">K-1 PDF Import</h1> |
|||
|
|||
@if (error) { |
|||
<div class="alert alert-danger mb-3"> |
|||
{{ error }} |
|||
</div> |
|||
} |
|||
|
|||
@if (!sessionId) { |
|||
<!-- Upload Form --> |
|||
<div class="upload-form mx-auto"> |
|||
<div class="mb-3"> |
|||
<mat-form-field class="w-100"> |
|||
<mat-label>Partnership</mat-label> |
|||
<mat-select [(ngModel)]="selectedPartnershipId"> |
|||
@for (p of partnerships; track p.id) { |
|||
<mat-option [value]="p.id">{{ p.name }}</mat-option> |
|||
} |
|||
</mat-select> |
|||
</mat-form-field> |
|||
</div> |
|||
|
|||
<div class="mb-3"> |
|||
<mat-form-field class="w-100"> |
|||
<mat-label>Tax Year</mat-label> |
|||
<mat-select [(ngModel)]="taxYear"> |
|||
@for (y of taxYearOptions; track y) { |
|||
<mat-option [value]="y">{{ y }}</mat-option> |
|||
} |
|||
</mat-select> |
|||
</mat-form-field> |
|||
</div> |
|||
|
|||
<div class="mb-3"> |
|||
<div class="upload-dropzone text-center p-4" |
|||
(click)="fileInput.click()" |
|||
(dragover)="$event.preventDefault()" |
|||
(drop)="$event.preventDefault(); onFileSelected($event)"> |
|||
<input #fileInput |
|||
accept="application/pdf" |
|||
hidden |
|||
type="file" |
|||
(change)="onFileSelected($event)" /> |
|||
|
|||
@if (selectedFile) { |
|||
<ion-icon name="document-text-outline" size="large"></ion-icon> |
|||
<p class="mt-2 mb-0">{{ selectedFile.name }}</p> |
|||
<small class="text-muted">{{ (selectedFile.size / 1024 / 1024).toFixed(2) }} MB</small> |
|||
} @else { |
|||
<ion-icon name="cloud-upload-outline" size="large"></ion-icon> |
|||
<p class="mt-2 mb-0">Click or drag a K-1 PDF file here</p> |
|||
<small class="text-muted">Maximum 25 MB</small> |
|||
} |
|||
</div> |
|||
</div> |
|||
|
|||
<button |
|||
[disabled]="!selectedFile || !selectedPartnershipId || !taxYear || isUploading" |
|||
class="w-100" |
|||
color="primary" |
|||
mat-flat-button |
|||
(click)="uploadK1()"> |
|||
@if (isUploading) { |
|||
Uploading... |
|||
} @else { |
|||
Upload & Scan K-1 |
|||
} |
|||
</button> |
|||
</div> |
|||
} @else { |
|||
<!-- Extraction Progress --> |
|||
<div class="processing-status text-center mx-auto"> |
|||
<h3>Processing K-1</h3> |
|||
|
|||
@if (extractionStatus === 'Processing...' || extractionStatus === 'PROCESSING') { |
|||
<mat-progress-bar mode="indeterminate"></mat-progress-bar> |
|||
<p class="mt-3">Extracting data from your K-1 PDF...</p> |
|||
<p class="text-muted">This usually takes less than 30 seconds.</p> |
|||
} @else if (extractionStatus === 'EXTRACTED') { |
|||
<p class="text-success">Extraction complete! Redirecting to verification...</p> |
|||
} @else if (extractionStatus === 'FAILED') { |
|||
<p class="text-danger">Extraction failed.</p> |
|||
} |
|||
|
|||
<button |
|||
class="mt-3" |
|||
color="warn" |
|||
mat-stroked-button |
|||
(click)="resetForm()"> |
|||
Cancel & Start Over |
|||
</button> |
|||
</div> |
|||
} |
|||
</div> |
|||
</div> |
|||
</div> |
|||
@ -0,0 +1,47 @@ |
|||
:host { |
|||
display: block; |
|||
} |
|||
|
|||
.upload-form { |
|||
max-width: 480px; |
|||
} |
|||
|
|||
.upload-dropzone { |
|||
border: 2px dashed var(--border-color, #ccc); |
|||
border-radius: 8px; |
|||
cursor: pointer; |
|||
transition: border-color 0.2s ease; |
|||
|
|||
&:hover { |
|||
border-color: var(--primary-color, #1976d2); |
|||
} |
|||
|
|||
ion-icon { |
|||
font-size: 48px; |
|||
color: var(--text-muted, #999); |
|||
} |
|||
} |
|||
|
|||
.processing-status { |
|||
max-width: 480px; |
|||
|
|||
mat-progress-bar { |
|||
margin-top: 1rem; |
|||
} |
|||
} |
|||
|
|||
.text-success { |
|||
color: #4caf50; |
|||
} |
|||
|
|||
.text-danger { |
|||
color: #f44336; |
|||
} |
|||
|
|||
.alert-danger { |
|||
background-color: rgba(244, 67, 54, 0.1); |
|||
border: 1px solid rgba(244, 67, 54, 0.3); |
|||
border-radius: 4px; |
|||
color: #f44336; |
|||
padding: 12px 16px; |
|||
} |
|||
Loading…
Reference in new issue