mirror of https://github.com/ghostfolio/ghostfolio
Browse Source
- Reduce left column xMax from 445 to 435 to prevent right-column subtype codes (at x~455) from being captured via 15pt tolerance - Process subtype regions before simple regions in extractPartIII to ensure right-column codes are consumed by their correct boxes first - Add assignItemsToRegions() closest-center matching algorithm to resolve contention when adjacent/overlapping regions compete for the same item - Refactor extractCheckboxes to use closest-center: fixes G_LIMITED being detected as G_GENERAL (x=180.3 at boundary x=178) and M_NO as M_YES - Refactor extractSections to use closest-center: fixes Section L rows (12pt spacing < 15pt tolerance) being assigned to wrong fields - Lower tax year y-threshold from 745 to 710 to capture year fragments at y=727.7, enabling proper 20+25 -> 2025 combination - Add tools/test-k1-parse.mjs for standalone PDF parsing validation Verified against actual K-1 PDF: all fields match expected values, unmapped items reduced from many to 3 (noise only).pull/6701/head
3 changed files with 574 additions and 71 deletions
@ -0,0 +1,411 @@ |
|||||
|
/** |
||||
|
* Test script: runs the PdfParseExtractor logic directly on a K-1 PDF |
||||
|
* and prints all extracted fields, metadata, and unmapped items. |
||||
|
* |
||||
|
* Usage: node tools/test-k1-parse.mjs <path-to-pdf> |
||||
|
*/ |
||||
|
import { readFileSync } from 'fs'; |
||||
|
import { resolve } from 'path'; |
||||
|
|
||||
|
// ── pdfjs-dist setup ──
|
||||
|
const { getDocument, GlobalWorkerOptions } = await import( |
||||
|
'pdfjs-dist/legacy/build/pdf.mjs' |
||||
|
); |
||||
|
|
||||
|
const workerPath = |
||||
|
'file:///' + |
||||
|
resolve('node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs').replace( |
||||
|
/\\/g, |
||||
|
'/' |
||||
|
); |
||||
|
GlobalWorkerOptions.workerSrc = workerPath; |
||||
|
|
||||
|
// ── Load k1-position-regions (need TS compilation) ──
|
||||
|
// For simplicity, inline the region definitions from the compiled output.
|
||||
|
// Instead, we'll replicate the core extraction logic here using the raw
|
||||
|
// coordinates from the TypeScript file.
|
||||
|
|
||||
|
// Actually, let's just load the TS file via tsx or esbuild-register...
|
||||
|
// Simplest approach: read the compiled JS from dist or use a bundler.
|
||||
|
// For now, let's inline the critical logic.
|
||||
|
|
||||
|
const POSITION_TOLERANCE = 15; |
||||
|
const SUBTYPE_Y_TOLERANCE = 8; |
||||
|
|
||||
|
// ── Import the regions by dynamically compiling the TS ──
|
||||
|
// We'll use a quick inline approach: load the raw TS and eval via esbuild
|
||||
|
|
||||
|
import { execSync } from 'child_process'; |
||||
|
import { writeFileSync, unlinkSync, existsSync } from 'fs'; |
||||
|
|
||||
|
// Build a temp bundle of just the regions file
|
||||
|
const regionsTsPath = resolve( |
||||
|
'apps/api/src/app/k1-import/extractors/k1-position-regions.ts' |
||||
|
); |
||||
|
const regionsTmpPath = resolve('tools/_tmp_regions.mjs'); |
||||
|
|
||||
|
try { |
||||
|
execSync( |
||||
|
`npx esbuild "${regionsTsPath}" --bundle --format=esm --outfile="${regionsTmpPath}" --platform=node`, |
||||
|
{ stdio: 'pipe' } |
||||
|
); |
||||
|
} catch (e) { |
||||
|
console.error('Failed to compile regions file:', e.stderr?.toString()); |
||||
|
process.exit(1); |
||||
|
} |
||||
|
|
||||
|
const regionsModule = await import('file:///' + regionsTmpPath.replace(/\\/g, '/')); |
||||
|
const K1_POSITION_REGIONS = regionsModule.K1_POSITION_REGIONS; |
||||
|
|
||||
|
// Clean up
|
||||
|
try { unlinkSync(regionsTmpPath); } catch {} |
||||
|
|
||||
|
// ── PDF parsing ──
|
||||
|
const pdfPath = process.argv[2]; |
||||
|
if (!pdfPath) { |
||||
|
console.error('Usage: node tools/test-k1-parse.mjs <path-to-pdf>'); |
||||
|
process.exit(1); |
||||
|
} |
||||
|
|
||||
|
const buffer = readFileSync(pdfPath); |
||||
|
const loadingTask = getDocument({ |
||||
|
data: new Uint8Array(buffer), |
||||
|
standardFontDataUrl: resolve('node_modules/pdfjs-dist/standard_fonts') + '/', |
||||
|
cMapUrl: resolve('node_modules/pdfjs-dist/cmaps') + '/', |
||||
|
cMapPacked: true, |
||||
|
isEvalSupported: false, |
||||
|
disableFontFace: true |
||||
|
}); |
||||
|
|
||||
|
const pdfDoc = await loadingTask.promise; |
||||
|
const page = await pdfDoc.getPage(1); |
||||
|
const textContent = await page.getTextContent({ includeMarkedContent: false }); |
||||
|
const items = textContent.items; |
||||
|
const styles = textContent.styles; |
||||
|
|
||||
|
// Filter data items (non-serif)
|
||||
|
const dataItems = []; |
||||
|
for (const item of items) { |
||||
|
const text = item.str.trim(); |
||||
|
if (!text) continue; |
||||
|
const style = styles[item.fontName]; |
||||
|
if (!style) continue; |
||||
|
const fontFamily = style.fontFamily.toLowerCase(); |
||||
|
if (fontFamily === 'serif') continue; |
||||
|
dataItems.push({ |
||||
|
text, |
||||
|
x: item.transform[4], |
||||
|
y: item.transform[5], |
||||
|
fontName: item.fontName, |
||||
|
fontFamily, |
||||
|
matched: false |
||||
|
}); |
||||
|
} |
||||
|
|
||||
|
console.log(`Total data items: ${dataItems.length}\n`); |
||||
|
|
||||
|
// ── Parsing logic (mirrors PdfParseExtractor) ──
|
||||
|
function parseNumericValue(raw) { |
||||
|
if (!raw) return null; |
||||
|
const trimmed = raw.trim(); |
||||
|
if (!trimmed) return null; |
||||
|
const upper = trimmed.toUpperCase(); |
||||
|
if (['SEE STMT', 'STMT', 'SEE STATEMENT', 'X', 'E-FILE', 'YES', 'NO'].includes(upper)) |
||||
|
return null; |
||||
|
let cleaned = trimmed; |
||||
|
const isParenNeg = /^\(.*\)$/.test(cleaned); |
||||
|
cleaned = cleaned.replace(/[$,()]/g, ''); |
||||
|
const isMinusNeg = cleaned.startsWith('-'); |
||||
|
if (isMinusNeg) cleaned = cleaned.substring(1); |
||||
|
const num = parseFloat(cleaned); |
||||
|
if (isNaN(num)) return null; |
||||
|
return isParenNeg || isMinusNeg ? -num : num; |
||||
|
} |
||||
|
|
||||
|
function findBestItemInRegion(items, region) { |
||||
|
let bestItem = null; |
||||
|
let bestDist = Infinity; |
||||
|
const cx = (region.xMin + region.xMax) / 2; |
||||
|
const cy = (region.yMin + region.yMax) / 2; |
||||
|
for (const item of items) { |
||||
|
if (item.matched) continue; |
||||
|
if ( |
||||
|
item.x >= region.xMin - POSITION_TOLERANCE && |
||||
|
item.x <= region.xMax + POSITION_TOLERANCE && |
||||
|
item.y >= region.yMin - POSITION_TOLERANCE && |
||||
|
item.y <= region.yMax + POSITION_TOLERANCE |
||||
|
) { |
||||
|
const dx = Math.abs(item.x - cx); |
||||
|
const dy = Math.abs(item.y - cy); |
||||
|
const d = Math.sqrt(dx * dx + dy * dy); |
||||
|
if (d < bestDist) { |
||||
|
bestDist = d; |
||||
|
bestItem = item; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
return bestItem; |
||||
|
} |
||||
|
|
||||
|
const fields = []; |
||||
|
const metadata = { |
||||
|
partnershipName: null, |
||||
|
partnershipEin: null, |
||||
|
partnerName: null, |
||||
|
partnerEin: null, |
||||
|
taxYear: null, |
||||
|
isAmended: false, |
||||
|
isFinal: false |
||||
|
}; |
||||
|
|
||||
|
// Closest-center assignment helper
|
||||
|
function assignItemsToRegions(items, regions) { |
||||
|
const candidates = []; |
||||
|
for (const item of items) { |
||||
|
if (item.matched) continue; |
||||
|
for (const region of regions) { |
||||
|
if ( |
||||
|
item.x >= region.xMin - POSITION_TOLERANCE && |
||||
|
item.x <= region.xMax + POSITION_TOLERANCE && |
||||
|
item.y >= region.yMin - POSITION_TOLERANCE && |
||||
|
item.y <= region.yMax + POSITION_TOLERANCE |
||||
|
) { |
||||
|
const cx = (region.xMin + region.xMax) / 2; |
||||
|
const cy = (region.yMin + region.yMax) / 2; |
||||
|
const dx = Math.abs(item.x - cx); |
||||
|
const dy = Math.abs(item.y - cy); |
||||
|
candidates.push({ item, region, distance: Math.sqrt(dx*dx + dy*dy) }); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
candidates.sort((a, b) => a.distance - b.distance); |
||||
|
const result = new Map(); |
||||
|
const usedItems = new Set(); |
||||
|
for (const { item, region } of candidates) { |
||||
|
if (usedItems.has(item) || result.has(region)) continue; |
||||
|
result.set(region, item); |
||||
|
usedItems.add(item); |
||||
|
} |
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
// 1. Checkboxes (closest-center assignment)
|
||||
|
const checkboxRegions = K1_POSITION_REGIONS.filter(r => r.valueType === 'checkbox'); |
||||
|
const cbAssignments = assignItemsToRegions(dataItems, checkboxRegions); |
||||
|
for (const [region, item] of cbAssignments) { |
||||
|
const isChecked = ['X', '✓', '✗'].includes(item.text.toUpperCase()); |
||||
|
if (!isChecked) continue; |
||||
|
fields.push({ |
||||
|
fieldId: region.fieldId, |
||||
|
boxNumber: region.boxNumber, |
||||
|
label: region.label, |
||||
|
rawValue: 'X', |
||||
|
numericValue: null, |
||||
|
fieldCategory: 'CHECKBOX', |
||||
|
isCheckbox: true, |
||||
|
subtype: null |
||||
|
}); |
||||
|
item.matched = true; |
||||
|
if (region.fieldId === 'FINAL_K1') metadata.isFinal = true; |
||||
|
if (region.fieldId === 'AMENDED_K1') metadata.isAmended = true; |
||||
|
} |
||||
|
|
||||
|
// 2. Part III — subtype regions first, then simple
|
||||
|
const partIIIRegions = K1_POSITION_REGIONS.filter( |
||||
|
r => r.fieldCategory === 'PART_III' && r.valueType !== 'checkbox' |
||||
|
); |
||||
|
const subtypeRegions = partIIIRegions.filter(r => r.hasSubtype); |
||||
|
const simpleRegions = partIIIRegions.filter(r => !r.hasSubtype); |
||||
|
|
||||
|
function extractSubtypeField(region) { |
||||
|
const codes = []; |
||||
|
const values = []; |
||||
|
for (const item of dataItems) { |
||||
|
if (item.matched) continue; |
||||
|
const inY = item.y >= region.yMin - POSITION_TOLERANCE && |
||||
|
item.y <= region.yMax + POSITION_TOLERANCE; |
||||
|
if (!inY) continue; |
||||
|
if (region.subtypeXMin !== null && region.subtypeXMax !== null && |
||||
|
item.x >= region.subtypeXMin - POSITION_TOLERANCE && |
||||
|
item.x <= region.subtypeXMax + POSITION_TOLERANCE) { |
||||
|
codes.push(item); |
||||
|
} else if (item.x >= region.xMin - POSITION_TOLERANCE && |
||||
|
item.x <= region.xMax + POSITION_TOLERANCE) { |
||||
|
values.push(item); |
||||
|
} |
||||
|
} |
||||
|
if (codes.length > 0) { |
||||
|
for (const code of codes) { |
||||
|
const paired = values.find(v => !v.matched && Math.abs(v.y - code.y) <= SUBTYPE_Y_TOLERANCE); |
||||
|
const raw = paired ? paired.text : ''; |
||||
|
fields.push({ |
||||
|
fieldId: region.fieldId, |
||||
|
boxNumber: region.boxNumber, |
||||
|
label: region.label, |
||||
|
rawValue: raw, |
||||
|
numericValue: parseNumericValue(raw), |
||||
|
fieldCategory: region.fieldCategory, |
||||
|
isCheckbox: false, |
||||
|
subtype: code.text.trim() |
||||
|
}); |
||||
|
code.matched = true; |
||||
|
if (paired) paired.matched = true; |
||||
|
} |
||||
|
} else if (values.length > 0) { |
||||
|
const item = values[0]; |
||||
|
fields.push({ |
||||
|
fieldId: region.fieldId, |
||||
|
boxNumber: region.boxNumber, |
||||
|
label: region.label, |
||||
|
rawValue: item.text, |
||||
|
numericValue: parseNumericValue(item.text), |
||||
|
fieldCategory: region.fieldCategory, |
||||
|
isCheckbox: false, |
||||
|
subtype: null |
||||
|
}); |
||||
|
item.matched = true; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
for (const region of subtypeRegions) { |
||||
|
extractSubtypeField(region); |
||||
|
} |
||||
|
for (const region of simpleRegions) { |
||||
|
const item = findBestItemInRegion(dataItems, region); |
||||
|
if (!item) continue; |
||||
|
fields.push({ |
||||
|
fieldId: region.fieldId, |
||||
|
boxNumber: region.boxNumber, |
||||
|
label: region.label, |
||||
|
rawValue: item.text, |
||||
|
numericValue: parseNumericValue(item.text), |
||||
|
fieldCategory: region.fieldCategory, |
||||
|
isCheckbox: false, |
||||
|
subtype: null |
||||
|
}); |
||||
|
item.matched = true; |
||||
|
} |
||||
|
|
||||
|
// 3. Metadata — tax year (lowered threshold from 745 to 710)
|
||||
|
const taxYearItems = []; |
||||
|
for (const item of dataItems) { |
||||
|
if (item.matched) continue; |
||||
|
if (item.y > 710 && item.x > 200 && item.x < 350) { |
||||
|
if (/^\d{2,4}$/.test(item.text)) { |
||||
|
taxYearItems.push(item); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
if (taxYearItems.length >= 2) { |
||||
|
taxYearItems.sort((a, b) => a.x - b.x); |
||||
|
const combined = taxYearItems.map(i => i.text).join(''); |
||||
|
const year = parseInt(combined, 10); |
||||
|
if (year >= 1900 && year <= 2100) { |
||||
|
metadata.taxYear = year; |
||||
|
for (const item of taxYearItems) item.matched = true; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// Text metadata
|
||||
|
function extractTextMetadata(regionFieldId, metadataKey) { |
||||
|
const region = K1_POSITION_REGIONS.find(r => r.fieldId === regionFieldId); |
||||
|
if (!region) return; |
||||
|
const matches = []; |
||||
|
for (const item of dataItems) { |
||||
|
if (item.matched) continue; |
||||
|
if ( |
||||
|
item.x >= region.xMin - POSITION_TOLERANCE && |
||||
|
item.x <= region.xMax + POSITION_TOLERANCE && |
||||
|
item.y >= region.yMin - POSITION_TOLERANCE && |
||||
|
item.y <= region.yMax + POSITION_TOLERANCE |
||||
|
) { |
||||
|
matches.push(item); |
||||
|
} |
||||
|
} |
||||
|
if (matches.length === 0) return; |
||||
|
matches.sort((a, b) => b.y - a.y); |
||||
|
const combined = matches.map(m => m.text).join(' ').trim(); |
||||
|
if (metadataKey && combined) { |
||||
|
metadata[metadataKey] = combined; |
||||
|
} |
||||
|
for (const item of matches) item.matched = true; |
||||
|
} |
||||
|
|
||||
|
extractTextMetadata('A_EIN', 'partnershipEin'); |
||||
|
extractTextMetadata('B_NAME', 'partnershipName'); |
||||
|
extractTextMetadata('C_IRS_CENTER', null); |
||||
|
extractTextMetadata('E_TIN', 'partnerEin'); |
||||
|
extractTextMetadata('F_NAME_ADDR', 'partnerName'); |
||||
|
|
||||
|
// Remaining metadata regions
|
||||
|
const metadataRegions = K1_POSITION_REGIONS.filter( |
||||
|
r => r.fieldCategory === 'METADATA' && r.valueType === 'text' |
||||
|
); |
||||
|
for (const region of metadataRegions) { |
||||
|
const item = findBestItemInRegion(dataItems, region); |
||||
|
if (!item) continue; |
||||
|
fields.push({ |
||||
|
fieldId: region.fieldId, |
||||
|
boxNumber: region.boxNumber, |
||||
|
label: region.label, |
||||
|
rawValue: item.text, |
||||
|
numericValue: parseNumericValue(item.text), |
||||
|
fieldCategory: region.fieldCategory, |
||||
|
isCheckbox: false, |
||||
|
subtype: null |
||||
|
}); |
||||
|
item.matched = true; |
||||
|
} |
||||
|
|
||||
|
// 4. Sections J/K/L/M/N (closest-center assignment)
|
||||
|
for (const cat of ['SECTION_J', 'SECTION_K', 'SECTION_L', 'SECTION_M', 'SECTION_N']) { |
||||
|
const regions = K1_POSITION_REGIONS.filter(r => r.fieldCategory === cat && r.valueType !== 'checkbox'); |
||||
|
const assignments = assignItemsToRegions(dataItems, regions); |
||||
|
for (const [region, item] of assignments) { |
||||
|
fields.push({ |
||||
|
fieldId: region.fieldId, |
||||
|
boxNumber: region.boxNumber, |
||||
|
label: region.label, |
||||
|
rawValue: item.text, |
||||
|
numericValue: parseNumericValue(item.text), |
||||
|
fieldCategory: region.fieldCategory, |
||||
|
isCheckbox: false, |
||||
|
subtype: null |
||||
|
}); |
||||
|
item.matched = true; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// ── Print results ──
|
||||
|
console.log('=== METADATA ==='); |
||||
|
console.log(JSON.stringify(metadata, null, 2)); |
||||
|
|
||||
|
console.log('\n=== EXTRACTED FIELDS ==='); |
||||
|
// Group by category
|
||||
|
const byCategory = {}; |
||||
|
for (const f of fields) { |
||||
|
const cat = f.fieldCategory; |
||||
|
if (!byCategory[cat]) byCategory[cat] = []; |
||||
|
byCategory[cat].push(f); |
||||
|
} |
||||
|
|
||||
|
for (const [cat, catFields] of Object.entries(byCategory)) { |
||||
|
console.log(`\n--- ${cat} ---`); |
||||
|
for (const f of catFields) { |
||||
|
const sub = f.subtype ? ` [${f.subtype}]` : ''; |
||||
|
const num = f.numericValue !== null ? ` (=${f.numericValue})` : ''; |
||||
|
console.log(` ${f.fieldId || f.boxNumber}: "${f.rawValue}"${sub}${num}`); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// Unmapped
|
||||
|
const unmapped = dataItems.filter(i => !i.matched && (i.text.length > 1 || /\d/.test(i.text) || i.text === 'X')); |
||||
|
console.log(`\n=== UNMAPPED ITEMS (${unmapped.length}) ===`); |
||||
|
for (const u of unmapped) { |
||||
|
const x = Math.round(u.x * 10) / 10; |
||||
|
const y = Math.round(u.y * 10) / 10; |
||||
|
console.log(` "${u.text}" at (${x}, ${y}) font=${u.fontFamily}`); |
||||
|
} |
||||
|
|
||||
|
await pdfDoc.destroy(); |
||||
|
console.log('\nDone.'); |
||||
Loading…
Reference in new issue