diff --git a/apps/api/src/app/k1-import/extractors/k1-position-regions.ts b/apps/api/src/app/k1-import/extractors/k1-position-regions.ts index eae8aedf4..39f997602 100644 --- a/apps/api/src/app/k1-import/extractors/k1-position-regions.ts +++ b/apps/api/src/app/k1-import/extractors/k1-position-regions.ts @@ -756,7 +756,8 @@ const SECTION_N_REGIONS: K1PositionRegion[] = [ // Part III -- Left Column: Boxes 1-13 // // Row spacing: 24pt. Label y-positions measured from template text. -// Value column: x=370-445 (between label text and right-column boundary). +// Value column: x=370-435 (right edge reduced from 445 to prevent +// cross-column contamination with right column subtypes at x~455). // Subtype code column (boxes 11-13): x=305-370. // // Verified: BOX_11 subtype 'ZZ*' at (314.2, 314.4), @@ -770,7 +771,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 696, yMax: 720, hasSubtype: false, @@ -784,7 +785,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 672, yMax: 696, hasSubtype: false, @@ -798,7 +799,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 648, yMax: 672, hasSubtype: false, @@ -812,7 +813,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 624, yMax: 648, hasSubtype: false, @@ -826,7 +827,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 600, yMax: 624, hasSubtype: false, @@ -840,7 +841,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 576, yMax: 600, hasSubtype: false, @@ -854,7 +855,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 552, yMax: 576, hasSubtype: false, @@ -868,7 +869,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 528, yMax: 552, hasSubtype: false, @@ -882,7 +883,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 504, yMax: 528, hasSubtype: false, @@ -896,7 +897,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 480, yMax: 504, hasSubtype: false, @@ -910,7 +911,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 456, yMax: 480, hasSubtype: false, @@ -924,7 +925,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 432, yMax: 456, hasSubtype: false, @@ -938,7 +939,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 408, yMax: 432, hasSubtype: false, @@ -952,7 +953,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 384, yMax: 408, hasSubtype: false, @@ -966,7 +967,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 360, yMax: 384, hasSubtype: false, @@ -980,7 +981,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 336, yMax: 360, hasSubtype: false, @@ -994,7 +995,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 288, yMax: 336, hasSubtype: true, @@ -1008,7 +1009,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 264, yMax: 288, hasSubtype: true, @@ -1022,7 +1023,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [ fieldCategory: 'PART_III', valueType: 'numeric', xMin: 370, - xMax: 445, + xMax: 435, yMin: 240, yMax: 264, hasSubtype: true, diff --git a/apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts b/apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts index 835c6aec3..cc6db3e82 100644 --- a/apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts +++ b/apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts @@ -339,14 +339,17 @@ export class PdfParseExtractor implements K1Extractor { r.valueType !== 'checkbox' ); - for (const region of partIIIRegions) { - if (region.hasSubtype) { - // T008-T009: Subtype pairing - this.extractSubtypeField(dataItems, fields, region); - } else { - // Simple value matching - this.extractSimpleField(dataItems, fields, region); - } + // CRITICAL: Process subtype regions FIRST (right column boxes 14-21 + // and left column boxes 11-13). This prevents left-column simple + // regions from stealing right-column subtype codes at x~455. + const subtypeRegions = partIIIRegions.filter((r) => r.hasSubtype); + const simpleRegions = partIIIRegions.filter((r) => !r.hasSubtype); + + for (const region of subtypeRegions) { + this.extractSubtypeField(dataItems, fields, region); + } + for (const region of simpleRegions) { + this.extractSimpleField(dataItems, fields, region); } } @@ -508,8 +511,8 @@ export class PdfParseExtractor implements K1Extractor { const taxYearItems: DataItem[] = []; for (const item of dataItems) { if (item.matched) continue; - // Tax year region: near top of page (y > 760), x around 245-310 - if (item.y > 745 && item.x > 200 && item.x < 350) { + // Tax year region: near top of page, x around 200-350 + if (item.y > 710 && item.x > 200 && item.x < 350) { // Look for 2-digit or 4-digit year fragments if (/^\d{2,4}$/.test(item.text)) { taxYearItems.push(item); @@ -617,6 +620,8 @@ export class PdfParseExtractor implements K1Extractor { // ========================================================================== // T015-T018 (US3): Section J/K/L/M/N extraction + // Uses closest-center assignment so closely-spaced rows (Section L has + // 12pt row spacing, smaller than POSITION_TOLERANCE=15) get correct mapping. // ========================================================================== private extractSections( dataItems: DataItem[], @@ -632,39 +637,15 @@ export class PdfParseExtractor implements K1Extractor { for (const category of sectionCategories) { const regions = K1_POSITION_REGIONS.filter( - (r) => r.fieldCategory === category + (r) => + r.fieldCategory === category && + r.valueType !== 'checkbox' ); - for (const region of regions) { - if (region.valueType === 'checkbox') { - // Handled in extractCheckboxes - continue; - } - this.extractSimpleField(dataItems, fields, region); - } - } - } - - // ========================================================================== - // T019-T020 (US4): Checkbox extraction - // ========================================================================== - private extractCheckboxes( - dataItems: DataItem[], - fields: K1ExtractedField[], - metadata: K1ExtractionResult['metadata'] - ): void { - const checkboxRegions = K1_POSITION_REGIONS.filter( - (r) => r.valueType === 'checkbox' - ); - for (const region of checkboxRegions) { - const item = this.findBestItemInRegion(dataItems, region); - const isChecked = - item !== null && - (item.text.toUpperCase() === 'X' || - item.text.toUpperCase() === '✓' || - item.text.toUpperCase() === '✗'); + const assignments = this.assignItemsToRegions(dataItems, regions); - if (isChecked && item) { + for (const [region, item] of assignments) { + const numericValue = this.parseNumericValue(item.text); const { confidence, confidenceLevel } = this.computeConfidence( item.x, item.y, @@ -675,25 +656,74 @@ export class PdfParseExtractor implements K1Extractor { boxNumber: region.boxNumber, label: region.label, customLabel: null, - rawValue: 'X', - numericValue: null, + rawValue: item.text, + numericValue, confidence, confidenceLevel, isUserEdited: false, isReviewed: false, subtype: null, - fieldCategory: 'CHECKBOX', - isCheckbox: true + fieldCategory: region.fieldCategory, + isCheckbox: false }); item.matched = true; + } + } + } - // Set metadata flags for known checkboxes - if (region.fieldId === 'FINAL_K1') { - metadata.isFinal = true; - } else if (region.fieldId === 'AMENDED_K1') { - metadata.isAmended = true; - } + // ========================================================================== + // T019-T020 (US4): Checkbox extraction + // Uses closest-center assignment to prevent adjacent checkbox regions + // (e.g., G_GENERAL/G_LIMITED, M_YES/M_NO) from stealing each other's marks. + // ========================================================================== + private extractCheckboxes( + dataItems: DataItem[], + fields: K1ExtractedField[], + metadata: K1ExtractionResult['metadata'] + ): void { + const checkboxRegions = K1_POSITION_REGIONS.filter( + (r) => r.valueType === 'checkbox' + ); + + const assignments = this.assignItemsToRegions(dataItems, checkboxRegions); + + for (const [region, item] of assignments) { + const isChecked = + item.text.toUpperCase() === 'X' || + item.text.toUpperCase() === '✓' || + item.text.toUpperCase() === '✗'; + + if (!isChecked) continue; + + const { confidence, confidenceLevel } = this.computeConfidence( + item.x, + item.y, + region + ); + + fields.push({ + boxNumber: region.boxNumber, + label: region.label, + customLabel: null, + rawValue: 'X', + numericValue: null, + confidence, + confidenceLevel, + isUserEdited: false, + isReviewed: false, + subtype: null, + fieldCategory: 'CHECKBOX', + isCheckbox: true + }); + + item.matched = true; + + // Set metadata flags for known checkboxes + if (region.fieldId === 'FINAL_K1') { + metadata.isFinal = true; + } else if (region.fieldId === 'AMENDED_K1') { + metadata.isAmended = true; } } } @@ -732,8 +762,13 @@ export class PdfParseExtractor implements K1Extractor { } // ========================================================================== - // T005: Position matching helper + // T005: Position matching helpers // ========================================================================== + + /** + * Find the single best (closest to center) unmatched item in a region. + * Used for isolated fields where only one region is being checked. + */ private findBestItemInRegion( dataItems: DataItem[], region: K1PositionRegion @@ -766,6 +801,62 @@ export class PdfParseExtractor implements K1Extractor { return bestItem; } + /** + * Closest-center assignment across a batch of regions. + * Builds all (item, region, distance) candidates, then greedily assigns + * by smallest distance first. Each region gets at most one item and each + * item is used at most once. This prevents adjacent/overlapping regions + * (e.g., G_GENERAL/G_LIMITED at boundary x=178, Section L rows 12pt apart) + * from stealing each other's data via tolerance-window overlap. + */ + private assignItemsToRegions( + dataItems: DataItem[], + regions: K1PositionRegion[] + ): Map { + const candidates: { + item: DataItem; + region: K1PositionRegion; + distance: number; + }[] = []; + + for (const item of dataItems) { + if (item.matched) continue; + for (const region of regions) { + if ( + item.x >= region.xMin - POSITION_TOLERANCE && + item.x <= region.xMax + POSITION_TOLERANCE && + item.y >= region.yMin - POSITION_TOLERANCE && + item.y <= region.yMax + POSITION_TOLERANCE + ) { + const cx = (region.xMin + region.xMax) / 2; + const cy = (region.yMin + region.yMax) / 2; + const dx = Math.abs(item.x - cx); + const dy = Math.abs(item.y - cy); + candidates.push({ + item, + region, + distance: Math.sqrt(dx * dx + dy * dy) + }); + } + } + } + + // Sort by distance — closest matches first + candidates.sort((a, b) => a.distance - b.distance); + + // Greedy assignment: each region and item used at most once + const result = new Map(); + const usedItems = new Set(); + + for (const { item, region } of candidates) { + if (usedItems.has(item) || result.has(region)) continue; + result.set(region, item); + usedItems.add(item); + } + + return result; + } + // ========================================================================== // Preserved: isDigitalK1 — used by isAvailable() and external callers // ========================================================================== diff --git a/tools/test-k1-parse.mjs b/tools/test-k1-parse.mjs new file mode 100644 index 000000000..6aac6e296 --- /dev/null +++ b/tools/test-k1-parse.mjs @@ -0,0 +1,411 @@ +/** + * Test script: runs the PdfParseExtractor logic directly on a K-1 PDF + * and prints all extracted fields, metadata, and unmapped items. + * + * Usage: node tools/test-k1-parse.mjs + */ +import { readFileSync } from 'fs'; +import { resolve } from 'path'; + +// ── pdfjs-dist setup ── +const { getDocument, GlobalWorkerOptions } = await import( + 'pdfjs-dist/legacy/build/pdf.mjs' +); + +const workerPath = + 'file:///' + + resolve('node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs').replace( + /\\/g, + '/' + ); +GlobalWorkerOptions.workerSrc = workerPath; + +// ── Load k1-position-regions (need TS compilation) ── +// For simplicity, inline the region definitions from the compiled output. +// Instead, we'll replicate the core extraction logic here using the raw +// coordinates from the TypeScript file. + +// Actually, let's just load the TS file via tsx or esbuild-register... +// Simplest approach: read the compiled JS from dist or use a bundler. +// For now, let's inline the critical logic. + +const POSITION_TOLERANCE = 15; +const SUBTYPE_Y_TOLERANCE = 8; + +// ── Import the regions by dynamically compiling the TS ── +// We'll use a quick inline approach: load the raw TS and eval via esbuild + +import { execSync } from 'child_process'; +import { writeFileSync, unlinkSync, existsSync } from 'fs'; + +// Build a temp bundle of just the regions file +const regionsTsPath = resolve( + 'apps/api/src/app/k1-import/extractors/k1-position-regions.ts' +); +const regionsTmpPath = resolve('tools/_tmp_regions.mjs'); + +try { + execSync( + `npx esbuild "${regionsTsPath}" --bundle --format=esm --outfile="${regionsTmpPath}" --platform=node`, + { stdio: 'pipe' } + ); +} catch (e) { + console.error('Failed to compile regions file:', e.stderr?.toString()); + process.exit(1); +} + +const regionsModule = await import('file:///' + regionsTmpPath.replace(/\\/g, '/')); +const K1_POSITION_REGIONS = regionsModule.K1_POSITION_REGIONS; + +// Clean up +try { unlinkSync(regionsTmpPath); } catch {} + +// ── PDF parsing ── +const pdfPath = process.argv[2]; +if (!pdfPath) { + console.error('Usage: node tools/test-k1-parse.mjs '); + process.exit(1); +} + +const buffer = readFileSync(pdfPath); +const loadingTask = getDocument({ + data: new Uint8Array(buffer), + standardFontDataUrl: resolve('node_modules/pdfjs-dist/standard_fonts') + '/', + cMapUrl: resolve('node_modules/pdfjs-dist/cmaps') + '/', + cMapPacked: true, + isEvalSupported: false, + disableFontFace: true +}); + +const pdfDoc = await loadingTask.promise; +const page = await pdfDoc.getPage(1); +const textContent = await page.getTextContent({ includeMarkedContent: false }); +const items = textContent.items; +const styles = textContent.styles; + +// Filter data items (non-serif) +const dataItems = []; +for (const item of items) { + const text = item.str.trim(); + if (!text) continue; + const style = styles[item.fontName]; + if (!style) continue; + const fontFamily = style.fontFamily.toLowerCase(); + if (fontFamily === 'serif') continue; + dataItems.push({ + text, + x: item.transform[4], + y: item.transform[5], + fontName: item.fontName, + fontFamily, + matched: false + }); +} + +console.log(`Total data items: ${dataItems.length}\n`); + +// ── Parsing logic (mirrors PdfParseExtractor) ── +function parseNumericValue(raw) { + if (!raw) return null; + const trimmed = raw.trim(); + if (!trimmed) return null; + const upper = trimmed.toUpperCase(); + if (['SEE STMT', 'STMT', 'SEE STATEMENT', 'X', 'E-FILE', 'YES', 'NO'].includes(upper)) + return null; + let cleaned = trimmed; + const isParenNeg = /^\(.*\)$/.test(cleaned); + cleaned = cleaned.replace(/[$,()]/g, ''); + const isMinusNeg = cleaned.startsWith('-'); + if (isMinusNeg) cleaned = cleaned.substring(1); + const num = parseFloat(cleaned); + if (isNaN(num)) return null; + return isParenNeg || isMinusNeg ? -num : num; +} + +function findBestItemInRegion(items, region) { + let bestItem = null; + let bestDist = Infinity; + const cx = (region.xMin + region.xMax) / 2; + const cy = (region.yMin + region.yMax) / 2; + for (const item of items) { + if (item.matched) continue; + if ( + item.x >= region.xMin - POSITION_TOLERANCE && + item.x <= region.xMax + POSITION_TOLERANCE && + item.y >= region.yMin - POSITION_TOLERANCE && + item.y <= region.yMax + POSITION_TOLERANCE + ) { + const dx = Math.abs(item.x - cx); + const dy = Math.abs(item.y - cy); + const d = Math.sqrt(dx * dx + dy * dy); + if (d < bestDist) { + bestDist = d; + bestItem = item; + } + } + } + return bestItem; +} + +const fields = []; +const metadata = { + partnershipName: null, + partnershipEin: null, + partnerName: null, + partnerEin: null, + taxYear: null, + isAmended: false, + isFinal: false +}; + +// Closest-center assignment helper +function assignItemsToRegions(items, regions) { + const candidates = []; + for (const item of items) { + if (item.matched) continue; + for (const region of regions) { + if ( + item.x >= region.xMin - POSITION_TOLERANCE && + item.x <= region.xMax + POSITION_TOLERANCE && + item.y >= region.yMin - POSITION_TOLERANCE && + item.y <= region.yMax + POSITION_TOLERANCE + ) { + const cx = (region.xMin + region.xMax) / 2; + const cy = (region.yMin + region.yMax) / 2; + const dx = Math.abs(item.x - cx); + const dy = Math.abs(item.y - cy); + candidates.push({ item, region, distance: Math.sqrt(dx*dx + dy*dy) }); + } + } + } + candidates.sort((a, b) => a.distance - b.distance); + const result = new Map(); + const usedItems = new Set(); + for (const { item, region } of candidates) { + if (usedItems.has(item) || result.has(region)) continue; + result.set(region, item); + usedItems.add(item); + } + return result; +} + +// 1. Checkboxes (closest-center assignment) +const checkboxRegions = K1_POSITION_REGIONS.filter(r => r.valueType === 'checkbox'); +const cbAssignments = assignItemsToRegions(dataItems, checkboxRegions); +for (const [region, item] of cbAssignments) { + const isChecked = ['X', '✓', '✗'].includes(item.text.toUpperCase()); + if (!isChecked) continue; + fields.push({ + fieldId: region.fieldId, + boxNumber: region.boxNumber, + label: region.label, + rawValue: 'X', + numericValue: null, + fieldCategory: 'CHECKBOX', + isCheckbox: true, + subtype: null + }); + item.matched = true; + if (region.fieldId === 'FINAL_K1') metadata.isFinal = true; + if (region.fieldId === 'AMENDED_K1') metadata.isAmended = true; +} + +// 2. Part III — subtype regions first, then simple +const partIIIRegions = K1_POSITION_REGIONS.filter( + r => r.fieldCategory === 'PART_III' && r.valueType !== 'checkbox' +); +const subtypeRegions = partIIIRegions.filter(r => r.hasSubtype); +const simpleRegions = partIIIRegions.filter(r => !r.hasSubtype); + +function extractSubtypeField(region) { + const codes = []; + const values = []; + for (const item of dataItems) { + if (item.matched) continue; + const inY = item.y >= region.yMin - POSITION_TOLERANCE && + item.y <= region.yMax + POSITION_TOLERANCE; + if (!inY) continue; + if (region.subtypeXMin !== null && region.subtypeXMax !== null && + item.x >= region.subtypeXMin - POSITION_TOLERANCE && + item.x <= region.subtypeXMax + POSITION_TOLERANCE) { + codes.push(item); + } else if (item.x >= region.xMin - POSITION_TOLERANCE && + item.x <= region.xMax + POSITION_TOLERANCE) { + values.push(item); + } + } + if (codes.length > 0) { + for (const code of codes) { + const paired = values.find(v => !v.matched && Math.abs(v.y - code.y) <= SUBTYPE_Y_TOLERANCE); + const raw = paired ? paired.text : ''; + fields.push({ + fieldId: region.fieldId, + boxNumber: region.boxNumber, + label: region.label, + rawValue: raw, + numericValue: parseNumericValue(raw), + fieldCategory: region.fieldCategory, + isCheckbox: false, + subtype: code.text.trim() + }); + code.matched = true; + if (paired) paired.matched = true; + } + } else if (values.length > 0) { + const item = values[0]; + fields.push({ + fieldId: region.fieldId, + boxNumber: region.boxNumber, + label: region.label, + rawValue: item.text, + numericValue: parseNumericValue(item.text), + fieldCategory: region.fieldCategory, + isCheckbox: false, + subtype: null + }); + item.matched = true; + } +} + +for (const region of subtypeRegions) { + extractSubtypeField(region); +} +for (const region of simpleRegions) { + const item = findBestItemInRegion(dataItems, region); + if (!item) continue; + fields.push({ + fieldId: region.fieldId, + boxNumber: region.boxNumber, + label: region.label, + rawValue: item.text, + numericValue: parseNumericValue(item.text), + fieldCategory: region.fieldCategory, + isCheckbox: false, + subtype: null + }); + item.matched = true; +} + +// 3. Metadata — tax year (lowered threshold from 745 to 710) +const taxYearItems = []; +for (const item of dataItems) { + if (item.matched) continue; + if (item.y > 710 && item.x > 200 && item.x < 350) { + if (/^\d{2,4}$/.test(item.text)) { + taxYearItems.push(item); + } + } +} +if (taxYearItems.length >= 2) { + taxYearItems.sort((a, b) => a.x - b.x); + const combined = taxYearItems.map(i => i.text).join(''); + const year = parseInt(combined, 10); + if (year >= 1900 && year <= 2100) { + metadata.taxYear = year; + for (const item of taxYearItems) item.matched = true; + } +} + +// Text metadata +function extractTextMetadata(regionFieldId, metadataKey) { + const region = K1_POSITION_REGIONS.find(r => r.fieldId === regionFieldId); + if (!region) return; + const matches = []; + for (const item of dataItems) { + if (item.matched) continue; + if ( + item.x >= region.xMin - POSITION_TOLERANCE && + item.x <= region.xMax + POSITION_TOLERANCE && + item.y >= region.yMin - POSITION_TOLERANCE && + item.y <= region.yMax + POSITION_TOLERANCE + ) { + matches.push(item); + } + } + if (matches.length === 0) return; + matches.sort((a, b) => b.y - a.y); + const combined = matches.map(m => m.text).join(' ').trim(); + if (metadataKey && combined) { + metadata[metadataKey] = combined; + } + for (const item of matches) item.matched = true; +} + +extractTextMetadata('A_EIN', 'partnershipEin'); +extractTextMetadata('B_NAME', 'partnershipName'); +extractTextMetadata('C_IRS_CENTER', null); +extractTextMetadata('E_TIN', 'partnerEin'); +extractTextMetadata('F_NAME_ADDR', 'partnerName'); + +// Remaining metadata regions +const metadataRegions = K1_POSITION_REGIONS.filter( + r => r.fieldCategory === 'METADATA' && r.valueType === 'text' +); +for (const region of metadataRegions) { + const item = findBestItemInRegion(dataItems, region); + if (!item) continue; + fields.push({ + fieldId: region.fieldId, + boxNumber: region.boxNumber, + label: region.label, + rawValue: item.text, + numericValue: parseNumericValue(item.text), + fieldCategory: region.fieldCategory, + isCheckbox: false, + subtype: null + }); + item.matched = true; +} + +// 4. Sections J/K/L/M/N (closest-center assignment) +for (const cat of ['SECTION_J', 'SECTION_K', 'SECTION_L', 'SECTION_M', 'SECTION_N']) { + const regions = K1_POSITION_REGIONS.filter(r => r.fieldCategory === cat && r.valueType !== 'checkbox'); + const assignments = assignItemsToRegions(dataItems, regions); + for (const [region, item] of assignments) { + fields.push({ + fieldId: region.fieldId, + boxNumber: region.boxNumber, + label: region.label, + rawValue: item.text, + numericValue: parseNumericValue(item.text), + fieldCategory: region.fieldCategory, + isCheckbox: false, + subtype: null + }); + item.matched = true; + } +} + +// ── Print results ── +console.log('=== METADATA ==='); +console.log(JSON.stringify(metadata, null, 2)); + +console.log('\n=== EXTRACTED FIELDS ==='); +// Group by category +const byCategory = {}; +for (const f of fields) { + const cat = f.fieldCategory; + if (!byCategory[cat]) byCategory[cat] = []; + byCategory[cat].push(f); +} + +for (const [cat, catFields] of Object.entries(byCategory)) { + console.log(`\n--- ${cat} ---`); + for (const f of catFields) { + const sub = f.subtype ? ` [${f.subtype}]` : ''; + const num = f.numericValue !== null ? ` (=${f.numericValue})` : ''; + console.log(` ${f.fieldId || f.boxNumber}: "${f.rawValue}"${sub}${num}`); + } +} + +// Unmapped +const unmapped = dataItems.filter(i => !i.matched && (i.text.length > 1 || /\d/.test(i.text) || i.text === 'X')); +console.log(`\n=== UNMAPPED ITEMS (${unmapped.length}) ===`); +for (const u of unmapped) { + const x = Math.round(u.x * 10) / 10; + const y = Math.round(u.y * 10) / 10; + console.log(` "${u.text}" at (${x}, ${y}) font=${u.fontFamily}`); +} + +await pdfDoc.destroy(); +console.log('\nDone.');