Browse Source

fix(k1-parser): fix cross-column contamination, checkbox, Section L, and tax year extraction

- Reduce left column xMax from 445 to 435 to prevent right-column subtype
  codes (at x~455) from being captured via 15pt tolerance
- Process subtype regions before simple regions in extractPartIII to ensure
  right-column codes are consumed by their correct boxes first
- Add assignItemsToRegions() closest-center matching algorithm to resolve
  contention when adjacent/overlapping regions compete for the same item
- Refactor extractCheckboxes to use closest-center: fixes G_LIMITED being
  detected as G_GENERAL (x=180.3 at boundary x=178) and M_NO as M_YES
- Refactor extractSections to use closest-center: fixes Section L rows
  (12pt spacing < 15pt tolerance) being assigned to wrong fields
- Lower tax year y-threshold from 745 to 710 to capture year fragments
  at y=727.7, enabling proper 20+25 -> 2025 combination
- Add tools/test-k1-parse.mjs for standalone PDF parsing validation

Verified against actual K-1 PDF: all fields match expected values,
unmapped items reduced from many to 3 (noise only).
pull/6701/head
Robert Patch 2 months ago
parent
commit
cda15c95bb
  1. 41
      apps/api/src/app/k1-import/extractors/k1-position-regions.ts
  2. 137
      apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
  3. 411
      tools/test-k1-parse.mjs

41
apps/api/src/app/k1-import/extractors/k1-position-regions.ts

@ -756,7 +756,8 @@ const SECTION_N_REGIONS: K1PositionRegion[] = [
// Part III -- Left Column: Boxes 1-13 // Part III -- Left Column: Boxes 1-13
// //
// Row spacing: 24pt. Label y-positions measured from template text. // Row spacing: 24pt. Label y-positions measured from template text.
// Value column: x=370-445 (between label text and right-column boundary). // Value column: x=370-435 (right edge reduced from 445 to prevent
// cross-column contamination with right column subtypes at x~455).
// Subtype code column (boxes 11-13): x=305-370. // Subtype code column (boxes 11-13): x=305-370.
// //
// Verified: BOX_11 subtype 'ZZ*' at (314.2, 314.4), // Verified: BOX_11 subtype 'ZZ*' at (314.2, 314.4),
@ -770,7 +771,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 696, yMin: 696,
yMax: 720, yMax: 720,
hasSubtype: false, hasSubtype: false,
@ -784,7 +785,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 672, yMin: 672,
yMax: 696, yMax: 696,
hasSubtype: false, hasSubtype: false,
@ -798,7 +799,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 648, yMin: 648,
yMax: 672, yMax: 672,
hasSubtype: false, hasSubtype: false,
@ -812,7 +813,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 624, yMin: 624,
yMax: 648, yMax: 648,
hasSubtype: false, hasSubtype: false,
@ -826,7 +827,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 600, yMin: 600,
yMax: 624, yMax: 624,
hasSubtype: false, hasSubtype: false,
@ -840,7 +841,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 576, yMin: 576,
yMax: 600, yMax: 600,
hasSubtype: false, hasSubtype: false,
@ -854,7 +855,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 552, yMin: 552,
yMax: 576, yMax: 576,
hasSubtype: false, hasSubtype: false,
@ -868,7 +869,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 528, yMin: 528,
yMax: 552, yMax: 552,
hasSubtype: false, hasSubtype: false,
@ -882,7 +883,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 504, yMin: 504,
yMax: 528, yMax: 528,
hasSubtype: false, hasSubtype: false,
@ -896,7 +897,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 480, yMin: 480,
yMax: 504, yMax: 504,
hasSubtype: false, hasSubtype: false,
@ -910,7 +911,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 456, yMin: 456,
yMax: 480, yMax: 480,
hasSubtype: false, hasSubtype: false,
@ -924,7 +925,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 432, yMin: 432,
yMax: 456, yMax: 456,
hasSubtype: false, hasSubtype: false,
@ -938,7 +939,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 408, yMin: 408,
yMax: 432, yMax: 432,
hasSubtype: false, hasSubtype: false,
@ -952,7 +953,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 384, yMin: 384,
yMax: 408, yMax: 408,
hasSubtype: false, hasSubtype: false,
@ -966,7 +967,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 360, yMin: 360,
yMax: 384, yMax: 384,
hasSubtype: false, hasSubtype: false,
@ -980,7 +981,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 336, yMin: 336,
yMax: 360, yMax: 360,
hasSubtype: false, hasSubtype: false,
@ -994,7 +995,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 288, yMin: 288,
yMax: 336, yMax: 336,
hasSubtype: true, hasSubtype: true,
@ -1008,7 +1009,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 264, yMin: 264,
yMax: 288, yMax: 288,
hasSubtype: true, hasSubtype: true,
@ -1022,7 +1023,7 @@ const PART_III_LEFT_REGIONS: K1PositionRegion[] = [
fieldCategory: 'PART_III', fieldCategory: 'PART_III',
valueType: 'numeric', valueType: 'numeric',
xMin: 370, xMin: 370,
xMax: 445, xMax: 435,
yMin: 240, yMin: 240,
yMax: 264, yMax: 264,
hasSubtype: true, hasSubtype: true,

137
apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts

@ -339,14 +339,17 @@ export class PdfParseExtractor implements K1Extractor {
r.valueType !== 'checkbox' r.valueType !== 'checkbox'
); );
for (const region of partIIIRegions) { // CRITICAL: Process subtype regions FIRST (right column boxes 14-21
if (region.hasSubtype) { // and left column boxes 11-13). This prevents left-column simple
// T008-T009: Subtype pairing // regions from stealing right-column subtype codes at x~455.
const subtypeRegions = partIIIRegions.filter((r) => r.hasSubtype);
const simpleRegions = partIIIRegions.filter((r) => !r.hasSubtype);
for (const region of subtypeRegions) {
this.extractSubtypeField(dataItems, fields, region); this.extractSubtypeField(dataItems, fields, region);
} else {
// Simple value matching
this.extractSimpleField(dataItems, fields, region);
} }
for (const region of simpleRegions) {
this.extractSimpleField(dataItems, fields, region);
} }
} }
@ -508,8 +511,8 @@ export class PdfParseExtractor implements K1Extractor {
const taxYearItems: DataItem[] = []; const taxYearItems: DataItem[] = [];
for (const item of dataItems) { for (const item of dataItems) {
if (item.matched) continue; if (item.matched) continue;
// Tax year region: near top of page (y > 760), x around 245-310 // Tax year region: near top of page, x around 200-350
if (item.y > 745 && item.x > 200 && item.x < 350) { if (item.y > 710 && item.x > 200 && item.x < 350) {
// Look for 2-digit or 4-digit year fragments // Look for 2-digit or 4-digit year fragments
if (/^\d{2,4}$/.test(item.text)) { if (/^\d{2,4}$/.test(item.text)) {
taxYearItems.push(item); taxYearItems.push(item);
@ -617,6 +620,8 @@ export class PdfParseExtractor implements K1Extractor {
// ========================================================================== // ==========================================================================
// T015-T018 (US3): Section J/K/L/M/N extraction // T015-T018 (US3): Section J/K/L/M/N extraction
// Uses closest-center assignment so closely-spaced rows (Section L has
// 12pt row spacing, smaller than POSITION_TOLERANCE=15) get correct mapping.
// ========================================================================== // ==========================================================================
private extractSections( private extractSections(
dataItems: DataItem[], dataItems: DataItem[],
@ -632,20 +637,45 @@ export class PdfParseExtractor implements K1Extractor {
for (const category of sectionCategories) { for (const category of sectionCategories) {
const regions = K1_POSITION_REGIONS.filter( const regions = K1_POSITION_REGIONS.filter(
(r) => r.fieldCategory === category (r) =>
r.fieldCategory === category &&
r.valueType !== 'checkbox'
); );
for (const region of regions) {
if (region.valueType === 'checkbox') { const assignments = this.assignItemsToRegions(dataItems, regions);
// Handled in extractCheckboxes
continue; for (const [region, item] of assignments) {
} const numericValue = this.parseNumericValue(item.text);
this.extractSimpleField(dataItems, fields, region); const { confidence, confidenceLevel } = this.computeConfidence(
item.x,
item.y,
region
);
fields.push({
boxNumber: region.boxNumber,
label: region.label,
customLabel: null,
rawValue: item.text,
numericValue,
confidence,
confidenceLevel,
isUserEdited: false,
isReviewed: false,
subtype: null,
fieldCategory: region.fieldCategory,
isCheckbox: false
});
item.matched = true;
} }
} }
} }
// ========================================================================== // ==========================================================================
// T019-T020 (US4): Checkbox extraction // T019-T020 (US4): Checkbox extraction
// Uses closest-center assignment to prevent adjacent checkbox regions
// (e.g., G_GENERAL/G_LIMITED, M_YES/M_NO) from stealing each other's marks.
// ========================================================================== // ==========================================================================
private extractCheckboxes( private extractCheckboxes(
dataItems: DataItem[], dataItems: DataItem[],
@ -656,15 +686,16 @@ export class PdfParseExtractor implements K1Extractor {
(r) => r.valueType === 'checkbox' (r) => r.valueType === 'checkbox'
); );
for (const region of checkboxRegions) { const assignments = this.assignItemsToRegions(dataItems, checkboxRegions);
const item = this.findBestItemInRegion(dataItems, region);
for (const [region, item] of assignments) {
const isChecked = const isChecked =
item !== null && item.text.toUpperCase() === 'X' ||
(item.text.toUpperCase() === 'X' ||
item.text.toUpperCase() === '✓' || item.text.toUpperCase() === '✓' ||
item.text.toUpperCase() === '✗'); item.text.toUpperCase() === '✗';
if (!isChecked) continue;
if (isChecked && item) {
const { confidence, confidenceLevel } = this.computeConfidence( const { confidence, confidenceLevel } = this.computeConfidence(
item.x, item.x,
item.y, item.y,
@ -696,7 +727,6 @@ export class PdfParseExtractor implements K1Extractor {
} }
} }
} }
}
// ========================================================================== // ==========================================================================
// T021 (US5): Unmapped items collection // T021 (US5): Unmapped items collection
@ -732,8 +762,13 @@ export class PdfParseExtractor implements K1Extractor {
} }
// ========================================================================== // ==========================================================================
// T005: Position matching helper // T005: Position matching helpers
// ========================================================================== // ==========================================================================
/**
* Find the single best (closest to center) unmatched item in a region.
* Used for isolated fields where only one region is being checked.
*/
private findBestItemInRegion( private findBestItemInRegion(
dataItems: DataItem[], dataItems: DataItem[],
region: K1PositionRegion region: K1PositionRegion
@ -766,6 +801,62 @@ export class PdfParseExtractor implements K1Extractor {
return bestItem; return bestItem;
} }
/**
* Closest-center assignment across a batch of regions.
* Builds all (item, region, distance) candidates, then greedily assigns
* by smallest distance first. Each region gets at most one item and each
* item is used at most once. This prevents adjacent/overlapping regions
* (e.g., G_GENERAL/G_LIMITED at boundary x=178, Section L rows 12pt apart)
* from stealing each other's data via tolerance-window overlap.
*/
private assignItemsToRegions(
dataItems: DataItem[],
regions: K1PositionRegion[]
): Map<K1PositionRegion, DataItem> {
const candidates: {
item: DataItem;
region: K1PositionRegion;
distance: number;
}[] = [];
for (const item of dataItems) {
if (item.matched) continue;
for (const region of regions) {
if (
item.x >= region.xMin - POSITION_TOLERANCE &&
item.x <= region.xMax + POSITION_TOLERANCE &&
item.y >= region.yMin - POSITION_TOLERANCE &&
item.y <= region.yMax + POSITION_TOLERANCE
) {
const cx = (region.xMin + region.xMax) / 2;
const cy = (region.yMin + region.yMax) / 2;
const dx = Math.abs(item.x - cx);
const dy = Math.abs(item.y - cy);
candidates.push({
item,
region,
distance: Math.sqrt(dx * dx + dy * dy)
});
}
}
}
// Sort by distance — closest matches first
candidates.sort((a, b) => a.distance - b.distance);
// Greedy assignment: each region and item used at most once
const result = new Map<K1PositionRegion, DataItem>();
const usedItems = new Set<DataItem>();
for (const { item, region } of candidates) {
if (usedItems.has(item) || result.has(region)) continue;
result.set(region, item);
usedItems.add(item);
}
return result;
}
// ========================================================================== // ==========================================================================
// Preserved: isDigitalK1 — used by isAvailable() and external callers // Preserved: isDigitalK1 — used by isAvailable() and external callers
// ========================================================================== // ==========================================================================

411
tools/test-k1-parse.mjs

@ -0,0 +1,411 @@
/**
* Test script: runs the PdfParseExtractor logic directly on a K-1 PDF
* and prints all extracted fields, metadata, and unmapped items.
*
* Usage: node tools/test-k1-parse.mjs <path-to-pdf>
*/
import { readFileSync } from 'fs';
import { resolve } from 'path';
// ── pdfjs-dist setup ──
const { getDocument, GlobalWorkerOptions } = await import(
'pdfjs-dist/legacy/build/pdf.mjs'
);
const workerPath =
'file:///' +
resolve('node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs').replace(
/\\/g,
'/'
);
GlobalWorkerOptions.workerSrc = workerPath;
// ── Load k1-position-regions (need TS compilation) ──
// For simplicity, inline the region definitions from the compiled output.
// Instead, we'll replicate the core extraction logic here using the raw
// coordinates from the TypeScript file.
// Actually, let's just load the TS file via tsx or esbuild-register...
// Simplest approach: read the compiled JS from dist or use a bundler.
// For now, let's inline the critical logic.
const POSITION_TOLERANCE = 15;
const SUBTYPE_Y_TOLERANCE = 8;
// ── Import the regions by dynamically compiling the TS ──
// We'll use a quick inline approach: load the raw TS and eval via esbuild
import { execSync } from 'child_process';
import { writeFileSync, unlinkSync, existsSync } from 'fs';
// Build a temp bundle of just the regions file
const regionsTsPath = resolve(
'apps/api/src/app/k1-import/extractors/k1-position-regions.ts'
);
const regionsTmpPath = resolve('tools/_tmp_regions.mjs');
try {
execSync(
`npx esbuild "${regionsTsPath}" --bundle --format=esm --outfile="${regionsTmpPath}" --platform=node`,
{ stdio: 'pipe' }
);
} catch (e) {
console.error('Failed to compile regions file:', e.stderr?.toString());
process.exit(1);
}
const regionsModule = await import('file:///' + regionsTmpPath.replace(/\\/g, '/'));
const K1_POSITION_REGIONS = regionsModule.K1_POSITION_REGIONS;
// Clean up
try { unlinkSync(regionsTmpPath); } catch {}
// ── PDF parsing ──
const pdfPath = process.argv[2];
if (!pdfPath) {
console.error('Usage: node tools/test-k1-parse.mjs <path-to-pdf>');
process.exit(1);
}
const buffer = readFileSync(pdfPath);
const loadingTask = getDocument({
data: new Uint8Array(buffer),
standardFontDataUrl: resolve('node_modules/pdfjs-dist/standard_fonts') + '/',
cMapUrl: resolve('node_modules/pdfjs-dist/cmaps') + '/',
cMapPacked: true,
isEvalSupported: false,
disableFontFace: true
});
const pdfDoc = await loadingTask.promise;
const page = await pdfDoc.getPage(1);
const textContent = await page.getTextContent({ includeMarkedContent: false });
const items = textContent.items;
const styles = textContent.styles;
// Filter data items (non-serif)
const dataItems = [];
for (const item of items) {
const text = item.str.trim();
if (!text) continue;
const style = styles[item.fontName];
if (!style) continue;
const fontFamily = style.fontFamily.toLowerCase();
if (fontFamily === 'serif') continue;
dataItems.push({
text,
x: item.transform[4],
y: item.transform[5],
fontName: item.fontName,
fontFamily,
matched: false
});
}
console.log(`Total data items: ${dataItems.length}\n`);
// ── Parsing logic (mirrors PdfParseExtractor) ──
function parseNumericValue(raw) {
if (!raw) return null;
const trimmed = raw.trim();
if (!trimmed) return null;
const upper = trimmed.toUpperCase();
if (['SEE STMT', 'STMT', 'SEE STATEMENT', 'X', 'E-FILE', 'YES', 'NO'].includes(upper))
return null;
let cleaned = trimmed;
const isParenNeg = /^\(.*\)$/.test(cleaned);
cleaned = cleaned.replace(/[$,()]/g, '');
const isMinusNeg = cleaned.startsWith('-');
if (isMinusNeg) cleaned = cleaned.substring(1);
const num = parseFloat(cleaned);
if (isNaN(num)) return null;
return isParenNeg || isMinusNeg ? -num : num;
}
function findBestItemInRegion(items, region) {
let bestItem = null;
let bestDist = Infinity;
const cx = (region.xMin + region.xMax) / 2;
const cy = (region.yMin + region.yMax) / 2;
for (const item of items) {
if (item.matched) continue;
if (
item.x >= region.xMin - POSITION_TOLERANCE &&
item.x <= region.xMax + POSITION_TOLERANCE &&
item.y >= region.yMin - POSITION_TOLERANCE &&
item.y <= region.yMax + POSITION_TOLERANCE
) {
const dx = Math.abs(item.x - cx);
const dy = Math.abs(item.y - cy);
const d = Math.sqrt(dx * dx + dy * dy);
if (d < bestDist) {
bestDist = d;
bestItem = item;
}
}
}
return bestItem;
}
const fields = [];
const metadata = {
partnershipName: null,
partnershipEin: null,
partnerName: null,
partnerEin: null,
taxYear: null,
isAmended: false,
isFinal: false
};
// Closest-center assignment helper
function assignItemsToRegions(items, regions) {
const candidates = [];
for (const item of items) {
if (item.matched) continue;
for (const region of regions) {
if (
item.x >= region.xMin - POSITION_TOLERANCE &&
item.x <= region.xMax + POSITION_TOLERANCE &&
item.y >= region.yMin - POSITION_TOLERANCE &&
item.y <= region.yMax + POSITION_TOLERANCE
) {
const cx = (region.xMin + region.xMax) / 2;
const cy = (region.yMin + region.yMax) / 2;
const dx = Math.abs(item.x - cx);
const dy = Math.abs(item.y - cy);
candidates.push({ item, region, distance: Math.sqrt(dx*dx + dy*dy) });
}
}
}
candidates.sort((a, b) => a.distance - b.distance);
const result = new Map();
const usedItems = new Set();
for (const { item, region } of candidates) {
if (usedItems.has(item) || result.has(region)) continue;
result.set(region, item);
usedItems.add(item);
}
return result;
}
// 1. Checkboxes (closest-center assignment)
const checkboxRegions = K1_POSITION_REGIONS.filter(r => r.valueType === 'checkbox');
const cbAssignments = assignItemsToRegions(dataItems, checkboxRegions);
for (const [region, item] of cbAssignments) {
const isChecked = ['X', '✓', '✗'].includes(item.text.toUpperCase());
if (!isChecked) continue;
fields.push({
fieldId: region.fieldId,
boxNumber: region.boxNumber,
label: region.label,
rawValue: 'X',
numericValue: null,
fieldCategory: 'CHECKBOX',
isCheckbox: true,
subtype: null
});
item.matched = true;
if (region.fieldId === 'FINAL_K1') metadata.isFinal = true;
if (region.fieldId === 'AMENDED_K1') metadata.isAmended = true;
}
// 2. Part III — subtype regions first, then simple
const partIIIRegions = K1_POSITION_REGIONS.filter(
r => r.fieldCategory === 'PART_III' && r.valueType !== 'checkbox'
);
const subtypeRegions = partIIIRegions.filter(r => r.hasSubtype);
const simpleRegions = partIIIRegions.filter(r => !r.hasSubtype);
function extractSubtypeField(region) {
const codes = [];
const values = [];
for (const item of dataItems) {
if (item.matched) continue;
const inY = item.y >= region.yMin - POSITION_TOLERANCE &&
item.y <= region.yMax + POSITION_TOLERANCE;
if (!inY) continue;
if (region.subtypeXMin !== null && region.subtypeXMax !== null &&
item.x >= region.subtypeXMin - POSITION_TOLERANCE &&
item.x <= region.subtypeXMax + POSITION_TOLERANCE) {
codes.push(item);
} else if (item.x >= region.xMin - POSITION_TOLERANCE &&
item.x <= region.xMax + POSITION_TOLERANCE) {
values.push(item);
}
}
if (codes.length > 0) {
for (const code of codes) {
const paired = values.find(v => !v.matched && Math.abs(v.y - code.y) <= SUBTYPE_Y_TOLERANCE);
const raw = paired ? paired.text : '';
fields.push({
fieldId: region.fieldId,
boxNumber: region.boxNumber,
label: region.label,
rawValue: raw,
numericValue: parseNumericValue(raw),
fieldCategory: region.fieldCategory,
isCheckbox: false,
subtype: code.text.trim()
});
code.matched = true;
if (paired) paired.matched = true;
}
} else if (values.length > 0) {
const item = values[0];
fields.push({
fieldId: region.fieldId,
boxNumber: region.boxNumber,
label: region.label,
rawValue: item.text,
numericValue: parseNumericValue(item.text),
fieldCategory: region.fieldCategory,
isCheckbox: false,
subtype: null
});
item.matched = true;
}
}
for (const region of subtypeRegions) {
extractSubtypeField(region);
}
for (const region of simpleRegions) {
const item = findBestItemInRegion(dataItems, region);
if (!item) continue;
fields.push({
fieldId: region.fieldId,
boxNumber: region.boxNumber,
label: region.label,
rawValue: item.text,
numericValue: parseNumericValue(item.text),
fieldCategory: region.fieldCategory,
isCheckbox: false,
subtype: null
});
item.matched = true;
}
// 3. Metadata — tax year (lowered threshold from 745 to 710)
const taxYearItems = [];
for (const item of dataItems) {
if (item.matched) continue;
if (item.y > 710 && item.x > 200 && item.x < 350) {
if (/^\d{2,4}$/.test(item.text)) {
taxYearItems.push(item);
}
}
}
if (taxYearItems.length >= 2) {
taxYearItems.sort((a, b) => a.x - b.x);
const combined = taxYearItems.map(i => i.text).join('');
const year = parseInt(combined, 10);
if (year >= 1900 && year <= 2100) {
metadata.taxYear = year;
for (const item of taxYearItems) item.matched = true;
}
}
// Text metadata
function extractTextMetadata(regionFieldId, metadataKey) {
const region = K1_POSITION_REGIONS.find(r => r.fieldId === regionFieldId);
if (!region) return;
const matches = [];
for (const item of dataItems) {
if (item.matched) continue;
if (
item.x >= region.xMin - POSITION_TOLERANCE &&
item.x <= region.xMax + POSITION_TOLERANCE &&
item.y >= region.yMin - POSITION_TOLERANCE &&
item.y <= region.yMax + POSITION_TOLERANCE
) {
matches.push(item);
}
}
if (matches.length === 0) return;
matches.sort((a, b) => b.y - a.y);
const combined = matches.map(m => m.text).join(' ').trim();
if (metadataKey && combined) {
metadata[metadataKey] = combined;
}
for (const item of matches) item.matched = true;
}
extractTextMetadata('A_EIN', 'partnershipEin');
extractTextMetadata('B_NAME', 'partnershipName');
extractTextMetadata('C_IRS_CENTER', null);
extractTextMetadata('E_TIN', 'partnerEin');
extractTextMetadata('F_NAME_ADDR', 'partnerName');
// Remaining metadata regions
const metadataRegions = K1_POSITION_REGIONS.filter(
r => r.fieldCategory === 'METADATA' && r.valueType === 'text'
);
for (const region of metadataRegions) {
const item = findBestItemInRegion(dataItems, region);
if (!item) continue;
fields.push({
fieldId: region.fieldId,
boxNumber: region.boxNumber,
label: region.label,
rawValue: item.text,
numericValue: parseNumericValue(item.text),
fieldCategory: region.fieldCategory,
isCheckbox: false,
subtype: null
});
item.matched = true;
}
// 4. Sections J/K/L/M/N (closest-center assignment)
for (const cat of ['SECTION_J', 'SECTION_K', 'SECTION_L', 'SECTION_M', 'SECTION_N']) {
const regions = K1_POSITION_REGIONS.filter(r => r.fieldCategory === cat && r.valueType !== 'checkbox');
const assignments = assignItemsToRegions(dataItems, regions);
for (const [region, item] of assignments) {
fields.push({
fieldId: region.fieldId,
boxNumber: region.boxNumber,
label: region.label,
rawValue: item.text,
numericValue: parseNumericValue(item.text),
fieldCategory: region.fieldCategory,
isCheckbox: false,
subtype: null
});
item.matched = true;
}
}
// ── Print results ──
console.log('=== METADATA ===');
console.log(JSON.stringify(metadata, null, 2));
console.log('\n=== EXTRACTED FIELDS ===');
// Group by category
const byCategory = {};
for (const f of fields) {
const cat = f.fieldCategory;
if (!byCategory[cat]) byCategory[cat] = [];
byCategory[cat].push(f);
}
for (const [cat, catFields] of Object.entries(byCategory)) {
console.log(`\n--- ${cat} ---`);
for (const f of catFields) {
const sub = f.subtype ? ` [${f.subtype}]` : '';
const num = f.numericValue !== null ? ` (=${f.numericValue})` : '';
console.log(` ${f.fieldId || f.boxNumber}: "${f.rawValue}"${sub}${num}`);
}
}
// Unmapped
const unmapped = dataItems.filter(i => !i.matched && (i.text.length > 1 || /\d/.test(i.text) || i.text === 'X'));
console.log(`\n=== UNMAPPED ITEMS (${unmapped.length}) ===`);
for (const u of unmapped) {
const x = Math.round(u.x * 10) / 10;
const y = Math.round(u.y * 10) / 10;
console.log(` "${u.text}" at (${x}, ${y}) font=${u.fontFamily}`);
}
await pdfDoc.destroy();
console.log('\nDone.');
Loading…
Cancel
Save