mirror of https://github.com/ghostfolio/ghostfolio
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
427 lines
13 KiB
427 lines
13 KiB
/**
|
|
* Test script: runs the PdfParseExtractor logic directly on a K-1 PDF
|
|
* and prints all extracted fields, metadata, and unmapped items.
|
|
*
|
|
* Usage: node tools/test-k1-parse.mjs <path-to-pdf>
|
|
*/
|
|
import { readFileSync } from 'fs';
|
|
import { resolve } from 'path';
|
|
|
|
// ── pdfjs-dist setup ──
|
|
const { getDocument, GlobalWorkerOptions } = await import(
|
|
'pdfjs-dist/legacy/build/pdf.mjs'
|
|
);
|
|
|
|
const workerPath =
|
|
'file:///' +
|
|
resolve('node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs').replace(
|
|
/\\/g,
|
|
'/'
|
|
);
|
|
GlobalWorkerOptions.workerSrc = workerPath;
|
|
|
|
// ── Load k1-position-regions (need TS compilation) ──
|
|
// For simplicity, inline the region definitions from the compiled output.
|
|
// Instead, we'll replicate the core extraction logic here using the raw
|
|
// coordinates from the TypeScript file.
|
|
|
|
// Actually, let's just load the TS file via tsx or esbuild-register...
|
|
// Simplest approach: read the compiled JS from dist or use a bundler.
|
|
// For now, let's inline the critical logic.
|
|
|
|
const POSITION_TOLERANCE = 15;
|
|
const SUBTYPE_Y_TOLERANCE = 8;
|
|
|
|
// ── Import the regions by dynamically compiling the TS ──
|
|
// We'll use a quick inline approach: load the raw TS and eval via esbuild
|
|
|
|
import { execSync } from 'child_process';
|
|
import { writeFileSync, unlinkSync, existsSync } from 'fs';
|
|
|
|
// Build a temp bundle of just the regions file
|
|
const regionsTsPath = resolve(
|
|
'apps/api/src/app/k1-import/extractors/k1-position-regions.ts'
|
|
);
|
|
const regionsTmpPath = resolve('tools/_tmp_regions.mjs');
|
|
|
|
try {
|
|
execSync(
|
|
`npx esbuild "${regionsTsPath}" --bundle --format=esm --outfile="${regionsTmpPath}" --platform=node`,
|
|
{ stdio: 'pipe' }
|
|
);
|
|
} catch (e) {
|
|
console.error('Failed to compile regions file:', e.stderr?.toString());
|
|
process.exit(1);
|
|
}
|
|
|
|
const regionsModule = await import('file:///' + regionsTmpPath.replace(/\\/g, '/'));
|
|
const K1_POSITION_REGIONS = regionsModule.K1_POSITION_REGIONS;
|
|
|
|
// Clean up
|
|
try { unlinkSync(regionsTmpPath); } catch {}
|
|
|
|
// ── PDF parsing ──
|
|
const pdfPath = process.argv[2];
|
|
if (!pdfPath) {
|
|
console.error('Usage: node tools/test-k1-parse.mjs <path-to-pdf>');
|
|
process.exit(1);
|
|
}
|
|
|
|
const buffer = readFileSync(pdfPath);
|
|
const loadingTask = getDocument({
|
|
data: new Uint8Array(buffer),
|
|
standardFontDataUrl: resolve('node_modules/pdfjs-dist/standard_fonts') + '/',
|
|
cMapUrl: resolve('node_modules/pdfjs-dist/cmaps') + '/',
|
|
cMapPacked: true,
|
|
isEvalSupported: false,
|
|
disableFontFace: true
|
|
});
|
|
|
|
const pdfDoc = await loadingTask.promise;
|
|
const page = await pdfDoc.getPage(1);
|
|
const textContent = await page.getTextContent({ includeMarkedContent: false });
|
|
const items = textContent.items;
|
|
const styles = textContent.styles;
|
|
|
|
// Filter data items (non-serif)
|
|
const dataItems = [];
|
|
for (const item of items) {
|
|
const text = item.str.trim();
|
|
if (!text) continue;
|
|
const style = styles[item.fontName];
|
|
if (!style) continue;
|
|
const fontFamily = style.fontFamily.toLowerCase();
|
|
if (fontFamily === 'serif') continue;
|
|
dataItems.push({
|
|
text,
|
|
x: item.transform[4],
|
|
y: item.transform[5],
|
|
fontName: item.fontName,
|
|
fontFamily,
|
|
matched: false
|
|
});
|
|
}
|
|
|
|
console.log(`Total data items: ${dataItems.length}\n`);
|
|
|
|
// ── Parsing logic (mirrors PdfParseExtractor) ──
|
|
function parseNumericValue(raw) {
|
|
if (!raw) return null;
|
|
const trimmed = raw.trim();
|
|
if (!trimmed) return null;
|
|
const upper = trimmed.toUpperCase();
|
|
if (['SEE STMT', 'STMT', 'SEE STATEMENT', 'X', 'E-FILE', 'YES', 'NO'].includes(upper))
|
|
return null;
|
|
let cleaned = trimmed;
|
|
const isParenNeg = /^\(.*\)$/.test(cleaned);
|
|
cleaned = cleaned.replace(/[$,()]/g, '');
|
|
const isMinusNeg = cleaned.startsWith('-');
|
|
if (isMinusNeg) cleaned = cleaned.substring(1);
|
|
const num = parseFloat(cleaned);
|
|
if (isNaN(num)) return null;
|
|
return isParenNeg || isMinusNeg ? -num : num;
|
|
}
|
|
|
|
function findBestItemInRegion(items, region) {
|
|
let bestItem = null;
|
|
let bestDist = Infinity;
|
|
const cx = (region.xMin + region.xMax) / 2;
|
|
const cy = (region.yMin + region.yMax) / 2;
|
|
for (const item of items) {
|
|
if (item.matched) continue;
|
|
if (
|
|
item.x >= region.xMin - POSITION_TOLERANCE &&
|
|
item.x <= region.xMax + POSITION_TOLERANCE &&
|
|
item.y >= region.yMin - POSITION_TOLERANCE &&
|
|
item.y <= region.yMax + POSITION_TOLERANCE
|
|
) {
|
|
const dx = Math.abs(item.x - cx);
|
|
const dy = Math.abs(item.y - cy);
|
|
const d = Math.sqrt(dx * dx + dy * dy);
|
|
if (d < bestDist) {
|
|
bestDist = d;
|
|
bestItem = item;
|
|
}
|
|
}
|
|
}
|
|
return bestItem;
|
|
}
|
|
|
|
const fields = [];
|
|
const metadata = {
|
|
partnershipName: null,
|
|
partnershipEin: null,
|
|
partnerName: null,
|
|
partnerEin: null,
|
|
taxYear: null,
|
|
isAmended: false,
|
|
isFinal: false
|
|
};
|
|
|
|
// Closest-center assignment helper
|
|
function assignItemsToRegions(items, regions) {
|
|
const candidates = [];
|
|
for (const item of items) {
|
|
if (item.matched) continue;
|
|
for (const region of regions) {
|
|
if (
|
|
item.x >= region.xMin - POSITION_TOLERANCE &&
|
|
item.x <= region.xMax + POSITION_TOLERANCE &&
|
|
item.y >= region.yMin - POSITION_TOLERANCE &&
|
|
item.y <= region.yMax + POSITION_TOLERANCE
|
|
) {
|
|
const cx = (region.xMin + region.xMax) / 2;
|
|
const cy = (region.yMin + region.yMax) / 2;
|
|
const dx = Math.abs(item.x - cx);
|
|
const dy = Math.abs(item.y - cy);
|
|
candidates.push({ item, region, distance: Math.sqrt(dx*dx + dy*dy) });
|
|
}
|
|
}
|
|
}
|
|
candidates.sort((a, b) => a.distance - b.distance);
|
|
const result = new Map();
|
|
const usedItems = new Set();
|
|
for (const { item, region } of candidates) {
|
|
if (usedItems.has(item) || result.has(region)) continue;
|
|
result.set(region, item);
|
|
usedItems.add(item);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
// 1. Checkboxes (closest-center assignment)
|
|
const checkboxRegions = K1_POSITION_REGIONS.filter(r => r.valueType === 'checkbox');
|
|
const cbAssignments = assignItemsToRegions(dataItems, checkboxRegions);
|
|
const checkedRegionIds = new Set();
|
|
for (const [region, item] of cbAssignments) {
|
|
const isChecked = ['X', '✓', '✗'].includes(item.text.toUpperCase());
|
|
if (!isChecked) continue;
|
|
checkedRegionIds.add(region.fieldId);
|
|
fields.push({
|
|
fieldId: region.fieldId,
|
|
boxNumber: region.boxNumber,
|
|
label: region.label,
|
|
rawValue: 'true',
|
|
numericValue: null,
|
|
fieldCategory: 'CHECKBOX',
|
|
isCheckbox: true,
|
|
subtype: null
|
|
});
|
|
item.matched = true;
|
|
if (region.fieldId === 'FINAL_K1') metadata.isFinal = true;
|
|
if (region.fieldId === 'AMENDED_K1') metadata.isAmended = true;
|
|
}
|
|
// Emit false for unchecked checkbox regions
|
|
for (const region of checkboxRegions) {
|
|
if (checkedRegionIds.has(region.fieldId)) continue;
|
|
fields.push({
|
|
fieldId: region.fieldId,
|
|
boxNumber: region.boxNumber,
|
|
label: region.label,
|
|
rawValue: 'false',
|
|
numericValue: null,
|
|
fieldCategory: 'CHECKBOX',
|
|
isCheckbox: true,
|
|
subtype: null
|
|
});
|
|
}
|
|
|
|
// 2. Part III — subtype regions first, then simple
|
|
const partIIIRegions = K1_POSITION_REGIONS.filter(
|
|
r => r.fieldCategory === 'PART_III' && r.valueType !== 'checkbox'
|
|
);
|
|
const subtypeRegions = partIIIRegions.filter(r => r.hasSubtype);
|
|
const simpleRegions = partIIIRegions.filter(r => !r.hasSubtype);
|
|
|
|
function extractSubtypeField(region) {
|
|
const codes = [];
|
|
const values = [];
|
|
for (const item of dataItems) {
|
|
if (item.matched) continue;
|
|
const inY = item.y >= region.yMin - POSITION_TOLERANCE &&
|
|
item.y <= region.yMax + POSITION_TOLERANCE;
|
|
if (!inY) continue;
|
|
if (region.subtypeXMin !== null && region.subtypeXMax !== null &&
|
|
item.x >= region.subtypeXMin - POSITION_TOLERANCE &&
|
|
item.x <= region.subtypeXMax + POSITION_TOLERANCE) {
|
|
codes.push(item);
|
|
} else if (item.x >= region.xMin - POSITION_TOLERANCE &&
|
|
item.x <= region.xMax + POSITION_TOLERANCE) {
|
|
values.push(item);
|
|
}
|
|
}
|
|
if (codes.length > 0) {
|
|
for (const code of codes) {
|
|
const paired = values.find(v => !v.matched && Math.abs(v.y - code.y) <= SUBTYPE_Y_TOLERANCE);
|
|
const raw = paired ? paired.text : '';
|
|
fields.push({
|
|
fieldId: region.fieldId,
|
|
boxNumber: region.boxNumber,
|
|
label: region.label,
|
|
rawValue: raw,
|
|
numericValue: parseNumericValue(raw),
|
|
fieldCategory: region.fieldCategory,
|
|
isCheckbox: false,
|
|
subtype: code.text.trim()
|
|
});
|
|
code.matched = true;
|
|
if (paired) paired.matched = true;
|
|
}
|
|
} else if (values.length > 0) {
|
|
const item = values[0];
|
|
fields.push({
|
|
fieldId: region.fieldId,
|
|
boxNumber: region.boxNumber,
|
|
label: region.label,
|
|
rawValue: item.text,
|
|
numericValue: parseNumericValue(item.text),
|
|
fieldCategory: region.fieldCategory,
|
|
isCheckbox: false,
|
|
subtype: null
|
|
});
|
|
item.matched = true;
|
|
}
|
|
}
|
|
|
|
for (const region of subtypeRegions) {
|
|
extractSubtypeField(region);
|
|
}
|
|
for (const region of simpleRegions) {
|
|
const item = findBestItemInRegion(dataItems, region);
|
|
if (!item) continue;
|
|
fields.push({
|
|
fieldId: region.fieldId,
|
|
boxNumber: region.boxNumber,
|
|
label: region.label,
|
|
rawValue: item.text,
|
|
numericValue: parseNumericValue(item.text),
|
|
fieldCategory: region.fieldCategory,
|
|
isCheckbox: false,
|
|
subtype: null
|
|
});
|
|
item.matched = true;
|
|
}
|
|
|
|
// 3. Metadata — tax year (lowered threshold from 745 to 710)
|
|
const taxYearItems = [];
|
|
for (const item of dataItems) {
|
|
if (item.matched) continue;
|
|
if (item.y > 710 && item.x > 200 && item.x < 350) {
|
|
if (/^\d{2,4}$/.test(item.text)) {
|
|
taxYearItems.push(item);
|
|
}
|
|
}
|
|
}
|
|
if (taxYearItems.length >= 2) {
|
|
taxYearItems.sort((a, b) => a.x - b.x);
|
|
const combined = taxYearItems.map(i => i.text).join('');
|
|
const year = parseInt(combined, 10);
|
|
if (year >= 1900 && year <= 2100) {
|
|
metadata.taxYear = year;
|
|
for (const item of taxYearItems) item.matched = true;
|
|
}
|
|
}
|
|
|
|
// Text metadata
|
|
function extractTextMetadata(regionFieldId, metadataKey) {
|
|
const region = K1_POSITION_REGIONS.find(r => r.fieldId === regionFieldId);
|
|
if (!region) return;
|
|
const matches = [];
|
|
for (const item of dataItems) {
|
|
if (item.matched) continue;
|
|
if (
|
|
item.x >= region.xMin - POSITION_TOLERANCE &&
|
|
item.x <= region.xMax + POSITION_TOLERANCE &&
|
|
item.y >= region.yMin - POSITION_TOLERANCE &&
|
|
item.y <= region.yMax + POSITION_TOLERANCE
|
|
) {
|
|
matches.push(item);
|
|
}
|
|
}
|
|
if (matches.length === 0) return;
|
|
matches.sort((a, b) => b.y - a.y);
|
|
const combined = matches.map(m => m.text).join(' ').trim();
|
|
if (metadataKey && combined) {
|
|
metadata[metadataKey] = combined;
|
|
}
|
|
for (const item of matches) item.matched = true;
|
|
}
|
|
|
|
extractTextMetadata('A_EIN', 'partnershipEin');
|
|
extractTextMetadata('B_NAME', 'partnershipName');
|
|
extractTextMetadata('C_IRS_CENTER', null);
|
|
extractTextMetadata('E_TIN', 'partnerEin');
|
|
extractTextMetadata('F_NAME_ADDR', 'partnerName');
|
|
|
|
// Remaining metadata regions
|
|
const metadataRegions = K1_POSITION_REGIONS.filter(
|
|
r => r.fieldCategory === 'METADATA' && r.valueType === 'text'
|
|
);
|
|
for (const region of metadataRegions) {
|
|
const item = findBestItemInRegion(dataItems, region);
|
|
if (!item) continue;
|
|
fields.push({
|
|
fieldId: region.fieldId,
|
|
boxNumber: region.boxNumber,
|
|
label: region.label,
|
|
rawValue: item.text,
|
|
numericValue: parseNumericValue(item.text),
|
|
fieldCategory: region.fieldCategory,
|
|
isCheckbox: false,
|
|
subtype: null
|
|
});
|
|
item.matched = true;
|
|
}
|
|
|
|
// 4. Sections J/K/L/M/N (closest-center assignment)
|
|
for (const cat of ['SECTION_J', 'SECTION_K', 'SECTION_L', 'SECTION_M', 'SECTION_N']) {
|
|
const regions = K1_POSITION_REGIONS.filter(r => r.fieldCategory === cat && r.valueType !== 'checkbox');
|
|
const assignments = assignItemsToRegions(dataItems, regions);
|
|
for (const [region, item] of assignments) {
|
|
fields.push({
|
|
fieldId: region.fieldId,
|
|
boxNumber: region.boxNumber,
|
|
label: region.label,
|
|
rawValue: item.text,
|
|
numericValue: parseNumericValue(item.text),
|
|
fieldCategory: region.fieldCategory,
|
|
isCheckbox: false,
|
|
subtype: null
|
|
});
|
|
item.matched = true;
|
|
}
|
|
}
|
|
|
|
// ── Print results ──
|
|
console.log('=== METADATA ===');
|
|
console.log(JSON.stringify(metadata, null, 2));
|
|
|
|
console.log('\n=== EXTRACTED FIELDS ===');
|
|
// Group by category
|
|
const byCategory = {};
|
|
for (const f of fields) {
|
|
const cat = f.fieldCategory;
|
|
if (!byCategory[cat]) byCategory[cat] = [];
|
|
byCategory[cat].push(f);
|
|
}
|
|
|
|
for (const [cat, catFields] of Object.entries(byCategory)) {
|
|
console.log(`\n--- ${cat} ---`);
|
|
for (const f of catFields) {
|
|
const sub = f.subtype ? ` [${f.subtype}]` : '';
|
|
const num = f.numericValue !== null ? ` (=${f.numericValue})` : '';
|
|
console.log(` ${f.fieldId || f.boxNumber}: "${f.rawValue}"${sub}${num}`);
|
|
}
|
|
}
|
|
|
|
// Unmapped
|
|
const unmapped = dataItems.filter(i => !i.matched && (i.text.length > 1 || /\d/.test(i.text) || i.text === 'X'));
|
|
console.log(`\n=== UNMAPPED ITEMS (${unmapped.length}) ===`);
|
|
for (const u of unmapped) {
|
|
const x = Math.round(u.x * 10) / 10;
|
|
const y = Math.round(u.y * 10) / 10;
|
|
console.log(` "${u.text}" at (${x}, ${y}) font=${u.fontFamily}`);
|
|
}
|
|
|
|
await pdfDoc.destroy();
|
|
console.log('\nDone.');
|
|
|