Browse Source

fix: recalibrate K-1 position regions from actual PDF coordinates

- Rewrote k1-position-regions.ts with all coordinates measured from
  actual e-filed K-1 PDF via pdfjs-dist text position extraction
- Fixed all y-coordinate offsets (header, Part I/II, sections J-N, Part III)
- Added missing regions: J_EXCHANGE, BOX_22, BOX_23, H2_DE_TIN
- Set clean column boundaries (left values x=370-445, right code x=445-510,
  right values x=510-600) to prevent cross-column matching
- Reordered extraction: checkboxes now run before Part III to prevent
  BOX_16_K3 checkbox X from being grabbed as a BOX_16 value
- Fixed stale fieldId refs: D_PARTNER_EIN->E_TIN, E_NAME->F_NAME_ADDR
- Removed SSL from client serve config for local HTTP development
- Changed API rootUrl from https to http for dev environment
- Added tools/extract-k1-positions.mjs utility for PDF coordinate dumps
pull/6701/head
Robert Patch 2 months ago
parent
commit
41659a9c5b
  1. 814
      apps/api/src/app/k1-import/extractors/k1-position-regions.ts
  2. 12
      apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
  3. 2
      apps/api/src/environments/environment.ts
  4. 5
      apps/client/project.json
  5. 74
      tools/extract-k1-positions.mjs

814
apps/api/src/app/k1-import/extractors/k1-position-regions.ts

File diff suppressed because it is too large

12
apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts

@ -127,6 +127,10 @@ export class PdfParseExtractor implements K1Extractor {
const fields: K1ExtractedField[] = []; const fields: K1ExtractedField[] = [];
const metadata = this.initMetadata(); const metadata = this.initMetadata();
// Checkboxes first — consume "X" marks before Part III so the
// BOX_16_K3 checkbox doesn't get grabbed as a BOX_16 value.
this.extractCheckboxes(dataItems, fields, metadata);
// T007-T010 (US1): Part III extraction // T007-T010 (US1): Part III extraction
this.extractPartIII(dataItems, fields); this.extractPartIII(dataItems, fields);
@ -136,9 +140,6 @@ export class PdfParseExtractor implements K1Extractor {
// T015-T018 (US3): Sections J/K/L/M/N // T015-T018 (US3): Sections J/K/L/M/N
this.extractSections(dataItems, fields); this.extractSections(dataItems, fields);
// T019-T020 (US4): Checkbox detection
this.extractCheckboxes(dataItems, fields, metadata);
// T021 (US5): Unmapped items // T021 (US5): Unmapped items
const unmappedItems = this.collectUnmappedItems(dataItems); const unmappedItems = this.collectUnmappedItems(dataItems);
@ -543,9 +544,8 @@ export class PdfParseExtractor implements K1Extractor {
this.extractTextMetadata(dataItems, 'C_IRS_CENTER', metadata, null); this.extractTextMetadata(dataItems, 'C_IRS_CENTER', metadata, null);
// Part II: Partner info // Part II: Partner info
this.extractTextMetadata(dataItems, 'D_PARTNER_EIN', metadata, 'partnerEin'); this.extractTextMetadata(dataItems, 'E_TIN', metadata, 'partnerEin');
this.extractTextMetadata(dataItems, 'E_NAME', metadata, 'partnerName'); this.extractTextMetadata(dataItems, 'F_NAME_ADDR', metadata, 'partnerName');
this.extractTextMetadata(dataItems, 'F_ADDR', metadata, null);
// Extract remaining metadata text fields into the fields array // Extract remaining metadata text fields into the fields array
const metadataRegions = K1_POSITION_REGIONS.filter( const metadataRegions = K1_POSITION_REGIONS.filter(

2
apps/api/src/environments/environment.ts

@ -2,6 +2,6 @@ import { DEFAULT_HOST } from '@ghostfolio/common/config';
export const environment = { export const environment = {
production: false, production: false,
rootUrl: `https://${DEFAULT_HOST}:4200`, rootUrl: `http://${DEFAULT_HOST}:4200`,
version: 'dev' version: 'dev'
}; };

5
apps/client/project.json

@ -215,10 +215,7 @@
"executor": "@nx/angular:dev-server", "executor": "@nx/angular:dev-server",
"options": { "options": {
"buildTarget": "client:build", "buildTarget": "client:build",
"proxyConfig": "apps/client/proxy.conf.json", "proxyConfig": "apps/client/proxy.conf.json"
"ssl": true,
"sslCert": "apps/client/localhost.cert",
"sslKey": "apps/client/localhost.pem"
}, },
"configurations": { "configurations": {
"development-ca": { "development-ca": {

74
tools/extract-k1-positions.mjs

@ -0,0 +1,74 @@
/**
* Utility to extract all text items with their (x, y) positions from a K-1 PDF.
* This dumps every text item with coordinates so we can calibrate position regions.
*
* Usage: node tools/extract-k1-positions.mjs <path-to-pdf>
*/
import { readFileSync } from 'fs';
import { resolve } from 'path';
// Dynamic import of pdfjs-dist legacy build
const { getDocument, GlobalWorkerOptions } = await import(
'pdfjs-dist/legacy/build/pdf.mjs'
);
const workerPath =
'file:///' +
resolve('node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs').replace(
/\\/g,
'/'
);
GlobalWorkerOptions.workerSrc = workerPath;
const pdfPath = process.argv[2];
if (!pdfPath) {
console.error('Usage: node tools/extract-k1-positions.mjs <path-to-pdf>');
process.exit(1);
}
const buffer = readFileSync(pdfPath);
const loadingTask = getDocument({
data: new Uint8Array(buffer),
standardFontDataUrl: resolve('node_modules/pdfjs-dist/standard_fonts') + '/',
cMapUrl: resolve('node_modules/pdfjs-dist/cmaps') + '/',
cMapPacked: true,
isEvalSupported: false,
disableFontFace: true
});
const pdfDoc = await loadingTask.promise;
console.log(`Pages: ${pdfDoc.numPages}`);
for (let pageNum = 1; pageNum <= Math.min(pdfDoc.numPages, 2); pageNum++) {
console.log(`\n=== PAGE ${pageNum} ===\n`);
const page = await pdfDoc.getPage(pageNum);
const textContent = await page.getTextContent({ includeMarkedContent: false });
const items = textContent.items;
const styles = textContent.styles;
// Sort by y descending (top of page first), then x ascending
const sorted = [...items].sort((a, b) => {
const dy = b.transform[5] - a.transform[5];
if (Math.abs(dy) > 2) return dy;
return a.transform[4] - b.transform[4];
});
for (const item of sorted) {
const text = item.str.trim();
if (!text) continue;
const x = Math.round(item.transform[4] * 10) / 10;
const y = Math.round(item.transform[5] * 10) / 10;
const style = styles[item.fontName] || {};
const fontFamily = style.fontFamily || 'unknown';
const isData = fontFamily.toLowerCase() !== 'serif';
console.log(
`${isData ? 'DATA' : 'TMPL'} | x=${String(x).padStart(7)} | y=${String(y).padStart(7)} | font=${fontFamily.padEnd(15)} | "${text}"`
);
}
}
await pdfDoc.destroy();
console.log('\nDone.');
Loading…
Cancel
Save