fix: recalibrate K-1 position regions from actual PDF coordinates

- Rewrote k1-position-regions.ts with all coordinates measured from actual e-filed K-1 PDF via pdfjs-dist text position extraction - Fixed all y-coordinate offsets (header, Part I/II, sections J-N, Part III) - Added missing regions: J_EXCHANGE, BOX_22, BOX_23, H2_DE_TIN - Set clean column boundaries (left values x=370-445, right code x=445-510, right values x=510-600) to prevent cross-column matching - Reordered extraction: checkboxes now run before Part III to prevent BOX_16_K3 checkbox X from being grabbed as a BOX_16 value - Fixed stale fieldId refs: D_PARTNER_EIN->E_TIN, E_NAME->F_NAME_ADDR - Removed SSL from client serve config for local HTTP development - Changed API rootUrl from https to http for dev environment - Added tools/extract-k1-positions.mjs utility for PDF coordinate dumps
4 months ago · 41659a9c5b
5 changed files with 512 additions and 395 deletions
--- a/apps/api/src/app/k1-import/extractors/k1-position-regions.ts
+++ b/apps/api/src/app/k1-import/extractors/k1-position-regions.ts
--- a/apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
+++ b/apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
@ -127,6 +127,10 @@ export class PdfParseExtractor implements K1Extractor {
      const fields: K1ExtractedField[] = [];
      const metadata = this.initMetadata();
      // Checkboxes first — consume "X" marks before Part III so the
      // BOX_16_K3 checkbox doesn't get grabbed as a BOX_16 value.
      this.extractCheckboxes(dataItems, fields, metadata);
      // T007-T010 (US1): Part III extraction
      this.extractPartIII(dataItems, fields);
@ -136,9 +140,6 @@ export class PdfParseExtractor implements K1Extractor {
      // T015-T018 (US3): Sections J/K/L/M/N
      this.extractSections(dataItems, fields);
      // T019-T020 (US4): Checkbox detection
      this.extractCheckboxes(dataItems, fields, metadata);
      // T021 (US5): Unmapped items
      const unmappedItems = this.collectUnmappedItems(dataItems);
@ -543,9 +544,8 @@ export class PdfParseExtractor implements K1Extractor {
    this.extractTextMetadata(dataItems, 'C_IRS_CENTER', metadata, null);
    // Part II: Partner info
-    this.extractTextMetadata(dataItems, 'D_PARTNER_EIN', metadata, 'partnerEin');
+    this.extractTextMetadata(dataItems, 'E_TIN', metadata, 'partnerEin');
-    this.extractTextMetadata(dataItems, 'E_NAME', metadata, 'partnerName');
+    this.extractTextMetadata(dataItems, 'F_NAME_ADDR', metadata, 'partnerName');
    this.extractTextMetadata(dataItems, 'F_ADDR', metadata, null);
    // Extract remaining metadata text fields into the fields array
    const metadataRegions = K1_POSITION_REGIONS.filter(
--- a/apps/api/src/environments/environment.ts
+++ b/apps/api/src/environments/environment.ts
@ -2,6 +2,6 @@ import { DEFAULT_HOST } from '@ghostfolio/common/config';
 export const environment = {
  production: false,
-  rootUrl: `https://${DEFAULT_HOST}:4200`,
+  rootUrl: `http://${DEFAULT_HOST}:4200`,
  version: 'dev'
 };
--- a/apps/client/project.json
+++ b/apps/client/project.json
@ -215,10 +215,7 @@
      "executor": "@nx/angular:dev-server",
      "options": {
        "buildTarget": "client:build",
-        "proxyConfig": "apps/client/proxy.conf.json",
+        "proxyConfig": "apps/client/proxy.conf.json"
        "ssl": true,
        "sslCert": "apps/client/localhost.cert",
        "sslKey": "apps/client/localhost.pem"
      },
      "configurations": {
        "development-ca": {
--- a/tools/extract-k1-positions.mjs
+++ b/tools/extract-k1-positions.mjs
@ -0,0 +1,74 @@
 /**
 * Utility to extract all text items with their (x, y) positions from a K-1 PDF.
 * This dumps every text item with coordinates so we can calibrate position regions.
 *
 * Usage: node tools/extract-k1-positions.mjs <path-to-pdf>
 */
 import { readFileSync } from 'fs';
 import { resolve } from 'path';
 // Dynamic import of pdfjs-dist legacy build
 const { getDocument, GlobalWorkerOptions } = await import(
  'pdfjs-dist/legacy/build/pdf.mjs'
 );
 const workerPath =
  'file:///' +
  resolve('node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs').replace(
    /\\/g,
    '/'
  );
 GlobalWorkerOptions.workerSrc = workerPath;
 const pdfPath = process.argv[2];
 if (!pdfPath) {
  console.error('Usage: node tools/extract-k1-positions.mjs <path-to-pdf>');
  process.exit(1);
 }
 const buffer = readFileSync(pdfPath);
 const loadingTask = getDocument({
  data: new Uint8Array(buffer),
  standardFontDataUrl: resolve('node_modules/pdfjs-dist/standard_fonts') + '/',
  cMapUrl: resolve('node_modules/pdfjs-dist/cmaps') + '/',
  cMapPacked: true,
  isEvalSupported: false,
  disableFontFace: true
 });
 const pdfDoc = await loadingTask.promise;
 console.log(`Pages: ${pdfDoc.numPages}`);
 for (let pageNum = 1; pageNum <= Math.min(pdfDoc.numPages, 2); pageNum++) {
  console.log(`\n=== PAGE ${pageNum} ===\n`);
  const page = await pdfDoc.getPage(pageNum);
  const textContent = await page.getTextContent({ includeMarkedContent: false });
  const items = textContent.items;
  const styles = textContent.styles;
  // Sort by y descending (top of page first), then x ascending
  const sorted = [...items].sort((a, b) => {
    const dy = b.transform[5] - a.transform[5];
    if (Math.abs(dy) > 2) return dy;
    return a.transform[4] - b.transform[4];
  });
  for (const item of sorted) {
    const text = item.str.trim();
    if (!text) continue;
    const x = Math.round(item.transform[4] * 10) / 10;
    const y = Math.round(item.transform[5] * 10) / 10;
    const style = styles[item.fontName] || {};
    const fontFamily = style.fontFamily || 'unknown';
    const isData = fontFamily.toLowerCase() !== 'serif';
    console.log(
      `${isData ? 'DATA' : 'TMPL'} | x=${String(x).padStart(7)} | y=${String(y).padStart(7)} | font=${fontFamily.padEnd(15)} | "${text}"`
    );
  }
 }
 await pdfDoc.destroy();
 console.log('\nDone.');