fix: recalibrate K-1 position regions from actual PDF coordinates

- Rewrote k1-position-regions.ts with all coordinates measured from actual e-filed K-1 PDF via pdfjs-dist text position extraction - Fixed all y-coordinate offsets (header, Part I/II, sections J-N, Part III) - Added missing regions: J_EXCHANGE, BOX_22, BOX_23, H2_DE_TIN - Set clean column boundaries (left values x=370-445, right code x=445-510, right values x=510-600) to prevent cross-column matching - Reordered extraction: checkboxes now run before Part III to prevent BOX_16_K3 checkbox X from being grabbed as a BOX_16 value - Fixed stale fieldId refs: D_PARTNER_EIN->E_TIN, E_NAME->F_NAME_ADDR - Removed SSL from client serve config for local HTTP development - Changed API rootUrl from https to http for dev environment - Added tools/extract-k1-positions.mjs utility for PDF coordinate dumps
4 months ago · 41659a9c5b
5 changed files with 512 additions and 395 deletions
--- a/apps/api/src/app/k1-import/extractors/k1-position-regions.ts
+++ b/apps/api/src/app/k1-import/extractors/k1-position-regions.ts
--- a/apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
+++ b/apps/api/src/app/k1-import/extractors/pdf-parse-extractor.ts
@ -127,6 +127,10 @@ export class PdfParseExtractor implements K1Extractor {
      const fields: K1ExtractedField[] = [];
      const metadata = this.initMetadata();

+      // Checkboxes first — consume "X" marks before Part III so the
+      // BOX_16_K3 checkbox doesn't get grabbed as a BOX_16 value.
+      this.extractCheckboxes(dataItems, fields, metadata);
+
      // T007-T010 (US1): Part III extraction
      this.extractPartIII(dataItems, fields);

@ -136,9 +140,6 @@ export class PdfParseExtractor implements K1Extractor {
      // T015-T018 (US3): Sections J/K/L/M/N
      this.extractSections(dataItems, fields);

-      // T019-T020 (US4): Checkbox detection
-      this.extractCheckboxes(dataItems, fields, metadata);
-
      // T021 (US5): Unmapped items
      const unmappedItems = this.collectUnmappedItems(dataItems);

@ -543,9 +544,8 @@ export class PdfParseExtractor implements K1Extractor {
    this.extractTextMetadata(dataItems, 'C_IRS_CENTER', metadata, null);

    // Part II: Partner info
-    this.extractTextMetadata(dataItems, 'D_PARTNER_EIN', metadata, 'partnerEin');
-    this.extractTextMetadata(dataItems, 'E_NAME', metadata, 'partnerName');
-    this.extractTextMetadata(dataItems, 'F_ADDR', metadata, null);
+    this.extractTextMetadata(dataItems, 'E_TIN', metadata, 'partnerEin');
+    this.extractTextMetadata(dataItems, 'F_NAME_ADDR', metadata, 'partnerName');

    // Extract remaining metadata text fields into the fields array
    const metadataRegions = K1_POSITION_REGIONS.filter(
--- a/apps/api/src/environments/environment.ts
+++ b/apps/api/src/environments/environment.ts
@ -2,6 +2,6 @@ import { DEFAULT_HOST } from '@ghostfolio/common/config';

 export const environment = {
  production: false,
-  rootUrl: `https://${DEFAULT_HOST}:4200`,
+  rootUrl: `http://${DEFAULT_HOST}:4200`,
  version: 'dev'
 };
--- a/apps/client/project.json
+++ b/apps/client/project.json
@ -215,10 +215,7 @@
      "executor": "@nx/angular:dev-server",
      "options": {
        "buildTarget": "client:build",
-        "proxyConfig": "apps/client/proxy.conf.json",
-        "ssl": true,
-        "sslCert": "apps/client/localhost.cert",
-        "sslKey": "apps/client/localhost.pem"
+        "proxyConfig": "apps/client/proxy.conf.json"
      },
      "configurations": {
        "development-ca": {
--- a/tools/extract-k1-positions.mjs
+++ b/tools/extract-k1-positions.mjs
@ -0,0 +1,74 @@
+/**
+ * Utility to extract all text items with their (x, y) positions from a K-1 PDF.
+ * This dumps every text item with coordinates so we can calibrate position regions.
+ *
+ * Usage: node tools/extract-k1-positions.mjs <path-to-pdf>
+ */
+import { readFileSync } from 'fs';
+import { resolve } from 'path';
+
+// Dynamic import of pdfjs-dist legacy build
+const { getDocument, GlobalWorkerOptions } = await import(
+  'pdfjs-dist/legacy/build/pdf.mjs'
+);
+
+const workerPath =
+  'file:///' +
+  resolve('node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs').replace(
+    /\\/g,
+    '/'
+  );
+GlobalWorkerOptions.workerSrc = workerPath;
+
+const pdfPath = process.argv[2];
+if (!pdfPath) {
+  console.error('Usage: node tools/extract-k1-positions.mjs <path-to-pdf>');
+  process.exit(1);
+}
+
+const buffer = readFileSync(pdfPath);
+const loadingTask = getDocument({
+  data: new Uint8Array(buffer),
+  standardFontDataUrl: resolve('node_modules/pdfjs-dist/standard_fonts') + '/',
+  cMapUrl: resolve('node_modules/pdfjs-dist/cmaps') + '/',
+  cMapPacked: true,
+  isEvalSupported: false,
+  disableFontFace: true
+});
+
+const pdfDoc = await loadingTask.promise;
+console.log(`Pages: ${pdfDoc.numPages}`);
+
+for (let pageNum = 1; pageNum <= Math.min(pdfDoc.numPages, 2); pageNum++) {
+  console.log(`\n=== PAGE ${pageNum} ===\n`);
+  const page = await pdfDoc.getPage(pageNum);
+  const textContent = await page.getTextContent({ includeMarkedContent: false });
+
+  const items = textContent.items;
+  const styles = textContent.styles;
+
+  // Sort by y descending (top of page first), then x ascending
+  const sorted = [...items].sort((a, b) => {
+    const dy = b.transform[5] - a.transform[5];
+    if (Math.abs(dy) > 2) return dy;
+    return a.transform[4] - b.transform[4];
+  });
+
+  for (const item of sorted) {
+    const text = item.str.trim();
+    if (!text) continue;
+
+    const x = Math.round(item.transform[4] * 10) / 10;
+    const y = Math.round(item.transform[5] * 10) / 10;
+    const style = styles[item.fontName] || {};
+    const fontFamily = style.fontFamily || 'unknown';
+    const isData = fontFamily.toLowerCase() !== 'serif';
+
+    console.log(
+      `${isData ? 'DATA' : 'TMPL'} | x=${String(x).padStart(7)} | y=${String(y).padStart(7)} | font=${fontFamily.padEnd(15)} | "${text}"`
+    );
+  }
+}
+
+await pdfDoc.destroy();
+console.log('\nDone.');