mirror of https://github.com/ghostfolio/ghostfolio
Browse Source
- Rewrote k1-position-regions.ts with all coordinates measured from actual e-filed K-1 PDF via pdfjs-dist text position extraction - Fixed all y-coordinate offsets (header, Part I/II, sections J-N, Part III) - Added missing regions: J_EXCHANGE, BOX_22, BOX_23, H2_DE_TIN - Set clean column boundaries (left values x=370-445, right code x=445-510, right values x=510-600) to prevent cross-column matching - Reordered extraction: checkboxes now run before Part III to prevent BOX_16_K3 checkbox X from being grabbed as a BOX_16 value - Fixed stale fieldId refs: D_PARTNER_EIN->E_TIN, E_NAME->F_NAME_ADDR - Removed SSL from client serve config for local HTTP development - Changed API rootUrl from https to http for dev environment - Added tools/extract-k1-positions.mjs utility for PDF coordinate dumpspull/6701/head
5 changed files with 512 additions and 395 deletions
File diff suppressed because it is too large
@ -0,0 +1,74 @@ |
|||
/** |
|||
* Utility to extract all text items with their (x, y) positions from a K-1 PDF. |
|||
* This dumps every text item with coordinates so we can calibrate position regions. |
|||
* |
|||
* Usage: node tools/extract-k1-positions.mjs <path-to-pdf> |
|||
*/ |
|||
import { readFileSync } from 'fs'; |
|||
import { resolve } from 'path'; |
|||
|
|||
// Dynamic import of pdfjs-dist legacy build
|
|||
const { getDocument, GlobalWorkerOptions } = await import( |
|||
'pdfjs-dist/legacy/build/pdf.mjs' |
|||
); |
|||
|
|||
const workerPath = |
|||
'file:///' + |
|||
resolve('node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs').replace( |
|||
/\\/g, |
|||
'/' |
|||
); |
|||
GlobalWorkerOptions.workerSrc = workerPath; |
|||
|
|||
const pdfPath = process.argv[2]; |
|||
if (!pdfPath) { |
|||
console.error('Usage: node tools/extract-k1-positions.mjs <path-to-pdf>'); |
|||
process.exit(1); |
|||
} |
|||
|
|||
const buffer = readFileSync(pdfPath); |
|||
const loadingTask = getDocument({ |
|||
data: new Uint8Array(buffer), |
|||
standardFontDataUrl: resolve('node_modules/pdfjs-dist/standard_fonts') + '/', |
|||
cMapUrl: resolve('node_modules/pdfjs-dist/cmaps') + '/', |
|||
cMapPacked: true, |
|||
isEvalSupported: false, |
|||
disableFontFace: true |
|||
}); |
|||
|
|||
const pdfDoc = await loadingTask.promise; |
|||
console.log(`Pages: ${pdfDoc.numPages}`); |
|||
|
|||
for (let pageNum = 1; pageNum <= Math.min(pdfDoc.numPages, 2); pageNum++) { |
|||
console.log(`\n=== PAGE ${pageNum} ===\n`); |
|||
const page = await pdfDoc.getPage(pageNum); |
|||
const textContent = await page.getTextContent({ includeMarkedContent: false }); |
|||
|
|||
const items = textContent.items; |
|||
const styles = textContent.styles; |
|||
|
|||
// Sort by y descending (top of page first), then x ascending
|
|||
const sorted = [...items].sort((a, b) => { |
|||
const dy = b.transform[5] - a.transform[5]; |
|||
if (Math.abs(dy) > 2) return dy; |
|||
return a.transform[4] - b.transform[4]; |
|||
}); |
|||
|
|||
for (const item of sorted) { |
|||
const text = item.str.trim(); |
|||
if (!text) continue; |
|||
|
|||
const x = Math.round(item.transform[4] * 10) / 10; |
|||
const y = Math.round(item.transform[5] * 10) / 10; |
|||
const style = styles[item.fontName] || {}; |
|||
const fontFamily = style.fontFamily || 'unknown'; |
|||
const isData = fontFamily.toLowerCase() !== 'serif'; |
|||
|
|||
console.log( |
|||
`${isData ? 'DATA' : 'TMPL'} | x=${String(x).padStart(7)} | y=${String(y).padStart(7)} | font=${fontFamily.padEnd(15)} | "${text}"` |
|||
); |
|||
} |
|||
} |
|||
|
|||
await pdfDoc.destroy(); |
|||
console.log('\nDone.'); |
|||
Loading…
Reference in new issue