mirror of https://github.com/ghostfolio/ghostfolio
Browse Source
- Rewrote k1-position-regions.ts with all coordinates measured from actual e-filed K-1 PDF via pdfjs-dist text position extraction - Fixed all y-coordinate offsets (header, Part I/II, sections J-N, Part III) - Added missing regions: J_EXCHANGE, BOX_22, BOX_23, H2_DE_TIN - Set clean column boundaries (left values x=370-445, right code x=445-510, right values x=510-600) to prevent cross-column matching - Reordered extraction: checkboxes now run before Part III to prevent BOX_16_K3 checkbox X from being grabbed as a BOX_16 value - Fixed stale fieldId refs: D_PARTNER_EIN->E_TIN, E_NAME->F_NAME_ADDR - Removed SSL from client serve config for local HTTP development - Changed API rootUrl from https to http for dev environment - Added tools/extract-k1-positions.mjs utility for PDF coordinate dumpspull/6701/head
5 changed files with 512 additions and 395 deletions
File diff suppressed because it is too large
@ -0,0 +1,74 @@ |
|||||
|
/** |
||||
|
* Utility to extract all text items with their (x, y) positions from a K-1 PDF. |
||||
|
* This dumps every text item with coordinates so we can calibrate position regions. |
||||
|
* |
||||
|
* Usage: node tools/extract-k1-positions.mjs <path-to-pdf> |
||||
|
*/ |
||||
|
import { readFileSync } from 'fs'; |
||||
|
import { resolve } from 'path'; |
||||
|
|
||||
|
// Dynamic import of pdfjs-dist legacy build
|
||||
|
const { getDocument, GlobalWorkerOptions } = await import( |
||||
|
'pdfjs-dist/legacy/build/pdf.mjs' |
||||
|
); |
||||
|
|
||||
|
const workerPath = |
||||
|
'file:///' + |
||||
|
resolve('node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs').replace( |
||||
|
/\\/g, |
||||
|
'/' |
||||
|
); |
||||
|
GlobalWorkerOptions.workerSrc = workerPath; |
||||
|
|
||||
|
const pdfPath = process.argv[2]; |
||||
|
if (!pdfPath) { |
||||
|
console.error('Usage: node tools/extract-k1-positions.mjs <path-to-pdf>'); |
||||
|
process.exit(1); |
||||
|
} |
||||
|
|
||||
|
const buffer = readFileSync(pdfPath); |
||||
|
const loadingTask = getDocument({ |
||||
|
data: new Uint8Array(buffer), |
||||
|
standardFontDataUrl: resolve('node_modules/pdfjs-dist/standard_fonts') + '/', |
||||
|
cMapUrl: resolve('node_modules/pdfjs-dist/cmaps') + '/', |
||||
|
cMapPacked: true, |
||||
|
isEvalSupported: false, |
||||
|
disableFontFace: true |
||||
|
}); |
||||
|
|
||||
|
const pdfDoc = await loadingTask.promise; |
||||
|
console.log(`Pages: ${pdfDoc.numPages}`); |
||||
|
|
||||
|
for (let pageNum = 1; pageNum <= Math.min(pdfDoc.numPages, 2); pageNum++) { |
||||
|
console.log(`\n=== PAGE ${pageNum} ===\n`); |
||||
|
const page = await pdfDoc.getPage(pageNum); |
||||
|
const textContent = await page.getTextContent({ includeMarkedContent: false }); |
||||
|
|
||||
|
const items = textContent.items; |
||||
|
const styles = textContent.styles; |
||||
|
|
||||
|
// Sort by y descending (top of page first), then x ascending
|
||||
|
const sorted = [...items].sort((a, b) => { |
||||
|
const dy = b.transform[5] - a.transform[5]; |
||||
|
if (Math.abs(dy) > 2) return dy; |
||||
|
return a.transform[4] - b.transform[4]; |
||||
|
}); |
||||
|
|
||||
|
for (const item of sorted) { |
||||
|
const text = item.str.trim(); |
||||
|
if (!text) continue; |
||||
|
|
||||
|
const x = Math.round(item.transform[4] * 10) / 10; |
||||
|
const y = Math.round(item.transform[5] * 10) / 10; |
||||
|
const style = styles[item.fontName] || {}; |
||||
|
const fontFamily = style.fontFamily || 'unknown'; |
||||
|
const isData = fontFamily.toLowerCase() !== 'serif'; |
||||
|
|
||||
|
console.log( |
||||
|
`${isData ? 'DATA' : 'TMPL'} | x=${String(x).padStart(7)} | y=${String(y).padStart(7)} | font=${fontFamily.padEnd(15)} | "${text}"` |
||||
|
); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
await pdfDoc.destroy(); |
||||
|
console.log('\nDone.'); |
||||
Loading…
Reference in new issue