You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

74 lines
2.3 KiB

/**
* Utility to extract all text items with their (x, y) positions from a K-1 PDF.
* This dumps every text item with coordinates so we can calibrate position regions.
*
* Usage: node tools/extract-k1-positions.mjs <path-to-pdf>
*/
import { readFileSync } from 'fs';
import { resolve } from 'path';
// Dynamic import of pdfjs-dist legacy build
const { getDocument, GlobalWorkerOptions } = await import(
'pdfjs-dist/legacy/build/pdf.mjs'
);
const workerPath =
'file:///' +
resolve('node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs').replace(
/\\/g,
'/'
);
GlobalWorkerOptions.workerSrc = workerPath;
const pdfPath = process.argv[2];
if (!pdfPath) {
console.error('Usage: node tools/extract-k1-positions.mjs <path-to-pdf>');
process.exit(1);
}
const buffer = readFileSync(pdfPath);
const loadingTask = getDocument({
data: new Uint8Array(buffer),
standardFontDataUrl: resolve('node_modules/pdfjs-dist/standard_fonts') + '/',
cMapUrl: resolve('node_modules/pdfjs-dist/cmaps') + '/',
cMapPacked: true,
isEvalSupported: false,
disableFontFace: true
});
const pdfDoc = await loadingTask.promise;
console.log(`Pages: ${pdfDoc.numPages}`);
for (let pageNum = 1; pageNum <= Math.min(pdfDoc.numPages, 2); pageNum++) {
console.log(`\n=== PAGE ${pageNum} ===\n`);
const page = await pdfDoc.getPage(pageNum);
const textContent = await page.getTextContent({ includeMarkedContent: false });
const items = textContent.items;
const styles = textContent.styles;
// Sort by y descending (top of page first), then x ascending
const sorted = [...items].sort((a, b) => {
const dy = b.transform[5] - a.transform[5];
if (Math.abs(dy) > 2) return dy;
return a.transform[4] - b.transform[4];
});
for (const item of sorted) {
const text = item.str.trim();
if (!text) continue;
const x = Math.round(item.transform[4] * 10) / 10;
const y = Math.round(item.transform[5] * 10) / 10;
const style = styles[item.fontName] || {};
const fontFamily = style.fontFamily || 'unknown';
const isData = fontFamily.toLowerCase() !== 'serif';
console.log(
`${isData ? 'DATA' : 'TMPL'} | x=${String(x).padStart(7)} | y=${String(y).padStart(7)} | font=${fontFamily.padEnd(15)} | "${text}"`
);
}
}
await pdfDoc.destroy();
console.log('\nDone.');