-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparsepdf.js
More file actions
executable file
·123 lines (104 loc) · 3.4 KB
/
parsepdf.js
File metadata and controls
executable file
·123 lines (104 loc) · 3.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env node
/**
* PDF to Text parser - mimics ATS (Applicant Tracking System) text extraction.
* Uses three different PDF parsing libraries for comparison.
*/
import { readFile, writeFile } from 'fs/promises';
import { existsSync } from 'fs';
import { basename, dirname, join } from 'path';
import { createRequire } from 'module';
// pdf-parse
import pdfParse from 'pdf-parse/lib/pdf-parse.js';
// pdfjs-dist
import { getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs';
// pdf2json
const require = createRequire(import.meta.url);
const PDFParser = require('pdf2json');
function getOutputPath(inputPath, library) {
const dir = dirname(inputPath);
const name = basename(inputPath, '.pdf');
return join(dir, `${name}.${library}.txt`);
}
async function parseWithPdfParse(pdfPath) {
const dataBuffer = await readFile(pdfPath);
const data = await pdfParse(dataBuffer, {
normalizeWhitespace: false,
disableCombineTextItems: false,
});
return data.text;
}
async function parseWithPdfjs(pdfPath) {
const dataBuffer = await readFile(pdfPath);
const uint8Array = new Uint8Array(dataBuffer);
const doc = await getDocument({ data: uint8Array }).promise;
const textParts = [];
for (let i = 1; i <= doc.numPages; i++) {
const page = await doc.getPage(i);
const textContent = await page.getTextContent();
const pageText = textContent.items
.map(item => item.str)
.join(' ');
textParts.push(pageText);
}
return textParts.join('\n');
}
async function parseWithPdf2json(pdfPath) {
return new Promise((resolve, reject) => {
const pdfParser = new PDFParser(null, 1); // 1 = don't combine text items
pdfParser.on('pdfParser_dataError', err => reject(err.parserError));
pdfParser.on('pdfParser_dataReady', pdfData => {
// Extract text from all pages
const textParts = [];
if (pdfData.Pages) {
for (const page of pdfData.Pages) {
if (page.Texts) {
for (const textItem of page.Texts) {
if (textItem.R) {
for (const r of textItem.R) {
if (r.T) {
textParts.push(decodeURIComponent(r.T));
}
}
}
}
}
textParts.push('\n');
}
}
resolve(textParts.join(' '));
});
pdfParser.loadPDF(pdfPath);
});
}
async function main() {
const args = process.argv.slice(2);
if (args.length === 0 || args.includes('-h') || args.includes('--help')) {
console.log('Usage: node parsepdf.js <pdf_file>');
console.log('Output: Creates three files:');
console.log(' <filename>.pdfparse.txt');
console.log(' <filename>.pdfjs.txt');
console.log(' <filename>.pdf2json.txt');
process.exit(args.length === 0 ? 1 : 0);
}
const pdfPath = args[0];
if (!existsSync(pdfPath)) {
console.error(`Error: File not found: ${pdfPath}`);
process.exit(1);
}
const parsers = [
{ name: 'pdfparse', fn: parseWithPdfParse },
{ name: 'pdfjs', fn: parseWithPdfjs },
{ name: 'pdf2json', fn: parseWithPdf2json },
];
for (const { name, fn } of parsers) {
try {
const text = await fn(pdfPath);
const outputPath = getOutputPath(pdfPath, name);
await writeFile(outputPath, text, 'utf-8');
console.log(`Output written to: ${outputPath}`);
} catch (err) {
console.error(`Error with ${name}: ${err.message}`);
}
}
}
main();