maybe-ats-parse/parsepdf.js at main · treejamie/maybe-ats-parse · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env node
/**
 * PDF to Text parser - mimics ATS (Applicant Tracking System) text extraction.
 * Uses three different PDF parsing libraries for comparison.
 */

import { readFile, writeFile } from 'fs/promises';
import { existsSync } from 'fs';
import { basename, dirname, join } from 'path';
import { createRequire } from 'module';

// pdf-parse
import pdfParse from 'pdf-parse/lib/pdf-parse.js';

// pdfjs-dist
import { getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs';

// pdf2json
const require = createRequire(import.meta.url);
const PDFParser = require('pdf2json');

function getOutputPath(inputPath, library) {
  const dir = dirname(inputPath);
  const name = basename(inputPath, '.pdf');
  return join(dir, `${name}.${library}.txt`);
}

async function parseWithPdfParse(pdfPath) {
  const dataBuffer = await readFile(pdfPath);
  const data = await pdfParse(dataBuffer, {
    normalizeWhitespace: false,
    disableCombineTextItems: false,
  });
  return data.text;
}

async function parseWithPdfjs(pdfPath) {
  const dataBuffer = await readFile(pdfPath);
  const uint8Array = new Uint8Array(dataBuffer);
  const doc = await getDocument({ data: uint8Array }).promise;

  const textParts = [];
  for (let i = 1; i <= doc.numPages; i++) {
    const page = await doc.getPage(i);
    const textContent = await page.getTextContent();
    const pageText = textContent.items
      .map(item => item.str)
      .join(' ');
    textParts.push(pageText);
  }

  return textParts.join('\n');
}

async function parseWithPdf2json(pdfPath) {
  return new Promise((resolve, reject) => {
    const pdfParser = new PDFParser(null, 1); // 1 = don't combine text items

    pdfParser.on('pdfParser_dataError', err => reject(err.parserError));
    pdfParser.on('pdfParser_dataReady', pdfData => {
      // Extract text from all pages
      const textParts = [];
      if (pdfData.Pages) {
        for (const page of pdfData.Pages) {
          if (page.Texts) {
            for (const textItem of page.Texts) {
              if (textItem.R) {
                for (const r of textItem.R) {
                  if (r.T) {
                    textParts.push(decodeURIComponent(r.T));
                  }
                }
              }
            }
          }
          textParts.push('\n');
        }
      }
      resolve(textParts.join(' '));
    });

    pdfParser.loadPDF(pdfPath);
  });
}

async function main() {
  const args = process.argv.slice(2);

  if (args.length === 0 || args.includes('-h') || args.includes('--help')) {
    console.log('Usage: node parsepdf.js <pdf_file>');
    console.log('Output: Creates three files:');
    console.log('  <filename>.pdfparse.txt');
    console.log('  <filename>.pdfjs.txt');
    console.log('  <filename>.pdf2json.txt');
    process.exit(args.length === 0 ? 1 : 0);
  }

  const pdfPath = args[0];

  if (!existsSync(pdfPath)) {
    console.error(`Error: File not found: ${pdfPath}`);
    process.exit(1);
  }

  const parsers = [
    { name: 'pdfparse', fn: parseWithPdfParse },
    { name: 'pdfjs', fn: parseWithPdfjs },
    { name: 'pdf2json', fn: parseWithPdf2json },
  ];

  for (const { name, fn } of parsers) {
    try {
      const text = await fn(pdfPath);
      const outputPath = getOutputPath(pdfPath, name);
      await writeFile(outputPath, text, 'utf-8');
      console.log(`Output written to: ${outputPath}`);
    } catch (err) {
      console.error(`Error with ${name}: ${err.message}`);
    }
  }
}

main();