-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse
More file actions
executable file
·96 lines (83 loc) · 2.53 KB
/
parse
File metadata and controls
executable file
·96 lines (83 loc) · 2.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/bin/bash
#
# Unified PDF parser - runs 8 different PDF parsing libraries
#
# Usage:
# ./parse <pdf_file>
#
# Output (8 files):
# Python:
# <filename>.pdfminer.txt - pdfminer.six
# <filename>.pymupdf.txt - PyMuPDF
# <filename>.pdfplumber.txt - pdfplumber
# JavaScript:
# <filename>.pdfparse.txt - pdf-parse
# <filename>.pdfjs.txt - pdfjs-dist
# <filename>.pdf2json.txt - pdf2json
# Go:
# <filename>.ledongthuc.txt - ledongthuc/pdf
# CLI:
# <filename>.pdftotext.txt - poppler (very common in enterprise ATS)
#
# Note: Desktop folder (~/Desktop) is the default location for input/output
#
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
DEFAULT_DIR="${PARSEPDF_DIR:-$HOME/Desktop}"
if [[ $# -eq 0 ]] || [[ "$1" == "-h" ]] || [[ "$1" == "--help" ]]; then
echo "Usage: ./parse <pdf_file>"
echo ""
echo "Output (8 files):"
echo ""
echo " Python:"
echo " <filename>.pdfminer.txt - pdfminer.six"
echo " <filename>.pymupdf.txt - PyMuPDF"
echo " <filename>.pdfplumber.txt - pdfplumber"
echo ""
echo " JavaScript:"
echo " <filename>.pdfparse.txt - pdf-parse"
echo " <filename>.pdfjs.txt - pdfjs-dist"
echo " <filename>.pdf2json.txt - pdf2json"
echo ""
echo " Go:"
echo " <filename>.ledongthuc.txt - ledongthuc/pdf"
echo ""
echo " CLI:"
echo " <filename>.pdftotext.txt - poppler (common in enterprise ATS)"
echo ""
echo "Note: Default directory is $DEFAULT_DIR"
echo " ./parse foo.pdf looks for $DEFAULT_DIR/foo.pdf"
exit 0
fi
PDF_FILE="$1"
# If not an absolute path, prepend Desktop directory
if [[ "$PDF_FILE" != /* ]]; then
PDF_FILE="$DEFAULT_DIR/$PDF_FILE"
fi
if [[ ! -f "$PDF_FILE" ]]; then
echo "Error: File not found: $PDF_FILE"
exit 1
fi
# Calculate output path for pdftotext
BASENAME="${PDF_FILE%.pdf}"
PDFTOTEXT_OUTPUT="${BASENAME}.pdftotext.txt"
echo "Parsing: $PDF_FILE"
echo ""
echo "=== Python parsers ==="
python "$SCRIPT_DIR/parsepdf.py" "$PDF_FILE"
echo ""
echo "=== JavaScript parsers ==="
node "$SCRIPT_DIR/parsepdf.js" "$PDF_FILE"
echo ""
echo "=== Go parser ==="
"$SCRIPT_DIR/parsepdf-go" "$PDF_FILE"
echo ""
echo "=== CLI parser (poppler) ==="
if command -v pdftotext &> /dev/null; then
pdftotext -layout "$PDF_FILE" "$PDFTOTEXT_OUTPUT"
echo "Output written to: $PDFTOTEXT_OUTPUT"
else
echo "pdftotext not found - install with: brew install poppler"
fi
echo ""
echo "Done. Compare the outputs to see how different ATS systems parse your PDF."