-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf2text.py
More file actions
63 lines (56 loc) · 2.48 KB
/
pdf2text.py
File metadata and controls
63 lines (56 loc) · 2.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/bin/python
"""Module to convert a PDF file to a Text file using OCR."""
import os
import sys
import tempfile
import pytesseract
from PIL import Image
import fitz # this is pymupdf
from tqdm.autonotebook import tqdm
from dotenv import load_dotenv
load_dotenv() # take environment variables from .env.
# Set the path to the installed tesseract OCR
pytesseract.pytesseract.Py = os.environ.get('TESSERACT_CMD', 'tesseract')
if not os.path.exists(pytesseract.pytesseract.Py):
print(f"Error: Tesseract not found at {pytesseract.pytesseract.Py}")
print("Please install Tesseract OCR and set the TESSERACT_CMD environment variable "
"to the .env file (or to the path of the installed tesseract executable).")
sys.exit(1)
def pdf_to_text(file_path :str, text_path :str) ->None:
"""Convert a PDF file to a Text file using OCR."""
print(f"Converting PDF {file_path} to Text {text_path} ...")
temp_dir = tempfile.gettempdir() # Returns the location of the temporary directory
doc = fitz.open(file_path)
for i in tqdm(range(len(doc)), unit="page", desc="Extracting pages"):
for img in doc.get_page_images(i):
xref = img[0]
pix = fitz.Pixmap(doc, xref)
if pix.n < 5: # this is GRAY or RGB
pix._writeIMG(f"{temp_dir}/p{i}.png", "png", 94)
else: # CMYK: convert to RGB first
pix1 = fitz.Pixmap(fitz.csRGB, pix)
pix1._writeIMG(f"{temp_dir}/p{i}.png", "png", 94)
pix1 = None
pix = None
for i in tqdm(range(len(doc)), unit="page", desc="Extracting text"):
image_path = f"{temp_dir}/p{i}.png"
text = doc.load_page(i).get_text()
if os.path.exists(image_path):
# Recognize the text as string in image using pytesserct
text += pytesseract.image_to_string(Image.open(image_path))
# Save the text in a Text file
with open(text_path, "w" if i == 0 else "a", encoding="utf-8") as f:
f.write(text)
f.write(f"\n\n<!-- Page {i+1} -->\n\n")
if os.path.exists(image_path):
os.remove(image_path) # remove image after text extraction
if __name__ == "__main__":
if len(sys.argv) < 2:
print(f"Usage: python {sys.argv[0]} <pdf_file_path> [<text_file_path>]")
sys.exit(1)
source = sys.argv[1]
if len(sys.argv) > 2:
target = sys.argv[2]
else:
target = source.replace(".pdf", ".txt")
pdf_to_text(source, target)