Skip to content

Commit 268e4c0

Browse files
authored
Merge pull request #3 from flycatch/dev
[ Dev ] : extract text from scanned pdf
2 parents 4e1fa5a + 5628053 commit 268e4c0

File tree

3 files changed

+25
-6
lines changed

3 files changed

+25
-6
lines changed

Dockerfile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,13 @@ FROM python:3.10-slim
44
# Set the working directory in the container
55
WORKDIR /app
66

7+
# Install system dependencies
8+
RUN apt-get update && apt-get install -y \
9+
poppler-utils \
10+
tesseract-ocr \
11+
libtesseract-dev \
12+
&& rm -rf /var/lib/apt/lists/*
13+
714
# Copy the current directory contents into the container at /app
815
COPY . /app
916

app/pdf_processor.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,27 @@
1-
import fitz
1+
import fitz # PyMuPDF
22
import re
33
import os
4+
import pytesseract
5+
from pdf2image import convert_from_path
6+
from PIL import Image
7+
48

59
def extract_text_from_pdf(file_path, pattern):
610
try:
711
pdf_file = fitz.open(file_path)
812
trans_numbers = []
913
page_numbers = []
10-
# Create regex dynamically based on the pattern
11-
regex = rf'{re.escape(pattern)}\s+(\d+)'
1214

13-
for number, page in enumerate(pdf_file):
14-
data = page.get_text("text")
15-
matches = re.findall(regex, data)
15+
# Convert each PDF page to an image
16+
images = convert_from_path(file_path, dpi=300) # High DPI for better OCR
17+
18+
for number, image in enumerate(images):
19+
# Convert image to text using Tesseract OCR
20+
text = pytesseract.image_to_string(image)
21+
22+
# Search for pattern in extracted text
23+
regex = rf'{re.escape(pattern)}\s+(\d+)'
24+
matches = re.findall(regex, text)
1625

1726
if matches:
1827
trans_numbers.append(matches[0])

requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
Flask==3.1.0
2+
pdf2image==1.17.0
3+
pillow==11.1.0
24
PyMuPDF==1.25.2
5+
pytesseract==0.3.13

0 commit comments

Comments
 (0)