File tree Expand file tree Collapse file tree 3 files changed +25
-6
lines changed
Expand file tree Collapse file tree 3 files changed +25
-6
lines changed Original file line number Diff line number Diff line change @@ -4,6 +4,13 @@ FROM python:3.10-slim
44# Set the working directory in the container
55WORKDIR /app
66
7+ # Install system dependencies
8+ RUN apt-get update && apt-get install -y \
9+ poppler-utils \
10+ tesseract-ocr \
11+ libtesseract-dev \
12+ && rm -rf /var/lib/apt/lists/*
13+
714# Copy the current directory contents into the container at /app
815COPY . /app
916
Original file line number Diff line number Diff line change 1- import fitz
1+ import fitz # PyMuPDF
22import re
33import os
4+ import pytesseract
5+ from pdf2image import convert_from_path
6+ from PIL import Image
7+
48
59def extract_text_from_pdf (file_path , pattern ):
610 try :
711 pdf_file = fitz .open (file_path )
812 trans_numbers = []
913 page_numbers = []
10- # Create regex dynamically based on the pattern
11- regex = rf'{ re .escape (pattern )} \s+(\d+)'
1214
13- for number , page in enumerate (pdf_file ):
14- data = page .get_text ("text" )
15- matches = re .findall (regex , data )
15+ # Convert each PDF page to an image
16+ images = convert_from_path (file_path , dpi = 300 ) # High DPI for better OCR
17+
18+ for number , image in enumerate (images ):
19+ # Convert image to text using Tesseract OCR
20+ text = pytesseract .image_to_string (image )
21+
22+ # Search for pattern in extracted text
23+ regex = rf'{ re .escape (pattern )} \s+(\d+)'
24+ matches = re .findall (regex , text )
1625
1726 if matches :
1827 trans_numbers .append (matches [0 ])
Original file line number Diff line number Diff line change 11Flask == 3.1.0
2+ pdf2image == 1.17.0
3+ pillow == 11.1.0
24PyMuPDF == 1.25.2
5+ pytesseract == 0.3.13
You can’t perform that action at this time.
0 commit comments