Skip to content

Commit 79cf344

Browse files
authored
Revert "WIP: PDF Processing Improvements - Table Extraction"
1 parent bf13294 commit 79cf344

2 files changed

Lines changed: 8 additions & 142 deletions

File tree

requirements.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,4 @@ numpy
1515
google-api-python-client
1616
ollama
1717
pydantic
18-
camelot-py[base]
19-
opencv-python
20-
pymupdf_layout
2118
xgboost

src/preprocessing/pdf_text_extraction.py

Lines changed: 8 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -6,140 +6,24 @@
66
This script uses PyMuPDF for accurate and efficient text extraction from
77
scientific PDFs. It preserves reading order, handles multi-column text, and
88
automatically applies OCR when a page contains only images (e.g., scanned documents).
9-
Tables are automatically detected and extracted using PyMuPDF first, with camelot-py as fallback.
109
"""
1110

11+
# Extract all text from a PDF using PyMuPDF
1212
import fitz
1313
import pytesseract
1414
from PIL import Image
1515
import io
1616
import argparse
1717
from pathlib import Path
1818
import sys
19-
import json
20-
from typing import List, Dict
21-
import camelot
2219

2320
Image.MAX_IMAGE_PIXELS = None
2421
fitz.TOOLS.mupdf_display_errors(False)
2522

2623

27-
def extract_tables_with_camelot(pdf_path: str) -> List[Dict]:
28-
"""Extract tables using camelot-py (fallback method).
29-
30-
Args:
31-
pdf_path: Path to the PDF file
32-
33-
Returns:
34-
List of dictionaries containing table data and metadata
35-
"""
36-
37-
tables_data = []
38-
39-
try:
40-
# Try stream method first with edge detection (better for tables without borders)
41-
tables = camelot.read_pdf(
42-
str(pdf_path), pages='all', flavor='stream', edge_tol=500, row_tol=10, column_tol=5 # Tolerance for detecting table edges # Ttolerance for row detection # Tolerance for column detection
43-
)
44-
45-
# If still no tables, try lattice method (for bordered tables)
46-
if len(tables) == 0:
47-
tables = camelot.read_pdf(str(pdf_path), pages='all', flavor='lattice', line_scale=40)
48-
49-
for idx, table in enumerate(tables, start=1):
50-
# Convert to list of lists
51-
table_cells = table.df.values.tolist()
52-
53-
# Add header row (pandas columns)
54-
header = table.df.columns.tolist()
55-
table_cells.insert(0, header)
56-
57-
# Skip tables with very few cells (likely detection errors)
58-
if len(table_cells) < 3 or (len(table_cells[0]) if table_cells else 0) < 2:
59-
continue
60-
61-
table_info = {
62-
"table_id": f"Table_P{table.page}_T{idx}",
63-
"page_number": table.page,
64-
"table_index": idx,
65-
"bbox": {
66-
"x0": table._bbox[0] if hasattr(table, '_bbox') else 0,
67-
"y0": table._bbox[1] if hasattr(table, '_bbox') else 0,
68-
"x1": table._bbox[2] if hasattr(table, '_bbox') else 0,
69-
"y1": table._bbox[3] if hasattr(table, '_bbox') else 0,
70-
},
71-
"num_rows": len(table_cells),
72-
"num_cols": len(table_cells[0]) if table_cells else 0,
73-
"cells": table_cells,
74-
"accuracy": float(table.accuracy) if hasattr(table, 'accuracy') else 0.0,
75-
"extraction_method": "camelot",
76-
}
77-
78-
tables_data.append(table_info)
79-
80-
except Exception as e:
81-
print(f"[ERROR] Camelot extraction failed: {e}", file=sys.stderr)
82-
83-
return tables_data
84-
85-
86-
def extract_tables_from_pdf(pdf_path: str) -> List[Dict]:
87-
"""Extract tables from PDF using PyMuPDF first, then camelot-py as fallback.
88-
89-
Args:
90-
pdf_path: Path to the PDF file
91-
92-
Returns:
93-
List of dictionaries containing table data and metadata
94-
"""
95-
tables_data = []
96-
97-
# Try PyMuPDF first
98-
try:
99-
with fitz.open(pdf_path) as doc:
100-
for page_num, page in enumerate(doc, start=1):
101-
tabs = page.find_tables()
102-
103-
if not tabs.tables:
104-
continue
105-
106-
for table_idx, tab in enumerate(tabs.tables, start=1):
107-
try:
108-
table_cells = tab.extract()
109-
110-
if not table_cells or len(table_cells) == 0:
111-
continue
112-
113-
bbox = tab.bbox
114-
115-
table_info = {
116-
"table_id": f"Table_P{page_num}_T{table_idx}",
117-
"page_number": page_num,
118-
"table_index": table_idx,
119-
"bbox": {"x0": bbox.x0, "y0": bbox.y0, "x1": bbox.x1, "y1": bbox.y1},
120-
"num_rows": len(table_cells),
121-
"num_cols": len(table_cells[0]) if table_cells else 0,
122-
"cells": table_cells,
123-
"extraction_method": "pymupdf",
124-
}
125-
126-
tables_data.append(table_info)
127-
128-
except Exception as e:
129-
tables_data.append({"table_id": f"Table_P{page_num}_T{table_idx}", "page_number": page_num, "table_index": table_idx, "error": str(e), "extraction_method": "pymupdf"})
130-
131-
except Exception as e:
132-
print(f"[ERROR] PyMuPDF table extraction failed: {e}", file=sys.stderr)
133-
134-
# If PyMuPDF found no tables, try camelot
135-
if len(tables_data) == 0:
136-
tables_data = extract_tables_with_camelot(pdf_path)
137-
138-
return tables_data
139-
140-
14124
def extract_text_from_pdf(pdf_path: str) -> str:
14225
text = []
26+
print(f"Extracting text from {pdf_path}.")
14327
try:
14428
with fitz.open(pdf_path) as doc:
14529
for page_num, page in enumerate(doc, start=1):
@@ -160,7 +44,7 @@ def extract_text_from_pdf(pdf_path: str) -> str:
16044
except Exception as e:
16145
print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr)
16246
return ""
163-
47+
# Join all pages into a single string separated by newlines
16448
return "\n".join(text)
16549

16650

@@ -184,46 +68,31 @@ def extract_text_from_pdf_bytes(data: bytes) -> str:
18468
return "\n".join(text)
18569

18670

71+
# Save extracted text to a file.
18772
def save_to_file(text: str, output_path: str):
188-
"""Save extracted text to a file."""
18973
try:
19074
with open(output_path, "w", encoding="utf-8") as f:
19175
f.write(text)
76+
print(f"[INFO] Text successfully saved to {output_path}")
19277
except Exception as e:
19378
print(f"[ERROR] Could not save text to {output_path}: {e}", file=sys.stderr)
19479

19580

19681
def main():
197-
parser = argparse.ArgumentParser(description="Extract text and tables from PDF using PyMuPDF and camelot-py.")
82+
parser = argparse.ArgumentParser(description="Extract text from PDF using PyMuPDF.")
19883
parser.add_argument("pdf", type=str, help="Path to the input PDF file.")
199-
parser.add_argument("--output-dir", type=str, default="data/processed-text", help="Output directory for extracted text (default: data/processed-text)")
200-
20184
args = parser.parse_args()
20285

20386
pdf_path = Path(args.pdf)
20487
if not pdf_path.exists():
20588
print(f"[ERROR] File not found: {pdf_path}", file=sys.stderr)
20689
sys.exit(1)
20790

208-
# Extract text
91+
# Perform extraction
20992
text = extract_text_from_pdf(str(pdf_path))
21093

211-
# Extract tables
212-
tables_data = extract_tables_from_pdf(str(pdf_path))
213-
214-
# If tables were found, save JSON
215-
if tables_data:
216-
217-
# Save tables JSON
218-
output_dir = Path(args.output_dir)
219-
output_dir.mkdir(parents=True, exist_ok=True)
220-
221-
tables_json_path = output_dir / (pdf_path.stem + "_tables.json")
222-
with open(tables_json_path, "w", encoding="utf-8") as f:
223-
json.dump(tables_data, f, indent=2)
94+
output_path = Path("data/processed-text") / pdf_path.with_suffix(".txt").name
22495

225-
# Save combined text
226-
output_path = Path(args.output_dir) / pdf_path.with_suffix(".txt").name
22796
save_to_file(text, str(output_path))
22897

22998

0 commit comments

Comments
 (0)