From 79cf34497318d4014e62f8506ad72d7a15288809 Mon Sep 17 00:00:00 2001 From: Sean Clayton Date: Sun, 25 Jan 2026 16:15:34 -0800 Subject: [PATCH] Revert "WIP: PDF Processing Improvements - Table Extraction" --- requirements.txt | 3 - src/preprocessing/pdf_text_extraction.py | 147 ++--------------------- 2 files changed, 8 insertions(+), 142 deletions(-) diff --git a/requirements.txt b/requirements.txt index 88e2f35..14496ae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,4 @@ numpy google-api-python-client ollama pydantic -camelot-py[base] -opencv-python -pymupdf_layout xgboost \ No newline at end of file diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py index a51ee2c..bbd9a6b 100644 --- a/src/preprocessing/pdf_text_extraction.py +++ b/src/preprocessing/pdf_text_extraction.py @@ -6,9 +6,9 @@ This script uses PyMuPDF for accurate and efficient text extraction from scientific PDFs. It preserves reading order, handles multi-column text, and automatically applies OCR when a page contains only images (e.g., scanned documents). -Tables are automatically detected and extracted using PyMuPDF first, with camelot-py as fallback. """ +# Extract all text from a PDF using PyMuPDF import fitz import pytesseract from PIL import Image @@ -16,130 +16,14 @@ import argparse from pathlib import Path import sys -import json -from typing import List, Dict -import camelot Image.MAX_IMAGE_PIXELS = None fitz.TOOLS.mupdf_display_errors(False) -def extract_tables_with_camelot(pdf_path: str) -> List[Dict]: - """Extract tables using camelot-py (fallback method). - - Args: - pdf_path: Path to the PDF file - - Returns: - List of dictionaries containing table data and metadata - """ - - tables_data = [] - - try: - # Try stream method first with edge detection (better for tables without borders) - tables = camelot.read_pdf( - str(pdf_path), pages='all', flavor='stream', edge_tol=500, row_tol=10, column_tol=5 # Tolerance for detecting table edges # Ttolerance for row detection # Tolerance for column detection - ) - - # If still no tables, try lattice method (for bordered tables) - if len(tables) == 0: - tables = camelot.read_pdf(str(pdf_path), pages='all', flavor='lattice', line_scale=40) - - for idx, table in enumerate(tables, start=1): - # Convert to list of lists - table_cells = table.df.values.tolist() - - # Add header row (pandas columns) - header = table.df.columns.tolist() - table_cells.insert(0, header) - - # Skip tables with very few cells (likely detection errors) - if len(table_cells) < 3 or (len(table_cells[0]) if table_cells else 0) < 2: - continue - - table_info = { - "table_id": f"Table_P{table.page}_T{idx}", - "page_number": table.page, - "table_index": idx, - "bbox": { - "x0": table._bbox[0] if hasattr(table, '_bbox') else 0, - "y0": table._bbox[1] if hasattr(table, '_bbox') else 0, - "x1": table._bbox[2] if hasattr(table, '_bbox') else 0, - "y1": table._bbox[3] if hasattr(table, '_bbox') else 0, - }, - "num_rows": len(table_cells), - "num_cols": len(table_cells[0]) if table_cells else 0, - "cells": table_cells, - "accuracy": float(table.accuracy) if hasattr(table, 'accuracy') else 0.0, - "extraction_method": "camelot", - } - - tables_data.append(table_info) - - except Exception as e: - print(f"[ERROR] Camelot extraction failed: {e}", file=sys.stderr) - - return tables_data - - -def extract_tables_from_pdf(pdf_path: str) -> List[Dict]: - """Extract tables from PDF using PyMuPDF first, then camelot-py as fallback. - - Args: - pdf_path: Path to the PDF file - - Returns: - List of dictionaries containing table data and metadata - """ - tables_data = [] - - # Try PyMuPDF first - try: - with fitz.open(pdf_path) as doc: - for page_num, page in enumerate(doc, start=1): - tabs = page.find_tables() - - if not tabs.tables: - continue - - for table_idx, tab in enumerate(tabs.tables, start=1): - try: - table_cells = tab.extract() - - if not table_cells or len(table_cells) == 0: - continue - - bbox = tab.bbox - - table_info = { - "table_id": f"Table_P{page_num}_T{table_idx}", - "page_number": page_num, - "table_index": table_idx, - "bbox": {"x0": bbox.x0, "y0": bbox.y0, "x1": bbox.x1, "y1": bbox.y1}, - "num_rows": len(table_cells), - "num_cols": len(table_cells[0]) if table_cells else 0, - "cells": table_cells, - "extraction_method": "pymupdf", - } - - tables_data.append(table_info) - - except Exception as e: - tables_data.append({"table_id": f"Table_P{page_num}_T{table_idx}", "page_number": page_num, "table_index": table_idx, "error": str(e), "extraction_method": "pymupdf"}) - - except Exception as e: - print(f"[ERROR] PyMuPDF table extraction failed: {e}", file=sys.stderr) - - # If PyMuPDF found no tables, try camelot - if len(tables_data) == 0: - tables_data = extract_tables_with_camelot(pdf_path) - - return tables_data - - def extract_text_from_pdf(pdf_path: str) -> str: text = [] + print(f"Extracting text from {pdf_path}.") try: with fitz.open(pdf_path) as doc: for page_num, page in enumerate(doc, start=1): @@ -160,7 +44,7 @@ def extract_text_from_pdf(pdf_path: str) -> str: except Exception as e: print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr) return "" - + # Join all pages into a single string separated by newlines return "\n".join(text) @@ -184,20 +68,19 @@ def extract_text_from_pdf_bytes(data: bytes) -> str: return "\n".join(text) +# Save extracted text to a file. def save_to_file(text: str, output_path: str): - """Save extracted text to a file.""" try: with open(output_path, "w", encoding="utf-8") as f: f.write(text) + print(f"[INFO] Text successfully saved to {output_path}") except Exception as e: print(f"[ERROR] Could not save text to {output_path}: {e}", file=sys.stderr) def main(): - parser = argparse.ArgumentParser(description="Extract text and tables from PDF using PyMuPDF and camelot-py.") + parser = argparse.ArgumentParser(description="Extract text from PDF using PyMuPDF.") parser.add_argument("pdf", type=str, help="Path to the input PDF file.") - parser.add_argument("--output-dir", type=str, default="data/processed-text", help="Output directory for extracted text (default: data/processed-text)") - args = parser.parse_args() pdf_path = Path(args.pdf) @@ -205,25 +88,11 @@ def main(): print(f"[ERROR] File not found: {pdf_path}", file=sys.stderr) sys.exit(1) - # Extract text + # Perform extraction text = extract_text_from_pdf(str(pdf_path)) - # Extract tables - tables_data = extract_tables_from_pdf(str(pdf_path)) - - # If tables were found, save JSON - if tables_data: - - # Save tables JSON - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - tables_json_path = output_dir / (pdf_path.stem + "_tables.json") - with open(tables_json_path, "w", encoding="utf-8") as f: - json.dump(tables_data, f, indent=2) + output_path = Path("data/processed-text") / pdf_path.with_suffix(".txt").name - # Save combined text - output_path = Path(args.output_dir) / pdf_path.with_suffix(".txt").name save_to_file(text, str(output_path))