From 916de8fbf2e9e32caa0600133cf8c41a151581f9 Mon Sep 17 00:00:00 2001 From: SeanClay10 Date: Sun, 25 Jan 2026 15:39:58 -0800 Subject: [PATCH 1/2] Feat: PDF processing improvements - structured tables --- requirements.txt | 3 + src/preprocessing/pdf_text_extraction.py | 176 +++++++++++++++++++++-- 2 files changed, 168 insertions(+), 11 deletions(-) diff --git a/requirements.txt b/requirements.txt index 14496ae..88e2f35 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,7 @@ numpy google-api-python-client ollama pydantic +camelot-py[base] +opencv-python +pymupdf_layout xgboost \ No newline at end of file diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py index bbd9a6b..3e339dd 100644 --- a/src/preprocessing/pdf_text_extraction.py +++ b/src/preprocessing/pdf_text_extraction.py @@ -6,9 +6,9 @@ This script uses PyMuPDF for accurate and efficient text extraction from scientific PDFs. It preserves reading order, handles multi-column text, and automatically applies OCR when a page contains only images (e.g., scanned documents). +Tables are automatically detected and extracted using PyMuPDF first, with camelot-py as fallback. """ -# Extract all text from a PDF using PyMuPDF import fitz import pytesseract from PIL import Image @@ -16,14 +16,150 @@ import argparse from pathlib import Path import sys +import json +from typing import List, Dict +import camelot Image.MAX_IMAGE_PIXELS = None fitz.TOOLS.mupdf_display_errors(False) +def extract_tables_with_camelot(pdf_path: str) -> List[Dict]: + """Extract tables using camelot-py (fallback method). + + Args: + pdf_path: Path to the PDF file + + Returns: + List of dictionaries containing table data and metadata + """ + + tables_data = [] + + try: + # Try stream method first with edge detection (better for tables without borders) + tables = camelot.read_pdf( + str(pdf_path), + pages='all', + flavor='stream', + edge_tol=500, # Tolerance for detecting table edges + row_tol=10, # Ttolerance for row detection + column_tol=5 # Tolerance for column detection + ) + + # If still no tables, try lattice method (for bordered tables) + if len(tables) == 0: + tables = camelot.read_pdf( + str(pdf_path), + pages='all', + flavor='lattice', + line_scale=40 + ) + + for idx, table in enumerate(tables, start=1): + # Convert to list of lists + table_cells = table.df.values.tolist() + + # Add header row (pandas columns) + header = table.df.columns.tolist() + table_cells.insert(0, header) + + # Skip tables with very few cells (likely detection errors) + if len(table_cells) < 3 or (len(table_cells[0]) if table_cells else 0) < 2: + continue + + table_info = { + "table_id": f"Table_P{table.page}_T{idx}", + "page_number": table.page, + "table_index": idx, + "bbox": { + "x0": table._bbox[0] if hasattr(table, '_bbox') else 0, + "y0": table._bbox[1] if hasattr(table, '_bbox') else 0, + "x1": table._bbox[2] if hasattr(table, '_bbox') else 0, + "y1": table._bbox[3] if hasattr(table, '_bbox') else 0 + }, + "num_rows": len(table_cells), + "num_cols": len(table_cells[0]) if table_cells else 0, + "cells": table_cells, + "accuracy": float(table.accuracy) if hasattr(table, 'accuracy') else 0.0, + "extraction_method": "camelot" + } + + tables_data.append(table_info) + + except Exception as e: + print(f"[ERROR] Camelot extraction failed: {e}", file=sys.stderr) + + return tables_data + + +def extract_tables_from_pdf(pdf_path: str) -> List[Dict]: + """Extract tables from PDF using PyMuPDF first, then camelot-py as fallback. + + Args: + pdf_path: Path to the PDF file + + Returns: + List of dictionaries containing table data and metadata + """ + tables_data = [] + + # Try PyMuPDF first + try: + with fitz.open(pdf_path) as doc: + for page_num, page in enumerate(doc, start=1): + tabs = page.find_tables() + + if not tabs.tables: + continue + + for table_idx, tab in enumerate(tabs.tables, start=1): + try: + table_cells = tab.extract() + + if not table_cells or len(table_cells) == 0: + continue + + bbox = tab.bbox + + table_info = { + "table_id": f"Table_P{page_num}_T{table_idx}", + "page_number": page_num, + "table_index": table_idx, + "bbox": { + "x0": bbox.x0, + "y0": bbox.y0, + "x1": bbox.x1, + "y1": bbox.y1 + }, + "num_rows": len(table_cells), + "num_cols": len(table_cells[0]) if table_cells else 0, + "cells": table_cells, + "extraction_method": "pymupdf" + } + + tables_data.append(table_info) + + except Exception as e: + tables_data.append({ + "table_id": f"Table_P{page_num}_T{table_idx}", + "page_number": page_num, + "table_index": table_idx, + "error": str(e), + "extraction_method": "pymupdf" + }) + + except Exception as e: + print(f"[ERROR] PyMuPDF table extraction failed: {e}", file=sys.stderr) + + # If PyMuPDF found no tables, try camelot + if len(tables_data) == 0: + tables_data = extract_tables_with_camelot(pdf_path) + + return tables_data + def extract_text_from_pdf(pdf_path: str) -> str: text = [] - print(f"Extracting text from {pdf_path}.") try: with fitz.open(pdf_path) as doc: for page_num, page in enumerate(doc, start=1): @@ -44,7 +180,7 @@ def extract_text_from_pdf(pdf_path: str) -> str: except Exception as e: print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr) return "" - # Join all pages into a single string separated by newlines + return "\n".join(text) @@ -68,19 +204,23 @@ def extract_text_from_pdf_bytes(data: bytes) -> str: return "\n".join(text) -# Save extracted text to a file. def save_to_file(text: str, output_path: str): + """Save extracted text to a file.""" try: with open(output_path, "w", encoding="utf-8") as f: f.write(text) - print(f"[INFO] Text successfully saved to {output_path}") except Exception as e: print(f"[ERROR] Could not save text to {output_path}: {e}", file=sys.stderr) def main(): - parser = argparse.ArgumentParser(description="Extract text from PDF using PyMuPDF.") + parser = argparse.ArgumentParser( + description="Extract text and tables from PDF using PyMuPDF and camelot-py." + ) parser.add_argument("pdf", type=str, help="Path to the input PDF file.") + parser.add_argument("--output-dir", type=str, default="data/processed-text", + help="Output directory for extracted text (default: data/processed-text)") + args = parser.parse_args() pdf_path = Path(args.pdf) @@ -88,13 +228,27 @@ def main(): print(f"[ERROR] File not found: {pdf_path}", file=sys.stderr) sys.exit(1) - # Perform extraction + # Extract text text = extract_text_from_pdf(str(pdf_path)) - - output_path = Path("data/processed-text") / pdf_path.with_suffix(".txt").name - + + # Extract tables + tables_data = extract_tables_from_pdf(str(pdf_path)) + + # If tables were found, save JSON + if tables_data: + + # Save tables JSON + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + tables_json_path = output_dir / (pdf_path.stem + "_tables.json") + with open(tables_json_path, "w", encoding="utf-8") as f: + json.dump(tables_data, f, indent=2) + + # Save combined text + output_path = Path(args.output_dir) / pdf_path.with_suffix(".txt").name save_to_file(text, str(output_path)) if __name__ == "__main__": - main() + main() \ No newline at end of file From bb89baebcf24a84598ed3dfed6a00d7b25ae32d1 Mon Sep 17 00:00:00 2001 From: SeanClay10 Date: Sun, 25 Jan 2026 15:46:34 -0800 Subject: [PATCH 2/2] Fix: Formatting issues. --- src/preprocessing/pdf_text_extraction.py | 107 +++++++++-------------- 1 file changed, 42 insertions(+), 65 deletions(-) diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py index 3e339dd..a51ee2c 100644 --- a/src/preprocessing/pdf_text_extraction.py +++ b/src/preprocessing/pdf_text_extraction.py @@ -23,50 +23,41 @@ Image.MAX_IMAGE_PIXELS = None fitz.TOOLS.mupdf_display_errors(False) + def extract_tables_with_camelot(pdf_path: str) -> List[Dict]: """Extract tables using camelot-py (fallback method). - + Args: pdf_path: Path to the PDF file - + Returns: List of dictionaries containing table data and metadata """ - + tables_data = [] - + try: # Try stream method first with edge detection (better for tables without borders) tables = camelot.read_pdf( - str(pdf_path), - pages='all', - flavor='stream', - edge_tol=500, # Tolerance for detecting table edges - row_tol=10, # Ttolerance for row detection - column_tol=5 # Tolerance for column detection + str(pdf_path), pages='all', flavor='stream', edge_tol=500, row_tol=10, column_tol=5 # Tolerance for detecting table edges # Ttolerance for row detection # Tolerance for column detection ) - + # If still no tables, try lattice method (for bordered tables) if len(tables) == 0: - tables = camelot.read_pdf( - str(pdf_path), - pages='all', - flavor='lattice', - line_scale=40 - ) - + tables = camelot.read_pdf(str(pdf_path), pages='all', flavor='lattice', line_scale=40) + for idx, table in enumerate(tables, start=1): # Convert to list of lists table_cells = table.df.values.tolist() - + # Add header row (pandas columns) header = table.df.columns.tolist() table_cells.insert(0, header) - + # Skip tables with very few cells (likely detection errors) if len(table_cells) < 3 or (len(table_cells[0]) if table_cells else 0) < 2: continue - + table_info = { "table_id": f"Table_P{table.page}_T{idx}", "page_number": table.page, @@ -75,86 +66,75 @@ def extract_tables_with_camelot(pdf_path: str) -> List[Dict]: "x0": table._bbox[0] if hasattr(table, '_bbox') else 0, "y0": table._bbox[1] if hasattr(table, '_bbox') else 0, "x1": table._bbox[2] if hasattr(table, '_bbox') else 0, - "y1": table._bbox[3] if hasattr(table, '_bbox') else 0 + "y1": table._bbox[3] if hasattr(table, '_bbox') else 0, }, "num_rows": len(table_cells), "num_cols": len(table_cells[0]) if table_cells else 0, "cells": table_cells, "accuracy": float(table.accuracy) if hasattr(table, 'accuracy') else 0.0, - "extraction_method": "camelot" + "extraction_method": "camelot", } - + tables_data.append(table_info) - + except Exception as e: print(f"[ERROR] Camelot extraction failed: {e}", file=sys.stderr) - + return tables_data def extract_tables_from_pdf(pdf_path: str) -> List[Dict]: """Extract tables from PDF using PyMuPDF first, then camelot-py as fallback. - + Args: pdf_path: Path to the PDF file - + Returns: List of dictionaries containing table data and metadata """ tables_data = [] - + # Try PyMuPDF first try: with fitz.open(pdf_path) as doc: for page_num, page in enumerate(doc, start=1): tabs = page.find_tables() - + if not tabs.tables: continue - + for table_idx, tab in enumerate(tabs.tables, start=1): try: table_cells = tab.extract() - + if not table_cells or len(table_cells) == 0: continue - + bbox = tab.bbox - + table_info = { "table_id": f"Table_P{page_num}_T{table_idx}", "page_number": page_num, "table_index": table_idx, - "bbox": { - "x0": bbox.x0, - "y0": bbox.y0, - "x1": bbox.x1, - "y1": bbox.y1 - }, + "bbox": {"x0": bbox.x0, "y0": bbox.y0, "x1": bbox.x1, "y1": bbox.y1}, "num_rows": len(table_cells), "num_cols": len(table_cells[0]) if table_cells else 0, "cells": table_cells, - "extraction_method": "pymupdf" + "extraction_method": "pymupdf", } - + tables_data.append(table_info) - + except Exception as e: - tables_data.append({ - "table_id": f"Table_P{page_num}_T{table_idx}", - "page_number": page_num, - "table_index": table_idx, - "error": str(e), - "extraction_method": "pymupdf" - }) - + tables_data.append({"table_id": f"Table_P{page_num}_T{table_idx}", "page_number": page_num, "table_index": table_idx, "error": str(e), "extraction_method": "pymupdf"}) + except Exception as e: print(f"[ERROR] PyMuPDF table extraction failed: {e}", file=sys.stderr) - + # If PyMuPDF found no tables, try camelot if len(tables_data) == 0: tables_data = extract_tables_with_camelot(pdf_path) - + return tables_data @@ -180,7 +160,7 @@ def extract_text_from_pdf(pdf_path: str) -> str: except Exception as e: print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr) return "" - + return "\n".join(text) @@ -214,13 +194,10 @@ def save_to_file(text: str, output_path: str): def main(): - parser = argparse.ArgumentParser( - description="Extract text and tables from PDF using PyMuPDF and camelot-py." - ) + parser = argparse.ArgumentParser(description="Extract text and tables from PDF using PyMuPDF and camelot-py.") parser.add_argument("pdf", type=str, help="Path to the input PDF file.") - parser.add_argument("--output-dir", type=str, default="data/processed-text", - help="Output directory for extracted text (default: data/processed-text)") - + parser.add_argument("--output-dir", type=str, default="data/processed-text", help="Output directory for extracted text (default: data/processed-text)") + args = parser.parse_args() pdf_path = Path(args.pdf) @@ -230,17 +207,17 @@ def main(): # Extract text text = extract_text_from_pdf(str(pdf_path)) - + # Extract tables tables_data = extract_tables_from_pdf(str(pdf_path)) - + # If tables were found, save JSON if tables_data: - + # Save tables JSON output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) - + tables_json_path = output_dir / (pdf_path.stem + "_tables.json") with open(tables_json_path, "w", encoding="utf-8") as f: json.dump(tables_data, f, indent=2) @@ -251,4 +228,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main()