From 09e18b4ec0fb3746551ec1b6c456b3886df7811c Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Sun, 25 Jan 2026 17:30:09 -0800 Subject: [PATCH 1/5] Added page number field to metrics --- src/llm/local_llm.py | 5 ++++- src/preprocessing/pdf_text_extraction.py | 6 ++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/llm/local_llm.py b/src/llm/local_llm.py index 981ab46..db51c5b 100644 --- a/src/llm/local_llm.py +++ b/src/llm/local_llm.py @@ -13,7 +13,7 @@ import json import sys from pathlib import Path -from typing import Optional +from typing import Optional, List from ollama import chat from pydantic import BaseModel, Field @@ -28,6 +28,7 @@ class PredatorDietMetrics(BaseModel): num_empty_stomachs: Optional[int] = Field(None, description="Number of predators with empty stomachs") num_nonempty_stomachs: Optional[int] = Field(None, description="Number of predators with non-empty stomachs") sample_size: Optional[int] = Field(None, description="Total number of predators surveyed") + source_pages: Optional[List[int]] = Field(None, description="Page numbers where the key data was found (species, location, date, stomach counts)") def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> PredatorDietMetrics: @@ -52,6 +53,7 @@ def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> Predator - Look in tables, methods, and results sections - Empty stomachs: "empty", "vacant", "no prey" - Non-empty stomachs: "with prey", "fed", "containing food" +- Page markers appear as [PAGE N] in the text EXTRACT: - species_name: Scientific name of PRIMARY predator studied (not prey) @@ -60,6 +62,7 @@ def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> Predator - num_empty_stomachs: Number with empty stomachs - num_nonempty_stomachs: Number with food in stomachs - sample_size: Total number examined +- source_pages: List of page numbers where you found the key data (look for [PAGE N] markers) TEXT: diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py index bbd9a6b..8354ee4 100644 --- a/src/preprocessing/pdf_text_extraction.py +++ b/src/preprocessing/pdf_text_extraction.py @@ -40,7 +40,8 @@ def extract_text_from_pdf(pdf_path: str) -> str: img = Image.open(io.BytesIO(pix.tobytes("png"))) page_text = pytesseract.image_to_string(img) - text.append(page_text) + # Add page marker for tracking + text.append(f"[PAGE {page_num}]\n{page_text}") except Exception as e: print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr) return "" @@ -61,7 +62,8 @@ def extract_text_from_pdf_bytes(data: bytes) -> str: pix = page.get_pixmap(dpi=300) img = Image.open(io.BytesIO(pix.tobytes("png"))) page_text = pytesseract.image_to_string(img) - text.append(page_text) + # Add page marker for tracking + text.append(f"[PAGE {page_num}]\n{page_text}") except Exception as e: print(f"[ERROR] Failed to extract text from PDF bytes: {e}", file=sys.stderr) return "" From 55b92ff57ff7e2a15cd450153f65d6dfdba6a51f Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Sun, 25 Jan 2026 17:50:42 -0800 Subject: [PATCH 2/5] Revert "Merge branch 'feat/table-augment' into feat/llm-page-tracking" This reverts commit 63dd263ba0b898ec3d8d7288b8cb5c7f50ac8781, reversing changes made to 09e18b4ec0fb3746551ec1b6c456b3886df7811c. --- requirements.txt | 3 - src/preprocessing/pdf_text_extraction.py | 147 ++--------------------- 2 files changed, 8 insertions(+), 142 deletions(-) diff --git a/requirements.txt b/requirements.txt index 88e2f35..14496ae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,4 @@ numpy google-api-python-client ollama pydantic -camelot-py[base] -opencv-python -pymupdf_layout xgboost \ No newline at end of file diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py index d3cbf12..8354ee4 100644 --- a/src/preprocessing/pdf_text_extraction.py +++ b/src/preprocessing/pdf_text_extraction.py @@ -6,9 +6,9 @@ This script uses PyMuPDF for accurate and efficient text extraction from scientific PDFs. It preserves reading order, handles multi-column text, and automatically applies OCR when a page contains only images (e.g., scanned documents). -Tables are automatically detected and extracted using PyMuPDF first, with camelot-py as fallback. """ +# Extract all text from a PDF using PyMuPDF import fitz import pytesseract from PIL import Image @@ -16,130 +16,14 @@ import argparse from pathlib import Path import sys -import json -from typing import List, Dict -import camelot Image.MAX_IMAGE_PIXELS = None fitz.TOOLS.mupdf_display_errors(False) -def extract_tables_with_camelot(pdf_path: str) -> List[Dict]: - """Extract tables using camelot-py (fallback method). - - Args: - pdf_path: Path to the PDF file - - Returns: - List of dictionaries containing table data and metadata - """ - - tables_data = [] - - try: - # Try stream method first with edge detection (better for tables without borders) - tables = camelot.read_pdf( - str(pdf_path), pages='all', flavor='stream', edge_tol=500, row_tol=10, column_tol=5 # Tolerance for detecting table edges # Ttolerance for row detection # Tolerance for column detection - ) - - # If still no tables, try lattice method (for bordered tables) - if len(tables) == 0: - tables = camelot.read_pdf(str(pdf_path), pages='all', flavor='lattice', line_scale=40) - - for idx, table in enumerate(tables, start=1): - # Convert to list of lists - table_cells = table.df.values.tolist() - - # Add header row (pandas columns) - header = table.df.columns.tolist() - table_cells.insert(0, header) - - # Skip tables with very few cells (likely detection errors) - if len(table_cells) < 3 or (len(table_cells[0]) if table_cells else 0) < 2: - continue - - table_info = { - "table_id": f"Table_P{table.page}_T{idx}", - "page_number": table.page, - "table_index": idx, - "bbox": { - "x0": table._bbox[0] if hasattr(table, '_bbox') else 0, - "y0": table._bbox[1] if hasattr(table, '_bbox') else 0, - "x1": table._bbox[2] if hasattr(table, '_bbox') else 0, - "y1": table._bbox[3] if hasattr(table, '_bbox') else 0, - }, - "num_rows": len(table_cells), - "num_cols": len(table_cells[0]) if table_cells else 0, - "cells": table_cells, - "accuracy": float(table.accuracy) if hasattr(table, 'accuracy') else 0.0, - "extraction_method": "camelot", - } - - tables_data.append(table_info) - - except Exception as e: - print(f"[ERROR] Camelot extraction failed: {e}", file=sys.stderr) - - return tables_data - - -def extract_tables_from_pdf(pdf_path: str) -> List[Dict]: - """Extract tables from PDF using PyMuPDF first, then camelot-py as fallback. - - Args: - pdf_path: Path to the PDF file - - Returns: - List of dictionaries containing table data and metadata - """ - tables_data = [] - - # Try PyMuPDF first - try: - with fitz.open(pdf_path) as doc: - for page_num, page in enumerate(doc, start=1): - tabs = page.find_tables() - - if not tabs.tables: - continue - - for table_idx, tab in enumerate(tabs.tables, start=1): - try: - table_cells = tab.extract() - - if not table_cells or len(table_cells) == 0: - continue - - bbox = tab.bbox - - table_info = { - "table_id": f"Table_P{page_num}_T{table_idx}", - "page_number": page_num, - "table_index": table_idx, - "bbox": {"x0": bbox.x0, "y0": bbox.y0, "x1": bbox.x1, "y1": bbox.y1}, - "num_rows": len(table_cells), - "num_cols": len(table_cells[0]) if table_cells else 0, - "cells": table_cells, - "extraction_method": "pymupdf", - } - - tables_data.append(table_info) - - except Exception as e: - tables_data.append({"table_id": f"Table_P{page_num}_T{table_idx}", "page_number": page_num, "table_index": table_idx, "error": str(e), "extraction_method": "pymupdf"}) - - except Exception as e: - print(f"[ERROR] PyMuPDF table extraction failed: {e}", file=sys.stderr) - - # If PyMuPDF found no tables, try camelot - if len(tables_data) == 0: - tables_data = extract_tables_with_camelot(pdf_path) - - return tables_data - - def extract_text_from_pdf(pdf_path: str) -> str: text = [] + print(f"Extracting text from {pdf_path}.") try: with fitz.open(pdf_path) as doc: for page_num, page in enumerate(doc, start=1): @@ -161,7 +45,7 @@ def extract_text_from_pdf(pdf_path: str) -> str: except Exception as e: print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr) return "" - + # Join all pages into a single string separated by newlines return "\n".join(text) @@ -186,20 +70,19 @@ def extract_text_from_pdf_bytes(data: bytes) -> str: return "\n".join(text) +# Save extracted text to a file. def save_to_file(text: str, output_path: str): - """Save extracted text to a file.""" try: with open(output_path, "w", encoding="utf-8") as f: f.write(text) + print(f"[INFO] Text successfully saved to {output_path}") except Exception as e: print(f"[ERROR] Could not save text to {output_path}: {e}", file=sys.stderr) def main(): - parser = argparse.ArgumentParser(description="Extract text and tables from PDF using PyMuPDF and camelot-py.") + parser = argparse.ArgumentParser(description="Extract text from PDF using PyMuPDF.") parser.add_argument("pdf", type=str, help="Path to the input PDF file.") - parser.add_argument("--output-dir", type=str, default="data/processed-text", help="Output directory for extracted text (default: data/processed-text)") - args = parser.parse_args() pdf_path = Path(args.pdf) @@ -207,25 +90,11 @@ def main(): print(f"[ERROR] File not found: {pdf_path}", file=sys.stderr) sys.exit(1) - # Extract text + # Perform extraction text = extract_text_from_pdf(str(pdf_path)) - # Extract tables - tables_data = extract_tables_from_pdf(str(pdf_path)) - - # If tables were found, save JSON - if tables_data: - - # Save tables JSON - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - tables_json_path = output_dir / (pdf_path.stem + "_tables.json") - with open(tables_json_path, "w", encoding="utf-8") as f: - json.dump(tables_data, f, indent=2) + output_path = Path("data/processed-text") / pdf_path.with_suffix(".txt").name - # Save combined text - output_path = Path(args.output_dir) / pdf_path.with_suffix(".txt").name save_to_file(text, str(output_path)) From 03cafe8d5d9adaea03a3d4464701980c236560bb Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Sun, 25 Jan 2026 20:17:56 -0800 Subject: [PATCH 3/5] Removed field, use regex instead --- src/llm/local_llm.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/llm/local_llm.py b/src/llm/local_llm.py index db51c5b..9a253d0 100644 --- a/src/llm/local_llm.py +++ b/src/llm/local_llm.py @@ -12,6 +12,7 @@ import argparse import json import sys +import re from pathlib import Path from typing import Optional, List @@ -28,7 +29,6 @@ class PredatorDietMetrics(BaseModel): num_empty_stomachs: Optional[int] = Field(None, description="Number of predators with empty stomachs") num_nonempty_stomachs: Optional[int] = Field(None, description="Number of predators with non-empty stomachs") sample_size: Optional[int] = Field(None, description="Total number of predators surveyed") - source_pages: Optional[List[int]] = Field(None, description="Page numbers where the key data was found (species, location, date, stomach counts)") def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> PredatorDietMetrics: @@ -62,7 +62,6 @@ def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> Predator - num_empty_stomachs: Number with empty stomachs - num_nonempty_stomachs: Number with food in stomachs - sample_size: Total number examined -- source_pages: List of page numbers where you found the key data (look for [PAGE N] markers) TEXT: @@ -141,6 +140,11 @@ def main(): # Extract metrics print(f"Extracting metrics from {text_path.name}...", file=sys.stderr) + + # Store original text for page extraction + original_text = text + print(f"[INFO] Text size: {len(text)} chars", file=sys.stderr) + try: metrics = extract_metrics_from_text(text, model=args.model) except Exception as e: @@ -150,6 +154,19 @@ def main(): # Validate and calculate derived metrics metrics_dict = metrics.model_dump() metrics_dict = validate_and_calculate(metrics_dict) + + # Extract page numbers programmatically from where data was found + source_pages = set() + for field, value in metrics_dict.items(): + if value and field not in ['fraction_feeding']: + value_str = str(value) + if value_str in original_text: + pos = original_text.find(value_str) + page_markers = re.findall(r'\[PAGE (\d+)\]', original_text[:pos]) + if page_markers: + source_pages.add(int(page_markers[-1])) + + metrics_dict["source_pages"] = sorted(list(source_pages)) if source_pages else None # Prepare output result = {"source_file": text_path.name, "metrics": metrics_dict} From 99a4b3719f7b3e563af64f2cd09ca1c3d5497ac8 Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Sun, 25 Jan 2026 20:24:01 -0800 Subject: [PATCH 4/5] reformat --- src/llm/local_llm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llm/local_llm.py b/src/llm/local_llm.py index 9a253d0..8abc543 100644 --- a/src/llm/local_llm.py +++ b/src/llm/local_llm.py @@ -140,7 +140,7 @@ def main(): # Extract metrics print(f"Extracting metrics from {text_path.name}...", file=sys.stderr) - + # Store original text for page extraction original_text = text print(f"[INFO] Text size: {len(text)} chars", file=sys.stderr) @@ -154,7 +154,7 @@ def main(): # Validate and calculate derived metrics metrics_dict = metrics.model_dump() metrics_dict = validate_and_calculate(metrics_dict) - + # Extract page numbers programmatically from where data was found source_pages = set() for field, value in metrics_dict.items(): @@ -165,7 +165,7 @@ def main(): page_markers = re.findall(r'\[PAGE (\d+)\]', original_text[:pos]) if page_markers: source_pages.add(int(page_markers[-1])) - + metrics_dict["source_pages"] = sorted(list(source_pages)) if source_pages else None # Prepare output From 801c701824cba26bea299c6f902beadb5ff093df Mon Sep 17 00:00:00 2001 From: Raymond Cen Date: Sun, 25 Jan 2026 20:25:32 -0800 Subject: [PATCH 5/5] Reapply "Merge branch 'feat/table-augment' into feat/llm-page-tracking" This reverts commit 55b92ff57ff7e2a15cd450153f65d6dfdba6a51f. --- requirements.txt | 3 + src/preprocessing/pdf_text_extraction.py | 147 +++++++++++++++++++++-- 2 files changed, 142 insertions(+), 8 deletions(-) diff --git a/requirements.txt b/requirements.txt index 14496ae..88e2f35 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,7 @@ numpy google-api-python-client ollama pydantic +camelot-py[base] +opencv-python +pymupdf_layout xgboost \ No newline at end of file diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py index 8354ee4..d3cbf12 100644 --- a/src/preprocessing/pdf_text_extraction.py +++ b/src/preprocessing/pdf_text_extraction.py @@ -6,9 +6,9 @@ This script uses PyMuPDF for accurate and efficient text extraction from scientific PDFs. It preserves reading order, handles multi-column text, and automatically applies OCR when a page contains only images (e.g., scanned documents). +Tables are automatically detected and extracted using PyMuPDF first, with camelot-py as fallback. """ -# Extract all text from a PDF using PyMuPDF import fitz import pytesseract from PIL import Image @@ -16,14 +16,130 @@ import argparse from pathlib import Path import sys +import json +from typing import List, Dict +import camelot Image.MAX_IMAGE_PIXELS = None fitz.TOOLS.mupdf_display_errors(False) +def extract_tables_with_camelot(pdf_path: str) -> List[Dict]: + """Extract tables using camelot-py (fallback method). + + Args: + pdf_path: Path to the PDF file + + Returns: + List of dictionaries containing table data and metadata + """ + + tables_data = [] + + try: + # Try stream method first with edge detection (better for tables without borders) + tables = camelot.read_pdf( + str(pdf_path), pages='all', flavor='stream', edge_tol=500, row_tol=10, column_tol=5 # Tolerance for detecting table edges # Ttolerance for row detection # Tolerance for column detection + ) + + # If still no tables, try lattice method (for bordered tables) + if len(tables) == 0: + tables = camelot.read_pdf(str(pdf_path), pages='all', flavor='lattice', line_scale=40) + + for idx, table in enumerate(tables, start=1): + # Convert to list of lists + table_cells = table.df.values.tolist() + + # Add header row (pandas columns) + header = table.df.columns.tolist() + table_cells.insert(0, header) + + # Skip tables with very few cells (likely detection errors) + if len(table_cells) < 3 or (len(table_cells[0]) if table_cells else 0) < 2: + continue + + table_info = { + "table_id": f"Table_P{table.page}_T{idx}", + "page_number": table.page, + "table_index": idx, + "bbox": { + "x0": table._bbox[0] if hasattr(table, '_bbox') else 0, + "y0": table._bbox[1] if hasattr(table, '_bbox') else 0, + "x1": table._bbox[2] if hasattr(table, '_bbox') else 0, + "y1": table._bbox[3] if hasattr(table, '_bbox') else 0, + }, + "num_rows": len(table_cells), + "num_cols": len(table_cells[0]) if table_cells else 0, + "cells": table_cells, + "accuracy": float(table.accuracy) if hasattr(table, 'accuracy') else 0.0, + "extraction_method": "camelot", + } + + tables_data.append(table_info) + + except Exception as e: + print(f"[ERROR] Camelot extraction failed: {e}", file=sys.stderr) + + return tables_data + + +def extract_tables_from_pdf(pdf_path: str) -> List[Dict]: + """Extract tables from PDF using PyMuPDF first, then camelot-py as fallback. + + Args: + pdf_path: Path to the PDF file + + Returns: + List of dictionaries containing table data and metadata + """ + tables_data = [] + + # Try PyMuPDF first + try: + with fitz.open(pdf_path) as doc: + for page_num, page in enumerate(doc, start=1): + tabs = page.find_tables() + + if not tabs.tables: + continue + + for table_idx, tab in enumerate(tabs.tables, start=1): + try: + table_cells = tab.extract() + + if not table_cells or len(table_cells) == 0: + continue + + bbox = tab.bbox + + table_info = { + "table_id": f"Table_P{page_num}_T{table_idx}", + "page_number": page_num, + "table_index": table_idx, + "bbox": {"x0": bbox.x0, "y0": bbox.y0, "x1": bbox.x1, "y1": bbox.y1}, + "num_rows": len(table_cells), + "num_cols": len(table_cells[0]) if table_cells else 0, + "cells": table_cells, + "extraction_method": "pymupdf", + } + + tables_data.append(table_info) + + except Exception as e: + tables_data.append({"table_id": f"Table_P{page_num}_T{table_idx}", "page_number": page_num, "table_index": table_idx, "error": str(e), "extraction_method": "pymupdf"}) + + except Exception as e: + print(f"[ERROR] PyMuPDF table extraction failed: {e}", file=sys.stderr) + + # If PyMuPDF found no tables, try camelot + if len(tables_data) == 0: + tables_data = extract_tables_with_camelot(pdf_path) + + return tables_data + + def extract_text_from_pdf(pdf_path: str) -> str: text = [] - print(f"Extracting text from {pdf_path}.") try: with fitz.open(pdf_path) as doc: for page_num, page in enumerate(doc, start=1): @@ -45,7 +161,7 @@ def extract_text_from_pdf(pdf_path: str) -> str: except Exception as e: print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr) return "" - # Join all pages into a single string separated by newlines + return "\n".join(text) @@ -70,19 +186,20 @@ def extract_text_from_pdf_bytes(data: bytes) -> str: return "\n".join(text) -# Save extracted text to a file. def save_to_file(text: str, output_path: str): + """Save extracted text to a file.""" try: with open(output_path, "w", encoding="utf-8") as f: f.write(text) - print(f"[INFO] Text successfully saved to {output_path}") except Exception as e: print(f"[ERROR] Could not save text to {output_path}: {e}", file=sys.stderr) def main(): - parser = argparse.ArgumentParser(description="Extract text from PDF using PyMuPDF.") + parser = argparse.ArgumentParser(description="Extract text and tables from PDF using PyMuPDF and camelot-py.") parser.add_argument("pdf", type=str, help="Path to the input PDF file.") + parser.add_argument("--output-dir", type=str, default="data/processed-text", help="Output directory for extracted text (default: data/processed-text)") + args = parser.parse_args() pdf_path = Path(args.pdf) @@ -90,11 +207,25 @@ def main(): print(f"[ERROR] File not found: {pdf_path}", file=sys.stderr) sys.exit(1) - # Perform extraction + # Extract text text = extract_text_from_pdf(str(pdf_path)) - output_path = Path("data/processed-text") / pdf_path.with_suffix(".txt").name + # Extract tables + tables_data = extract_tables_from_pdf(str(pdf_path)) + + # If tables were found, save JSON + if tables_data: + + # Save tables JSON + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + tables_json_path = output_dir / (pdf_path.stem + "_tables.json") + with open(tables_json_path, "w", encoding="utf-8") as f: + json.dump(tables_data, f, indent=2) + # Save combined text + output_path = Path(args.output_dir) / pdf_path.with_suffix(".txt").name save_to_file(text, str(output_path))