From 09e18b4ec0fb3746551ec1b6c456b3886df7811c Mon Sep 17 00:00:00 2001
From: Raymond Cen <raymondcen07@gmail.com>
Date: Sun, 25 Jan 2026 17:30:09 -0800
Subject: [PATCH 1/5] Added page number field to metrics

---
 src/llm/local_llm.py                     | 5 ++++-
 src/preprocessing/pdf_text_extraction.py | 6 ++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/llm/local_llm.py b/src/llm/local_llm.py
index 981ab46..db51c5b 100644
--- a/src/llm/local_llm.py
+++ b/src/llm/local_llm.py
@@ -13,7 +13,7 @@
 import json
 import sys
 from pathlib import Path
-from typing import Optional
+from typing import Optional, List
 
 from ollama import chat
 from pydantic import BaseModel, Field
@@ -28,6 +28,7 @@ class PredatorDietMetrics(BaseModel):
     num_empty_stomachs: Optional[int] = Field(None, description="Number of predators with empty stomachs")
     num_nonempty_stomachs: Optional[int] = Field(None, description="Number of predators with non-empty stomachs")
     sample_size: Optional[int] = Field(None, description="Total number of predators surveyed")
+    source_pages: Optional[List[int]] = Field(None, description="Page numbers where the key data was found (species, location, date, stomach counts)")
 
 
 def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> PredatorDietMetrics:
@@ -52,6 +53,7 @@ def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> Predator
 - Look in tables, methods, and results sections
 - Empty stomachs: "empty", "vacant", "no prey"
 - Non-empty stomachs: "with prey", "fed", "containing food"
+- Page markers appear as [PAGE N] in the text
 
 EXTRACT:
 - species_name: Scientific name of PRIMARY predator studied (not prey)
@@ -60,6 +62,7 @@ def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> Predator
 - num_empty_stomachs: Number with empty stomachs
 - num_nonempty_stomachs: Number with food in stomachs
 - sample_size: Total number examined
+- source_pages: List of page numbers where you found the key data (look for [PAGE N] markers)
 
 
 TEXT:
diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py
index bbd9a6b..8354ee4 100644
--- a/src/preprocessing/pdf_text_extraction.py
+++ b/src/preprocessing/pdf_text_extraction.py
@@ -40,7 +40,8 @@ def extract_text_from_pdf(pdf_path: str) -> str:
                     img = Image.open(io.BytesIO(pix.tobytes("png")))
                     page_text = pytesseract.image_to_string(img)
 
-                text.append(page_text)
+                # Add page marker for tracking
+                text.append(f"[PAGE {page_num}]\n{page_text}")
     except Exception as e:
         print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr)
         return ""
@@ -61,7 +62,8 @@ def extract_text_from_pdf_bytes(data: bytes) -> str:
                     pix = page.get_pixmap(dpi=300)
                     img = Image.open(io.BytesIO(pix.tobytes("png")))
                     page_text = pytesseract.image_to_string(img)
-                text.append(page_text)
+                # Add page marker for tracking
+                text.append(f"[PAGE {page_num}]\n{page_text}")
     except Exception as e:
         print(f"[ERROR] Failed to extract text from PDF bytes: {e}", file=sys.stderr)
         return ""

From 55b92ff57ff7e2a15cd450153f65d6dfdba6a51f Mon Sep 17 00:00:00 2001
From: Raymond Cen <raymondcen07@gmail.com>
Date: Sun, 25 Jan 2026 17:50:42 -0800
Subject: [PATCH 2/5] Revert "Merge branch 'feat/table-augment' into
 feat/llm-page-tracking"

This reverts commit 63dd263ba0b898ec3d8d7288b8cb5c7f50ac8781, reversing
changes made to 09e18b4ec0fb3746551ec1b6c456b3886df7811c.
---
 requirements.txt                         |   3 -
 src/preprocessing/pdf_text_extraction.py | 147 ++---------------------
 2 files changed, 8 insertions(+), 142 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 88e2f35..14496ae 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,4 @@ numpy
 google-api-python-client
 ollama
 pydantic
-camelot-py[base]
-opencv-python
-pymupdf_layout
 xgboost
\ No newline at end of file
diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py
index d3cbf12..8354ee4 100644
--- a/src/preprocessing/pdf_text_extraction.py
+++ b/src/preprocessing/pdf_text_extraction.py
@@ -6,9 +6,9 @@
 This script uses PyMuPDF for accurate and efficient text extraction from
 scientific PDFs. It preserves reading order, handles multi-column text, and
 automatically applies OCR when a page contains only images (e.g., scanned documents).
-Tables are automatically detected and extracted using PyMuPDF first, with camelot-py as fallback.
 """
 
+# Extract all text from a PDF using PyMuPDF
 import fitz
 import pytesseract
 from PIL import Image
@@ -16,130 +16,14 @@
 import argparse
 from pathlib import Path
 import sys
-import json
-from typing import List, Dict
-import camelot
 
 Image.MAX_IMAGE_PIXELS = None
 fitz.TOOLS.mupdf_display_errors(False)
 
 
-def extract_tables_with_camelot(pdf_path: str) -> List[Dict]:
-    """Extract tables using camelot-py (fallback method).
-
-    Args:
-        pdf_path: Path to the PDF file
-
-    Returns:
-        List of dictionaries containing table data and metadata
-    """
-
-    tables_data = []
-
-    try:
-        # Try stream method first with edge detection (better for tables without borders)
-        tables = camelot.read_pdf(
-            str(pdf_path), pages='all', flavor='stream', edge_tol=500, row_tol=10, column_tol=5  # Tolerance for detecting table edges  # Ttolerance for row detection  # Tolerance for column detection
-        )
-
-        # If still no tables, try lattice method (for bordered tables)
-        if len(tables) == 0:
-            tables = camelot.read_pdf(str(pdf_path), pages='all', flavor='lattice', line_scale=40)
-
-        for idx, table in enumerate(tables, start=1):
-            # Convert to list of lists
-            table_cells = table.df.values.tolist()
-
-            # Add header row (pandas columns)
-            header = table.df.columns.tolist()
-            table_cells.insert(0, header)
-
-            # Skip tables with very few cells (likely detection errors)
-            if len(table_cells) < 3 or (len(table_cells[0]) if table_cells else 0) < 2:
-                continue
-
-            table_info = {
-                "table_id": f"Table_P{table.page}_T{idx}",
-                "page_number": table.page,
-                "table_index": idx,
-                "bbox": {
-                    "x0": table._bbox[0] if hasattr(table, '_bbox') else 0,
-                    "y0": table._bbox[1] if hasattr(table, '_bbox') else 0,
-                    "x1": table._bbox[2] if hasattr(table, '_bbox') else 0,
-                    "y1": table._bbox[3] if hasattr(table, '_bbox') else 0,
-                },
-                "num_rows": len(table_cells),
-                "num_cols": len(table_cells[0]) if table_cells else 0,
-                "cells": table_cells,
-                "accuracy": float(table.accuracy) if hasattr(table, 'accuracy') else 0.0,
-                "extraction_method": "camelot",
-            }
-
-            tables_data.append(table_info)
-
-    except Exception as e:
-        print(f"[ERROR] Camelot extraction failed: {e}", file=sys.stderr)
-
-    return tables_data
-
-
-def extract_tables_from_pdf(pdf_path: str) -> List[Dict]:
-    """Extract tables from PDF using PyMuPDF first, then camelot-py as fallback.
-
-    Args:
-        pdf_path: Path to the PDF file
-
-    Returns:
-        List of dictionaries containing table data and metadata
-    """
-    tables_data = []
-
-    # Try PyMuPDF first
-    try:
-        with fitz.open(pdf_path) as doc:
-            for page_num, page in enumerate(doc, start=1):
-                tabs = page.find_tables()
-
-                if not tabs.tables:
-                    continue
-
-                for table_idx, tab in enumerate(tabs.tables, start=1):
-                    try:
-                        table_cells = tab.extract()
-
-                        if not table_cells or len(table_cells) == 0:
-                            continue
-
-                        bbox = tab.bbox
-
-                        table_info = {
-                            "table_id": f"Table_P{page_num}_T{table_idx}",
-                            "page_number": page_num,
-                            "table_index": table_idx,
-                            "bbox": {"x0": bbox.x0, "y0": bbox.y0, "x1": bbox.x1, "y1": bbox.y1},
-                            "num_rows": len(table_cells),
-                            "num_cols": len(table_cells[0]) if table_cells else 0,
-                            "cells": table_cells,
-                            "extraction_method": "pymupdf",
-                        }
-
-                        tables_data.append(table_info)
-
-                    except Exception as e:
-                        tables_data.append({"table_id": f"Table_P{page_num}_T{table_idx}", "page_number": page_num, "table_index": table_idx, "error": str(e), "extraction_method": "pymupdf"})
-
-    except Exception as e:
-        print(f"[ERROR] PyMuPDF table extraction failed: {e}", file=sys.stderr)
-
-    # If PyMuPDF found no tables, try camelot
-    if len(tables_data) == 0:
-        tables_data = extract_tables_with_camelot(pdf_path)
-
-    return tables_data
-
-
 def extract_text_from_pdf(pdf_path: str) -> str:
     text = []
+    print(f"Extracting text from {pdf_path}.")
     try:
         with fitz.open(pdf_path) as doc:
             for page_num, page in enumerate(doc, start=1):
@@ -161,7 +45,7 @@ def extract_text_from_pdf(pdf_path: str) -> str:
     except Exception as e:
         print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr)
         return ""
-
+    # Join all pages into a single string separated by newlines
     return "\n".join(text)
 
 
@@ -186,20 +70,19 @@ def extract_text_from_pdf_bytes(data: bytes) -> str:
     return "\n".join(text)
 
 
+# Save extracted text to a file.
 def save_to_file(text: str, output_path: str):
-    """Save extracted text to a file."""
     try:
         with open(output_path, "w", encoding="utf-8") as f:
             f.write(text)
+        print(f"[INFO] Text successfully saved to {output_path}")
     except Exception as e:
         print(f"[ERROR] Could not save text to {output_path}: {e}", file=sys.stderr)
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Extract text and tables from PDF using PyMuPDF and camelot-py.")
+    parser = argparse.ArgumentParser(description="Extract text from PDF using PyMuPDF.")
     parser.add_argument("pdf", type=str, help="Path to the input PDF file.")
-    parser.add_argument("--output-dir", type=str, default="data/processed-text", help="Output directory for extracted text (default: data/processed-text)")
-
     args = parser.parse_args()
 
     pdf_path = Path(args.pdf)
@@ -207,25 +90,11 @@ def main():
         print(f"[ERROR] File not found: {pdf_path}", file=sys.stderr)
         sys.exit(1)
 
-    # Extract text
+    # Perform extraction
     text = extract_text_from_pdf(str(pdf_path))
 
-    # Extract tables
-    tables_data = extract_tables_from_pdf(str(pdf_path))
-
-    # If tables were found, save JSON
-    if tables_data:
-
-        # Save tables JSON
-        output_dir = Path(args.output_dir)
-        output_dir.mkdir(parents=True, exist_ok=True)
-
-        tables_json_path = output_dir / (pdf_path.stem + "_tables.json")
-        with open(tables_json_path, "w", encoding="utf-8") as f:
-            json.dump(tables_data, f, indent=2)
+    output_path = Path("data/processed-text") / pdf_path.with_suffix(".txt").name
 
-    # Save combined text
-    output_path = Path(args.output_dir) / pdf_path.with_suffix(".txt").name
     save_to_file(text, str(output_path))
 
 

From 03cafe8d5d9adaea03a3d4464701980c236560bb Mon Sep 17 00:00:00 2001
From: Raymond Cen <raymondcen07@gmail.com>
Date: Sun, 25 Jan 2026 20:17:56 -0800
Subject: [PATCH 3/5] Removed field, use regex instead

---
 src/llm/local_llm.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/llm/local_llm.py b/src/llm/local_llm.py
index db51c5b..9a253d0 100644
--- a/src/llm/local_llm.py
+++ b/src/llm/local_llm.py
@@ -12,6 +12,7 @@
 import argparse
 import json
 import sys
+import re
 from pathlib import Path
 from typing import Optional, List
 
@@ -28,7 +29,6 @@ class PredatorDietMetrics(BaseModel):
     num_empty_stomachs: Optional[int] = Field(None, description="Number of predators with empty stomachs")
     num_nonempty_stomachs: Optional[int] = Field(None, description="Number of predators with non-empty stomachs")
     sample_size: Optional[int] = Field(None, description="Total number of predators surveyed")
-    source_pages: Optional[List[int]] = Field(None, description="Page numbers where the key data was found (species, location, date, stomach counts)")
 
 
 def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> PredatorDietMetrics:
@@ -62,7 +62,6 @@ def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> Predator
 - num_empty_stomachs: Number with empty stomachs
 - num_nonempty_stomachs: Number with food in stomachs
 - sample_size: Total number examined
-- source_pages: List of page numbers where you found the key data (look for [PAGE N] markers)
 
 
 TEXT:
@@ -141,6 +140,11 @@ def main():
 
     # Extract metrics
     print(f"Extracting metrics from {text_path.name}...", file=sys.stderr)
+    
+    # Store original text for page extraction
+    original_text = text
+    print(f"[INFO] Text size: {len(text)} chars", file=sys.stderr)
+
     try:
         metrics = extract_metrics_from_text(text, model=args.model)
     except Exception as e:
@@ -150,6 +154,19 @@ def main():
     # Validate and calculate derived metrics
     metrics_dict = metrics.model_dump()
     metrics_dict = validate_and_calculate(metrics_dict)
+    
+    # Extract page numbers programmatically from where data was found
+    source_pages = set()
+    for field, value in metrics_dict.items():
+        if value and field not in ['fraction_feeding']:
+            value_str = str(value)
+            if value_str in original_text:
+                pos = original_text.find(value_str)
+                page_markers = re.findall(r'\[PAGE (\d+)\]', original_text[:pos])
+                if page_markers:
+                    source_pages.add(int(page_markers[-1]))
+    
+    metrics_dict["source_pages"] = sorted(list(source_pages)) if source_pages else None
 
     # Prepare output
     result = {"source_file": text_path.name, "metrics": metrics_dict}

From 99a4b3719f7b3e563af64f2cd09ca1c3d5497ac8 Mon Sep 17 00:00:00 2001
From: Raymond Cen <raymondcen07@gmail.com>
Date: Sun, 25 Jan 2026 20:24:01 -0800
Subject: [PATCH 4/5] reformat

---
 src/llm/local_llm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llm/local_llm.py b/src/llm/local_llm.py
index 9a253d0..8abc543 100644
--- a/src/llm/local_llm.py
+++ b/src/llm/local_llm.py
@@ -140,7 +140,7 @@ def main():
 
     # Extract metrics
     print(f"Extracting metrics from {text_path.name}...", file=sys.stderr)
-    
+
     # Store original text for page extraction
     original_text = text
     print(f"[INFO] Text size: {len(text)} chars", file=sys.stderr)
@@ -154,7 +154,7 @@ def main():
     # Validate and calculate derived metrics
     metrics_dict = metrics.model_dump()
     metrics_dict = validate_and_calculate(metrics_dict)
-    
+
     # Extract page numbers programmatically from where data was found
     source_pages = set()
     for field, value in metrics_dict.items():
@@ -165,7 +165,7 @@ def main():
                 page_markers = re.findall(r'\[PAGE (\d+)\]', original_text[:pos])
                 if page_markers:
                     source_pages.add(int(page_markers[-1]))
-    
+
     metrics_dict["source_pages"] = sorted(list(source_pages)) if source_pages else None
 
     # Prepare output

From 801c701824cba26bea299c6f902beadb5ff093df Mon Sep 17 00:00:00 2001
From: Raymond Cen <raymondcen07@gmail.com>
Date: Sun, 25 Jan 2026 20:25:32 -0800
Subject: [PATCH 5/5] Reapply "Merge branch 'feat/table-augment' into
 feat/llm-page-tracking"

This reverts commit 55b92ff57ff7e2a15cd450153f65d6dfdba6a51f.
---
 requirements.txt                         |   3 +
 src/preprocessing/pdf_text_extraction.py | 147 +++++++++++++++++++++--
 2 files changed, 142 insertions(+), 8 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 14496ae..88e2f35 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,4 +15,7 @@ numpy
 google-api-python-client
 ollama
 pydantic
+camelot-py[base]
+opencv-python
+pymupdf_layout
 xgboost
\ No newline at end of file
diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py
index 8354ee4..d3cbf12 100644
--- a/src/preprocessing/pdf_text_extraction.py
+++ b/src/preprocessing/pdf_text_extraction.py
@@ -6,9 +6,9 @@
 This script uses PyMuPDF for accurate and efficient text extraction from
 scientific PDFs. It preserves reading order, handles multi-column text, and
 automatically applies OCR when a page contains only images (e.g., scanned documents).
+Tables are automatically detected and extracted using PyMuPDF first, with camelot-py as fallback.
 """
 
-# Extract all text from a PDF using PyMuPDF
 import fitz
 import pytesseract
 from PIL import Image
@@ -16,14 +16,130 @@
 import argparse
 from pathlib import Path
 import sys
+import json
+from typing import List, Dict
+import camelot
 
 Image.MAX_IMAGE_PIXELS = None
 fitz.TOOLS.mupdf_display_errors(False)
 
 
+def extract_tables_with_camelot(pdf_path: str) -> List[Dict]:
+    """Extract tables using camelot-py (fallback method).
+
+    Args:
+        pdf_path: Path to the PDF file
+
+    Returns:
+        List of dictionaries containing table data and metadata
+    """
+
+    tables_data = []
+
+    try:
+        # Try stream method first with edge detection (better for tables without borders)
+        tables = camelot.read_pdf(
+            str(pdf_path), pages='all', flavor='stream', edge_tol=500, row_tol=10, column_tol=5  # Tolerance for detecting table edges  # Ttolerance for row detection  # Tolerance for column detection
+        )
+
+        # If still no tables, try lattice method (for bordered tables)
+        if len(tables) == 0:
+            tables = camelot.read_pdf(str(pdf_path), pages='all', flavor='lattice', line_scale=40)
+
+        for idx, table in enumerate(tables, start=1):
+            # Convert to list of lists
+            table_cells = table.df.values.tolist()
+
+            # Add header row (pandas columns)
+            header = table.df.columns.tolist()
+            table_cells.insert(0, header)
+
+            # Skip tables with very few cells (likely detection errors)
+            if len(table_cells) < 3 or (len(table_cells[0]) if table_cells else 0) < 2:
+                continue
+
+            table_info = {
+                "table_id": f"Table_P{table.page}_T{idx}",
+                "page_number": table.page,
+                "table_index": idx,
+                "bbox": {
+                    "x0": table._bbox[0] if hasattr(table, '_bbox') else 0,
+                    "y0": table._bbox[1] if hasattr(table, '_bbox') else 0,
+                    "x1": table._bbox[2] if hasattr(table, '_bbox') else 0,
+                    "y1": table._bbox[3] if hasattr(table, '_bbox') else 0,
+                },
+                "num_rows": len(table_cells),
+                "num_cols": len(table_cells[0]) if table_cells else 0,
+                "cells": table_cells,
+                "accuracy": float(table.accuracy) if hasattr(table, 'accuracy') else 0.0,
+                "extraction_method": "camelot",
+            }
+
+            tables_data.append(table_info)
+
+    except Exception as e:
+        print(f"[ERROR] Camelot extraction failed: {e}", file=sys.stderr)
+
+    return tables_data
+
+
+def extract_tables_from_pdf(pdf_path: str) -> List[Dict]:
+    """Extract tables from PDF using PyMuPDF first, then camelot-py as fallback.
+
+    Args:
+        pdf_path: Path to the PDF file
+
+    Returns:
+        List of dictionaries containing table data and metadata
+    """
+    tables_data = []
+
+    # Try PyMuPDF first
+    try:
+        with fitz.open(pdf_path) as doc:
+            for page_num, page in enumerate(doc, start=1):
+                tabs = page.find_tables()
+
+                if not tabs.tables:
+                    continue
+
+                for table_idx, tab in enumerate(tabs.tables, start=1):
+                    try:
+                        table_cells = tab.extract()
+
+                        if not table_cells or len(table_cells) == 0:
+                            continue
+
+                        bbox = tab.bbox
+
+                        table_info = {
+                            "table_id": f"Table_P{page_num}_T{table_idx}",
+                            "page_number": page_num,
+                            "table_index": table_idx,
+                            "bbox": {"x0": bbox.x0, "y0": bbox.y0, "x1": bbox.x1, "y1": bbox.y1},
+                            "num_rows": len(table_cells),
+                            "num_cols": len(table_cells[0]) if table_cells else 0,
+                            "cells": table_cells,
+                            "extraction_method": "pymupdf",
+                        }
+
+                        tables_data.append(table_info)
+
+                    except Exception as e:
+                        tables_data.append({"table_id": f"Table_P{page_num}_T{table_idx}", "page_number": page_num, "table_index": table_idx, "error": str(e), "extraction_method": "pymupdf"})
+
+    except Exception as e:
+        print(f"[ERROR] PyMuPDF table extraction failed: {e}", file=sys.stderr)
+
+    # If PyMuPDF found no tables, try camelot
+    if len(tables_data) == 0:
+        tables_data = extract_tables_with_camelot(pdf_path)
+
+    return tables_data
+
+
 def extract_text_from_pdf(pdf_path: str) -> str:
     text = []
-    print(f"Extracting text from {pdf_path}.")
     try:
         with fitz.open(pdf_path) as doc:
             for page_num, page in enumerate(doc, start=1):
@@ -45,7 +161,7 @@ def extract_text_from_pdf(pdf_path: str) -> str:
     except Exception as e:
         print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr)
         return ""
-    # Join all pages into a single string separated by newlines
+
     return "\n".join(text)
 
 
@@ -70,19 +186,20 @@ def extract_text_from_pdf_bytes(data: bytes) -> str:
     return "\n".join(text)
 
 
-# Save extracted text to a file.
 def save_to_file(text: str, output_path: str):
+    """Save extracted text to a file."""
     try:
         with open(output_path, "w", encoding="utf-8") as f:
             f.write(text)
-        print(f"[INFO] Text successfully saved to {output_path}")
     except Exception as e:
         print(f"[ERROR] Could not save text to {output_path}: {e}", file=sys.stderr)
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Extract text from PDF using PyMuPDF.")
+    parser = argparse.ArgumentParser(description="Extract text and tables from PDF using PyMuPDF and camelot-py.")
     parser.add_argument("pdf", type=str, help="Path to the input PDF file.")
+    parser.add_argument("--output-dir", type=str, default="data/processed-text", help="Output directory for extracted text (default: data/processed-text)")
+
     args = parser.parse_args()
 
     pdf_path = Path(args.pdf)
@@ -90,11 +207,25 @@ def main():
         print(f"[ERROR] File not found: {pdf_path}", file=sys.stderr)
         sys.exit(1)
 
-    # Perform extraction
+    # Extract text
     text = extract_text_from_pdf(str(pdf_path))
 
-    output_path = Path("data/processed-text") / pdf_path.with_suffix(".txt").name
+    # Extract tables
+    tables_data = extract_tables_from_pdf(str(pdf_path))
+
+    # If tables were found, save JSON
+    if tables_data:
+
+        # Save tables JSON
+        output_dir = Path(args.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        tables_json_path = output_dir / (pdf_path.stem + "_tables.json")
+        with open(tables_json_path, "w", encoding="utf-8") as f:
+            json.dump(tables_data, f, indent=2)
 
+    # Save combined text
+    output_path = Path(args.output_dir) / pdf_path.with_suffix(".txt").name
     save_to_file(text, str(output_path))