NovakLabOSU · SeanClay10 · May 4, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/classify_extract.py b/classify_extract.py
@@ -50,76 +50,32 @@
 
 def _process_single_pdf(
     pdf_path: Path,
-    model_dir: str,
     llm_model: str,
     output_dir: Path,
     confidence_threshold: float,
     max_chars: int,
     num_ctx: int,
+    clf_model,
+    vectorizer,
+    encoder,
 ):
-    """Run classify → extract pipeline on one or more PDFs.
-
-    For each PDF:
-      1. Extract text via PyMuPDF / OCR (pdf_text_extraction.py)
-      2. Classify with XGBoost (pdf_classifier.py)
-      3. If 'useful': trim text to budget (llm_text.py), run LLM extraction
-         (llm_client.py), and save result JSON (llm_client.py)
-      4. Append a row to the summary CSV regardless of classification outcome
-
-    Args:
-        input_path: Path to a single PDF or a directory of PDFs.
-        model_dir: Directory containing classifier model artifacts.
-        llm_model: Ollama model name for extraction.
-        output_dir: Where to write JSON results and the summary CSV.
-        confidence_threshold: Classifier probability threshold for 'useful'.
-        max_chars: Max characters to send to the LLM.
-        num_ctx: Context window size for Ollama.
-    """
-    # ── Collect PDF paths ─────────────────────────────────────────────────
-    if pdf_path.is_dir():
-        pdf_paths = sorted(pdf_path.glob("*.pdf"))
-        if not pdf_paths:
-            print(f"[ERROR] No PDF files found in directory: {pdf_path}", file=sys.stderr)
-            log.error("No PDF files found in directory: %s", pdf_path)
-            sys.exit(1)
-        print(f"[INFO] Found {len(pdf_paths)} PDF(s) in {pdf_path}", file=sys.stderr)
-    elif pdf_path.is_file() and pdf_path.suffix.lower() == ".pdf":
-        pdf_paths = [pdf_path]
-    else:
-        print(f"[ERROR] pdf must be a .pdf file or a directory of PDFs: {pdf_path}", file=sys.stderr)
-        log.error("pdf must be a .pdf file or a directory of PDFs: %s", pdf_path)
-        sys.exit(1)
-
-    # ── Load classifier once (avoid re-reading model artifacts per file) ──
-    print("[INFO] Loading classifier...", file=sys.stderr)
-    try:
-        clf_model, vectorizer, encoder = load_classifier(model_dir)
-    except FileNotFoundError as e:
-        print(f"[ERROR] {e}", file=sys.stderr)
-        log.critical("Classifier artifacts not found: %s", e)
-        sys.exit(1)
-    print("[INFO] Classifier loaded.", file=sys.stderr)
-
+    """Classify one PDF and return a summary row dict."""
     output_dir.mkdir(parents=True, exist_ok=True)
-    summary_rows = []
 
-    for idx, pdf_path in enumerate(pdf_paths, start=1):
-        print(f"\n[{idx}/{len(pdf_paths)}] Processing: {pdf_path.name}", file=sys.stderr)
-
-        row = {
-            "filename": pdf_path.name,
-            "classification": "",
-            "confidence": "",
-            "pred_prob": "",
-            "extraction_status": "",
-            "species_name": "",
-            "study_location": "",
-            "study_date": "",
-            "sample_size": "",
-            "num_empty_stomachs": "",
-            "num_nonempty_stomachs": "",
-            "fraction_feeding": "",
-        }
+    row = {
+        "filename": pdf_path.name,
+        "classification": "",
+        "confidence": "",
+        "pred_prob": "",
+        "extraction_status": "",
+        "species_name": "",
+        "study_location": "",
+        "study_date": "",
+        "sample_size": "",
+        "num_empty_stomachs": "",
+        "num_nonempty_stomachs": "",
+        "fraction_feeding": "",
+    }
 
     # ── Step 1: Extract text ──────────────────────────────────────────
     try:
@@ -239,6 +195,16 @@ def run_pipeline(
         sys.exit(1)
 
     output_dir.mkdir(parents=True, exist_ok=True)
+
+    print("[INFO] Loading classifier...", file=sys.stderr)
+    try:
+        clf_model, vectorizer, encoder = load_classifier(model_dir)
+    except FileNotFoundError as e:
+        print(f"[ERROR] {e}", file=sys.stderr)
+        log.critical("Classifier artifacts not found: %s", e)
+        sys.exit(1)
+    print("[INFO] Classifier loaded.", file=sys.stderr)
+
     summary_rows = []
 
     if workers > 1 and len(pdf_paths) > 1:
@@ -248,12 +214,14 @@ def run_pipeline(
                 executor.submit(
                     _process_single_pdf,
                     pdf_path,
-                    model_dir,
                     llm_model,
                     output_dir,
                     confidence_threshold,
                     max_chars,
                     num_ctx,
+                    clf_model,
+                    vectorizer,
+                    encoder,
                 ): pdf_path
                 for pdf_path in pdf_paths
             }
@@ -270,12 +238,14 @@ def run_pipeline(
             print(f"\n[{idx}/{len(pdf_paths)}] Processing: {pdf_path.name}", file=sys.stderr)
             row = _process_single_pdf(
                 pdf_path,
-                model_dir,
                 llm_model,
                 output_dir,
                 confidence_threshold,
                 max_chars,
                 num_ctx,
+                clf_model,
+                vectorizer,
+                encoder,
             )
             summary_rows.append(row)
 

diff --git a/src/llm/chunked_biomistral_llm.py b/src/llm/chunked_biomistral_llm.py
@@ -13,6 +13,7 @@
 import logging
 import sys
 from collections import Counter
+from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 
 project_root = Path(__file__).parent.parent.parent

diff --git a/src/llm/chunked_extraction.py b/src/llm/chunked_extraction.py
@@ -92,7 +92,7 @@ def extract_with_chunking(
 ):
     """Main extraction with chunking pipeline."""
 
-    print(f"  [CHUNK] Loading classifier...", file=sys.stderr)
+    print("  [CHUNK] Loading classifier...", file=sys.stderr)
     model, vectorizer, _encoder = load_classifier(model_dir)
 
     chunks = chunk_text(text, chunk_size, overlap)
@@ -120,7 +120,7 @@ def extract_with_chunking(
 
     results = []
     for i, (chunk, score) in enumerate(top_chunks):
-        print(f"  [CHUNK] Extracting chunk {i+1}/{len(top_chunks)}...", file=sys.stderr)
+        print(f"  [CHUNK] Extracting chunk {i + 1}/{len(top_chunks)}...", file=sys.stderr)
 
         try:
             metrics = extract_metrics_from_text(