From 5793b9a0817cf839e27951c0caafd0ac1deef2f6 Mon Sep 17 00:00:00 2001 From: SeanClay10 Date: Wed, 29 Apr 2026 15:46:58 -0700 Subject: [PATCH 1/3] fix: load classifier once per pipeline run instead of per PDF --- classify_extract.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/classify_extract.py b/classify_extract.py index f44fbbb..66e95b1 100644 --- a/classify_extract.py +++ b/classify_extract.py @@ -50,12 +50,14 @@ def _process_single_pdf( pdf_path: Path, - model_dir: str, llm_model: str, output_dir: Path, confidence_threshold: float, max_chars: int, num_ctx: int, + clf_model, + vectorizer, + encoder, ): """Run classify → extract pipeline on one or more PDFs. @@ -90,16 +92,6 @@ def _process_single_pdf( log.error("pdf must be a .pdf file or a directory of PDFs: %s", pdf_path) sys.exit(1) - # ── Load classifier once (avoid re-reading model artifacts per file) ── - print("[INFO] Loading classifier...", file=sys.stderr) - try: - clf_model, vectorizer, encoder = load_classifier(model_dir) - except FileNotFoundError as e: - print(f"[ERROR] {e}", file=sys.stderr) - log.critical("Classifier artifacts not found: %s", e) - sys.exit(1) - print("[INFO] Classifier loaded.", file=sys.stderr) - output_dir.mkdir(parents=True, exist_ok=True) summary_rows = [] @@ -239,6 +231,16 @@ def run_pipeline( sys.exit(1) output_dir.mkdir(parents=True, exist_ok=True) + + print("[INFO] Loading classifier...", file=sys.stderr) + try: + clf_model, vectorizer, encoder = load_classifier(model_dir) + except FileNotFoundError as e: + print(f"[ERROR] {e}", file=sys.stderr) + log.critical("Classifier artifacts not found: %s", e) + sys.exit(1) + print("[INFO] Classifier loaded.", file=sys.stderr) + summary_rows = [] if workers > 1 and len(pdf_paths) > 1: @@ -248,12 +250,14 @@ def run_pipeline( executor.submit( _process_single_pdf, pdf_path, - model_dir, llm_model, output_dir, confidence_threshold, max_chars, num_ctx, + clf_model, + vectorizer, + encoder, ): pdf_path for pdf_path in pdf_paths } @@ -270,12 +274,14 @@ def run_pipeline( print(f"\n[{idx}/{len(pdf_paths)}] Processing: {pdf_path.name}", file=sys.stderr) row = _process_single_pdf( pdf_path, - model_dir, llm_model, output_dir, confidence_threshold, max_chars, num_ctx, + clf_model, + vectorizer, + encoder, ) summary_rows.append(row) From 8ff9ac0b341dd2a1ba6c456d906dd99b05ddf1bb Mon Sep 17 00:00:00 2001 From: SeanClay10 Date: Wed, 29 Apr 2026 15:50:18 -0700 Subject: [PATCH 2/3] fix: removing dead code and updating docstrings --- classify_extract.py | 66 +++++++++++---------------------------------- 1 file changed, 15 insertions(+), 51 deletions(-) diff --git a/classify_extract.py b/classify_extract.py index 66e95b1..2c8b5fa 100644 --- a/classify_extract.py +++ b/classify_extract.py @@ -59,59 +59,23 @@ def _process_single_pdf( vectorizer, encoder, ): - """Run classify → extract pipeline on one or more PDFs. - - For each PDF: - 1. Extract text via PyMuPDF / OCR (pdf_text_extraction.py) - 2. Classify with XGBoost (pdf_classifier.py) - 3. If 'useful': trim text to budget (llm_text.py), run LLM extraction - (llm_client.py), and save result JSON (llm_client.py) - 4. Append a row to the summary CSV regardless of classification outcome - - Args: - input_path: Path to a single PDF or a directory of PDFs. - model_dir: Directory containing classifier model artifacts. - llm_model: Ollama model name for extraction. - output_dir: Where to write JSON results and the summary CSV. - confidence_threshold: Classifier probability threshold for 'useful'. - max_chars: Max characters to send to the LLM. - num_ctx: Context window size for Ollama. - """ - # ── Collect PDF paths ───────────────────────────────────────────────── - if pdf_path.is_dir(): - pdf_paths = sorted(pdf_path.glob("*.pdf")) - if not pdf_paths: - print(f"[ERROR] No PDF files found in directory: {pdf_path}", file=sys.stderr) - log.error("No PDF files found in directory: %s", pdf_path) - sys.exit(1) - print(f"[INFO] Found {len(pdf_paths)} PDF(s) in {pdf_path}", file=sys.stderr) - elif pdf_path.is_file() and pdf_path.suffix.lower() == ".pdf": - pdf_paths = [pdf_path] - else: - print(f"[ERROR] pdf must be a .pdf file or a directory of PDFs: {pdf_path}", file=sys.stderr) - log.error("pdf must be a .pdf file or a directory of PDFs: %s", pdf_path) - sys.exit(1) - + """Classify one PDF and return a summary row dict.""" output_dir.mkdir(parents=True, exist_ok=True) - summary_rows = [] - for idx, pdf_path in enumerate(pdf_paths, start=1): - print(f"\n[{idx}/{len(pdf_paths)}] Processing: {pdf_path.name}", file=sys.stderr) - - row = { - "filename": pdf_path.name, - "classification": "", - "confidence": "", - "pred_prob": "", - "extraction_status": "", - "species_name": "", - "study_location": "", - "study_date": "", - "sample_size": "", - "num_empty_stomachs": "", - "num_nonempty_stomachs": "", - "fraction_feeding": "", - } + row = { + "filename": pdf_path.name, + "classification": "", + "confidence": "", + "pred_prob": "", + "extraction_status": "", + "species_name": "", + "study_location": "", + "study_date": "", + "sample_size": "", + "num_empty_stomachs": "", + "num_nonempty_stomachs": "", + "fraction_feeding": "", + } # ── Step 1: Extract text ────────────────────────────────────────── try: From 455bea5dc636bef7cfc19b5255246a3ddc7aa49c Mon Sep 17 00:00:00 2001 From: SeanClay10 Date: Wed, 29 Apr 2026 15:51:52 -0700 Subject: [PATCH 3/3] fix: linting --- src/llm/chunked_biomistral_llm.py | 1 + src/llm/chunked_extraction.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/llm/chunked_biomistral_llm.py b/src/llm/chunked_biomistral_llm.py index 89a34b0..9064fc1 100644 --- a/src/llm/chunked_biomistral_llm.py +++ b/src/llm/chunked_biomistral_llm.py @@ -13,6 +13,7 @@ import logging import sys from collections import Counter +from pathlib import Path from typing import Dict, List, Optional, Tuple project_root = Path(__file__).parent.parent.parent diff --git a/src/llm/chunked_extraction.py b/src/llm/chunked_extraction.py index e1d9e3f..5c8238b 100644 --- a/src/llm/chunked_extraction.py +++ b/src/llm/chunked_extraction.py @@ -92,7 +92,7 @@ def extract_with_chunking( ): """Main extraction with chunking pipeline.""" - print(f" [CHUNK] Loading classifier...", file=sys.stderr) + print(" [CHUNK] Loading classifier...", file=sys.stderr) model, vectorizer, _encoder = load_classifier(model_dir) chunks = chunk_text(text, chunk_size, overlap) @@ -120,7 +120,7 @@ def extract_with_chunking( results = [] for i, (chunk, score) in enumerate(top_chunks): - print(f" [CHUNK] Extracting chunk {i+1}/{len(top_chunks)}...", file=sys.stderr) + print(f" [CHUNK] Extracting chunk {i + 1}/{len(top_chunks)}...", file=sys.stderr) try: metrics = extract_metrics_from_text(