Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 34 additions & 64 deletions classify_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,76 +50,32 @@

def _process_single_pdf(
pdf_path: Path,
model_dir: str,
llm_model: str,
output_dir: Path,
confidence_threshold: float,
max_chars: int,
num_ctx: int,
clf_model,
vectorizer,
encoder,
):
"""Run classify → extract pipeline on one or more PDFs.

For each PDF:
1. Extract text via PyMuPDF / OCR (pdf_text_extraction.py)
2. Classify with XGBoost (pdf_classifier.py)
3. If 'useful': trim text to budget (llm_text.py), run LLM extraction
(llm_client.py), and save result JSON (llm_client.py)
4. Append a row to the summary CSV regardless of classification outcome

Args:
input_path: Path to a single PDF or a directory of PDFs.
model_dir: Directory containing classifier model artifacts.
llm_model: Ollama model name for extraction.
output_dir: Where to write JSON results and the summary CSV.
confidence_threshold: Classifier probability threshold for 'useful'.
max_chars: Max characters to send to the LLM.
num_ctx: Context window size for Ollama.
"""
# ── Collect PDF paths ─────────────────────────────────────────────────
if pdf_path.is_dir():
pdf_paths = sorted(pdf_path.glob("*.pdf"))
if not pdf_paths:
print(f"[ERROR] No PDF files found in directory: {pdf_path}", file=sys.stderr)
log.error("No PDF files found in directory: %s", pdf_path)
sys.exit(1)
print(f"[INFO] Found {len(pdf_paths)} PDF(s) in {pdf_path}", file=sys.stderr)
elif pdf_path.is_file() and pdf_path.suffix.lower() == ".pdf":
pdf_paths = [pdf_path]
else:
print(f"[ERROR] pdf must be a .pdf file or a directory of PDFs: {pdf_path}", file=sys.stderr)
log.error("pdf must be a .pdf file or a directory of PDFs: %s", pdf_path)
sys.exit(1)

# ── Load classifier once (avoid re-reading model artifacts per file) ──
print("[INFO] Loading classifier...", file=sys.stderr)
try:
clf_model, vectorizer, encoder = load_classifier(model_dir)
except FileNotFoundError as e:
print(f"[ERROR] {e}", file=sys.stderr)
log.critical("Classifier artifacts not found: %s", e)
sys.exit(1)
print("[INFO] Classifier loaded.", file=sys.stderr)

"""Classify one PDF and return a summary row dict."""
output_dir.mkdir(parents=True, exist_ok=True)
summary_rows = []

for idx, pdf_path in enumerate(pdf_paths, start=1):
print(f"\n[{idx}/{len(pdf_paths)}] Processing: {pdf_path.name}", file=sys.stderr)

row = {
"filename": pdf_path.name,
"classification": "",
"confidence": "",
"pred_prob": "",
"extraction_status": "",
"species_name": "",
"study_location": "",
"study_date": "",
"sample_size": "",
"num_empty_stomachs": "",
"num_nonempty_stomachs": "",
"fraction_feeding": "",
}
row = {
"filename": pdf_path.name,
"classification": "",
"confidence": "",
"pred_prob": "",
"extraction_status": "",
"species_name": "",
"study_location": "",
"study_date": "",
"sample_size": "",
"num_empty_stomachs": "",
"num_nonempty_stomachs": "",
"fraction_feeding": "",
}

# ── Step 1: Extract text ──────────────────────────────────────────
try:
Expand Down Expand Up @@ -239,6 +195,16 @@ def run_pipeline(
sys.exit(1)

output_dir.mkdir(parents=True, exist_ok=True)

print("[INFO] Loading classifier...", file=sys.stderr)
try:
clf_model, vectorizer, encoder = load_classifier(model_dir)
except FileNotFoundError as e:
print(f"[ERROR] {e}", file=sys.stderr)
log.critical("Classifier artifacts not found: %s", e)
sys.exit(1)
print("[INFO] Classifier loaded.", file=sys.stderr)

summary_rows = []

if workers > 1 and len(pdf_paths) > 1:
Expand All @@ -248,12 +214,14 @@ def run_pipeline(
executor.submit(
_process_single_pdf,
pdf_path,
model_dir,
llm_model,
output_dir,
confidence_threshold,
max_chars,
num_ctx,
clf_model,
vectorizer,
encoder,
): pdf_path
for pdf_path in pdf_paths
}
Expand All @@ -270,12 +238,14 @@ def run_pipeline(
print(f"\n[{idx}/{len(pdf_paths)}] Processing: {pdf_path.name}", file=sys.stderr)
row = _process_single_pdf(
pdf_path,
model_dir,
llm_model,
output_dir,
confidence_threshold,
max_chars,
num_ctx,
clf_model,
vectorizer,
encoder,
)
summary_rows.append(row)

Expand Down
1 change: 1 addition & 0 deletions src/llm/chunked_biomistral_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import logging
import sys
from collections import Counter
from pathlib import Path
from typing import Dict, List, Optional, Tuple

project_root = Path(__file__).parent.parent.parent
Expand Down
4 changes: 2 additions & 2 deletions src/llm/chunked_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def extract_with_chunking(
):
"""Main extraction with chunking pipeline."""

print(f" [CHUNK] Loading classifier...", file=sys.stderr)
print(" [CHUNK] Loading classifier...", file=sys.stderr)
model, vectorizer, _encoder = load_classifier(model_dir)

chunks = chunk_text(text, chunk_size, overlap)
Expand Down Expand Up @@ -120,7 +120,7 @@ def extract_with_chunking(

results = []
for i, (chunk, score) in enumerate(top_chunks):
print(f" [CHUNK] Extracting chunk {i+1}/{len(top_chunks)}...", file=sys.stderr)
print(f" [CHUNK] Extracting chunk {i + 1}/{len(top_chunks)}...", file=sys.stderr)

try:
metrics = extract_metrics_from_text(
Expand Down
Loading