diff --git a/src/llm/local_llm.py b/src/llm/local_llm.py index 981ab46..8abc543 100644 --- a/src/llm/local_llm.py +++ b/src/llm/local_llm.py @@ -12,8 +12,9 @@ import argparse import json import sys +import re from pathlib import Path -from typing import Optional +from typing import Optional, List from ollama import chat from pydantic import BaseModel, Field @@ -52,6 +53,7 @@ def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> Predator - Look in tables, methods, and results sections - Empty stomachs: "empty", "vacant", "no prey" - Non-empty stomachs: "with prey", "fed", "containing food" +- Page markers appear as [PAGE N] in the text EXTRACT: - species_name: Scientific name of PRIMARY predator studied (not prey) @@ -138,6 +140,11 @@ def main(): # Extract metrics print(f"Extracting metrics from {text_path.name}...", file=sys.stderr) + + # Store original text for page extraction + original_text = text + print(f"[INFO] Text size: {len(text)} chars", file=sys.stderr) + try: metrics = extract_metrics_from_text(text, model=args.model) except Exception as e: @@ -148,6 +155,19 @@ def main(): metrics_dict = metrics.model_dump() metrics_dict = validate_and_calculate(metrics_dict) + # Extract page numbers programmatically from where data was found + source_pages = set() + for field, value in metrics_dict.items(): + if value and field not in ['fraction_feeding']: + value_str = str(value) + if value_str in original_text: + pos = original_text.find(value_str) + page_markers = re.findall(r'\[PAGE (\d+)\]', original_text[:pos]) + if page_markers: + source_pages.add(int(page_markers[-1])) + + metrics_dict["source_pages"] = sorted(list(source_pages)) if source_pages else None + # Prepare output result = {"source_file": text_path.name, "metrics": metrics_dict} diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py index a51ee2c..d3cbf12 100644 --- a/src/preprocessing/pdf_text_extraction.py +++ b/src/preprocessing/pdf_text_extraction.py @@ -156,7 +156,8 @@ def extract_text_from_pdf(pdf_path: str) -> str: img = Image.open(io.BytesIO(pix.tobytes("png"))) page_text = pytesseract.image_to_string(img) - text.append(page_text) + # Add page marker for tracking + text.append(f"[PAGE {page_num}]\n{page_text}") except Exception as e: print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr) return "" @@ -177,7 +178,8 @@ def extract_text_from_pdf_bytes(data: bytes) -> str: pix = page.get_pixmap(dpi=300) img = Image.open(io.BytesIO(pix.tobytes("png"))) page_text = pytesseract.image_to_string(img) - text.append(page_text) + # Add page marker for tracking + text.append(f"[PAGE {page_num}]\n{page_text}") except Exception as e: print(f"[ERROR] Failed to extract text from PDF bytes: {e}", file=sys.stderr) return ""