Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion src/llm/local_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
import argparse
import json
import sys
import re
from pathlib import Path
from typing import Optional
from typing import Optional, List

from ollama import chat
from pydantic import BaseModel, Field
Expand Down Expand Up @@ -52,6 +53,7 @@ def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> Predator
- Look in tables, methods, and results sections
- Empty stomachs: "empty", "vacant", "no prey"
- Non-empty stomachs: "with prey", "fed", "containing food"
- Page markers appear as [PAGE N] in the text

EXTRACT:
- species_name: Scientific name of PRIMARY predator studied (not prey)
Expand Down Expand Up @@ -138,6 +140,11 @@ def main():

# Extract metrics
print(f"Extracting metrics from {text_path.name}...", file=sys.stderr)

# Store original text for page extraction
original_text = text
print(f"[INFO] Text size: {len(text)} chars", file=sys.stderr)

try:
metrics = extract_metrics_from_text(text, model=args.model)
except Exception as e:
Expand All @@ -148,6 +155,19 @@ def main():
metrics_dict = metrics.model_dump()
metrics_dict = validate_and_calculate(metrics_dict)

# Extract page numbers programmatically from where data was found
source_pages = set()
for field, value in metrics_dict.items():
if value and field not in ['fraction_feeding']:
value_str = str(value)
if value_str in original_text:
pos = original_text.find(value_str)
page_markers = re.findall(r'\[PAGE (\d+)\]', original_text[:pos])
if page_markers:
source_pages.add(int(page_markers[-1]))

metrics_dict["source_pages"] = sorted(list(source_pages)) if source_pages else None

# Prepare output
result = {"source_file": text_path.name, "metrics": metrics_dict}

Expand Down
6 changes: 4 additions & 2 deletions src/preprocessing/pdf_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,8 @@ def extract_text_from_pdf(pdf_path: str) -> str:
img = Image.open(io.BytesIO(pix.tobytes("png")))
page_text = pytesseract.image_to_string(img)

text.append(page_text)
# Add page marker for tracking
text.append(f"[PAGE {page_num}]\n{page_text}")
except Exception as e:
print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr)
return ""
Expand All @@ -177,7 +178,8 @@ def extract_text_from_pdf_bytes(data: bytes) -> str:
pix = page.get_pixmap(dpi=300)
img = Image.open(io.BytesIO(pix.tobytes("png")))
page_text = pytesseract.image_to_string(img)
text.append(page_text)
# Add page marker for tracking
text.append(f"[PAGE {page_num}]\n{page_text}")
except Exception as e:
print(f"[ERROR] Failed to extract text from PDF bytes: {e}", file=sys.stderr)
return ""
Expand Down