Skip to content

Commit 03cafe8

Browse files
committed
Removed field, use regex instead
1 parent 55b92ff commit 03cafe8

1 file changed

Lines changed: 19 additions & 2 deletions

File tree

src/llm/local_llm.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import argparse
1313
import json
1414
import sys
15+
import re
1516
from pathlib import Path
1617
from typing import Optional, List
1718

@@ -28,7 +29,6 @@ class PredatorDietMetrics(BaseModel):
2829
num_empty_stomachs: Optional[int] = Field(None, description="Number of predators with empty stomachs")
2930
num_nonempty_stomachs: Optional[int] = Field(None, description="Number of predators with non-empty stomachs")
3031
sample_size: Optional[int] = Field(None, description="Total number of predators surveyed")
31-
source_pages: Optional[List[int]] = Field(None, description="Page numbers where the key data was found (species, location, date, stomach counts)")
3232

3333

3434
def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> PredatorDietMetrics:
@@ -62,7 +62,6 @@ def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> Predator
6262
- num_empty_stomachs: Number with empty stomachs
6363
- num_nonempty_stomachs: Number with food in stomachs
6464
- sample_size: Total number examined
65-
- source_pages: List of page numbers where you found the key data (look for [PAGE N] markers)
6665
6766
6867
TEXT:
@@ -141,6 +140,11 @@ def main():
141140

142141
# Extract metrics
143142
print(f"Extracting metrics from {text_path.name}...", file=sys.stderr)
143+
144+
# Store original text for page extraction
145+
original_text = text
146+
print(f"[INFO] Text size: {len(text)} chars", file=sys.stderr)
147+
144148
try:
145149
metrics = extract_metrics_from_text(text, model=args.model)
146150
except Exception as e:
@@ -150,6 +154,19 @@ def main():
150154
# Validate and calculate derived metrics
151155
metrics_dict = metrics.model_dump()
152156
metrics_dict = validate_and_calculate(metrics_dict)
157+
158+
# Extract page numbers programmatically from where data was found
159+
source_pages = set()
160+
for field, value in metrics_dict.items():
161+
if value and field not in ['fraction_feeding']:
162+
value_str = str(value)
163+
if value_str in original_text:
164+
pos = original_text.find(value_str)
165+
page_markers = re.findall(r'\[PAGE (\d+)\]', original_text[:pos])
166+
if page_markers:
167+
source_pages.add(int(page_markers[-1]))
168+
169+
metrics_dict["source_pages"] = sorted(list(source_pages)) if source_pages else None
153170

154171
# Prepare output
155172
result = {"source_file": text_path.name, "metrics": metrics_dict}

0 commit comments

Comments
 (0)