1212import argparse
1313import json
1414import sys
15+ import re
1516from pathlib import Path
1617from typing import Optional , List
1718
@@ -28,7 +29,6 @@ class PredatorDietMetrics(BaseModel):
2829 num_empty_stomachs : Optional [int ] = Field (None , description = "Number of predators with empty stomachs" )
2930 num_nonempty_stomachs : Optional [int ] = Field (None , description = "Number of predators with non-empty stomachs" )
3031 sample_size : Optional [int ] = Field (None , description = "Total number of predators surveyed" )
31- source_pages : Optional [List [int ]] = Field (None , description = "Page numbers where the key data was found (species, location, date, stomach counts)" )
3232
3333
3434def extract_metrics_from_text (text : str , model : str = "llama3.1:8b" ) -> PredatorDietMetrics :
@@ -62,7 +62,6 @@ def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> Predator
6262- num_empty_stomachs: Number with empty stomachs
6363- num_nonempty_stomachs: Number with food in stomachs
6464- sample_size: Total number examined
65- - source_pages: List of page numbers where you found the key data (look for [PAGE N] markers)
6665
6766
6867TEXT:
@@ -141,6 +140,11 @@ def main():
141140
142141 # Extract metrics
143142 print (f"Extracting metrics from { text_path .name } ..." , file = sys .stderr )
143+
144+ # Store original text for page extraction
145+ original_text = text
146+ print (f"[INFO] Text size: { len (text )} chars" , file = sys .stderr )
147+
144148 try :
145149 metrics = extract_metrics_from_text (text , model = args .model )
146150 except Exception as e :
@@ -150,6 +154,19 @@ def main():
150154 # Validate and calculate derived metrics
151155 metrics_dict = metrics .model_dump ()
152156 metrics_dict = validate_and_calculate (metrics_dict )
157+
158+ # Extract page numbers programmatically from where data was found
159+ source_pages = set ()
160+ for field , value in metrics_dict .items ():
161+ if value and field not in ['fraction_feeding' ]:
162+ value_str = str (value )
163+ if value_str in original_text :
164+ pos = original_text .find (value_str )
165+ page_markers = re .findall (r'\[PAGE (\d+)\]' , original_text [:pos ])
166+ if page_markers :
167+ source_pages .add (int (page_markers [- 1 ]))
168+
169+ metrics_dict ["source_pages" ] = sorted (list (source_pages )) if source_pages else None
153170
154171 # Prepare output
155172 result = {"source_file" : text_path .name , "metrics" : metrics_dict }
0 commit comments