From 8dbbc917fec1fe11c2b4071fda34f4139e09508f Mon Sep 17 00:00:00 2001
From: SeanClay10 <zweihander555@gmail.com>
Date: Sun, 18 Jan 2026 13:20:55 -0800
Subject: [PATCH 1/3] Update requirements for initial LLM implementation.

---
 requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index b940946..14496ae 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,4 +13,6 @@ google-auth-oauthlib
 google-auth-httplib2
 numpy
 google-api-python-client
+ollama
+pydantic
 xgboost
\ No newline at end of file

From 5ef944ca882b0931399720af89a8a092bcfbcd0d Mon Sep 17 00:00:00 2001
From: SeanClay10 <zweihander555@gmail.com>
Date: Sun, 18 Jan 2026 13:24:11 -0800
Subject: [PATCH 2/3] Feat: Initial LLM implementation.

---
 src/llm/local_llm.py | 204 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 204 insertions(+)
 create mode 100644 src/llm/local_llm.py

diff --git a/src/llm/local_llm.py b/src/llm/local_llm.py
new file mode 100644
index 0000000..af122b0
--- /dev/null
+++ b/src/llm/local_llm.py
@@ -0,0 +1,204 @@
+"""LLM-based metric extraction from preprocessed text files.
+
+Usage:
+    python extract_metrics.py path/to/text_file.txt
+    python extract_metrics.py path/to/text_file.txt --model llama3.1:8b
+    python extract_metrics.py path/to/text_file.txt --output-dir results/
+
+This script uses Ollama to extract structured data from preprocessed predator diet 
+surveys, including species name, study date, location, and stomach content data.
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Optional
+
+from ollama import chat
+from pydantic import BaseModel, Field
+
+
+class PredatorDietMetrics(BaseModel):
+    """Structured schema for extracted predator diet survey metrics."""
+    
+    species_name: Optional[str] = Field(
+        None,
+        description="Scientific name of the predator species studied"
+    )
+    study_location: Optional[str] = Field(
+        None,
+        description="Geographic location where the study was conducted"
+    )
+    study_date: Optional[str] = Field(
+        None,
+        description="Year or date range when the study was conducted"
+    )
+    num_empty_stomachs: Optional[int] = Field(
+        None,
+        description="Number of predators with empty stomachs"
+    )
+    num_nonempty_stomachs: Optional[int] = Field(
+        None,
+        description="Number of predators with non-empty stomachs"
+    )
+    sample_size: Optional[int] = Field(
+        None,
+        description="Total number of predators surveyed"
+    )
+
+
+def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> PredatorDietMetrics:
+    """Extract structured metrics from text using Ollama.
+    
+    Args:
+        text: Preprocessed text content from a scientific publication
+        model: Name of the Ollama model to use
+        
+    Returns:
+        PredatorDietMetrics object with extracted data
+    """
+    prompt = f"""You are a scientific data extraction assistant specializing in predator diet surveys.
+
+Extract specific metrics from the text below. Focus on stomach content data where:
+- EMPTY stomachs = no food/prey
+- NON-EMPTY stomachs = contained food/prey
+- SAMPLE SIZE = total number of predators examined
+
+KEY INFORMATION TO FIND:
+- Species names are in Latin format (Genus species)
+- Look in tables, methods, and results sections
+- Empty stomachs: "empty", "vacant", "no prey"
+- Non-empty stomachs: "with prey", "fed", "containing food"
+
+EXTRACT:
+- species_name: Scientific name of PRIMARY predator studied (not prey)
+- study_location: Geographic location of sampling
+- study_date: Year or date range of collection
+- num_empty_stomachs: Number with empty stomachs
+- num_nonempty_stomachs: Number with food in stomachs
+- sample_size: Total number examined
+
+
+TEXT:
+{text}
+"""
+    # Ollama call with structured schema output
+    response = chat(
+        messages=[
+            {
+                'role': 'user',
+                'content': prompt,
+            }
+        ],
+        model=model,
+        format=PredatorDietMetrics.model_json_schema(),
+    )
+    
+    metrics = PredatorDietMetrics.model_validate_json(response.message.content)
+    return metrics
+
+
+def validate_and_calculate(metrics: dict) -> dict:
+    """Validate extracted metrics and calculate derived values.
+    
+    Args:
+        metrics: Dictionary of extracted metrics
+        
+    Returns:
+        Dictionary with validated metrics and calculated fraction_feeding
+    """
+    empty = metrics.get("num_empty_stomachs")
+    nonempty = metrics.get("num_nonempty_stomachs")
+    sample = metrics.get("sample_size")
+    
+    # Validate and fix sample size if needed
+    if empty is not None and nonempty is not None:
+        calculated_sample = empty + nonempty
+        if sample is None:
+            metrics["sample_size"] = calculated_sample
+            sample = calculated_sample
+        elif sample != calculated_sample:
+            # LLM made an error, use calculated value
+            metrics["sample_size"] = calculated_sample
+            sample = calculated_sample
+    
+    # Calculate fraction of feeding predators
+    fraction_feeding = None
+    if nonempty is not None and sample is not None and sample > 0:
+        fraction_feeding = round(nonempty / sample, 4)
+    
+    metrics["fraction_feeding"] = fraction_feeding
+    
+    return metrics
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Extract predator diet metrics from preprocessed text using LLM"
+    )
+    parser.add_argument(
+        "text_file",
+        type=str,
+        help="Path to the preprocessed text file"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="llama3.1:8b",
+        help="Ollama model to use (default: llama3.1:8b)"
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="data/results",
+        help="Output directory for JSON results (default: data/results)"
+    )
+    
+    args = parser.parse_args()
+    
+    # Load text file
+    text_path = Path(args.text_file)
+    if not text_path.exists():
+        print(f"[ERROR] File not found: {text_path}", file=sys.stderr)
+        sys.exit(1)
+    
+    try:
+        with open(text_path, "r", encoding="utf-8") as f:
+            text = f.read()
+    except Exception as e:
+        print(f"[ERROR] Failed to read file: {e}", file=sys.stderr)
+        sys.exit(1)
+    
+    # Extract metrics
+    print(f"Extracting metrics from {text_path.name}...", file=sys.stderr)
+    try:
+        metrics = extract_metrics_from_text(text, model=args.model)
+    except Exception as e:
+        print(f"[ERROR] Extraction failed: {e}", file=sys.stderr)
+        sys.exit(1)
+    
+    # Validate and calculate derived metrics
+    metrics_dict = metrics.model_dump()
+    metrics_dict = validate_and_calculate(metrics_dict)
+    
+    # Prepare output
+    result = {
+        "source_file": text_path.name,
+        "metrics": metrics_dict
+    }
+    
+    # Generate output filename: input_name_results.json
+    output_filename = text_path.stem + "_results.json"
+    output_path = Path(args.output_dir) / output_filename
+    
+    # Save results
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(result, f, indent=2)
+    
+    print(f"Results saved to {output_path}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 82905ddf7fa139e4c861c950c08b66d78ea016a0 Mon Sep 17 00:00:00 2001
From: SeanClay10 <zweihander555@gmail.com>
Date: Sun, 18 Jan 2026 13:28:25 -0800
Subject: [PATCH 3/3] Fix: formatting issues.

---
 src/llm/local_llm.py | 101 ++++++++++++++-----------------------------
 1 file changed, 32 insertions(+), 69 deletions(-)

diff --git a/src/llm/local_llm.py b/src/llm/local_llm.py
index af122b0..981ab46 100644
--- a/src/llm/local_llm.py
+++ b/src/llm/local_llm.py
@@ -5,7 +5,7 @@
     python extract_metrics.py path/to/text_file.txt --model llama3.1:8b
     python extract_metrics.py path/to/text_file.txt --output-dir results/
 
-This script uses Ollama to extract structured data from preprocessed predator diet 
+This script uses Ollama to extract structured data from preprocessed predator diet
 surveys, including species name, study date, location, and stomach content data.
 """
 
@@ -21,40 +21,22 @@
 
 class PredatorDietMetrics(BaseModel):
     """Structured schema for extracted predator diet survey metrics."""
-    
-    species_name: Optional[str] = Field(
-        None,
-        description="Scientific name of the predator species studied"
-    )
-    study_location: Optional[str] = Field(
-        None,
-        description="Geographic location where the study was conducted"
-    )
-    study_date: Optional[str] = Field(
-        None,
-        description="Year or date range when the study was conducted"
-    )
-    num_empty_stomachs: Optional[int] = Field(
-        None,
-        description="Number of predators with empty stomachs"
-    )
-    num_nonempty_stomachs: Optional[int] = Field(
-        None,
-        description="Number of predators with non-empty stomachs"
-    )
-    sample_size: Optional[int] = Field(
-        None,
-        description="Total number of predators surveyed"
-    )
+
+    species_name: Optional[str] = Field(None, description="Scientific name of the predator species studied")
+    study_location: Optional[str] = Field(None, description="Geographic location where the study was conducted")
+    study_date: Optional[str] = Field(None, description="Year or date range when the study was conducted")
+    num_empty_stomachs: Optional[int] = Field(None, description="Number of predators with empty stomachs")
+    num_nonempty_stomachs: Optional[int] = Field(None, description="Number of predators with non-empty stomachs")
+    sample_size: Optional[int] = Field(None, description="Total number of predators surveyed")
 
 
 def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> PredatorDietMetrics:
     """Extract structured metrics from text using Ollama.
-    
+
     Args:
         text: Preprocessed text content from a scientific publication
         model: Name of the Ollama model to use
-        
+
     Returns:
         PredatorDietMetrics object with extracted data
     """
@@ -94,24 +76,24 @@ def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> Predator
         model=model,
         format=PredatorDietMetrics.model_json_schema(),
     )
-    
+
     metrics = PredatorDietMetrics.model_validate_json(response.message.content)
     return metrics
 
 
 def validate_and_calculate(metrics: dict) -> dict:
     """Validate extracted metrics and calculate derived values.
-    
+
     Args:
         metrics: Dictionary of extracted metrics
-        
+
     Returns:
         Dictionary with validated metrics and calculated fraction_feeding
     """
     empty = metrics.get("num_empty_stomachs")
     nonempty = metrics.get("num_nonempty_stomachs")
     sample = metrics.get("sample_size")
-    
+
     # Validate and fix sample size if needed
     if empty is not None and nonempty is not None:
         calculated_sample = empty + nonempty
@@ -122,54 +104,38 @@ def validate_and_calculate(metrics: dict) -> dict:
             # LLM made an error, use calculated value
             metrics["sample_size"] = calculated_sample
             sample = calculated_sample
-    
+
     # Calculate fraction of feeding predators
     fraction_feeding = None
     if nonempty is not None and sample is not None and sample > 0:
         fraction_feeding = round(nonempty / sample, 4)
-    
+
     metrics["fraction_feeding"] = fraction_feeding
-    
+
     return metrics
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="Extract predator diet metrics from preprocessed text using LLM"
-    )
-    parser.add_argument(
-        "text_file",
-        type=str,
-        help="Path to the preprocessed text file"
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="llama3.1:8b",
-        help="Ollama model to use (default: llama3.1:8b)"
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        default="data/results",
-        help="Output directory for JSON results (default: data/results)"
-    )
-    
+    parser = argparse.ArgumentParser(description="Extract predator diet metrics from preprocessed text using LLM")
+    parser.add_argument("text_file", type=str, help="Path to the preprocessed text file")
+    parser.add_argument("--model", type=str, default="llama3.1:8b", help="Ollama model to use (default: llama3.1:8b)")
+    parser.add_argument("--output-dir", type=str, default="data/results", help="Output directory for JSON results (default: data/results)")
+
     args = parser.parse_args()
-    
+
     # Load text file
     text_path = Path(args.text_file)
     if not text_path.exists():
         print(f"[ERROR] File not found: {text_path}", file=sys.stderr)
         sys.exit(1)
-    
+
     try:
         with open(text_path, "r", encoding="utf-8") as f:
             text = f.read()
     except Exception as e:
         print(f"[ERROR] Failed to read file: {e}", file=sys.stderr)
         sys.exit(1)
-    
+
     # Extract metrics
     print(f"Extracting metrics from {text_path.name}...", file=sys.stderr)
     try:
@@ -177,28 +143,25 @@ def main():
     except Exception as e:
         print(f"[ERROR] Extraction failed: {e}", file=sys.stderr)
         sys.exit(1)
-    
+
     # Validate and calculate derived metrics
     metrics_dict = metrics.model_dump()
     metrics_dict = validate_and_calculate(metrics_dict)
-    
+
     # Prepare output
-    result = {
-        "source_file": text_path.name,
-        "metrics": metrics_dict
-    }
-    
+    result = {"source_file": text_path.name, "metrics": metrics_dict}
+
     # Generate output filename: input_name_results.json
     output_filename = text_path.stem + "_results.json"
     output_path = Path(args.output_dir) / output_filename
-    
+
     # Save results
     output_path.parent.mkdir(parents=True, exist_ok=True)
     with open(output_path, "w", encoding="utf-8") as f:
         json.dump(result, f, indent=2)
-    
+
     print(f"Results saved to {output_path}", file=sys.stderr)
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()