From 8dbbc917fec1fe11c2b4071fda34f4139e09508f Mon Sep 17 00:00:00 2001 From: SeanClay10 Date: Sun, 18 Jan 2026 13:20:55 -0800 Subject: [PATCH 1/3] Update requirements for initial LLM implementation. --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index b940946..14496ae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,6 @@ google-auth-oauthlib google-auth-httplib2 numpy google-api-python-client +ollama +pydantic xgboost \ No newline at end of file From 5ef944ca882b0931399720af89a8a092bcfbcd0d Mon Sep 17 00:00:00 2001 From: SeanClay10 Date: Sun, 18 Jan 2026 13:24:11 -0800 Subject: [PATCH 2/3] Feat: Initial LLM implementation. --- src/llm/local_llm.py | 204 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 src/llm/local_llm.py diff --git a/src/llm/local_llm.py b/src/llm/local_llm.py new file mode 100644 index 0000000..af122b0 --- /dev/null +++ b/src/llm/local_llm.py @@ -0,0 +1,204 @@ +"""LLM-based metric extraction from preprocessed text files. + +Usage: + python extract_metrics.py path/to/text_file.txt + python extract_metrics.py path/to/text_file.txt --model llama3.1:8b + python extract_metrics.py path/to/text_file.txt --output-dir results/ + +This script uses Ollama to extract structured data from preprocessed predator diet +surveys, including species name, study date, location, and stomach content data. +""" + +import argparse +import json +import sys +from pathlib import Path +from typing import Optional + +from ollama import chat +from pydantic import BaseModel, Field + + +class PredatorDietMetrics(BaseModel): + """Structured schema for extracted predator diet survey metrics.""" + + species_name: Optional[str] = Field( + None, + description="Scientific name of the predator species studied" + ) + study_location: Optional[str] = Field( + None, + description="Geographic location where the study was conducted" + ) + study_date: Optional[str] = Field( + None, + description="Year or date range when the study was conducted" + ) + num_empty_stomachs: Optional[int] = Field( + None, + description="Number of predators with empty stomachs" + ) + num_nonempty_stomachs: Optional[int] = Field( + None, + description="Number of predators with non-empty stomachs" + ) + sample_size: Optional[int] = Field( + None, + description="Total number of predators surveyed" + ) + + +def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> PredatorDietMetrics: + """Extract structured metrics from text using Ollama. + + Args: + text: Preprocessed text content from a scientific publication + model: Name of the Ollama model to use + + Returns: + PredatorDietMetrics object with extracted data + """ + prompt = f"""You are a scientific data extraction assistant specializing in predator diet surveys. + +Extract specific metrics from the text below. Focus on stomach content data where: +- EMPTY stomachs = no food/prey +- NON-EMPTY stomachs = contained food/prey +- SAMPLE SIZE = total number of predators examined + +KEY INFORMATION TO FIND: +- Species names are in Latin format (Genus species) +- Look in tables, methods, and results sections +- Empty stomachs: "empty", "vacant", "no prey" +- Non-empty stomachs: "with prey", "fed", "containing food" + +EXTRACT: +- species_name: Scientific name of PRIMARY predator studied (not prey) +- study_location: Geographic location of sampling +- study_date: Year or date range of collection +- num_empty_stomachs: Number with empty stomachs +- num_nonempty_stomachs: Number with food in stomachs +- sample_size: Total number examined + + +TEXT: +{text} +""" + # Ollama call with structured schema output + response = chat( + messages=[ + { + 'role': 'user', + 'content': prompt, + } + ], + model=model, + format=PredatorDietMetrics.model_json_schema(), + ) + + metrics = PredatorDietMetrics.model_validate_json(response.message.content) + return metrics + + +def validate_and_calculate(metrics: dict) -> dict: + """Validate extracted metrics and calculate derived values. + + Args: + metrics: Dictionary of extracted metrics + + Returns: + Dictionary with validated metrics and calculated fraction_feeding + """ + empty = metrics.get("num_empty_stomachs") + nonempty = metrics.get("num_nonempty_stomachs") + sample = metrics.get("sample_size") + + # Validate and fix sample size if needed + if empty is not None and nonempty is not None: + calculated_sample = empty + nonempty + if sample is None: + metrics["sample_size"] = calculated_sample + sample = calculated_sample + elif sample != calculated_sample: + # LLM made an error, use calculated value + metrics["sample_size"] = calculated_sample + sample = calculated_sample + + # Calculate fraction of feeding predators + fraction_feeding = None + if nonempty is not None and sample is not None and sample > 0: + fraction_feeding = round(nonempty / sample, 4) + + metrics["fraction_feeding"] = fraction_feeding + + return metrics + + +def main(): + parser = argparse.ArgumentParser( + description="Extract predator diet metrics from preprocessed text using LLM" + ) + parser.add_argument( + "text_file", + type=str, + help="Path to the preprocessed text file" + ) + parser.add_argument( + "--model", + type=str, + default="llama3.1:8b", + help="Ollama model to use (default: llama3.1:8b)" + ) + parser.add_argument( + "--output-dir", + type=str, + default="data/results", + help="Output directory for JSON results (default: data/results)" + ) + + args = parser.parse_args() + + # Load text file + text_path = Path(args.text_file) + if not text_path.exists(): + print(f"[ERROR] File not found: {text_path}", file=sys.stderr) + sys.exit(1) + + try: + with open(text_path, "r", encoding="utf-8") as f: + text = f.read() + except Exception as e: + print(f"[ERROR] Failed to read file: {e}", file=sys.stderr) + sys.exit(1) + + # Extract metrics + print(f"Extracting metrics from {text_path.name}...", file=sys.stderr) + try: + metrics = extract_metrics_from_text(text, model=args.model) + except Exception as e: + print(f"[ERROR] Extraction failed: {e}", file=sys.stderr) + sys.exit(1) + + # Validate and calculate derived metrics + metrics_dict = metrics.model_dump() + metrics_dict = validate_and_calculate(metrics_dict) + + # Prepare output + result = { + "source_file": text_path.name, + "metrics": metrics_dict + } + + # Generate output filename: input_name_results.json + output_filename = text_path.stem + "_results.json" + output_path = Path(args.output_dir) / output_filename + + # Save results + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w", encoding="utf-8") as f: + json.dump(result, f, indent=2) + + print(f"Results saved to {output_path}", file=sys.stderr) + + +if __name__ == "__main__": + main() \ No newline at end of file From 82905ddf7fa139e4c861c950c08b66d78ea016a0 Mon Sep 17 00:00:00 2001 From: SeanClay10 Date: Sun, 18 Jan 2026 13:28:25 -0800 Subject: [PATCH 3/3] Fix: formatting issues. --- src/llm/local_llm.py | 101 ++++++++++++++----------------------------- 1 file changed, 32 insertions(+), 69 deletions(-) diff --git a/src/llm/local_llm.py b/src/llm/local_llm.py index af122b0..981ab46 100644 --- a/src/llm/local_llm.py +++ b/src/llm/local_llm.py @@ -5,7 +5,7 @@ python extract_metrics.py path/to/text_file.txt --model llama3.1:8b python extract_metrics.py path/to/text_file.txt --output-dir results/ -This script uses Ollama to extract structured data from preprocessed predator diet +This script uses Ollama to extract structured data from preprocessed predator diet surveys, including species name, study date, location, and stomach content data. """ @@ -21,40 +21,22 @@ class PredatorDietMetrics(BaseModel): """Structured schema for extracted predator diet survey metrics.""" - - species_name: Optional[str] = Field( - None, - description="Scientific name of the predator species studied" - ) - study_location: Optional[str] = Field( - None, - description="Geographic location where the study was conducted" - ) - study_date: Optional[str] = Field( - None, - description="Year or date range when the study was conducted" - ) - num_empty_stomachs: Optional[int] = Field( - None, - description="Number of predators with empty stomachs" - ) - num_nonempty_stomachs: Optional[int] = Field( - None, - description="Number of predators with non-empty stomachs" - ) - sample_size: Optional[int] = Field( - None, - description="Total number of predators surveyed" - ) + + species_name: Optional[str] = Field(None, description="Scientific name of the predator species studied") + study_location: Optional[str] = Field(None, description="Geographic location where the study was conducted") + study_date: Optional[str] = Field(None, description="Year or date range when the study was conducted") + num_empty_stomachs: Optional[int] = Field(None, description="Number of predators with empty stomachs") + num_nonempty_stomachs: Optional[int] = Field(None, description="Number of predators with non-empty stomachs") + sample_size: Optional[int] = Field(None, description="Total number of predators surveyed") def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> PredatorDietMetrics: """Extract structured metrics from text using Ollama. - + Args: text: Preprocessed text content from a scientific publication model: Name of the Ollama model to use - + Returns: PredatorDietMetrics object with extracted data """ @@ -94,24 +76,24 @@ def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> Predator model=model, format=PredatorDietMetrics.model_json_schema(), ) - + metrics = PredatorDietMetrics.model_validate_json(response.message.content) return metrics def validate_and_calculate(metrics: dict) -> dict: """Validate extracted metrics and calculate derived values. - + Args: metrics: Dictionary of extracted metrics - + Returns: Dictionary with validated metrics and calculated fraction_feeding """ empty = metrics.get("num_empty_stomachs") nonempty = metrics.get("num_nonempty_stomachs") sample = metrics.get("sample_size") - + # Validate and fix sample size if needed if empty is not None and nonempty is not None: calculated_sample = empty + nonempty @@ -122,54 +104,38 @@ def validate_and_calculate(metrics: dict) -> dict: # LLM made an error, use calculated value metrics["sample_size"] = calculated_sample sample = calculated_sample - + # Calculate fraction of feeding predators fraction_feeding = None if nonempty is not None and sample is not None and sample > 0: fraction_feeding = round(nonempty / sample, 4) - + metrics["fraction_feeding"] = fraction_feeding - + return metrics def main(): - parser = argparse.ArgumentParser( - description="Extract predator diet metrics from preprocessed text using LLM" - ) - parser.add_argument( - "text_file", - type=str, - help="Path to the preprocessed text file" - ) - parser.add_argument( - "--model", - type=str, - default="llama3.1:8b", - help="Ollama model to use (default: llama3.1:8b)" - ) - parser.add_argument( - "--output-dir", - type=str, - default="data/results", - help="Output directory for JSON results (default: data/results)" - ) - + parser = argparse.ArgumentParser(description="Extract predator diet metrics from preprocessed text using LLM") + parser.add_argument("text_file", type=str, help="Path to the preprocessed text file") + parser.add_argument("--model", type=str, default="llama3.1:8b", help="Ollama model to use (default: llama3.1:8b)") + parser.add_argument("--output-dir", type=str, default="data/results", help="Output directory for JSON results (default: data/results)") + args = parser.parse_args() - + # Load text file text_path = Path(args.text_file) if not text_path.exists(): print(f"[ERROR] File not found: {text_path}", file=sys.stderr) sys.exit(1) - + try: with open(text_path, "r", encoding="utf-8") as f: text = f.read() except Exception as e: print(f"[ERROR] Failed to read file: {e}", file=sys.stderr) sys.exit(1) - + # Extract metrics print(f"Extracting metrics from {text_path.name}...", file=sys.stderr) try: @@ -177,28 +143,25 @@ def main(): except Exception as e: print(f"[ERROR] Extraction failed: {e}", file=sys.stderr) sys.exit(1) - + # Validate and calculate derived metrics metrics_dict = metrics.model_dump() metrics_dict = validate_and_calculate(metrics_dict) - + # Prepare output - result = { - "source_file": text_path.name, - "metrics": metrics_dict - } - + result = {"source_file": text_path.name, "metrics": metrics_dict} + # Generate output filename: input_name_results.json output_filename = text_path.stem + "_results.json" output_path = Path(args.output_dir) / output_filename - + # Save results output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "w", encoding="utf-8") as f: json.dump(result, f, indent=2) - + print(f"Results saved to {output_path}", file=sys.stderr) if __name__ == "__main__": - main() \ No newline at end of file + main()