workflow-training-data-extraction/outbreak_flagger_argo.py at main · NIAID-BRC-Codeathons/workflow-training-data-extraction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/env python3
"""
Outbreak Flagger using ARGO LLM
Analyzes outbreak catalog and uses LLM to generate a comprehensive outbreak report
"""

import sys
import os
import csv
from datetime import datetime

# Add scripts directory to path to import ARGO
sys.path.append('scripts')
from ARGO import ArgoWrapper

class OutbreakFlaggerARGO:
    def __init__(self, catalog_path="outbreak_data/catalog.csv"):
        self.catalog_path = catalog_path
        self.argo = ArgoWrapper(model="gpt4o")
        self.catalog_data = []

    def read_catalog(self):
        """Read the catalog CSV file"""
        print("Reading catalog...")
        with open(self.catalog_path, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                if row.get('filename') and row.get('description'):
                    # Skip test files and empty descriptions
                    if not row['filename'].startswith('test_') and 'no entries' not in row['description'].lower():
                        self.catalog_data.append(row)
        print(f"Found {len(self.catalog_data)} valid catalog entries")
        return self.catalog_data

    def generate_report_with_llm(self):
        """Use ARGO LLM to generate the complete markdown report"""
        print("Generating comprehensive outbreak report with LLM...")

        # Prepare the catalog data for the LLM
        catalog_entries = []
        for i, entry in enumerate(self.catalog_data, 1):
            catalog_entries.append(f"Entry {i}:\nFile: {entry['filename']}\nDescription: {entry['description']}")

        catalog_text = "\n\n".join(catalog_entries)

        # System prompt
        system_prompt = """You are an expert epidemiologist and outbreak analyst tasked with creating a comprehensive outbreak analysis report. You will analyze outbreak data catalog entries and produce a detailed markdown report identifying potential disease outbreaks.

Your report should be professional, thorough, and actionable for public health officials. Consider factors like geographic spread, case counts, mortality rates, vaccination status, emergence of variants, and unusual disease patterns.

Prioritize outbreaks by risk level based on:
- Severity (mortality, hospitalizations)
- Spread potential (geographic distribution, transmission rate)
- Public health impact (vulnerable populations, healthcare capacity)
- Novel or concerning characteristics"""

        # User prompt
        user_prompt = f"""Analyze the following outbreak data catalog with {len(self.catalog_data)} entries and generate a comprehensive markdown report identifying all potential outbreaks that require investigation.

CATALOG DATA:
{catalog_text}

Generate a complete markdown report with the following structure:

# Potential Outbreak Analysis Report

Include:
- Header with generation date ({datetime.now().strftime('%Y-%m-%d %H:%M:%S UTC')}) and metadata
- Executive summary
- Detailed analysis of each identified outbreak including:
  - Location, Date, Disease
  - Supporting data from the catalog
  - Epidemiological hypotheses explaining the outbreak
  - Specific URLs to crawl for verification and monitoring
- Risk assessment and prioritization
- Geographic distribution analysis
- Trend analysis and patterns
- Recommendations for immediate action
- Data gaps and limitations

For each outbreak, provide:
1. Clear identification (disease, location, timeframe)
2. All relevant data points from the descriptions
3. 3-5 hypotheses explaining the outbreak
4. 5-10 specific URLs for investigation including:
   - CDC/WHO disease-specific pages
   - ProMED searches
   - Local health department sites
   - News searches
   - Academic/research resources

Conclude with actionable next steps and recommendations.

Make the report comprehensive, well-structured, and ready for immediate use by outbreak response teams."""

        try:
            # Call ARGO LLM to generate the complete report
            response = self.argo.invoke(
                prompt_system=system_prompt,
                prompt_user=user_prompt,
                temperature=0.1,  # Low temperature for factual, consistent output
                top_p=0.95
            )

            if response and 'response' in response:
                return response['response']
            else:
                print("Error: Invalid response from ARGO")
                return None

        except Exception as e:
            print(f"Error calling ARGO: {e}")
            return None

    def save_report(self, report_content):
        """Save the generated report to file"""
        if not report_content:
            print("No report content to save")
            return False

        output_file = "potential_outbreaks.md"
        try:
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(report_content)
            print(f"Report successfully saved to: {output_file}")
            return True
        except Exception as e:
            print(f"Error saving report: {e}")
            return False

    def run(self):
        """Main execution method"""
        print("=" * 60)
        print("OUTBREAK FLAGGER - ARGO LLM Analysis")
        print("=" * 60)

        # Read catalog
        self.read_catalog()

        if not self.catalog_data:
            print("No valid catalog entries found")
            return

        # Generate report with LLM
        report = self.generate_report_with_llm()

        if report:
            # Save the report
            if self.save_report(report):
                print("\n" + "=" * 60)
                print("Analysis complete!")
                print("Report saved to: potential_outbreaks.md")
                print("=" * 60)

                # Print summary statistics
                print(f"\nSummary:")
                print(f"- Catalog entries analyzed: {len(self.catalog_data)}")
                print(f"- Report generated: potential_outbreaks.md")
                print(f"- Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S UTC')}")
        else:
            print("Failed to generate report")


def main():
    flagger = OutbreakFlaggerARGO()
    flagger.run()


if __name__ == "__main__":
    main()