workflow-training-data-extraction/FireCrawl_Script_Scrape_Symptoms.py at main · NIAID-BRC-Codeathons/workflow-training-data-extraction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
"""
Outbreak Symptom Trend Analysis
Uses FireCrawl to find recent measles and monkeypox outbreaks,
then analyzes Google Trends data for fever and rash symptoms.
"""

import signal
from typing import List, Dict, Any, Tuple
from datetime import datetime, timedelta
import pandas as pd
from firecrawl import FirecrawlApp
from pytrends.request import TrendReq
import time
import json
import dotenv
from firecrawl_response_formatter import format_response
from data_repository_writer import write_to_repository


# get api-key from detenv or error
FIRECRAWL_API_KEY = dotenv.get_key(dotenv.find_dotenv(), "FIRECRAWL_API_KEY")
if not FIRECRAWL_API_KEY:
    raise ValueError("FIRECRAWL_API_KEY not set in .env")

# Configuration
FIRECRAWL_CONFIG = {
    "api_key": FIRECRAWL_API_KEY
}

# Timeout handler for searches
def timeout_handler(signum, frame):
    raise TimeoutError("Search operation timed out")


def execute_search(query: str, num_results: int = 5, use_timeout_signal: bool = True) -> List[Dict[str, Any]]:
    """
    Execute a search query using Firecrawl
    Args:
        query: The search query
        num_results: Number of results to return
        use_timeout_signal: Whether to use signal-based timeout (only works in main thread)
    Returns:
        List of search results with content
    """
    print(f"\n🔍 Searching with Firecrawl: {query}")
    results = []

    # Initialize the Firecrawl client
    firecrawl_app = FirecrawlApp(api_key=FIRECRAWL_CONFIG["api_key"])

    try:
        # Only use signal-based timeout in the main thread
        if use_timeout_signal:
            # Set timeout handler
            signal.signal(signal.SIGALRM, timeout_handler)
            signal.alarm(120)  # 30 second timeout

        # Execute search with Firecrawl
        search_response = firecrawl_app.search(
            query=query,
            timeout= 30000,
            limit = num_results,
            scrape_options = {'formats': ['markdown']}
        )

        formatted_results = format_response(query, search_response)

        print(f"✅ Found {len(formatted_results)} results from Firecrawl")

        write_to_repository(formatted_results)

        return formatted_results

    except TimeoutError:
        print(f"⏱️  Search timed out after 30 seconds: {query}")
        return []
    except Exception as e:
        print(f"❌ Error searching with Firecrawl: {e}")
        return []


def find_outbreak_data(disease: str, timeframe: str = "2024 OR 2025") -> List[Dict[str, Any]]:
    """
    Search for recent outbreak announcements for a specific disease
    """
    queries = [
        f"{disease} outbreak {timeframe} United States state",
        f"{disease} cases reported {timeframe} US state health department",
        f"CDC {disease} outbreak {timeframe}"
    ]

    all_results = []
    for query in queries:
        results = execute_search(query, num_results=10)
        all_results.extend(results)
        time.sleep(2)  # Be respectful to the API

    return all_results


def extract_outbreak_info(results: List[Dict[str, Any]], disease: str) -> List[Dict[str, Any]]:
    """
    Parse outbreak information from search results to extract:
    - State(s) affected
    - Date of announcement
    - Number of cases (if available)
    """
    outbreaks = []

    # US state list for reference
    us_states = [
        'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
        'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
        'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
        'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
        'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
        'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
        'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
        'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
        'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
        'West Virginia', 'Wisconsin', 'Wyoming'
    ]

    for result in results:
        content = result.get('content', '').lower()

        # Find mentioned states
        mentioned_states = [state for state in us_states if state.lower() in content]

        if mentioned_states:
            outbreak_info = {
                'disease': disease,
                'states': mentioned_states,
                'snippet': result.get('snippet', ''),
                'content_preview': content[:1000]
            }
            outbreaks.append(outbreak_info)

    return outbreaks


def get_nearby_states(state: str) -> List[str]:
    """
    Return neighboring states for comparison (simplified mapping)
    """
    # Simplified neighbor mapping for major states
    state_neighbors = {
        'California': ['Nevada', 'Arizona', 'Oregon'],
        'Texas': ['Louisiana', 'Oklahoma', 'New Mexico', 'Arkansas'],
        'Florida': ['Georgia', 'Alabama'],
        'New York': ['New Jersey', 'Pennsylvania', 'Connecticut', 'Massachusetts'],
        'Illinois': ['Indiana', 'Wisconsin', 'Iowa', 'Missouri'],
        'Pennsylvania': ['New York', 'New Jersey', 'Ohio', 'Maryland'],
        'Ohio': ['Pennsylvania', 'Indiana', 'Michigan', 'Kentucky'],
        'Georgia': ['Florida', 'Alabama', 'South Carolina', 'Tennessee'],
        'North Carolina': ['South Carolina', 'Virginia', 'Tennessee', 'Georgia'],
        'Michigan': ['Ohio', 'Indiana', 'Wisconsin'],
        'Washington': ['Oregon', 'Idaho'],
        'Arizona': ['California', 'Nevada', 'New Mexico'],
        'Massachusetts': ['New Hampshire', 'Rhode Island', 'Connecticut', 'New York'],
        'Indiana': ['Illinois', 'Ohio', 'Michigan', 'Kentucky'],
        'Tennessee': ['Kentucky', 'Virginia', 'North Carolina', 'Georgia', 'Alabama', 'Mississippi', 'Arkansas', 'Missouri'],
        'Missouri': ['Illinois', 'Iowa', 'Kansas', 'Oklahoma', 'Arkansas', 'Tennessee', 'Kentucky'],
        'Maryland': ['Pennsylvania', 'Delaware', 'Virginia', 'West Virginia'],
        'Wisconsin': ['Michigan', 'Illinois', 'Iowa', 'Minnesota'],
        'Colorado': ['Wyoming', 'Nebraska', 'Kansas', 'Oklahoma', 'New Mexico', 'Utah'],
        'Minnesota': ['Wisconsin', 'Iowa', 'South Dakota', 'North Dakota'],
    }

    return state_neighbors.get(state, [])


def analyze_google_trends(
    symptoms: List[str],
    states: List[str],
    end_date: datetime,
    days_before: int = 7,
    max_retries: int = 5,
    initial_delay: int = 30
) -> pd.DataFrame:
    """
    Analyze Google Trends data for symptoms in specific states

    Args:
        symptoms: List of symptoms to track (e.g., ['fever', 'rash'])
        states: List of US states (2-letter codes or full names)
        end_date: End date for the analysis (typically outbreak announcement date)
        days_before: Number of days before end_date to analyze
        max_retries: Maximum number of retry attempts for rate-limited requests
        initial_delay: Initial delay in seconds between requests

    Returns:
        DataFrame with trend data
    """
    print(f"\n📊 Analyzing Google Trends for {symptoms} in {states}")
    print(f"   Period: {days_before} days before {end_date.strftime('%Y-%m-%d')}")
    print(f"   ⚠️  Note: Google Trends has strict rate limits. This may take several minutes...")

    # Convert state names to 2-letter codes if needed
    state_code_map = {
        'Alabama': 'US-AL', 'Alaska': 'US-AK', 'Arizona': 'US-AZ', 'Arkansas': 'US-AR',
        'California': 'US-CA', 'Colorado': 'US-CO', 'Connecticut': 'US-CT', 'Delaware': 'US-DE',
        'Florida': 'US-FL', 'Georgia': 'US-GA', 'Hawaii': 'US-HI', 'Idaho': 'US-ID',
        'Illinois': 'US-IL', 'Indiana': 'US-IN', 'Iowa': 'US-IA', 'Kansas': 'US-KS',
        'Kentucky': 'US-KY', 'Louisiana': 'US-LA', 'Maine': 'US-ME', 'Maryland': 'US-MD',
        'Massachusetts': 'US-MA', 'Michigan': 'US-MI', 'Minnesota': 'US-MN', 'Mississippi': 'US-MS',
        'Missouri': 'US-MO', 'Montana': 'US-MT', 'Nebraska': 'US-NE', 'Nevada': 'US-NV',
        'New Hampshire': 'US-NH', 'New Jersey': 'US-NJ', 'New Mexico': 'US-NM', 'New York': 'US-NY',
        'North Carolina': 'US-NC', 'North Dakota': 'US-ND', 'Ohio': 'US-OH', 'Oklahoma': 'US-OK',
        'Oregon': 'US-OR', 'Pennsylvania': 'US-PA', 'Rhode Island': 'US-RI', 'South Carolina': 'US-SC',
        'South Dakota': 'US-SD', 'Tennessee': 'US-TN', 'Texas': 'US-TX', 'Utah': 'US-UT',
        'Vermont': 'US-VT', 'Virginia': 'US-VA', 'Washington': 'US-WA', 'West Virginia': 'US-WV',
        'Wisconsin': 'US-WI', 'Wyoming': 'US-WY'
    }

    # Calculate date range
    start_date = end_date - timedelta(days=days_before)
    timeframe = f"{start_date.strftime('%Y-%m-%d')} {end_date.strftime('%Y-%m-%d')}"

    # Initialize pytrends with timeout
    pytrends = TrendReq(hl='en-US', tz=360, timeout=(10, 30))

    all_data = []

    for i, state in enumerate(states):
        # Convert to state code if needed
        state_code = state_code_map.get(state, state)
        if not state_code.startswith('US-'):
            state_code = f"US-{state_code}"

        # Retry logic with exponential backoff
        retry_count = 0
        delay = initial_delay
        success = False

        while retry_count < max_retries and not success:
            try:
                # Add a pre-request delay to avoid immediate rate limiting
                if retry_count == 0 and i > 0:
                    # Progressive delay between states
                    wait_time = min(initial_delay + (i * 10), 60)  # Cap at 60 seconds
                    print(f"   ⏳ Waiting {wait_time} seconds before fetching {state}...")
                    time.sleep(wait_time)

                print(f"   Fetching data for {state} ({state_code})...")

                # Build payload for this state
                pytrends.build_payload(
                    symptoms,
                    cat=0,
                    timeframe=timeframe,
                    geo=state_code,
                    gprop=''
                )

                # Get interest over time
                interest_df = pytrends.interest_over_time()

                if not interest_df.empty:
                    # Remove 'isPartial' column if present
                    if 'isPartial' in interest_df.columns:
                        interest_df = interest_df.drop('isPartial', axis=1)

                    # Add state information
                    interest_df['state'] = state
                    interest_df['state_code'] = state_code
                    interest_df.reset_index(inplace=True)

                    all_data.append(interest_df)
                    print(f"   ✅ Successfully fetched data for {state}")
                else:
                    print(f"   ⚠️  No data available for {state}")

                success = True

            except Exception as e:
                error_msg = str(e)

                # Check if it's a rate limit error (429)
                if '429' in error_msg or 'rate' in error_msg.lower() or 'too many' in error_msg.lower():
                    retry_count += 1
                    if retry_count < max_retries:
                        # Use longer delays for retries
                        retry_delay = delay * (2 ** retry_count)  # Exponential backoff
                        print(f"   ⚠️  Rate limited for {state}. Waiting {retry_delay} seconds before retry {retry_count}/{max_retries}...")
                        time.sleep(retry_delay)
                    else:
                        print(f"   ❌ Max retries reached for {state}. Skipping...")
                        # Add a cooldown period even after max retries
                        print(f"   ⏳ Cooling down for 60 seconds...")
                        time.sleep(60)
                else:
                    # For other errors, just log and continue
                    print(f"   ❌ Error fetching data for {state}: {e}")
                    break

    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        print(f"✅ Successfully collected trend data for {len(states)} states")
        return combined_df
    else:
        print("❌ No trend data collected")
        return pd.DataFrame()


def calculate_trend_metrics(df: pd.DataFrame, symptoms: List[str]) -> Dict[str, Any]:
    """
    Calculate key metrics from trend data:
    - Average search volume
    - Rate of increase (slope)
    - Peak values
    """
    metrics = {}

    # Check if DataFrame is empty
    if df.empty or 'state' not in df.columns:
        return metrics

    for state in df['state'].unique():
        state_data = df[df['state'] == state].copy()
        state_metrics = {'state': state}

        for symptom in symptoms:
            if symptom in state_data.columns:
                values = state_data[symptom].values

                # Calculate metrics
                state_metrics[f'{symptom}_avg'] = values.mean()
                state_metrics[f'{symptom}_max'] = values.max()
                state_metrics[f'{symptom}_min'] = values.min()

                # Calculate rate of increase (simple linear approximation)
                if len(values) > 1:
                    rate = (values[-1] - values[0]) / len(values)
                    state_metrics[f'{symptom}_rate'] = rate
                else:
                    state_metrics[f'{symptom}_rate'] = 0

        metrics[state] = state_metrics

    return metrics


def compare_outbreak_vs_control(
    outbreak_states: List[str],
    control_states: List[str],
    symptoms: List[str],
    outbreak_date: datetime,
    days_before: int = 7
) -> Dict[str, Any]:
    """
    Compare symptom trends between outbreak and control states
    """
    print(f"\n🔬 Comparing outbreak vs control states")
    print(f"   Outbreak states: {outbreak_states}")
    print(f"   Control states: {control_states}")

    # Get trends for outbreak states with enhanced retry logic
    print("\n📈 Fetching data for OUTBREAK states...")
    print("   ⚠️  Google Trends has very strict rate limits. This process may take 5-10 minutes.")
    outbreak_trends = analyze_google_trends(
        symptoms,
        outbreak_states,
        outbreak_date,
        days_before,
        max_retries=5,
        initial_delay=30
    )

    # Get trends for control states with enhanced retry logic
    print("\n📈 Fetching data for CONTROL states...")
    control_trends = analyze_google_trends(
        symptoms,
        control_states,
        outbreak_date,
        days_before,
        max_retries=5,
        initial_delay=30
    )

    # Calculate metrics
    outbreak_metrics = calculate_trend_metrics(outbreak_trends, symptoms)
    control_metrics = calculate_trend_metrics(control_trends, symptoms)

    # Prepare comparison
    comparison = {
        'outbreak_states': outbreak_states,
        'control_states': control_states,
        'outbreak_date': outbreak_date.strftime('%Y-%m-%d'),
        'analysis_period_days': days_before,
        'symptoms': symptoms,
        'outbreak_metrics': outbreak_metrics,
        'control_metrics': control_metrics,
        'outbreak_trends_data': outbreak_trends.to_dict('records') if not outbreak_trends.empty else [],
        'control_trends_data': control_trends.to_dict('records') if not control_trends.empty else []
    }

    return comparison


def analyze_disease_outbreak(disease: str, symptoms: List[str] = ['fever', 'rash']) -> Dict[str, Any]:
    """
    Complete pipeline for analyzing a disease outbreak
    """
    print(f"\n{'='*80}")
    print(f"🦠 ANALYZING {disease.upper()} OUTBREAK")
    print(f"{'='*80}")

    # Step 1: Find outbreak data
    print(f"\n📡 Step 1: Searching for {disease} outbreak information...")
    outbreak_results = find_outbreak_data(disease)

    if not outbreak_results:
        print(f"❌ No outbreak data found for {disease}")
        return {'disease': disease, 'status': 'no_data_found'}

    # Step 2: Extract outbreak information
    print(f"\n📋 Step 2: Extracting outbreak details...")
    outbreaks = extract_outbreak_info(outbreak_results, disease)

    if not outbreaks:
        print(f"❌ Could not extract outbreak state information for {disease}")
        return {'disease': disease, 'status': 'no_states_identified'}

    # Display found outbreaks
    print(f"\n✅ Found {len(outbreaks)} potential outbreak reports:")
    for i, outbreak in enumerate(outbreaks[:5], 1):  # Show first 5
        print(f"\n   {i}. States: {outbreak['states']}")

    # Get unique outbreak states
    outbreak_states = list(set([state for outbreak in outbreaks for state in outbreak['states']]))
    print(f"\n🎯 Identified outbreak states: {outbreak_states}")

    # Step 3: Get control states (neighbors)
    control_states = []
    for state in outbreak_states[:3]:  # Use first 3 outbreak states
        neighbors = get_nearby_states(state)
        # Only add neighbors that are NOT in outbreak states
        control_states.extend([n for n in neighbors if n not in outbreak_states])

    control_states = list(set(control_states))[:5]  # Limit to 5 control states

    # If no control states found, use some default states that are likely not in outbreak
    if not control_states:
        # Use states that are geographically diverse and less likely to all have outbreaks
        all_states = ['Montana', 'North Dakota', 'Wyoming', 'Vermont', 'Maine', 'Alaska', 'Hawaii']
        control_states = [s for s in all_states if s not in outbreak_states][:5]

    print(f"🎯 Selected control states: {control_states}")

    # Step 4: Analyze trends
    # Use current date minus a few days as proxy for "outbreak announcement"
    outbreak_date = datetime.now() - timedelta(days=7)

    # print(f"\n📊 Step 3: Analyzing Google Trends data...")
    # comparison = compare_outbreak_vs_control(
    #     outbreak_states[:5],  # Limit to 5 outbreak states
    #     control_states,
    #     symptoms,
    #     outbreak_date,
    #     days_before=7
    # )

    # # Add outbreak details to comparison
    # comparison['disease'] = disease
    # comparison['outbreak_details'] = outbreaks[:5]

    # return comparison


def generate_report(measles_results: Dict[str, Any], monkeypox_results: Dict[str, Any]) -> str:
    """
    Generate a comprehensive analysis report
    """
    report = []
    report.append("="*80)
    report.append("MEASLES VS MONKEYPOX: SYMPTOM SEARCH TREND ANALYSIS")
    report.append("="*80)
    report.append(f"\nReport Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report.append(f"\nAnalysis Period: 7 days prior to outbreak announcement")
    report.append(f"Symptoms Analyzed: FEVER and RASH")

    # Measles section
    report.append("\n" + "="*80)
    report.append("MEASLES OUTBREAK ANALYSIS")
    report.append("="*80)

    if measles_results.get('status') in ['no_data_found', 'no_states_identified']:
        report.append(f"\n⚠️  Status: {measles_results.get('status', 'unknown')}")
    else:
        report.append(f"\nOutbreak States: {', '.join(measles_results.get('outbreak_states', []))}")
        report.append(f"Control States: {', '.join(measles_results.get('control_states', []))}")

        # Metrics summary
        report.append("\n--- OUTBREAK STATES METRICS ---")
        for state, metrics in measles_results.get('outbreak_metrics', {}).items():
            report.append(f"\n{state}:")
            report.append(f"  Fever - Avg: {metrics.get('fever_avg', 0):.2f}, Max: {metrics.get('fever_max', 0)}, Rate: {metrics.get('fever_rate', 0):.2f}")
            report.append(f"  Rash  - Avg: {metrics.get('rash_avg', 0):.2f}, Max: {metrics.get('rash_max', 0)}, Rate: {metrics.get('rash_rate', 0):.2f}")

        report.append("\n--- CONTROL STATES METRICS ---")
        for state, metrics in measles_results.get('control_metrics', {}).items():
            report.append(f"\n{state}:")
            report.append(f"  Fever - Avg: {metrics.get('fever_avg', 0):.2f}, Max: {metrics.get('fever_max', 0)}, Rate: {metrics.get('fever_rate', 0):.2f}")
            report.append(f"  Rash  - Avg: {metrics.get('rash_avg', 0):.2f}, Max: {metrics.get('rash_max', 0)}, Rate: {metrics.get('rash_rate', 0):.2f}")

    # Monkeypox section
    report.append("\n\n" + "="*80)
    report.append("MONKEYPOX OUTBREAK ANALYSIS")
    report.append("="*80)

    if monkeypox_results.get('status') in ['no_data_found', 'no_states_identified']:
        report.append(f"\n⚠️  Status: {monkeypox_results.get('status', 'unknown')}")
    else:
        report.append(f"\nOutbreak States: {', '.join(monkeypox_results.get('outbreak_states', []))}")
        report.append(f"Control States: {', '.join(monkeypox_results.get('control_states', []))}")

        # Metrics summary
        report.append("\n--- OUTBREAK STATES METRICS ---")
        for state, metrics in monkeypox_results.get('outbreak_metrics', {}).items():
            report.append(f"\n{state}:")
            report.append(f"  Fever - Avg: {metrics.get('fever_avg', 0):.2f}, Max: {metrics.get('fever_max', 0)}, Rate: {metrics.get('fever_rate', 0):.2f}")
            report.append(f"  Rash  - Avg: {metrics.get('rash_avg', 0):.2f}, Max: {metrics.get('rash_max', 0)}, Rate: {metrics.get('rash_rate', 0):.2f}")

        report.append("\n--- CONTROL STATES METRICS ---")
        for state, metrics in monkeypox_results.get('control_metrics', {}).items():
            report.append(f"\n{state}:")
            report.append(f"  Fever - Avg: {metrics.get('fever_avg', 0):.2f}, Max: {metrics.get('fever_max', 0)}, Rate: {metrics.get('fever_rate', 0):.2f}")
            report.append(f"  Rash  - Avg: {metrics.get('rash_avg', 0):.2f}, Max: {metrics.get('rash_max', 0)}, Rate: {metrics.get('rash_rate', 0):.2f}")

    # Comparative analysis
    report.append("\n\n" + "="*80)
    report.append("COMPARATIVE ANALYSIS")
    report.append("="*80)
    report.append("\nThis analysis compares symptom search trends (fever and rash) between:")
    report.append("1. States with reported outbreaks")
    report.append("2. Neighboring states without reported outbreaks")
    report.append("\nKey metrics:")
    report.append("- Average: Mean search interest (0-100 scale)")
    report.append("- Max: Peak search interest")
    report.append("- Rate: Daily rate of change in search interest")
    report.append("\nNote: Higher rates and averages in outbreak states may indicate")
    report.append("increased public concern or symptom prevalence prior to official announcements.")

    report.append("\n" + "="*80)

    return "\n".join(report)


def main():
    """
    Main execution function
    """
    print("\n" + "="*80)
    print("OUTBREAK SYMPTOM TREND ANALYSIS")
    print("Analyzing fever and rash search trends for measles and monkeypox outbreaks")
    print("="*80)

    # Symptoms to analyze
    symptoms = ['fever', 'rash']

    # Analyze measles
    print("\n\n🔴 PART 1: MEASLES ANALYSIS")
    measles_results = analyze_disease_outbreak('measles', symptoms)

    # Save intermediate results
    with open('measles_results.json', 'w') as f:
        json.dump(measles_results, f, indent=2, default=str)
    print("\n💾 Measles results saved to measles_results.json")

    # Analyze monkeypox
    print("\n\n🔴 PART 2: MONKEYPOX ANALYSIS")
    monkeypox_results = analyze_disease_outbreak('monkeypox', symptoms)

    # Save intermediate results
    with open('monkeypox_results.json', 'w') as f:
        json.dump(monkeypox_results, f, indent=2, default=str)
    print("\n💾 Monkeypox results saved to monkeypox_results.json")

    # Generate comprehensive report
    print("\n\n📝 GENERATING FINAL REPORT...")
    report = generate_report(measles_results, monkeypox_results)

    # Save report
    with open('outbreak_analysis_report.txt', 'w') as f:
        f.write(report)

    print(report)
    print("\n\n💾 Full report saved to outbreak_analysis_report.txt")

    # Save trend data to CSV if available
    if 'outbreak_trends_data' in measles_results and measles_results['outbreak_trends_data']:
        measles_df = pd.DataFrame(measles_results['outbreak_trends_data'])
        measles_df.to_csv('measles_trends.csv', index=False)
        print("📊 Measles trend data saved to measles_trends.csv")

    if 'outbreak_trends_data' in monkeypox_results and monkeypox_results['outbreak_trends_data']:
        monkeypox_df = pd.DataFrame(monkeypox_results['outbreak_trends_data'])
        monkeypox_df.to_csv('monkeypox_trends.csv', index=False)
        print("📊 Monkeypox trend data saved to monkeypox_trends.csv")

    print("\n✅ Analysis complete!")


if __name__ == "__main__":
    main()