adaptive_cot_framework/run_lighteval_evaluation.py at main · macto94/adaptive_cot_framework · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
"""
LightEval evaluation script for Adaptive CoT Framework.

This script uses LightEval for robust evaluation with proper few-shot handling,
regex patterns, and metrics. It integrates with the existing Adaptive CoT framework
while leveraging LightEval's evaluation infrastructure.
"""

import argparse
import sys
import os
import json
from pathlib import Path
from typing import Dict, List, Any, Optional

# Add src to path
sys.path.append(str(Path(__file__).parent / "src"))

from models.model_factory import ModelFactory
from adaptive.adaptive_cot import AdaptiveCoT
from evaluation.lighteval_integration import LightEvalIntegration


def main():
    parser = argparse.ArgumentParser(description="Run LightEval evaluation with Adaptive CoT")

    # Model configuration
    parser.add_argument("--model-path", type=str, required=True, help="Path to the model")
    parser.add_argument("--model-type", type=str, default="deepseek", choices=["deepseek", "generic"], help="Model type")
    parser.add_argument("--gpu-id", type=int, default=0, help="GPU ID to use")

    # Evaluation configuration
    parser.add_argument("--tasks", nargs="+", default=["gsm8k"], help="Tasks to evaluate on")
    parser.add_argument("--num-fewshot", type=int, default=8, help="Number of few-shot examples")
    parser.add_argument("--limit", type=int, default=None, help="Limit number of samples per task")
    parser.add_argument("--output-dir", type=str, default="lighteval_results", help="Output directory")

    # Adaptive CoT configuration
    parser.add_argument("--min-branches", type=int, default=3, help="Minimum branches for adaptive")
    parser.add_argument("--max-branches", type=int, default=8, help="Maximum branches for adaptive")
    parser.add_argument("--default-branches", type=int, default=5, help="Default branches for static")
    parser.add_argument("--adaptive", action="store_true", help="Use adaptive branching (default: static)")

    # Generation parameters
    parser.add_argument("--temperature", type=float, default=0.7, help="Generation temperature")
    parser.add_argument("--top-p", type=float, default=0.95, help="Top-p sampling")
    parser.add_argument("--max-tokens", type=int, default=512, help="Maximum tokens to generate")

    args = parser.parse_args()

    print("🔬 LightEval Evaluation with Adaptive CoT")
    print("=" * 50)
    print(f"Model: {args.model_path}")
    print(f"Tasks: {args.tasks}")
    print(f"Few-shot: {args.num_fewshot}")
    print(f"Branches: {'Adaptive' if args.adaptive else 'Static'} ({args.min_branches}-{args.max_branches})")
    print(f"Generation: temp={args.temperature}, top_p={args.top_p}")
    print("=" * 50)

    # Create output directory
    output_dir = Path(args.output_dir)
    output_dir.mkdir(exist_ok=True)

    try:
        # Load model
        print(f"🔧 Loading model: {args.model_path}")
        model = ModelFactory.create_model(
            model_type=args.model_type,
            model_name=args.model_path,
            config={"gpu_id": args.gpu_id}
        )
        model.load_model()

        # Create Adaptive CoT configuration
        adaptive_config = {
            "adaptive_branching": args.adaptive,
            "min_branches": args.min_branches,
            "max_branches": args.max_branches,
            "default_branches": args.default_branches,
            "num_fewshot": args.num_fewshot,
            "entropy_threshold": 0.8,
            "kl_threshold": 0.5,
            "confidence_threshold": 0.7,
            "temperature": args.temperature,
            "top_p": args.top_p,
            "max_tokens": args.max_tokens,
        }

        # Create Adaptive CoT instance
        adaptive_cot = AdaptiveCoT(model, adaptive_config)

        # Create LightEval integration
        lighteval_config = {
            "model_name": f"adaptive_cot_{args.model_type}",
            "max_parallel_branches": args.max_branches,
        }

        lighteval_integration = LightEvalIntegration(adaptive_cot, lighteval_config)

        # Run evaluation
        print(f"🚀 Starting LightEval evaluation...")
        results = lighteval_integration.evaluate_math_tasks(
            tasks=args.tasks,
            num_fewshot_examples=args.num_fewshot,
            limit=args.limit
        )

        # Save results
        results_file = output_dir / f"lighteval_results_{args.model_type}_{args.tasks[0]}.json"
        with open(results_file, 'w') as f:
            json.dump(results, f, indent=2, default=str)

        print(f"✅ Evaluation completed!")
        print(f"📊 Results saved to: {results_file}")

        # Print summary
        if isinstance(results, dict):
            for task_name, task_results in results.items():
                if isinstance(task_results, dict) and "results" in task_results:
                    accuracy = task_results["results"].get("acc", 0.0)
                    print(f"📈 {task_name}: {accuracy:.3f} accuracy")

    except Exception as e:
        print(f"❌ Error during evaluation: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()