-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_lighteval_evaluation.py
More file actions
133 lines (107 loc) Β· 5.14 KB
/
run_lighteval_evaluation.py
File metadata and controls
133 lines (107 loc) Β· 5.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
"""
LightEval evaluation script for Adaptive CoT Framework.
This script uses LightEval for robust evaluation with proper few-shot handling,
regex patterns, and metrics. It integrates with the existing Adaptive CoT framework
while leveraging LightEval's evaluation infrastructure.
"""
import argparse
import sys
import os
import json
from pathlib import Path
from typing import Dict, List, Any, Optional
# Add src to path
sys.path.append(str(Path(__file__).parent / "src"))
from models.model_factory import ModelFactory
from adaptive.adaptive_cot import AdaptiveCoT
from evaluation.lighteval_integration import LightEvalIntegration
def main():
parser = argparse.ArgumentParser(description="Run LightEval evaluation with Adaptive CoT")
# Model configuration
parser.add_argument("--model-path", type=str, required=True, help="Path to the model")
parser.add_argument("--model-type", type=str, default="deepseek", choices=["deepseek", "generic"], help="Model type")
parser.add_argument("--gpu-id", type=int, default=0, help="GPU ID to use")
# Evaluation configuration
parser.add_argument("--tasks", nargs="+", default=["gsm8k"], help="Tasks to evaluate on")
parser.add_argument("--num-fewshot", type=int, default=8, help="Number of few-shot examples")
parser.add_argument("--limit", type=int, default=None, help="Limit number of samples per task")
parser.add_argument("--output-dir", type=str, default="lighteval_results", help="Output directory")
# Adaptive CoT configuration
parser.add_argument("--min-branches", type=int, default=3, help="Minimum branches for adaptive")
parser.add_argument("--max-branches", type=int, default=8, help="Maximum branches for adaptive")
parser.add_argument("--default-branches", type=int, default=5, help="Default branches for static")
parser.add_argument("--adaptive", action="store_true", help="Use adaptive branching (default: static)")
# Generation parameters
parser.add_argument("--temperature", type=float, default=0.7, help="Generation temperature")
parser.add_argument("--top-p", type=float, default=0.95, help="Top-p sampling")
parser.add_argument("--max-tokens", type=int, default=512, help="Maximum tokens to generate")
args = parser.parse_args()
print("π¬ LightEval Evaluation with Adaptive CoT")
print("=" * 50)
print(f"Model: {args.model_path}")
print(f"Tasks: {args.tasks}")
print(f"Few-shot: {args.num_fewshot}")
print(f"Branches: {'Adaptive' if args.adaptive else 'Static'} ({args.min_branches}-{args.max_branches})")
print(f"Generation: temp={args.temperature}, top_p={args.top_p}")
print("=" * 50)
# Create output directory
output_dir = Path(args.output_dir)
output_dir.mkdir(exist_ok=True)
try:
# Load model
print(f"π§ Loading model: {args.model_path}")
model = ModelFactory.create_model(
model_type=args.model_type,
model_name=args.model_path,
config={"gpu_id": args.gpu_id}
)
model.load_model()
# Create Adaptive CoT configuration
adaptive_config = {
"adaptive_branching": args.adaptive,
"min_branches": args.min_branches,
"max_branches": args.max_branches,
"default_branches": args.default_branches,
"num_fewshot": args.num_fewshot,
"entropy_threshold": 0.8,
"kl_threshold": 0.5,
"confidence_threshold": 0.7,
"temperature": args.temperature,
"top_p": args.top_p,
"max_tokens": args.max_tokens,
}
# Create Adaptive CoT instance
adaptive_cot = AdaptiveCoT(model, adaptive_config)
# Create LightEval integration
lighteval_config = {
"model_name": f"adaptive_cot_{args.model_type}",
"max_parallel_branches": args.max_branches,
}
lighteval_integration = LightEvalIntegration(adaptive_cot, lighteval_config)
# Run evaluation
print(f"π Starting LightEval evaluation...")
results = lighteval_integration.evaluate_math_tasks(
tasks=args.tasks,
num_fewshot_examples=args.num_fewshot,
limit=args.limit
)
# Save results
results_file = output_dir / f"lighteval_results_{args.model_type}_{args.tasks[0]}.json"
with open(results_file, 'w') as f:
json.dump(results, f, indent=2, default=str)
print(f"β
Evaluation completed!")
print(f"π Results saved to: {results_file}")
# Print summary
if isinstance(results, dict):
for task_name, task_results in results.items():
if isinstance(task_results, dict) and "results" in task_results:
accuracy = task_results["results"].get("acc", 0.0)
print(f"π {task_name}: {accuracy:.3f} accuracy")
except Exception as e:
print(f"β Error during evaluation: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()