1+ import os
2+ import re
3+ import argparse
4+ from tqdm import tqdm
5+
6+ from transformers import pipeline , set_seed
7+ from transformers import AutoTokenizer , AutoModelForCausalLM
8+ from transformers .pipelines .base import Pipeline
9+
10+ from human_eval .data import write_jsonl , read_problems
11+
12+ def load_generation_pipe (model_name_or_path : str , gpu_device : int = 0 ):
13+ model = AutoModelForCausalLM .from_pretrained (model_name_or_path )
14+ tokenizer = AutoTokenizer .from_pretrained (model_name_or_path )
15+
16+ pipe = pipeline (
17+ 'text-generation' ,
18+ model = model ,
19+ tokenizer = tokenizer ,
20+ device = gpu_device
21+ )
22+
23+ print ("load generation pipeline from {} over, vocab size = {}, eos id = {}, gpu device = {}." .format (
24+ model_name_or_path , len (tokenizer ), tokenizer .eos_token_id , gpu_device )
25+ )
26+
27+ return pipe
28+
29+ def extract_function_block (string ):
30+ return re .split ("\n class|\n def|\n #|\n @|\n print|\n if" , string )[0 ].rstrip ()
31+
32+ def run_code_generation (pipe , prompt , num_completions = 1 , ** gen_kwargs ):
33+ set_seed (123 )
34+
35+ code_gens = pipe (prompt ,
36+ num_return_sequences = num_completions ,
37+ ** gen_kwargs
38+ )
39+
40+ return [extract_function_block (code_gen ["generated_text" ][len (prompt ):]) for code_gen in code_gens ]
41+
42+ def evaluate_on_human_eval (
43+ model_name_or_path : str ,
44+ temperature : float ,
45+ top_p : float ,
46+ num_samples_per_task : int ,
47+ max_new_tokens : int ,
48+ gpu_device : int ,
49+ output_dir : str ,
50+ ) -> str :
51+
52+ pipe : Pipeline = load_generation_pipe (model_name_or_path , gpu_device = gpu_device )
53+ eval_name = f"human_eval.t{ temperature } .p{ top_p } .l{ max_new_tokens } .n{ num_samples_per_task } "
54+
55+ if output_dir is None :
56+ if os .path .exists (model_name_or_path ):
57+ output_dir = model_name_or_path
58+ else :
59+ raise ValueError ("Output dir can't be null if you are not evaluation a local model." )
60+
61+ os .makedirs (output_dir , exist_ok = True )
62+ saved_path = os .path .join (output_dir , f"{ eval_name } .samples.jsonl" )
63+
64+ gen_kwargs = {
65+ "do_sample" : True ,
66+ "temperature" : temperature ,
67+ "max_new_tokens" : max_new_tokens ,
68+ "top_p" : top_p ,
69+ "top_k" : 0 ,
70+ "pad_token_id" : pipe .tokenizer .pad_token_id if pipe .tokenizer .pad_token_id else pipe .tokenizer .eos_token_id ,
71+ "eos_token_id" : pipe .tokenizer .eos_token_id
72+ }
73+
74+ problems = read_problems ()
75+ samples = []
76+ generate_batch_size = min (50 , num_samples_per_task )
77+
78+ bos_token = pipe .tokenizer .bos_token if pipe .tokenizer .bos_token else pipe .tokenizer .eos_token
79+
80+ for task_id in tqdm (problems ):
81+ # Strip operation is important as new tokenizer will not treat '\n' as a independent token
82+ prompt = problems [task_id ]["prompt" ].strip ()
83+
84+ for _ in range (num_samples_per_task // generate_batch_size ):
85+ input_prompt = bos_token + prompt
86+ gen_results = run_code_generation (pipe , input_prompt , num_completions = generate_batch_size , ** gen_kwargs )
87+ for gen_result in gen_results :
88+ samples .append (dict (task_id = task_id , completion = gen_result ))
89+
90+ write_jsonl (saved_path , samples )
91+ print ("Run generation over, save {} samples to {}." .format (len (samples ), saved_path ))
92+
93+ if __name__ == '__main__' :
94+ parser = argparse .ArgumentParser (description = 'Run evaluation for code generation model on human-eval.' )
95+
96+ parser .add_argument ('-model' , '--model_name_or_path' , type = str , required = True )
97+ parser .add_argument ('-o' , '--output_dir' , type = str , default = None )
98+ parser .add_argument ('-n' , '--num_completions' , type = int , default = 100 )
99+ parser .add_argument ('-t' , '--temperature' , type = float , default = 0.2 )
100+ parser .add_argument ('-p' , '--top_p' , type = float , default = 0.95 )
101+ parser .add_argument ('-l' , '--max_new_tokens' , type = int , default = 100 )
102+ parser .add_argument ('-gpu' , "--gpu_device" , type = int , default = 0 )
103+
104+ args = parser .parse_args ()
105+
106+ evaluate_on_human_eval (
107+ model_name_or_path = args .model_name_or_path ,
108+ temperature = args .temperature ,
109+ top_p = args .top_p ,
110+ num_samples_per_task = args .num_completions ,
111+ max_new_tokens = args .max_new_tokens ,
112+ gpu_device = args .gpu_device ,
113+ output_dir = args .output_dir ,
114+ )
115+ pass
0 commit comments