@@ -118,40 +118,34 @@ def check_valid(results):
118118
119119
120120def split_gen ():
121- shutil .rmtree ("sanitized_samples" , ignore_errors = True )
122121 shutil .rmtree ("sanitized_calibrated_samples" , ignore_errors = True )
123- os .makedirs ("sanitized_samples/complete" , exist_ok = True )
124- os .makedirs ("sanitized_samples/instruct" , exist_ok = True )
125- os .makedirs ("sanitized_calibrated_samples/complete" , exist_ok = True )
126- os .makedirs ("sanitized_calibrated_samples/instruct" , exist_ok = True )
122+ os .makedirs ("sanitized_calibrated_samples/hard/complete" , exist_ok = True )
123+ os .makedirs ("sanitized_calibrated_samples/hard/instruct" , exist_ok = True )
124+ os .makedirs ("sanitized_calibrated_samples/full/complete" , exist_ok = True )
125+ os .makedirs ("sanitized_calibrated_samples/full/instruct" , exist_ok = True )
126+
127127 for model , info in model_info .items ():
128128 model = model .replace ("/" , "--" )
129129 files = glob (f"results/{ model } --bigcodebench-*.jsonl" )
130130 if info ["link" ].startswith ("https://huggingface.co/" ):
131131 model = info ["link" ].split ("https://huggingface.co/" )[- 1 ].replace ("/" , "--" )
132132
133133 for file in files :
134+ if "-sanitized" not in file or "calibrated" not in file :
135+ continue
136+
134137 _ , suffix = os .path .basename (file ).split ("--bigcodebench-" )
135138 with open (file , "r" ) as f :
136139 data = f .readlines ()
137140
138- if "-sanitized" in file :
139- if "calibrated" in file :
140- if info ["prompted" ]:
141- if suffix .startswith ("complete" ):
142- with open (f"sanitized_calibrated_samples/complete/{ model } --bigcodebench-{ suffix } " , "w" ) as f :
143- f .writelines (data )
144- else :
145- with open (f"sanitized_calibrated_samples/instruct/{ model } --bigcodebench-{ suffix } " , "w" ) as f :
146- f .writelines (data )
141+ split_type = "hard" if "-hard-" in file else "full"
142+ if info ["prompted" ]:
143+ if suffix .startswith ("complete" ) or suffix .startswith ("hard-complete" ):
144+ with open (f"sanitized_calibrated_samples/{ split_type } /complete/{ model } --bigcodebench-{ suffix } " , "w" ) as f :
145+ f .writelines (data )
147146 else :
148- if suffix .startswith ("complete" ):
149- with open (f"sanitized_samples/complete/{ model } --bigcodebench-{ suffix } " , "w" ) as f :
150- f .writelines (data )
151- else :
152- with open (f"sanitized_samples/instruct/{ model } --bigcodebench-{ suffix } " , "w" ) as f :
153- f .writelines (data )
154-
147+ with open (f"sanitized_calibrated_samples/{ split_type } /instruct/{ model } --bigcodebench-{ suffix } " , "w" ) as f :
148+ f .writelines (data )
155149
156150def read_task_perf (tids , task = "complete" ):
157151 model_results = dict ()
@@ -302,7 +296,7 @@ def get_perf_df(data_dict):
302296
303297
304298if __name__ == "__main__" :
305- # split_gen()
299+ split_gen ()
306300 bcb_orig = load_dataset ("bigcode/bigcodebench" , split = "v0.1.1" )
307301 bcb_hard = load_dataset ("bigcode/bigcodebench-hard" , split = "v0.1.1" )
308302 bcb_config = {
0 commit comments