-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuild_simple_datasets.py
More file actions
73 lines (59 loc) · 2.47 KB
/
build_simple_datasets.py
File metadata and controls
73 lines (59 loc) · 2.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import sys
from datasets import load_dataset
def clean_columns(dataset):
keys_to_remove = [key for key in dataset.column_names if key not in ["canonical_solution", "test", "instance_id", "prompt"]]
return dataset.remove_columns(keys_to_remove)
def convert_mbpp_tests(assert_list):
# Generate individual test functions
test_functions = []
for i, assert_line in enumerate(assert_list, 1):
test_func = f"def test{i}():\n {assert_line}"
test_functions.append(test_func)
return "\n\n".join(test_functions)
def convert_humaneval_tests(test_code, entrypoint):
# Split the input into lines and clean up
lines = test_code.strip().split("\n")
# Find all assert lines
assert_lines = [line for line in lines if line.lstrip().startswith("assert")]
# Generate individual test functions
test_functions = [f"candidate = {entrypoint}"]
for i, assert_line in enumerate(assert_lines, 1):
test_func = f"def test{i}():\n{assert_line}"
test_functions.append(test_func)
return "\n\n".join(test_functions)
def convert_humaneval():
ds = load_dataset("openai/openai_humaneval")
for split in ds:
ds[split] = ds[split].rename_column('task_id', 'instance_id')
tests = [convert_humaneval_tests(one['test'], one['entry_point']) for one in ds[split]]
ds[split] = ds[split].remove_columns(['test'])
ds[split] = ds[split].add_column(name='test', column=tests)
ds[split] = clean_columns(ds[split])
out_name = f"commit0/openai_humaneval"
ds.push_to_hub(out_name)
def convert_codecontests():
pass
def convert_bigcodebench():
pass
def convert_mbpp():
ds = load_dataset("google-research-datasets/mbpp")
for split in ds:
ds[split] = ds[split].rename_column('task_id', 'instance_id')
ds[split] = ds[split].rename_column('code', 'canonical_solution')
ds[split] = ds[split].rename_column('text', 'prompt')
ds[split] = ds[split].add_column(name='test', column=[convert_mbpp_tests(one) for one in ds[split]['test_list']])
ds[split] = clean_columns(ds[split])
out_name = f"commit0/mbpp"
ds.push_to_hub(out_name)
if __name__ == "__main__":
data = sys.argv[1].lower()
if data == "mbpp":
convert_mbpp()
elif data == "humaneval":
convert_humaneval()
elif data == "codecontests":
convert_codecontests()
elif data == "bigcodebench":
convert_bigcodebench()
else:
raise NotImplementedError()