-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_prep.py
More file actions
39 lines (31 loc) · 1.16 KB
/
data_prep.py
File metadata and controls
39 lines (31 loc) · 1.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from datasets import load_dataset
from utils import save_data
import re
def clean_lines(lines):
"""
Funtion for cleaining the data, removes all dubble spaces and all tabs
"""
out = []
for s in lines.splitlines():
s = re.sub(r"\s+", " ", s).strip()
#s = s.replace("\t\t", " ").strip()
if s:
out.append(s)
return "\n".join(out)
if __name__ == "__main__":
out_src_train = "data/wikitext2_train.txt"
out_src_valid = "data/wikitext2_valid.txt"
out_src_test = "data/wikitext2_test.txt"
# Download Wikitext2
ds = load_dataset("wikitext", "wikitext-2-raw-v1") # 'train'/'validation'/'test'
# Split into train, val and text
text_train = "\n\n".join(ds["train"]["text"])
text_valid = "\n\n".join(ds["validation"]["text"])
text_test = "\n\n".join(ds["test"]["text"])
# Clean the data
text_train_cl = clean_lines(text_train)
text_valid_cl = clean_lines(text_valid)
text_test_cl = clean_lines(text_test)
save_data(out_src=out_src_train, text=text_train_cl)
save_data(out_src=out_src_test, text=text_test_cl)
save_data(out_src=out_src_valid, text=text_valid_cl)