-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconvert_jsonl.py
More file actions
29 lines (22 loc) · 1.16 KB
/
convert_jsonl.py
File metadata and controls
29 lines (22 loc) · 1.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from datasets import load_from_disk
import jsonlines
from tqdm import tqdm
# Load the saved dataset from disk
train_dataset = load_from_disk("data/train_dataset")
# Split the dataset into train and validation sets with a 7:3 ratio
train_size = int(0.7 * len(train_dataset))
train_split = train_dataset.select(range(train_size))
valid_split = train_dataset.select(range(train_size, len(train_dataset)))
# Define output files for train and validation sets
train_output_file = "data/train.jsonl"
valid_output_file = "data/valid.jsonl"
# Save the train split to train.jsonl
with jsonlines.open(train_output_file, mode='w') as writer:
for item in tqdm(train_split, desc="Converting train split to JSONL"):
writer.write({"input": item["query"], "output": item["response"]})
# Save the validation split to valid.jsonl
with jsonlines.open(valid_output_file, mode='w') as writer:
for item in tqdm(valid_split, desc="Converting validation split to JSONL"):
writer.write({"input": item["query"], "output": item["response"]})
print(f"Train dataset successfully converted to {train_output_file}")
print(f"Validation dataset successfully converted to {valid_output_file}")