-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_input_path.py
More file actions
37 lines (24 loc) · 1.11 KB
/
generate_input_path.py
File metadata and controls
37 lines (24 loc) · 1.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import os
import random
def list_random_files(input_dir, output_file, num_files, extensions=None):
all_files = []
for root, dirs, files in os.walk(input_dir):
for file in files:
if extensions:
if not any(file.lower().endswith(ext) for ext in extensions):
continue
file_path = os.path.join(root, file)
all_files.append(file_path)
if len(all_files) > num_files:
selected_files = random.sample(all_files, num_files)
else:
selected_files = all_files
with open(output_file, 'w', encoding='utf-8') as f_out:
for file_path in selected_files:
print(file_path)
f_out.write(f"{file_path}\n")
input_directory = '' # put directory ex) ./files/test
data_file = '' # put your output txt file / it will be an input of pretrain_byteT5 and pretrain_byteBERT
file_extensions = ['.txt']
number_of_files = 12000 # put total number of files ex) train-40000 test-12000 validation-8000
list_random_files(input_directory, data_file, number_of_files, extensions=file_extensions)