-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathsplitTrainTest.py
More file actions
40 lines (31 loc) · 1.19 KB
/
splitTrainTest.py
File metadata and controls
40 lines (31 loc) · 1.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import sys, random
#WARNING: if you use this to split into train and test and use process_data.py to preprocess,
#the label numbers may get mixed up since the order the labels appear in the test set
#is probably different from the order they appear in the training set
filePath = sys.argv[1]
randomize = True #by default we randomly shuffle data
if len(sys.argv) == 3: #user specified instructions vis-a-vis random shuffling of data
randomize = sys.argv[2] #set randomize to user-specified parameter
fileName = filePath.split(".")[0]
train_proportion = 0.8
data_file = open(filePath, "r")
data = data_file.readlines()
numData = len(data)
#take first part (specified by train proportion) as training data, rest as test
numTrain = int(train_proportion * numData)
#if randomized shuffle the data randomly
if randomize:
random.shuffle(data)
train_data = data[:numTrain]
test_data = data[numTrain:]
#write to respective files
train_fileName = fileName + "_train.txt"
test_fileName = fileName + "_test.txt"
train_file = open(train_fileName, "w")
for datum in train_data:
train_file.write(datum)
train_file.close()
test_file = open(test_fileName, "w")
for datum in test_data:
test_file.write(datum)
test_file.close()