-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsplit_train_test.py
More file actions
58 lines (51 loc) · 2.13 KB
/
split_train_test.py
File metadata and controls
58 lines (51 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import csv
import numpy as np
def split_data(ifname, delimiter=None, has_header=False, columns=None, seed=42, fraction=0.15):
"""
Imports a tab/comma/semi-colon/... separated data file as an array of
floating point numbers.
parameters
----------
ifname -- filename/path of data file.
delimiter -- delimiter of data values
has_header -- does the data-file have a header line
columns -- a list of integers specifying which columns of the file to import
(counting from 0)
seed -- the seed for the pseudo-random number generator
returns
-------
training_data_as_array -- the training data as a numpy.array object
test_data_as_array -- the test data as a numpy.array object
"""
np.random.seed(seed)
test_rows = np.unique(np.array(np.random.uniform(size = int(fraction*1599))*1599).astype(int))
print(test_rows)
if delimiter is None:
delimiter = '\t'
with open(ifname, 'r') as ifile:
datareader = csv.reader(ifile, delimiter=delimiter)
# if the data has a header line we want to avoid trying to import it.
if has_header:
field_names = next(datareader)
# create empty lists to store each row of data
training_data = []
test_data = []
count = 0
for row in datareader:
# for each row of data only take the columns we are interested in
if not columns is None:
row = [row[c] for c in columns]
# now store in our data lists
if(count in test_rows):
test_data.append(row)
else:
training_data.append(row)
count+=1
print("There are %d training entries" % len(training_data))
print("There are %d test entries" % len(test_data))
print("Each row has %d elements" % len(training_data[0]))
# convert the data (list object) into a numpy array.
training_data_as_array = np.array(training_data).astype(float)
test_data_as_array = np.array(test_data).astype(float)
# return the two data sets to caller
return training_data_as_array, test_data_as_array, field_names