-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathnormalizeColumnTemplate.py
More file actions
132 lines (105 loc) · 4.11 KB
/
normalizeColumnTemplate.py
File metadata and controls
132 lines (105 loc) · 4.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#
# normalize column
#
# This is the template for you to write and test the method
#
# normalizeColumn
#
# You will also need the file athletesTrainingSet.txt
#
# For use with the book Programmer's Guide to Data Mining
# http://guidetodatamining.com
#
# Ron Zacharski
#
class Classifier:
def __init__(self, filename):
self.medianAndDeviation = []
# reading the data in from the file
f = open(filename)
lines = f.readlines()
f.close()
self.format = lines[0].strip().split('\t')
self.data = []
for line in lines[1:]:
fields = line.strip().split('\t')
ignore = []
vector = []
for i in range(len(fields)):
if self.format[i] == 'num':
vector.append(int(fields[i]))
elif self.format[i] == 'comment':
ignore.append(fields[i])
elif self.format[i] == 'class':
classification = fields[i]
self.data.append((classification, vector, ignore))
self.rawData = list(self.data)
# get length of instance vector
self.vlen = len(self.data[0][1])
# now normalize the data
for i in range(self.vlen):
self.normalizeColumn(i)
def getMedian(self, alist):
"""return median of alist"""
if alist == []:
return []
blist = sorted(alist)
length = len(alist)
if length % 2 == 1:
# length of list is odd so return middle element
return blist[int(((length + 1) / 2) - 1)]
else:
# length of list is even so compute midpoint
v1 = blist[int(length / 2)]
v2 =blist[(int(length / 2) - 1)]
return (v1 + v2) / 2.0
def getAbsoluteStandardDeviation(self, alist, median):
"""given alist and median return absolute standard deviation"""
sum = 0
for item in alist:
sum += abs(item - median)
return sum / len(alist)
##################################################
###
### FINISH WRITING THIS METHOD
def normalizeColumn(self, columnNumber):
"""given a column number, normalize that column in self.data
using the Modified Standard Score"""
""" TO BE DONE"""
col = [v[1][columnNumber] for v in self.data]
median = self.getMedian(col)
asd = self.getAbsoluteStandardDeviation(col, median)
self.medianAndDeviation.append((median, asd))
for v in self.data:
v[1][columnNumber] = (v[1][columnNumber] - median)/asd
###
###
##################################################
def unitTest():
classifier = Classifier('athletesTrainingSet.txt')
#
# test median and absolute standard deviation methods
list1 = [54, 72, 78, 49, 65, 63, 75, 67, 54, 76, 68,
61, 58, 70, 70, 70, 63, 65, 66, 61]
list2 = [66, 162, 204, 90, 99, 106, 175, 123, 68,
200, 163, 95, 77, 108, 155, 155, 108, 106, 97, 76]
m1 = classifier.getMedian(list1)
assert(round(m1, 3) == 65.5)
m2 = classifier.getMedian(list2)
assert(round(m2, 3) == 107)
assert(round(classifier.getAbsoluteStandardDeviation(list1, m1),3) == 5.95)
assert(round(classifier.getAbsoluteStandardDeviation(list2, m2),3) == 33.65)
print("getMedian and getAbsoluteStandardDeviation are OK")
# test normalizeColumn
list1 = [[-1.9328, -1.2184], [1.0924, 1.6345], [2.1008, 2.8826],
[-2.7731, -0.5052], [-0.084, -0.2377], [-0.4202, -0.0297],
[1.5966, 2.0208], [0.2521, 0.4755], [-1.9328, -1.159],
[1.7647, 2.7637], [0.4202, 1.6642], [-0.7563, -0.3566],
[-1.2605, -0.8915], [0.7563, 0.0297], [0.7563, 1.4264],
[0.7563, 1.4264], [-0.4202, 0.0297], [-0.084, -0.0297],
[0.084, -0.2972], [-0.7563, -0.9212]]
for i in range(len(list1)):
assert(round(classifier.data[i][1][0],4) == list1[i][0])
assert(round(classifier.data[i][1][1],4) == list1[i][1])
print("normalizeColumn is OK")
unitTest()