-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathHeuristics.py
More file actions
113 lines (90 loc) · 3.91 KB
/
Heuristics.py
File metadata and controls
113 lines (90 loc) · 3.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import math
def entropy(data, targetAttr):
# Calculates the entropy of the given data set for the target attribute.
valFreq = {}
dataEntropy = 0.0
# Calculate the frequency of each of the values in the target attr
for record in data:
if (valFreq.has_key(record[targetAttr])):
valFreq[record[targetAttr]] += 1.0
else:
valFreq[record[targetAttr]] = 1.0
# Calculate the entropy of the data for the target attribute
for freq in valFreq.values():
dataEntropy += (-freq/len(data)) * math.log(freq/len(data), 2)
return dataEntropy
def gini(data, attr, targetAttr):
# Calculates the gini index of the given data set for the target attribute
valFreq = {}
giniInd = 0.0
for record in data:
if (valFreq.has_key(record[attr])):
valFreq[record[attr]] += 1.0
else:
valFreq[record[attr]] = 1.0
for val in valFreq.keys():
valProb = valFreq[val] / sum(valFreq.values())
giniInd += valProb*(1 - valProb)
return giniInd
def misclassificationError(data, attr, target):
# calculates the misclassification index of the given data set for the target attribute
valFreq = {}
for record in data:
if (valFreq.has_key(record[attr])):
valFreq[record[attr]] += 1.0
else:
valFreq[record[attr]] = 1.0
maxValue = max(valFreq.values())
return 1.0 - maxValue / len(data)
def gainEntr(data, attr, targetAttr):
# Calculates the information gain (reduction in entropy) that would
# result by splitting the data on the chosen attribute (attr).
valFreq = {}
subsetEntropy = 0.0
# Calculate the frequency of each of the values in the target attribute
for record in data:
if (valFreq.has_key(record[attr])):
valFreq[record[attr]] += 1.0
else:
valFreq[record[attr]] = 1.0
# Calculate the sum of the entropy for each subset of records weighted
# by their probability of occuring in the training set.
for val in valFreq.keys():
valProb = valFreq[val] / sum(valFreq.values())
dataSubset = [record for record in data if record[attr] == val]
subsetEntropy += valProb * entropy(dataSubset, targetAttr)
# Subtract the entropy of the chosen attribute from the entropy of the
# whole data set with respect to the target attribute (and return it)
return (entropy(data, targetAttr) - subsetEntropy)
def gainGini(data, attr, targetAttr):
# Calculates the gini index (probability of misclassification) that would
# result by splitting the data on the chosen attribute (attr).
valFreq = {}
subsetError = 0.0
# Calculate the frequency of each of the values in the target attribute
for record in data:
if (valFreq.has_key(record[attr])):
valFreq[record[attr]] += 1.0
else:
valFreq[record[attr]] = 1.0
for val in valFreq.keys():
valProb = valFreq[val] / sum(valFreq.values())
dataSubset = [record for record in data if record[attr] == val]
subsetError += valProb * gini(dataSubset, attr, targetAttr)
return (gini(data, attr, targetAttr) - subsetError)
def gainMisclass(data, attr, targetAttr):
# Calculates the misclassification error that would
# result by splitting the data on the chosen attribute (attr).
valFreq = {}
subsetError = 0.0
# Calculate the frequency of each of the values in the target attribute
for record in data:
if (valFreq.has_key(record[attr])):
valFreq[record[attr]] += 1.0
else:
valFreq[record[attr]] = 1.0
for val in valFreq.keys():
valProb = valFreq[val] / sum(valFreq.values())
dataSubset = [record for record in data if record[attr] == val]
subsetError += valProb * misclassificationError(dataSubset, attr, targetAttr)
return (misclassificationError(data, attr, targetAttr) - subsetError)