DecisionTreeAI/Heuristics.py at master · fmalato/DecisionTreeAI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import math

def entropy(data, targetAttr):
    # Calculates the entropy of the given data set for the target attribute.
    valFreq = {}
    dataEntropy = 0.0

    # Calculate the frequency of each of the values in the target attr
    for record in data:
        if (valFreq.has_key(record[targetAttr])):
            valFreq[record[targetAttr]] += 1.0
        else:
            valFreq[record[targetAttr]] = 1.0

    # Calculate the entropy of the data for the target attribute
    for freq in valFreq.values():
        dataEntropy += (-freq/len(data)) * math.log(freq/len(data), 2)

    return dataEntropy

def gini(data, attr, targetAttr):
    # Calculates the gini index of the given data set for the target attribute
    valFreq = {}
    giniInd = 0.0

    for record in data:
        if (valFreq.has_key(record[attr])):
            valFreq[record[attr]] += 1.0
        else:
            valFreq[record[attr]] = 1.0
    for val in valFreq.keys():
        valProb = valFreq[val] / sum(valFreq.values())
        giniInd += valProb*(1 - valProb)
    return giniInd

def misclassificationError(data, attr, target):
    # calculates the misclassification index of the given data set for the target attribute
    valFreq = {}

    for record in data:
        if (valFreq.has_key(record[attr])):
            valFreq[record[attr]] += 1.0
        else:
            valFreq[record[attr]] = 1.0
    maxValue = max(valFreq.values())
    return 1.0 - maxValue / len(data)

def gainEntr(data, attr, targetAttr):
    # Calculates the information gain (reduction in entropy) that would
    # result by splitting the data on the chosen attribute (attr).

    valFreq = {}
    subsetEntropy = 0.0

    # Calculate the frequency of each of the values in the target attribute
    for record in data:
        if (valFreq.has_key(record[attr])):
            valFreq[record[attr]] += 1.0
        else:
            valFreq[record[attr]] = 1.0

    # Calculate the sum of the entropy for each subset of records weighted
    # by their probability of occuring in the training set.
    for val in valFreq.keys():
        valProb = valFreq[val] / sum(valFreq.values())
        dataSubset = [record for record in data if record[attr] == val]
        subsetEntropy += valProb * entropy(dataSubset, targetAttr)

    # Subtract the entropy of the chosen attribute from the entropy of the
    # whole data set with respect to the target attribute (and return it)
    return (entropy(data, targetAttr) - subsetEntropy)

def gainGini(data, attr, targetAttr):
    # Calculates the gini index (probability of misclassification) that would
    # result by splitting the data on the chosen attribute (attr).

    valFreq = {}
    subsetError = 0.0

    # Calculate the frequency of each of the values in the target attribute
    for record in data:
        if (valFreq.has_key(record[attr])):
            valFreq[record[attr]] += 1.0
        else:
            valFreq[record[attr]] = 1.0

    for val in valFreq.keys():
        valProb = valFreq[val] / sum(valFreq.values())
        dataSubset = [record for record in data if record[attr] == val]
        subsetError += valProb * gini(dataSubset, attr, targetAttr)

    return (gini(data, attr, targetAttr) - subsetError)

def gainMisclass(data, attr, targetAttr):
    # Calculates the misclassification error that would
    # result by splitting the data on the chosen attribute (attr).

    valFreq = {}
    subsetError = 0.0

    # Calculate the frequency of each of the values in the target attribute
    for record in data:
        if (valFreq.has_key(record[attr])):
            valFreq[record[attr]] += 1.0
        else:
            valFreq[record[attr]] = 1.0

    for val in valFreq.keys():
        valProb = valFreq[val] / sum(valFreq.values())
        dataSubset = [record for record in data if record[attr] == val]
        subsetError += valProb * misclassificationError(dataSubset, attr, targetAttr)

    return (misclassificationError(data, attr, targetAttr) - subsetError)