-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdecision_tree_utils.py
More file actions
104 lines (82 loc) · 3.59 KB
/
decision_tree_utils.py
File metadata and controls
104 lines (82 loc) · 3.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import pandas as pd
import numpy as np
from math import log2
# want to implement a random forest classifier, starting with a decision tree classifier
def entropy(target_column : str):
'''
Calculate the entropy of the dataset
'''
if len(target_column) == 0:
return 0
# Calculate the number of 0s and 1s
class0 = len(target_column[target_column == 0]) / len(target_column)
class1 = len(target_column[target_column == 1]) / len(target_column)
# if one of the classes is 0, the entropy is 0
if class0 == 0 or class1 == 0:
return 0
# Calculate the entropy
return -(class0 * log2(class0) + class1 * log2(class1))
def info_gain(data,target,split_feature,split_value):
'''
Calculate the information gain of the dataset
'''
# split the data into two parts
left = data[data[split_feature] <= split_value]
right = data[data[split_feature] > split_value]
# if one of the parts is empty, the information gain is 0
if len(left) == 0 or len(right) == 0:
return 0
# calculate the entropy of the left and right parts
left_entropy = entropy(left[target])
right_entropy = entropy(right[target])
total_entropy = entropy(data[target])
# calculate the information gain
info_gain = total_entropy - (((len(left)/len(data))*left_entropy) + ((len(right)/len(data))*right_entropy))
return info_gain
# exhaustive search to find the best split value for each feature
def find_best_split_for_feature(data, target, feature):
# find the best split value for the feature
best_split_value = 0
best_info_gain = 0
# find the best split value for this feature
for split_value in data[feature]:
curr_info_gain = info_gain(data, target, feature, split_value)
if curr_info_gain > best_info_gain:
best_split_value = split_value
best_info_gain = curr_info_gain
return best_split_value, best_info_gain
def find_best_split(data, target):
best_split_feature = None
best_split_value = 0
best_info_gain = 0
# find the best split value for each feature
for feature in data.columns:
if feature == target:
continue
split_value, info_gain = find_best_split_for_feature(data, target, feature)
if info_gain > best_info_gain:
best_split_feature = feature
best_split_value = split_value
best_info_gain = info_gain
return best_split_feature, best_split_value
def split_data(data, split_feature, split_value):
# split the data into two parts
left = data[data[split_feature] <= split_value]
right = data[data[split_feature] > split_value]
return left, right
def main():
dataset = pd.read_csv('cancer.csv')
for attribute in dataset.columns:
best_split_value, best_info_gain = find_best_split_for_feature(dataset, 'diagnosis(1=m, 0=b)', attribute)
print(f'Attribute: {attribute} split value: {best_split_value}, best info gain: {best_info_gain}')
#print mean
print(f'mean: {dataset[attribute].mean()}')
#print minimum of that attribute
print(f'min: {dataset[attribute].min()}')
# values less than best split value
print(f'split num: {len(dataset[dataset[attribute] > best_split_value])}')
print(f'entropy of entire dataset: {entropy(dataset["diagnosis(1=m, 0=b)"])}')
best_split_feature, best_split_value= find_best_split(dataset, 'diagnosis(1=m, 0=b)')
print(f'best split feature: {best_split_feature}, best split value: {best_split_value}, best info gain: {best_info_gain}')
if __name__ == "__main__":
main()