-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDecisionTree.py
More file actions
122 lines (94 loc) · 3.6 KB
/
DecisionTree.py
File metadata and controls
122 lines (94 loc) · 3.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#class for our decision tree
import numpy as np
import pandas as pd
from pandas.core.frame import pandas
from decision_tree_utils import *
import pickle
class Node:
'''
Node class for our decision tree
'''
def __init__(self, data : pandas.DataFrame, split_feature : str, split_value : float):
self.data = data
self.split_feature = split_feature
self.split_value = split_value
self.left = None
self.right = None
def display(self):
print(f'length of data: {len(self.data)}')
print(f'split feature: {self.split_feature}')
print(f'split value: {self.split_value}')
print(f' is left None: {self.left == None}')
print(f' is right None: {self.right == None}\n')
def build_tree(data : pandas.DataFrame, target, max_depth : int, min_samples_split : int, depth : int):
'''
Function to build the decision tree
'''
if len(data) == 0:
return None
elif depth >= max_depth:
return None
elif len(data) < min_samples_split:
return None
else:
best_split, best_value = find_best_split(data, target)
if best_split == None or best_value == None:
return None
else:
split_feature, split_value = best_split, best_value
left_data, right_data = split_data(data, split_feature, split_value)
node = Node(data, split_feature, split_value)
node.left = build_tree(left_data, target, max_depth, min_samples_split, depth + 1)
node.right = build_tree(right_data,target, max_depth, min_samples_split, depth + 1)
return node
class DecisionTree:
'''
Decision Tree class
'''
def __init__(self, max_depth : int, min_samples_split : int):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.target = None
self.tree = None
def fit(self, data : pandas.DataFrame, target : str):
self.tree = build_tree(data, target, self.max_depth, self.min_samples_split, 0)
def inorder(self, node : Node):
'''
Function to print the tree in inorder traversal
'''
if node:
self.inorder(node.left)
node.display()
self.inorder(node.right)
def predict(self, data : pandas.DataFrame):
'''
Function to predict the target values
'''
current_node = self.tree
while current_node.left != None and current_node.right != None:
if data[current_node.split_feature] <= current_node.split_value:
current_node = current_node.left
else:
current_node = current_node.right
return current_node.data[self.target].mode()[0]
def main():
dataset = pd.read_csv('cancer.csv')
# get train and test data splits no sklearn
shuffled_data = dataset.sample(frac=1, random_state=12)
train_size = int(0.8 * len(shuffled_data))
train_data = shuffled_data[:train_size]
test_data = shuffled_data[train_size:]
maxDepthIn = int(input('Enter the max depth: '))
minSamplesIn = int(input('Enter the min samples: '))
myTree = DecisionTree(maxDepthIn, minSamplesIn)
myTree.fit(train_data, 'diagnosis(1=m, 0=b)')
myTree.target = 'diagnosis(1=m, 0=b)'
#predict all the rows and check the accuracy
correct = 0
for i in range(len(test_data)):
row = test_data.iloc[i]
if myTree.predict(row) == row[myTree.target]:
correct += 1
print(f'accuracy: {correct/len(test_data)}')
if __name__ == '__main__':
main()