forked from eriklindernoren/ML-From-Scratch
-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathxgboost.py
More file actions
119 lines (94 loc) · 3.98 KB
/
xgboost.py
File metadata and controls
119 lines (94 loc) · 3.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from __future__ import division, print_function
import numpy as np
from sklearn import datasets
import sys
import os
import matplotlib.pyplot as plt
import progressbar
# Import helper functions
from mlfromscratch.utils.data_manipulation import train_test_split, standardize, categorical_to_binary, normalize
from mlfromscratch.utils.data_operation import mean_squared_error, accuracy_score
from mlfromscratch.supervised_learning import XGBoostRegressionTree
from mlfromscratch.utils.loss_functions import LogisticLoss
from mlfromscratch.unsupervised_learning import PCA
from mlfromscratch.utils.misc import bar_widgets
from mlfromscratch.utils import Plot
class XGBoost(object):
"""The XGBoost classifier.
Reference: http://xgboost.readthedocs.io/en/latest/model.html
Parameters:
-----------
n_estimators: int
The number of classification trees that are used.
learning_rate: float
The step length that will be taken when following the negative gradient during
training.
min_samples_split: int
The minimum number of samples needed to make a split when building a tree.
min_impurity: float
The minimum impurity required to split the tree further.
max_depth: int
The maximum depth of a tree.
debug: boolean
True or false depending on if we wish to display the training progress.
"""
def __init__(self, n_estimators=200, learning_rate=0.001, min_samples_split=2,
min_impurity=1e-7, max_depth=2, debug=False):
self.n_estimators = n_estimators # Number of trees
self.learning_rate = learning_rate
self.min_samples_split = min_samples_split # The minimum n of sampels to justify split
self.min_impurity = min_impurity # Minimum variance reduction to continue
self.max_depth = max_depth # Maximum depth for tree
self.debug = debug
self.bar = progressbar.ProgressBar(widgets=bar_widgets)
# Log loss for classification
self.loss = LogisticLoss(grad_wrt_theta=False)
# Initialize regression trees
self.trees = []
for _ in range(n_estimators):
tree = XGBoostRegressionTree(
min_samples_split=self.min_samples_split,
min_impurity=min_impurity,
max_depth=self.max_depth,
loss=self.loss)
self.trees.append(tree)
def fit(self, X, y):
y = categorical_to_binary(y)
y_pred = np.zeros(np.shape(y))
for i in self.bar(range(self.n_estimators)):
tree = self.trees[i]
y_and_pred = np.concatenate((y, y_pred), axis=1)
tree.fit(X, y_and_pred)
update_pred = tree.predict(X)
y_pred -= np.multiply(self.learning_rate, update_pred)
def predict(self, X):
# Fix shape of y_pred as (n_samples, n_outputs)
n_samples = np.shape(X)[0]
y_pred = np.array([])
# Make predictions
for tree in self.trees:
# Estimate gradient and update prediction
update = np.multiply(self.learning_rate, tree.predict(X))
y_pred = update if not y_pred.any() else y_pred - update
# Turn into probability distribution
y_pred = np.exp(y_pred) / np.expand_dims(np.sum(np.exp(y_pred), axis=1), axis=1)
# Set label to the value that maximizes probability
y_pred = np.argmax(y_pred, axis=1)
return y_pred
def main():
print ("-- XGBoost --")
data = datasets.load_iris()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, seed=2)
clf = XGBoost(debug=True)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print ("Accuracy:", accuracy)
Plot().plot_in_2d(X_test, y_pred,
title="XGBoost",
accuracy=accuracy,
legend_labels=data.target_names)
if __name__ == "__main__":
main()