-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgrf_hoeffding_tree.py
More file actions
1412 lines (1177 loc) · 51.4 KB
/
grf_hoeffding_tree.py
File metadata and controls
1412 lines (1177 loc) · 51.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 30 16:44:48 2019
@author: txuslopez
"""
import logging
import textwrap
from abc import ABCMeta
from operator import attrgetter
import numpy as np
from skmultiflow.utils.utils import get_dimensions, normalize_values_in_dict, calculate_object_size
from skmultiflow.core.base import StreamModel
from skmultiflow.trees.numeric_attribute_class_observer_gaussian import NumericAttributeClassObserverGaussian
from skmultiflow.trees.nominal_attribute_class_observer import NominalAttributeClassObserver
from skmultiflow.trees.attribute_class_observer_null import AttributeClassObserverNull
from skmultiflow.trees.attribute_split_suggestion import AttributeSplitSuggestion
from skmultiflow.trees.gini_split_criterion import GiniSplitCriterion
from skmultiflow.trees.info_gain_split_criterion import InfoGainSplitCriterion
from skmultiflow.trees.utils import do_naive_bayes_prediction
GINI_SPLIT = 'gini'
INFO_GAIN_SPLIT = 'info_gain'
MAJORITY_CLASS = 'mc'
NAIVE_BAYES = 'nb'
NAIVE_BAYES_ADAPTIVE = 'nba'
# logger
logging.basicConfig(format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)
class GRF_HoeffdingTree(StreamModel):
""" Hoeffding Tree or VFDT.
Parameters
----------
max_byte_size: int (default=33554432)
Maximum memory consumed by the tree.
memory_estimate_period: int (default=1000000)
Number of instances between memory consumption checks.
grace_period: int (default=200)
Number of instances a leaf should observe between split attempts.
split_criterion: string (default='info_gain')
| Split criterion to use.
| 'gini' - Gini
| 'info_gain' - Information Gain
split_confidence: float (default=0.0000001)
Allowed error in split decision, a value closer to 0 takes longer to decide.
tie_threshold: float (default=0.05)
Threshold below which a split will be forced to break ties.
binary_split: boolean (default=False)
If True, only allow binary splits.
stop_mem_management: boolean (default=False)
If True, stop growing as soon as memory limit is hit.
remove_poor_atts: boolean (default=False)
If True, disable poor attributes.
no_preprune: boolean (default=False)
If True, disable pre-pruning.
leaf_prediction: string (default='nba')
| Prediction mechanism used at leafs.
| 'mc' - Majority Class
| 'nb' - Naive Bayes
| 'nba' - Naive Bayes Adaptive
nb_threshold: int (default=0)
Number of instances a leaf should observe before allowing Naive Bayes.
nominal_attributes: list, optional
List of Nominal attributes. If emtpy, then assume that all attributes are numerical.
Notes
-----
A Hoeffding Tree [1]_ is an incremental, anytime decision tree induction algorithm that is capable of learning from
massive data streams, assuming that the distribution generating examples does not change over time. Hoeffding trees
exploit the fact that a small sample can often be enough to choose an optimal splitting attribute. This idea is
supported mathematically by the Hoeffding bound, which quantifies the number of observations (in our case, examples)
needed to estimate some statistics within a prescribed precision (in our case, the goodness of an attribute).
A theoretically appealing feature of Hoeffding Trees not shared by other incremental decision tree learners is that
it has sound guarantees of performance. Using the Hoeffding bound one can show that its output is asymptotically
nearly identical to that of a non-incremental learner using infinitely many examples.
Implementation based on MOA [2]_.
References
----------
.. [1] G. Hulten, L. Spencer, and P. Domingos. Mining time-changing data streams.
In KDD’01, pages 97–106, San Francisco, CA, 2001. ACM Press.
.. [2] Albert Bifet, Geoff Holmes, Richard Kirkby, Bernhard Pfahringer.
MOA: Massive Online Analysis; Journal of Machine Learning Research 11: 1601-1604, 2010.
"""
class FoundNode(object):
""" Base class for tree nodes.
Parameters
----------
node: SplitNode or LearningNode
The node object.
parent: SplitNode or None
The node's parent.
parent_branch: int
The parent node's branch.
"""
def __init__(self, node=None, parent=None, parent_branch=None):
""" FoundNode class constructor. """
self.node = node
self.parent = parent
self.parent_branch = parent_branch
class Node(metaclass=ABCMeta):
""" Base class for nodes in a Hoeffding Tree.
Parameters
----------
class_observations: dict (class_value, weight) or None
Class observations.
"""
def __init__(self, class_observations=None):
""" Node class constructor. """
if class_observations is None:
class_observations = {} # Dictionary (class_value, weight)
self._observed_class_distribution = class_observations
@staticmethod
def is_leaf():
""" Determine if the node is a leaf.
Returns
-------
True if leaf, False otherwise
"""
return True
def filter_instance_to_leaf(self, X, parent, parent_branch):
""" Traverse down the tree to locate the corresponding leaf for an instance.
Parameters
----------
X: numpy.ndarray of shape (n_samples, n_features)
Data instances.
parent: HoeffdingTree.Node
Parent node.
parent_branch: Int
Parent branch index
Returns
-------
FoundNode
The corresponding leaf.
"""
return GRF_HoeffdingTree.FoundNode(self, parent, parent_branch)
def get_observed_class_distribution(self):
""" Get the current observed class distribution at the node.
Returns
-------
dict (class_value, weight)
Class distribution at the node.
"""
return self._observed_class_distribution
def get_class_votes(self, X, ht):
""" Get the votes per class for a given instance.
Parameters
----------
X: numpy.ndarray of length equal to the number of features.
Data instances.
ht: HoeffdingTree
The Hoeffding Tree.
Returns
-------
dict (class_value, weight)
Class votes for the given instance.
"""
return self._observed_class_distribution
def observed_class_distribution_is_pure(self):
""" Check if observed class distribution is pure, i.e. if all samples belong to the same class.
Returns
-------
boolean
True if observed number of classes is less than 2, False otherwise.
"""
count = 0
for _, weight in self._observed_class_distribution.items():
if weight is not 0:
count += 1
if count == 2: # No need to count beyond this point
break
return count < 2
def subtree_depth(self):
""" Calculate the depth of the subtree from this node.
Returns
-------
int
Subtree depth, 0 if the node is a leaf.
"""
return 0
def calculate_promise(self):
""" Calculate node's promise.
Returns
-------
int
A small value indicates that the node has seen more samples of a given class than the other classes.
"""
total_seen = sum(self._observed_class_distribution.values())
if total_seen > 0:
return total_seen - max(self._observed_class_distribution.values())
else:
return 0
def describe_subtree(self, ht, buffer, indent=0):
""" Walk the tree and write its structure to a buffer string.
Parameters
----------
ht: HoeffdingTree
The tree to describe.
buffer: string
The string buffer where the tree's structure will be stored
indent: int
Indentation level (number of white spaces for current node.)
"""
buffer[0] += textwrap.indent('Leaf = ', ' ' * indent)
try:
class_val = max(self._observed_class_distribution, key=self._observed_class_distribution.get)
buffer[0] += 'Class {} | {}\n'.format(class_val, self._observed_class_distribution)
except ValueError: # Regression problems
buffer[0] += 'Statistics {}\n'.format(self._observed_class_distribution)
# TODO
def get_description(self):
pass
class SplitNode(Node):
""" Node that splits the data in a Hoeffding Tree.
Parameters
----------
split_test: InstanceConditionalTest
Split test.
class_observations: dict (class_value, weight) or None
Class observations
"""
def __init__(self, split_test, class_observations):
""" SplitNode class constructor."""
super().__init__(class_observations)
self._split_test = split_test
# Dict of tuples (branch, child)
self._children = {}
def num_children(self):
""" Count the number of children for a node."""
return len(self._children)
def set_child(self, index, node):
""" Set node as child.
Parameters
----------
index: int
Branch index where the node will be inserted.
node: HoeffdingTree.Node
The node to insert.
"""
if (self._split_test.max_branches() >= 0) and (index >= self._split_test.max_branches()):
raise IndexError
self._children[index] = node
def get_child(self, index):
""" Retrieve a node's child given its branch index.
Parameters
----------
index: int
Node's branch index.
Returns
-------
HoeffdingTree.Node or None
Child node.
"""
if index in self._children:
return self._children[index]
else:
return None
def instance_child_index(self, X):
""" Get the branch index for a given instance at the current node.
Returns
-------
int
Branch index, -1 if unknown.
"""
return self._split_test.branch_for_instance(X)
@staticmethod
def is_leaf():
""" Determine if the node is a leaf.
Returns
-------
boolean
True if node is a leaf, False otherwise
"""
return False
def filter_instance_to_leaf(self, X, parent, parent_branch):
""" Traverse down the tree to locate the corresponding leaf for an instance.
Parameters
----------
X: numpy.ndarray of shape (n_samples, n_features)
Data instances.
parent: HoeffdingTree.Node
Parent node.
parent_branch: int
Parent branch index.
Returns
-------
FoundNode
Leaf node for the instance.
"""
child_index = self.instance_child_index(X)
if child_index >= 0:
child = self.get_child(child_index)
if child is not None:
return child.filter_instance_to_leaf(X, self, child_index)
else:
return GRF_HoeffdingTree.FoundNode(None, self, child_index)
else:
return GRF_HoeffdingTree.FoundNode(self, parent, parent_branch)
def subtree_depth(self):
""" Calculate the depth of the subtree from this node.
Returns
-------
int
Subtree depth, 0 if node is a leaf.
"""
max_child_depth = 0
for child in self._children:
if child is not None:
depth = child.subtree_depth()
if depth > max_child_depth:
max_child_depth = depth
return max_child_depth + 1
def describe_subtree(self, ht, buffer, indent=0):
""" Walk the tree and write its structure to a buffer string.
Parameters
----------
ht: HoeffdingTree
The tree to describe.
buffer: string
The buffer where the tree's structure will be stored.
indent: int
Indentation level (number of white spaces for current node).
"""
for branch_idx in range(self.num_children()):
child = self.get_child(branch_idx)
if child is not None:
buffer[0] += textwrap.indent('if ', ' ' * indent)
buffer[0] += self._split_test.describe_condition_for_branch(branch_idx)
buffer[0] += ':\n'
child.describe_subtree(ht, buffer, indent + 2)
class LearningNode(Node):
""" Base class for Learning Nodes in a Hoeffding Tree.
Parameters
----------
initial_class_observations: dict (class_value, weight) or None
Initial class observations
"""
def __init__(self, initial_class_observations=None):
""" LearningNode class constructor. """
super().__init__(initial_class_observations)
def learn_from_instance(self, X, y, weight, ht):
"""Update the node with the provided instance.
Parameters
----------
X: numpy.ndarray of length equal to the number of features.
Instance attributes for updating the node.
y: int
Instance class.
weight: float
Instance weight.
ht: HoeffdingTree
Hoeffding Tree to update.
"""
pass
class InactiveLearningNode(LearningNode):
""" Inactive learning node that does not grow.
Parameters
----------
initial_class_observations: dict (class_value, weight) or None
Initial class observations
"""
def __init__(self, initial_class_observations=None):
""" InactiveLearningNode class constructor. """
super().__init__(initial_class_observations)
def learn_from_instance(self, X, y, weight, ht):
""" Update the node with the provided instance.
Parameters
----------
X: numpy.ndarray of length equal to the number of features.
Instance attributes for updating the node.
y: int
Instance class.
weight: float
Instance weight.
ht: HoeffdingTree
Hoeffding Tree to update.
"""
try:
self._observed_class_distribution[y] += weight
except KeyError:
self._observed_class_distribution[y] = weight
class ActiveLearningNode(LearningNode):
""" Learning node that supports growth.
Parameters
----------
initial_class_observations: dict (class_value, weight) or None
Initial class observations
"""
def __init__(self, initial_class_observations):
""" ActiveLearningNode class constructor. """
super().__init__(initial_class_observations)
self._weight_seen_at_last_split_evaluation = self.get_weight_seen()
self._attribute_observers = {}
def learn_from_instance(self, X, y, weight, ht):
""" Update the node with the provided instance.
Parameters
----------
X: numpy.ndarray of length equal to the number of features.
Instance attributes for updating the node.
y: int
Instance class.
weight: float
Instance weight.
ht: HoeffdingTree
Hoeffding Tree to update.
"""
try:
self._observed_class_distribution[y] += weight
except KeyError:
self._observed_class_distribution[y] = weight
for i in range(len(X)):
try:
obs = self._attribute_observers[i]
except KeyError:
if i in ht.nominal_attributes:
obs = NominalAttributeClassObserver()
else:
obs = NumericAttributeClassObserverGaussian()
self._attribute_observers[i] = obs
obs.observe_attribute_class(X[i], int(y), weight)
def get_weight_seen(self):
""" Calculate the total weight seen by the node.
Returns
-------
float
Total weight seen.
"""
return sum(self._observed_class_distribution.values())
def get_weight_seen_at_last_split_evaluation(self):
""" Retrieve the weight seen at last split evaluation.
Returns
-------
float
Weight seen at last split evaluation.
"""
return self._weight_seen_at_last_split_evaluation
def set_weight_seen_at_last_split_evaluation(self, weight):
""" Retrieve the weight seen at last split evaluation.
Parameters
----------
weight: float
Weight seen at last split evaluation.
"""
self._weight_seen_at_last_split_evaluation = weight
def get_best_split_suggestions(self, criterion, ht):
""" Find possible split candidates.
Parameters
----------
criterion: SplitCriterion
The splitting criterion to be used.
ht: HoeffdingTree
Hoeffding Tree.
Returns
-------
list
Split candidates.
"""
best_suggestions = []
pre_split_dist = self._observed_class_distribution
if not ht.no_preprune:
# Add null split as an option
null_split = AttributeSplitSuggestion(None, [{}],
criterion.get_merit_of_split(pre_split_dist, [pre_split_dist]))
best_suggestions.append(null_split)
for i, obs in self._attribute_observers.items():
best_suggestion = obs.get_best_evaluated_split_suggestion(criterion, pre_split_dist,
i, ht.binary_split)
if best_suggestion is not None:
best_suggestions.append(best_suggestion)
return best_suggestions
def disable_attribute(self, att_idx):
""" Disable an attribute observer.
Parameters
----------
att_idx: int
Attribute index.
"""
if att_idx in self._attribute_observers:
self._attribute_observers[att_idx] = AttributeClassObserverNull()
class LearningNodeNB(ActiveLearningNode):
""" Learning node that uses Naive Bayes models.
Parameters
----------
initial_class_observations: dict (class_value, weight) or None
Initial class observations
"""
def __init__(self, initial_class_observations):
""" LearningNodeNB class constructor. """
super().__init__(initial_class_observations)
def get_class_votes(self, X, ht):
""" Get the votes per class for a given instance.
Parameters
----------
X: numpy.ndarray of length equal to the number of features.
Instance attributes.
ht: HoeffdingTree
Hoeffding Tree.
Returns
-------
dict (class_value, weight)
Class votes for the given instance.
"""
if self.get_weight_seen() >= ht.nb_threshold:
return do_naive_bayes_prediction(X, self._observed_class_distribution, self._attribute_observers)
else:
return super().get_class_votes(X, ht)
def disable_attribute(self, att_index):
""" Disable an attribute observer.
Disabled in Nodes using Naive Bayes, since poor attributes are used in Naive Bayes calculation.
Parameters
----------
att_index: int
Attribute index.
"""
pass
class LearningNodeNBAdaptive(LearningNodeNB):
""" Learning node that uses Adaptive Naive Bayes models.
Parameters
----------
initial_class_observations: dict (class_value, weight) or None
Initial class observations
"""
def __init__(self, initial_class_observations):
""" LearningNodeNBAdaptive class constructor. """
super().__init__(initial_class_observations)
self._mc_correct_weight = 0.0
self._nb_correct_weight = 0.0
def learn_from_instance(self, X, y, weight, ht):
""" Update the node with the provided instance.
Parameters
----------
X: numpy.ndarray of length equal to the number of features.
Instance attributes for updating the node.
y: int
Instance class.
weight: float
The instance's weight.
ht: HoeffdingTree
The Hoeffding Tree to update.
"""
if self._observed_class_distribution == {}:
# All classes equal, default to class 0
if 0 == y:
self._mc_correct_weight += weight
elif max(self._observed_class_distribution, key=self._observed_class_distribution.get) == y:
self._mc_correct_weight += weight
nb_prediction = do_naive_bayes_prediction(X, self._observed_class_distribution, self._attribute_observers)
if max(nb_prediction, key=nb_prediction.get) == y:
self._nb_correct_weight += weight
super().learn_from_instance(X, y, weight, ht)
def get_class_votes(self, X, ht):
""" Get the votes per class for a given instance.
Parameters
----------
X: numpy.ndarray of length equal to the number of features.
Instance attributes.
ht: HoeffdingTree
Hoeffding Tree.
Returns
-------
dict (class_value, weight)
Class votes for the given instance.
"""
if self._mc_correct_weight > self._nb_correct_weight:
return self._observed_class_distribution
return do_naive_bayes_prediction(X, self._observed_class_distribution, self._attribute_observers)
# ====================================
# == Hoeffding Tree implementation ===
# ====================================
def __init__(self,
max_byte_size=33554432,
memory_estimate_period=1000000,
grace_period=200,
split_criterion='info_gain',
split_confidence=0.0000001,
tie_threshold=0.05,
binary_split=False,
stop_mem_management=False,
remove_poor_atts=False,
no_preprune=False,
leaf_prediction='nba',
nb_threshold=0,
nominal_attributes=None):
""" HoeffdingTree class constructor."""
super().__init__()
self.max_byte_size = max_byte_size
self.memory_estimate_period = memory_estimate_period
self.grace_period = grace_period
self.split_criterion = split_criterion
self.split_confidence = split_confidence
self.tie_threshold = tie_threshold
self.binary_split = binary_split
self.stop_mem_management = stop_mem_management
self.remove_poor_atts = remove_poor_atts
self.no_preprune = no_preprune
self.leaf_prediction = leaf_prediction
self.nb_threshold = nb_threshold
self.nominal_attributes = nominal_attributes
# self._numeric_estimator_option = 'GaussianNumericAttributeClassObserver' # Numeric estimator to use.
# self._nominal_estimator_option = 'NominalAttributeClassObserver' # Nominal estimator to use.
self._tree_root = None
self._decision_node_cnt = 0
self._active_leaf_node_cnt = 0
self._inactive_leaf_node_cnt = 0
self._inactive_leaf_byte_size_estimate = 0.0
self._active_leaf_byte_size_estimate = 0.0
self._byte_size_estimate_overhead_fraction = 1.0
self._growth_allowed = True
self._train_weight_seen_by_model = 0.0
self.classes = None
self.min_max_data = []
self.gamma = 0.0
self.n_gaussianRF = 0
self.time_coding = 1.0
@property
def max_byte_size(self):
return self._max_byte_size
@max_byte_size.setter
def max_byte_size(self, max_byte_size):
self._max_byte_size = max_byte_size
@property
def memory_estimate_period(self):
return self._memory_estimate_period
@memory_estimate_period.setter
def memory_estimate_period(self, memory_estimate_period):
self._memory_estimate_period = memory_estimate_period
@property
def grace_period(self):
return self._grace_period
@grace_period.setter
def grace_period(self, grace_period):
self._grace_period = grace_period
@property
def split_criterion(self):
return self._split_criterion
@split_criterion.setter
def split_criterion(self, split_criterion):
if split_criterion != GINI_SPLIT and split_criterion != INFO_GAIN_SPLIT:
logger.info("Invalid option {}', will use default '{}'".format(split_criterion, INFO_GAIN_SPLIT))
self._split_criterion = INFO_GAIN_SPLIT
else:
self._split_criterion = split_criterion
@property
def split_confidence(self):
return self._split_confidence
@split_confidence.setter
def split_confidence(self, split_confidence):
self._split_confidence = split_confidence
@property
def tie_threshold(self):
return self._tie_threshold
@tie_threshold.setter
def tie_threshold(self, tie_threshold):
self._tie_threshold = tie_threshold
@property
def binary_split(self):
return self._binary_split
@binary_split.setter
def binary_split(self, binary_split):
self._binary_split = binary_split
@property
def stop_mem_management(self):
return self._stop_mem_management
@stop_mem_management.setter
def stop_mem_management(self, stop_mem_management):
self._stop_mem_management = stop_mem_management
@property
def remove_poor_atts(self):
return self._remove_poor_atts
@remove_poor_atts.setter
def remove_poor_atts(self, remove_poor_atts):
self._remove_poor_atts = remove_poor_atts
@property
def no_preprune(self):
return self._no_preprune
@no_preprune.setter
def no_preprune(self, no_pre_prune):
self._no_preprune = no_pre_prune
@property
def leaf_prediction(self):
return self._leaf_prediction
@leaf_prediction.setter
def leaf_prediction(self, leaf_prediction):
if leaf_prediction != MAJORITY_CLASS and leaf_prediction != NAIVE_BAYES \
and leaf_prediction != NAIVE_BAYES_ADAPTIVE:
logger.info("Invalid option {}', will use default '{}'".format(leaf_prediction, NAIVE_BAYES_ADAPTIVE))
self._leaf_prediction = NAIVE_BAYES_ADAPTIVE
else:
self._leaf_prediction = leaf_prediction
@property
def nb_threshold(self):
return self._nb_threshold
@nb_threshold.setter
def nb_threshold(self, nb_threshold):
self._nb_threshold = nb_threshold
@property
def nominal_attributes(self):
return self._nominal_attributes
@nominal_attributes.setter
def nominal_attributes(self, nominal_attributes):
if nominal_attributes is None:
nominal_attributes = []
logger.debug("No Nominal attributes have been defined, will consider all attributes as numerical.")
self._nominal_attributes = nominal_attributes
@property
def classes(self):
return self._classes
@classes.setter
def classes(self, value):
self._classes = value
def measure_byte_size(self):
""" Calculate the size of the tree.
Returns
-------
int
Size of the tree in bytes.
"""
return calculate_object_size(self)
def reset(self):
""" Reset the Hoeffding Tree to default values."""
self._tree_root = None
self._decision_node_cnt = 0
self._active_leaf_node_cnt = 0
self._inactive_leaf_node_cnt = 0
self._inactive_leaf_byte_size_estimate = 0.0
self._active_leaf_byte_size_estimate = 0.0
self._byte_size_estimate_overhead_fraction = 1.0
self._growth_allowed = True
if self._leaf_prediction != MAJORITY_CLASS:
self._remove_poor_atts = None
self._train_weight_seen_by_model = 0.0
def fit(self, X, y, classes=None, weight=None):
raise NotImplementedError
def partial_fit(self, X, y, classes=None, weight=None):
""" Incrementally trains the model. Train samples (instances) are composed of X attributes and their
corresponding targets y.
Tasks performed before training:
* Verify instance weight. if not provided, uniform weights (1.0) are assumed.
* If more than one instance is passed, loop through X and pass instances one at a time.
* Update weight seen by model.
Training tasks:
* If the tree is empty, create a leaf node as the root.
* If the tree is already initialized, find the corresponding leaf for the instance and update the leaf node
statistics.
* If growth is allowed and the number of instances that the leaf has observed between split attempts
exceed the grace period then attempt to split.
Parameters
----------
X: numpy.ndarray of shape (n_samples, n_features)
Instance attributes.
y: array_like
Classes (targets) for all samples in X.
classes: list or numpy.array
Contains the class values in the stream. If defined, will be used to define the length of the arrays
returned by `predict_proba`
weight: float or array-like
Instance weight. If not provided, uniform weights are assumed.
"""
if self.classes is None and classes is not None:
self.classes = classes
if y is not None:
if weight is None:
weight = np.array([1.0])
row_cnt, _ = get_dimensions(X)
wrow_cnt, _ = get_dimensions(weight)
if row_cnt != wrow_cnt:
weight = [weight[0]] * row_cnt
for i in range(row_cnt):
if weight[i] != 0.0:
self._train_weight_seen_by_model += weight[i]
self._partial_fit(X[i], y[i], weight[i])
def _partial_fit(self, X, y, weight):
""" Trains the model on samples X and corresponding targets y.
Private function where actual training is carried on.
Parameters
----------
X: numpy.ndarray of shape (n_samples, n_features)
Instance attributes.
y: array_like
Classes (targets) for all samples in X.
weight: float or array-like
Instance weight. If not provided, uniform weights are assumed.
"""
if self._tree_root is None:
self._tree_root = self._new_learning_node()
self._active_leaf_node_cnt = 1
found_node = self._tree_root.filter_instance_to_leaf(X, None, -1)
leaf_node = found_node.node
if leaf_node is None:
leaf_node = self._new_learning_node()
found_node.parent.set_child(found_node.parent_branch, leaf_node)
self._active_leaf_node_cnt += 1
if isinstance(leaf_node, self.LearningNode):
learning_node = leaf_node
learning_node.learn_from_instance(X, y, weight, self)
if self._growth_allowed and isinstance(learning_node, self.ActiveLearningNode):
active_learning_node = learning_node
weight_seen = active_learning_node.get_weight_seen()
weight_diff = weight_seen - active_learning_node.get_weight_seen_at_last_split_evaluation()
if weight_diff >= self.grace_period:
self._attempt_to_split(active_learning_node, found_node.parent, found_node.parent_branch)
active_learning_node.set_weight_seen_at_last_split_evaluation(weight_seen)
if self._train_weight_seen_by_model % self.memory_estimate_period == 0:
self.estimate_model_byte_size()
def get_votes_for_instance(self, X):
""" Get class votes for a single instance.
Parameters
----------
X: numpy.ndarray of length equal to the number of features.
Instance attributes.
Returns
-------
dict (class_value, weight)
"""
if self._tree_root is not None:
found_node = self._tree_root.filter_instance_to_leaf(X, None, -1)
leaf_node = found_node.node
if leaf_node is None:
leaf_node = found_node.parent
return leaf_node.get_class_votes(X, self)