-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathSplitter.py
More file actions
142 lines (104 loc) · 4.4 KB
/
Splitter.py
File metadata and controls
142 lines (104 loc) · 4.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# -*- coding: utf-8 -*-
from Predicate import Predicate
import cython
__author__ = 'popka'
from utils.Placement import Placement
import numpy as np
from math import fabs
import time
class Splitter():
"""
Сплиттер находит оптимальное разделение "параметр предиката"
"""
@staticmethod
def split_categorial(x, y, impurity):
combiner = Placement(np.unique(x))
max_impurity = -1
best_c = []
for c in combiner:
mask = np.in1d(x, c)
y_left = y[mask]
y_right = y[np.invert(mask)]
imp = impurity.calculate_split(y_left, y_right)
if imp > max_impurity:
max_impurity = imp
best_c = c
return best_c, max_impurity
@staticmethod
def split_quantitative(x, y, impurity, steps=100):
"""
Функция делит количественную переменную так, чтобы разделение приводило к максимальному уменьшению импюрити
:param x: столбец признака
:param y: столбец целевой функции
:param impurity: объект-наследний класса Impurity
:param steps: количество шагов. Если None, то просмотреть все объекты в отдельности
:return: возвращает лучшее разделение и знаение импюрити
"""
max_impurity = -1
best_value = 0
if steps is None:
argsort = x.argsort()
x = x[argsort]
y = y[argsort]
for value in x[:-1]:
y_left = y[x<=value]
y_right = y[x>value]
if len(y_right) == 0:
continue
imp = impurity.calculate_split(y_left, y_right)
if imp > max_impurity:
max_impurity = imp
best_value = value
else:
current = min(x)
end = max(x)
step = float(end-current)/steps
current += step
# 1.1 только, чтобы оставался в конце хотя бы один элемент. Т.е. Значение было больше одного шага и меньше двух шагов
while (current < end-step*1.1):
y_left = y[x<=current]
y_right = y[x>current]
imp = impurity.calculate_split(y_left, y_right)
if imp > max_impurity:
max_impurity = imp
best_value = current
current += step
return best_value, max_impurity
@staticmethod
def split_quick_quantitative(x, y, impurity):
max_gain = None
best_value = None
argsort = x.argsort()
x = x[argsort]
y = y[argsort]
imp_full = impurity.calculate_node(y)
length = len(y)
#rs - right_split
rs = Split(np.mean(y[1:]), imp_full, impurity.calculate_node(y[1:]), len(y)-1)
#ls - left_split
ls = Split(y[0], 0, 0, 1)
for i in range(1, len(y)-1):
rs.prev_mean = rs.mean
rs.mean = (rs.length*rs.mean - y[i])/(rs.length-1)
rs.var = ((rs.var*rs.length-(y[i]-rs.mean)*(y[i]-rs.prev_mean))/(rs.length-1))
rs.length -= 1.
ls.length += 1.
ls.prev_mean = ls.mean
ls.mean = ls.prev_mean+(y[i]-ls.prev_mean)/ls.length
ls.var = (((ls.length-1)*ls.var + (y[i]-ls.prev_mean)*(y[i]-ls.mean))/ls.length)
if (x[i+1]!=x[i]):
gain = imp_full - rs.var*rs.length/length - ls.var*ls.length/length
if gain > max_gain:
max_gain = gain
best_value = float(x[i]+x[i+1])/2
if i != ls.length-1:
print "error"
# если не нашли максимального (по сути, когда все значения столбца одинаковы)
# возвращаем None, None
return best_value, max_gain
class Split():
def __init__(self, mean, prev_mean, var, length):
self.mean = float(mean)
self.prev_mean = float(prev_mean)
self.var = float(var)
self.length = float(length)