Skip to content

Commit 2e7c070

Browse files
authored
[MRG] EHN add voting paramter for ClusterCentroids (#318)
* EHN POC sparse handling for RandomUnderSampler * EHN support sparse ENN * iter * EHN sparse indexing IHT * EHN sparse support nearmiss * EHN support sparse matrices for NCR * EHN support sparse Tomek and OSS * EHN support sparsity for CNN * EHN support sparse for SMOTE * EHN support sparse adasyn * EHN support sparsity for sombine methods * EHN support sparsity BC * DOC update docstring * DOC fix example topic classification * FIX fix test and class clustercentroids * TST add common test * TST add ensemble * TST use allclose * TST install conda with ubuntu container * TST increase tolerance * TST increase tolerance * TST test all versions NearMiss and SMOTE * TST set the algorithm of KMeans * DOC add entry in user guide * DOC add entry sparse for CC * DOC whatsnew entry * EHN add voting paramter for ClusterCentroids * TST fix common test fixing voting
1 parent cddf39b commit 2e7c070

File tree

6 files changed

+148
-42
lines changed

6 files changed

+148
-42
lines changed

doc/introduction.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ Introduction
99
API's of imbalanced-learn samplers
1010
----------------------------------
1111

12-
The available samplers follows the scikit-learn API using the base estimator and adding a sampling functionality throw the ``sample`` method::
12+
The available samplers follows the scikit-learn API using the base estimator
13+
and adding a sampling functionality throw the ``sample`` method::
1314

1415
:Estimator:
1516

doc/whats_new.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ Bug fixes
4040
New features
4141
~~~~~~~~~~~~
4242

43+
- :class:`under_sampling.ClusterCentroids` accepts a parameter ``voting``
44+
allowing to use nearest-neighbors of centroids instead of centroids
45+
themselves. It is more efficient for sparse input. By `Guillaume Lemaitre`_.
46+
4347
- Turn off steps in :class:`pipeline.Pipeline` using the `None`
4448
object. By `Christos Aridas`_.
4549

examples/under-sampling/plot_cluster_centroids.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
2525
n_informative=3, n_redundant=1, flip_y=0,
2626
n_features=20, n_clusters_per_class=1,
27-
n_samples=200, random_state=10)
27+
n_samples=50, random_state=10)
2828

2929
# Instanciate a PCA object for the sake of easy visualisation
3030
pca = PCA(n_components=2)
@@ -34,25 +34,46 @@
3434
# Apply Cluster Centroids
3535
cc = ClusterCentroids()
3636
X_resampled, y_resampled = cc.fit_sample(X, y)
37-
X_res_vis = pca.transform(X_resampled)
37+
X_res_vis_soft = pca.transform(X_resampled)
38+
39+
# Use hard voting instead of soft voting
40+
cc = ClusterCentroids(voting='hard')
41+
X_resampled, y_resampled = cc.fit_sample(X, y)
42+
X_res_vis_hard = pca.transform(X_resampled)
3843

3944
# Two subplots, unpack the axes array immediately
40-
f, (ax1, ax2) = plt.subplots(1, 2)
45+
f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
4146

4247
c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
4348
alpha=0.5)
4449
c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",
4550
alpha=0.5)
4651
ax1.set_title('Original set')
4752

48-
ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
53+
ax2.scatter(X_res_vis_soft[y_resampled == 0, 0],
54+
X_res_vis_soft[y_resampled == 0, 1],
55+
label="Class #0", alpha=.5)
56+
ax2.scatter(X_res_vis_soft[y_resampled == 1, 0],
57+
X_res_vis_soft[y_resampled == 1, 1],
58+
label="Class #1", alpha=.5)
59+
c2 = ax2.scatter(X_vis[y == 1, 0],
60+
X_vis[y == 1, 1], label="Original #1",
61+
alpha=0.2)
62+
ax2.set_title('Cluster centroids with soft voting')
63+
64+
ax3.scatter(X_res_vis_hard[y_resampled == 0, 0],
65+
X_res_vis_hard[y_resampled == 0, 1],
4966
label="Class #0", alpha=.5)
50-
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
67+
ax3.scatter(X_res_vis_hard[y_resampled == 1, 0],
68+
X_res_vis_hard[y_resampled == 1, 1],
5169
label="Class #1", alpha=.5)
52-
ax2.set_title('Cluster centroids')
70+
ax3.scatter(X_vis[y == 1, 0],
71+
X_vis[y == 1, 1],
72+
alpha=0.2)
73+
ax3.set_title('Cluster centroids with hard voting')
5374

5475
# make nice plotting
55-
for ax in (ax1, ax2):
76+
for ax in (ax1, ax2, ax3):
5677
ax.spines['top'].set_visible(False)
5778
ax.spines['right'].set_visible(False)
5879
ax.get_xaxis().tick_bottom()
@@ -62,7 +83,8 @@
6283
ax.set_xlim([-6, 8])
6384
ax.set_ylim([-6, 6])
6485

65-
plt.figlegend((c0, c1), ('Class #0', 'Class #1'), loc='lower center',
66-
ncol=2, labelspacing=0.)
86+
plt.figlegend((c0, c1), ('Class #0', 'Class #1', 'Original Class #1'),
87+
loc='lower center',
88+
ncol=3, labelspacing=0.)
6789
plt.tight_layout(pad=3)
6890
plt.show()

imblearn/under_sampling/prototype_generation/cluster_centroids.py

Lines changed: 55 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,13 @@
1212
from scipy import sparse
1313

1414
from sklearn.cluster import KMeans
15+
from sklearn.neighbors import NearestNeighbors
1516
from sklearn.utils import safe_indexing
1617

1718
from ..base import BaseUnderSampler
1819

20+
VOTING_KIND = ('auto', 'hard', 'soft')
21+
1922

2023
class ClusterCentroids(BaseUnderSampler):
2124
"""Perform under-sampling by generating centroids based on
@@ -58,6 +61,18 @@ class ClusterCentroids(BaseUnderSampler):
5861
estimator : object, optional(default=KMeans())
5962
Pass a :class:`sklearn.cluster.KMeans` estimator.
6063
64+
voting : str, optional (default='auto')
65+
Voting strategy to generate the new samples:
66+
67+
- If ``'hard'``, the nearest-neighbors of the centroids found using the
68+
clustering algorithm will be used.
69+
- If ``'soft'``, the centroids found by the clustering algorithm will
70+
be used.
71+
- If ``'auto'``, if the input is sparse, it will default on ``'hard'``
72+
otherwise, ``'soft'`` will be used.
73+
74+
.. versionadded:: 0.3.0
75+
6176
n_jobs : int, optional (default=1)
6277
The number of threads to open if possible.
6378
@@ -91,10 +106,12 @@ def __init__(self,
91106
ratio='auto',
92107
random_state=None,
93108
estimator=None,
109+
voting='auto',
94110
n_jobs=1):
95111
super(ClusterCentroids, self).__init__(
96112
ratio=ratio, random_state=random_state)
97113
self.estimator = estimator
114+
self.voting = voting
98115
self.n_jobs = n_jobs
99116

100117
def _validate_estimator(self):
@@ -108,6 +125,22 @@ def _validate_estimator(self):
108125
raise ValueError('`estimator` has to be a KMeans clustering.'
109126
' Got {} instead.'.format(type(self.estimator)))
110127

128+
def _generate_sample(self, X, y, centroids, target_class):
129+
if self.voting_ == 'hard':
130+
nearest_neighbors = NearestNeighbors(n_neighbors=1)
131+
nearest_neighbors.fit(X, y)
132+
indices = nearest_neighbors.kneighbors(centroids,
133+
return_distance=False)
134+
X_new = safe_indexing(X, np.squeeze(indices))
135+
else:
136+
if sparse.issparse(X):
137+
X_new = sparse.csr_matrix(centroids)
138+
else:
139+
X_new = centroids
140+
y_new = np.array([target_class] * centroids.shape[0])
141+
142+
return X_new, y_new
143+
111144
def _sample(self, X, y):
112145
"""Resample the dataset.
113146
@@ -131,28 +164,37 @@ def _sample(self, X, y):
131164
"""
132165
self._validate_estimator()
133166

134-
idx_under = np.empty((0, ), dtype=int)
135-
centroids, y_resampled = [], []
167+
if self.voting == 'auto':
168+
if sparse.issparse(X):
169+
self.voting_ = 'hard'
170+
else:
171+
self.voting_ = 'soft'
172+
else:
173+
if self.voting in VOTING_KIND:
174+
self.voting_ = self.voting
175+
else:
176+
raise ValueError("'voting' needs to be one of {}. Got {}"
177+
" instead.".format(VOTING_KIND, self.voting))
178+
179+
X_resampled, y_resampled = [], []
136180
for target_class in np.unique(y):
137181
if target_class in self.ratio_.keys():
138182
n_samples = self.ratio_[target_class]
139183
self.estimator_.set_params(**{'n_clusters': n_samples})
140184
self.estimator_.fit(X[y == target_class])
141-
centroids.append(self.estimator_.cluster_centers_)
142-
y_resampled += [target_class] * n_samples
143-
185+
X_new, y_new = self._generate_sample(
186+
X, y, self.estimator_.cluster_centers_, target_class)
187+
X_resampled.append(X_new)
188+
y_resampled.append(y_new)
144189
else:
145190
target_class_indices = np.flatnonzero(y == target_class)
146-
idx_under = np.concatenate(
147-
(idx_under, target_class_indices), axis=0)
148-
149-
X_resampled = np.concatenate((centroids))
191+
X_resampled.append(safe_indexing(X, target_class_indices))
192+
y_resampled.append(safe_indexing(y, target_class_indices))
150193

151194
if sparse.issparse(X):
152-
X_resampled = sparse.vstack([sparse.csr_matrix(X_resampled),
153-
safe_indexing(X, idx_under)])
195+
X_resampled = sparse.vstack(X_resampled)
154196
else:
155-
X_resampled = np.vstack((X_resampled, safe_indexing(X, idx_under)))
156-
y_resampled = np.hstack((y_resampled, safe_indexing(y, idx_under)))
197+
X_resampled = np.vstack(X_resampled)
198+
y_resampled = np.hstack(y_resampled)
157199

158200
return X_resampled, np.array(y_resampled)

imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py

Lines changed: 55 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
from collections import Counter
55

66
import numpy as np
7+
from scipy import sparse
78
from pytest import raises
89

910
from sklearn.utils.testing import assert_allclose
1011
from sklearn.utils.testing import assert_array_equal
11-
1212
from sklearn.cluster import KMeans
1313

1414
from imblearn.under_sampling import ClusterCentroids
@@ -23,17 +23,26 @@
2323
R_TOL = 1e-4
2424

2525

26+
def test_fit_sample_check_voting():
27+
cc = ClusterCentroids(random_state=RND_SEED)
28+
cc.fit_sample(X, Y)
29+
assert cc.voting_ == 'soft'
30+
cc = ClusterCentroids(random_state=RND_SEED)
31+
cc.fit_sample(sparse.csr_matrix(X), Y)
32+
assert cc.voting_ == 'hard'
33+
34+
2635
def test_fit_sample_auto():
2736
ratio = 'auto'
2837
cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)
2938
X_resampled, y_resampled = cc.fit_sample(X, Y)
30-
X_gt = np.array([[0.06738818, -0.529627],
31-
[0.17901516, 0.69860992],
32-
[0.094035, -2.55298982],
33-
[0.92923648, 0.76103773],
39+
X_gt = np.array([[0.92923648, 0.76103773],
3440
[0.47104475, 0.44386323],
35-
[0.13347175, 0.12167502]])
36-
y_gt = np.array([1, 1, 1, 0, 0, 0])
41+
[0.13347175, 0.12167502],
42+
[0.06738818, -0.529627],
43+
[0.17901516, 0.69860992],
44+
[0.094035, -2.55298982]])
45+
y_gt = np.array([0, 0, 0, 1, 1, 1])
3746
assert_allclose(X_resampled, X_gt, rtol=R_TOL)
3847
assert_array_equal(y_resampled, y_gt)
3948

@@ -42,16 +51,16 @@ def test_fit_sample_half():
4251
ratio = .5
4352
cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)
4453
X_resampled, y_resampled = cc.fit_sample(X, Y)
45-
X_gt = np.array([[0.09125309, -0.85409574],
54+
X_gt = np.array([[0.92923648, 0.76103773],
55+
[0.47104475, 0.44386323],
56+
[0.13347175, 0.12167502],
57+
[0.09125309, -0.85409574],
4658
[0.19220316, 0.32337101],
4759
[0.094035, -2.55298982],
4860
[0.20792588, 1.49407907],
4961
[0.04352327, -0.20515826],
50-
[0.12372842, 0.6536186],
51-
[0.92923648, 0.76103773],
52-
[0.47104475, 0.44386323],
53-
[0.13347175, 0.12167502]])
54-
y_gt = np.array([1, 1, 1, 1, 1, 1, 0, 0, 0])
62+
[0.12372842, 0.6536186]])
63+
y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
5564
assert_allclose(X_resampled, X_gt, rtol=R_TOL)
5665
assert_array_equal(y_resampled, y_gt)
5766

@@ -75,21 +84,48 @@ def test_fit_sample_object():
7584
ratio=ratio, random_state=RND_SEED, estimator=cluster)
7685

7786
X_resampled, y_resampled = cc.fit_sample(X, Y)
78-
X_gt = np.array([[0.06738818, -0.529627],
87+
X_gt = np.array([[0.92923648, 0.76103773],
88+
[0.47104475, 0.44386323],
89+
[0.13347175, 0.12167502],
90+
[0.06738818, -0.529627],
7991
[0.17901516, 0.69860992],
80-
[0.094035, -2.55298982],
81-
[0.92923648, 0.76103773],
92+
[0.094035, -2.55298982]])
93+
y_gt = np.array([0, 0, 0, 1, 1, 1])
94+
assert_allclose(X_resampled, X_gt, rtol=R_TOL)
95+
assert_array_equal(y_resampled, y_gt)
96+
97+
98+
def test_fit_hard_voting():
99+
ratio = 'auto'
100+
voting = 'hard'
101+
cluster = KMeans(random_state=RND_SEED)
102+
cc = ClusterCentroids(
103+
ratio=ratio, random_state=RND_SEED, estimator=cluster,
104+
voting=voting)
105+
106+
X_resampled, y_resampled = cc.fit_sample(X, Y)
107+
X_gt = np.array([[0.92923648, 0.76103773],
82108
[0.47104475, 0.44386323],
83-
[0.13347175, 0.12167502]])
84-
y_gt = np.array([1, 1, 1, 0, 0, 0])
109+
[0.13347175, 0.12167502],
110+
[0.09125309, -0.85409574],
111+
[0.12372842, 0.6536186],
112+
[0.094035, -2.55298982]])
113+
y_gt = np.array([0, 0, 0, 1, 1, 1])
85114
assert_allclose(X_resampled, X_gt, rtol=R_TOL)
86115
assert_array_equal(y_resampled, y_gt)
116+
for x in X_resampled:
117+
assert np.any(np.all(x == X, axis=1))
87118

88119

89-
def test_fit_sample_wrong_object():
120+
def test_fit_sample_error():
90121
ratio = 'auto'
91122
cluster = 'rnd'
92123
cc = ClusterCentroids(
93124
ratio=ratio, random_state=RND_SEED, estimator=cluster)
94125
with raises(ValueError, match="has to be a KMeans clustering"):
95126
cc.fit_sample(X, Y)
127+
128+
voting = 'unknown'
129+
cc = ClusterCentroids(ratio=ratio, voting=voting, random_state=RND_SEED)
130+
with raises(ValueError, match="needs to be one of"):
131+
cc.fit_sample(X, Y)

imblearn/utils/estimator_checks.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,7 @@ def check_samplers_sparse(name, Sampler):
271271
elif isinstance(Sampler(), ClusterCentroids):
272272
# set KMeans to full since it support sparse and dense
273273
samplers = [Sampler(random_state=0,
274+
voting='soft',
274275
estimator=KMeans(random_state=1,
275276
algorithm='full'))]
276277
else:

0 commit comments

Comments
 (0)