-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
cluster.py
282 lines (226 loc) · 9.87 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
"""SMOTE variant employing some clustering before the generation."""
# Authors: Guillaume Lemaitre <[email protected]>
# Fernando Nogueira
# Christos Aridas
# License: MIT
import math
import numpy as np
from scipy import sparse
from sklearn.base import clone
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import pairwise_distances
from sklearn.utils import _safe_indexing
from ..base import BaseOverSampler
from ...utils import Substitution
from ...utils._docstring import _n_jobs_docstring
from ...utils._docstring import _random_state_docstring
from ...utils._validation import _deprecate_positional_args
from .base import BaseSMOTE
@Substitution(
sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class KMeansSMOTE(BaseSMOTE):
"""Apply a KMeans clustering before to over-sample using SMOTE.
This is an implementation of the algorithm described in [1]_.
Read more in the :ref:`User Guide <smote_adasyn>`.
.. versionadded:: 0.5
Parameters
----------
{sampling_strategy}
{random_state}
k_neighbors : int or object, default=2
If ``int``, number of nearest neighbours to used to construct synthetic
samples. If object, an estimator that inherits from
:class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to
find the k_neighbors.
{n_jobs}
kmeans_estimator : int or object, default=None
A KMeans instance or the number of clusters to be used. By default,
we used a :class:`~sklearn.cluster.MiniBatchKMeans` which tend to be
better with large number of samples.
cluster_balance_threshold : "auto" or float, default="auto"
The threshold at which a cluster is called balanced and where samples
of the class selected for SMOTE will be oversampled. If "auto", this
will be determined by the ratio for each class, or it can be set
manually.
density_exponent : "auto" or float, default="auto"
This exponent is used to determine the density of a cluster. Leaving
this to "auto" will use a feature-length based exponent.
Attributes
----------
kmeans_estimator_ : estimator
The fitted clustering method used before to apply SMOTE.
nn_k_ : estimator
The fitted k-NN estimator used in SMOTE.
cluster_balance_threshold_ : float
The threshold used during ``fit`` for calling a cluster balanced.
See Also
--------
SMOTE : Over-sample using SMOTE.
SMOTENC : Over-sample using SMOTE for continuous and categorical features.
SMOTEN : Over-sample using the SMOTE variant specifically for categorical
features only.
SVMSMOTE : Over-sample using SVM-SMOTE variant.
BorderlineSMOTE : Over-sample using Borderline-SMOTE variant.
ADASYN : Over-sample using ADASYN.
References
----------
.. [1] Felix Last, Georgios Douzas, Fernando Bacao, "Oversampling for
Imbalanced Learning Based on K-Means and SMOTE"
https://arxiv.org/abs/1711.00837
Examples
--------
>>> import numpy as np
>>> from imblearn.over_sampling import KMeansSMOTE
>>> from sklearn.datasets import make_blobs
>>> blobs = [100, 800, 100]
>>> X, y = make_blobs(blobs, centers=[(-10, 0), (0,0), (10, 0)])
>>> # Add a single 0 sample in the middle blob
>>> X = np.concatenate([X, [[0, 0]]])
>>> y = np.append(y, 0)
>>> # Make this a binary classification problem
>>> y = y == 1
>>> sm = KMeansSMOTE(random_state=42)
>>> X_res, y_res = sm.fit_resample(X, y)
>>> # Find the number of new samples in the middle blob
>>> n_res_in_middle = ((X_res[:, 0] > -5) & (X_res[:, 0] < 5)).sum()
>>> print("Samples in the middle blob: %s" % n_res_in_middle)
Samples in the middle blob: 801
>>> print("Middle blob unchanged: %s" % (n_res_in_middle == blobs[1] + 1))
Middle blob unchanged: True
>>> print("More 0 samples: %s" % ((y_res == 0).sum() > (y == 0).sum()))
More 0 samples: True
"""
@_deprecate_positional_args
def __init__(
self,
*,
sampling_strategy="auto",
random_state=None,
k_neighbors=2,
n_jobs=None,
kmeans_estimator=None,
cluster_balance_threshold="auto",
density_exponent="auto",
):
super().__init__(
sampling_strategy=sampling_strategy,
random_state=random_state,
k_neighbors=k_neighbors,
n_jobs=n_jobs,
)
self.kmeans_estimator = kmeans_estimator
self.cluster_balance_threshold = cluster_balance_threshold
self.density_exponent = density_exponent
def _validate_estimator(self):
super()._validate_estimator()
if self.kmeans_estimator is None:
self.kmeans_estimator_ = MiniBatchKMeans(random_state=self.random_state)
elif isinstance(self.kmeans_estimator, int):
self.kmeans_estimator_ = MiniBatchKMeans(
n_clusters=self.kmeans_estimator,
random_state=self.random_state,
)
else:
self.kmeans_estimator_ = clone(self.kmeans_estimator)
# validate the parameters
for param_name in ("cluster_balance_threshold", "density_exponent"):
param = getattr(self, param_name)
if isinstance(param, str) and param != "auto":
raise ValueError(
f"'{param_name}' should be 'auto' when a string is passed."
f" Got {repr(param)} instead."
)
self.cluster_balance_threshold_ = (
self.cluster_balance_threshold
if self.kmeans_estimator_.n_clusters != 1
else -np.inf
)
def _find_cluster_sparsity(self, X):
"""Compute the cluster sparsity."""
euclidean_distances = pairwise_distances(
X, metric="euclidean", n_jobs=self.n_jobs
)
# negate diagonal elements
for ind in range(X.shape[0]):
euclidean_distances[ind, ind] = 0
non_diag_elements = (X.shape[0] ** 2) - X.shape[0]
mean_distance = euclidean_distances.sum() / non_diag_elements
exponent = (
math.log(X.shape[0], 1.6) ** 1.8 * 0.16
if self.density_exponent == "auto"
else self.density_exponent
)
return (mean_distance ** exponent) / X.shape[0]
def _fit_resample(self, X, y):
self._validate_estimator()
X_resampled = X.copy()
y_resampled = y.copy()
total_inp_samples = sum(self.sampling_strategy_.values())
for class_sample, n_samples in self.sampling_strategy_.items():
if n_samples == 0:
continue
# target_class_indices = np.flatnonzero(y == class_sample)
# X_class = _safe_indexing(X, target_class_indices)
X_clusters = self.kmeans_estimator_.fit_predict(X)
valid_clusters = []
cluster_sparsities = []
# identify cluster which are answering the requirements
for cluster_idx in range(self.kmeans_estimator_.n_clusters):
cluster_mask = np.flatnonzero(X_clusters == cluster_idx)
X_cluster = _safe_indexing(X, cluster_mask)
y_cluster = _safe_indexing(y, cluster_mask)
cluster_class_mean = (y_cluster == class_sample).mean()
if self.cluster_balance_threshold_ == "auto":
balance_threshold = n_samples / total_inp_samples / 2
else:
balance_threshold = self.cluster_balance_threshold_
# the cluster is already considered balanced
if cluster_class_mean < balance_threshold:
continue
# not enough samples to apply SMOTE
anticipated_samples = cluster_class_mean * X_cluster.shape[0]
if anticipated_samples < self.nn_k_.n_neighbors:
continue
X_cluster_class = _safe_indexing(
X_cluster, np.flatnonzero(y_cluster == class_sample)
)
valid_clusters.append(cluster_mask)
cluster_sparsities.append(self._find_cluster_sparsity(X_cluster_class))
cluster_sparsities = np.array(cluster_sparsities)
cluster_weights = cluster_sparsities / cluster_sparsities.sum()
if not valid_clusters:
raise RuntimeError(
f"No clusters found with sufficient samples of "
f"class {class_sample}. Try lowering the "
f"cluster_balance_threshold or increasing the number of "
f"clusters."
)
for valid_cluster_idx, valid_cluster in enumerate(valid_clusters):
X_cluster = _safe_indexing(X, valid_cluster)
y_cluster = _safe_indexing(y, valid_cluster)
X_cluster_class = _safe_indexing(
X_cluster, np.flatnonzero(y_cluster == class_sample)
)
self.nn_k_.fit(X_cluster_class)
nns = self.nn_k_.kneighbors(X_cluster_class, return_distance=False)[
:, 1:
]
cluster_n_samples = int(
math.ceil(n_samples * cluster_weights[valid_cluster_idx])
)
X_new, y_new = self._make_samples(
X_cluster_class,
y.dtype,
class_sample,
X_cluster_class,
nns,
cluster_n_samples,
1.0,
)
stack = [np.vstack, sparse.vstack][int(sparse.issparse(X_new))]
X_resampled = stack((X_resampled, X_new))
y_resampled = np.hstack((y_resampled, y_new))
return X_resampled, y_resampled