Skip to content

Commit

Permalink
Merge pull request #174 from kklein/kmodes_sample_weight
Browse files Browse the repository at this point in the history
Draft implementation of `sample_weight` for kmodes.
  • Loading branch information
nicodv authored Mar 30, 2022
2 parents 4de7bf7 + 03a92c8 commit 247f193
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 36 deletions.
60 changes: 44 additions & 16 deletions kmodes/kmodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,16 +113,22 @@ def __init__(self, n_clusters=8, max_iter=100, cat_dissim=matching_dissim,
"Setting n_init to 1.")
self.n_init = 1

def fit(self, X, y=None, **kwargs):
def fit(self, X, y=None, sample_weight=None, **kwargs):
"""Compute k-modes clustering.
Parameters
----------
X : array-like, shape=[n_samples, n_features]
sample_weight : sequence, default: None
The weight that is assigned to each individual data point when
updating the centroids.
"""
X = pandas_to_numpy(X)

random_state = check_random_state(self.random_state)
_validate_sample_weight(sample_weight, n_samples=X.shape[0])

self._enc_cluster_centroids, self._enc_map, self.labels_, self.cost_, \
self.n_iter_, self.epoch_costs_ = k_modes(
X,
Expand All @@ -134,6 +140,7 @@ def fit(self, X, y=None, **kwargs):
self.verbose,
random_state,
self.n_jobs,
sample_weight
)
return self

Expand Down Expand Up @@ -179,7 +186,7 @@ def cluster_centroids_(self):
"because the model is not yet fitted.")


def labels_cost(X, centroids, dissim, membship=None):
def labels_cost(X, centroids, dissim, membship=None, sample_weight=None):
"""Calculate labels and cost function given a matrix of points and
a list of centroids for the k-modes algorithm.
"""
Expand All @@ -190,15 +197,17 @@ def labels_cost(X, centroids, dissim, membship=None):
cost = 0.
labels = np.empty(n_points, dtype=np.uint16)
for ipoint, curpoint in enumerate(X):
weight = sample_weight[ipoint] if sample_weight is not None else 1
diss = dissim(centroids, curpoint, X=X, membship=membship)
clust = np.argmin(diss)
labels[ipoint] = clust
cost += diss[clust]
cost += diss[clust] * weight

return labels, cost


def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose, random_state, n_jobs):
def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose, random_state, n_jobs,
sample_weight=None):
"""k-modes algorithm"""
random_state = check_random_state(random_state)
if sparse.issparse(X):
Expand Down Expand Up @@ -229,13 +238,13 @@ def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose, random_state
if n_jobs == 1:
for init_no in range(n_init):
results.append(_k_modes_single(
X, n_clusters, n_points, n_attrs, max_iter,
dissim, init, init_no, verbose, seeds[init_no]
X, n_clusters, n_points, n_attrs, max_iter, dissim, init, init_no,
verbose, seeds[init_no], sample_weight
))
else:
results = Parallel(n_jobs=n_jobs, verbose=0)(
delayed(_k_modes_single)(X, n_clusters, n_points, n_attrs, max_iter,
dissim, init, init_no, verbose, seed)
dissim, init, init_no, verbose, seed, sample_weight)
for init_no, seed in enumerate(seeds))
all_centroids, all_labels, all_costs, all_n_iters, all_epoch_costs = zip(*results)

Expand All @@ -248,7 +257,7 @@ def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose, random_state


def _k_modes_single(X, n_clusters, n_points, n_attrs, max_iter, dissim, init, init_no,
verbose, random_state):
verbose, random_state, sample_weight=None):
random_state = check_random_state(random_state)
# _____ INIT _____
if verbose:
Expand Down Expand Up @@ -282,12 +291,13 @@ def _k_modes_single(X, n_clusters, n_points, n_attrs, max_iter, dissim, init, in
cl_attr_freq = [[defaultdict(int) for _ in range(n_attrs)]
for _ in range(n_clusters)]
for ipoint, curpoint in enumerate(X):
weight = sample_weight[ipoint] if sample_weight is not None else 1
# Initial assignment to clusters
clust = np.argmin(dissim(centroids, curpoint, X=X, membship=membship))
membship[clust, ipoint] = 1
# Count attribute values per cluster.
for iattr, curattr in enumerate(curpoint):
cl_attr_freq[clust][iattr][curattr] += 1
cl_attr_freq[clust][iattr][curattr] += weight
# Perform an initial centroid update.
for ik in range(n_clusters):
for iattr in range(n_attrs):
Expand All @@ -304,7 +314,7 @@ def _k_modes_single(X, n_clusters, n_points, n_attrs, max_iter, dissim, init, in
labels = None
converged = False

_, cost = labels_cost(X, centroids, dissim, membship)
_, cost = labels_cost(X, centroids, dissim, membship, sample_weight)

epoch_costs = [cost]
while itr < max_iter and not converged:
Expand All @@ -315,10 +325,11 @@ def _k_modes_single(X, n_clusters, n_points, n_attrs, max_iter, dissim, init, in
cl_attr_freq,
membship,
dissim,
random_state
random_state,
sample_weight
)
# All points seen in this iteration
labels, ncost = labels_cost(X, centroids, dissim, membship)
labels, ncost = labels_cost(X, centroids, dissim, membship, sample_weight)
converged = (moves == 0) or (ncost >= cost)
epoch_costs.append(ncost)
cost = ncost
Expand All @@ -329,10 +340,12 @@ def _k_modes_single(X, n_clusters, n_points, n_attrs, max_iter, dissim, init, in
return centroids, labels, cost, itr, epoch_costs


def _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim, random_state):
def _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim, random_state,
sample_weight):
"""Single iteration of k-modes clustering algorithm"""
moves = 0
for ipoint, curpoint in enumerate(X):
weight = sample_weight[ipoint] if sample_weight is not None else 1
clust = np.argmin(dissim(centroids, curpoint, X=X, membship=membship))
if membship[clust, ipoint]:
# Point is already in its right place.
Expand All @@ -343,7 +356,8 @@ def _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim, random_state):
old_clust = np.argwhere(membship[:, ipoint])[0][0]

cl_attr_freq, membship, centroids = _move_point_cat(
curpoint, ipoint, clust, old_clust, cl_attr_freq, membship, centroids
curpoint, ipoint, clust, old_clust, cl_attr_freq, membship, centroids,
weight
)

# In case of an empty cluster, reinitialize with a random point
Expand All @@ -354,14 +368,15 @@ def _k_modes_iter(X, centroids, cl_attr_freq, membship, dissim, random_state):
rindx = random_state.choice(choices)

cl_attr_freq, membship, centroids = _move_point_cat(
X[rindx], rindx, old_clust, from_clust, cl_attr_freq, membship, centroids
X[rindx], rindx, old_clust, from_clust, cl_attr_freq, membship,
centroids, weight
)

return centroids, cl_attr_freq, membship, moves


def _move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq,
membship, centroids, sample_weight=1):
membship, centroids, sample_weight):
"""Move point between clusters, categorical attributes."""
membship[to_clust, ipoint] = 1
membship[from_clust, ipoint] = 0
Expand Down Expand Up @@ -390,3 +405,16 @@ def _move_point_cat(point, ipoint, to_clust, from_clust, cl_attr_freq,
centroids[from_clust][iattr] = get_max_value_key(from_attr_counts)

return cl_attr_freq, membship, centroids


def _validate_sample_weight(sample_weight, n_samples):
if sample_weight is not None:
if len(sample_weight) != n_samples:
raise ValueError("sample_weight should be of equal size as samples.")
if any(
not isinstance(weight, int) and not isinstance(weight, float)
for weight in sample_weight
):
raise ValueError("sample_weight elements should either be int or floats.")
if any(sample < 0 for sample in sample_weight):
raise ValueError("sample_weight elements should be positive.")
22 changes: 5 additions & 17 deletions kmodes/kprototypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def fit(self, X, y=None, categorical=None, sample_weight=None):
X = pandas_to_numpy(X)

random_state = check_random_state(self.random_state)
_validate_sample_weight(sample_weight, n_samples=X.shape[0])
kmodes._validate_sample_weight(sample_weight, n_samples=X.shape[0])

# If self.gamma is None, gamma will be automatically determined from
# the data. The function below returns its value.
Expand Down Expand Up @@ -495,17 +495,18 @@ def _k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_memb_sum, cl_attr_
rindx = random_state.choice(choices)

cl_attr_sum, cl_memb_sum = _move_point_num(
Xnum[rindx], old_clust, from_clust, cl_attr_sum, cl_memb_sum
Xnum[rindx], old_clust, from_clust, cl_attr_sum, cl_memb_sum,
weight
)
cl_attr_freq, membship, centroids[1] = kmodes._move_point_cat(
Xcat[rindx], rindx, old_clust, from_clust,
cl_attr_freq, membship, centroids[1]
cl_attr_freq, membship, centroids[1], weight
)

return centroids, cl_attr_sum, cl_memb_sum, cl_attr_freq, membship, moves


def _move_point_num(point, to_clust, from_clust, cl_attr_sum, cl_memb_sum, sample_weight=1):
def _move_point_num(point, to_clust, from_clust, cl_attr_sum, cl_memb_sum, sample_weight):
"""Move point between clusters, numerical attributes."""
# Update sum of attributes in cluster.
for iattr, curattr in enumerate(point):
Expand All @@ -528,16 +529,3 @@ def _split_num_cat(X, categorical):
if ii not in categorical]]).astype(np.float64)
Xcat = np.asanyarray(X[:, categorical])
return Xnum, Xcat


def _validate_sample_weight(sample_weight, n_samples):
if sample_weight is not None:
if len(sample_weight) != n_samples:
raise ValueError("sample_weight should be of equal size as samples.")
if any(
not isinstance(weight, int) and not isinstance(weight, float)
for weight in sample_weight
):
raise ValueError("sample_weight elements should either be int or floats.")
if any(sample < 0 for sample in sample_weight):
raise ValueError("sample_weight elements should be positive.")
46 changes: 46 additions & 0 deletions kmodes/tests/test_kmodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,3 +521,49 @@ def test_kmodes_epoch_costs(self):
kmodes = KModes(n_clusters=4, init='Cao', random_state=42)
kmodes.fit(SOYBEAN)
self.assertEqual(kmodes.epoch_costs_, [206.0, 204.0, 199.0, 199.0])

def test_kmodes_sample_weights_validation(self):
kmodes = KModes(n_clusters=4, init='Cao', random_state=42)
sample_weight_too_few = [1] * (SOYBEAN.shape[0] - 1)
with self.assertRaisesRegex(
ValueError, "sample_weight should be of equal size as samples."
):
kmodes.fit_predict(SOYBEAN, sample_weight=sample_weight_too_few)
sample_weight_negative = [-1] + [1] * (SOYBEAN.shape[0] - 1)
with self.assertRaisesRegex(
ValueError, "sample_weight elements should be positive."
):
kmodes.fit_predict(SOYBEAN, sample_weight=sample_weight_negative)
sample_weight_non_numerical = [None] + [1] * (SOYBEAN.shape[0] - 1)
with self.assertRaisesRegex(
ValueError, "sample_weight elements should either be int or floats."
):
kmodes.fit_predict(SOYBEAN, sample_weight=sample_weight_non_numerical)

def test_kmodes_sample_weights_all_but_one_zero(self):
"""Test whether centroid collapses to single datapoint with non-zero weight."""
kmodes = KModes(n_clusters=1, init='Cao', random_state=42)
n_samples = 10
for indicator in range(n_samples):
sample_weight = np.zeros(n_samples)
sample_weight[indicator] = 1
model = kmodes.fit(
TEST_DATA[:n_samples, :], sample_weight=sample_weight
)
self.assertTrue((model.cluster_centroids_[0, :] == TEST_DATA[indicator, :]).all())

def test_k_modes_sample_weight_unchanged(self):
"""Test whether centroid definition remains unchanged when scaling uniformly."""
kmodes_baseline = KModes(n_clusters=4, init='Cao', random_state=42)
model_baseline = kmodes_baseline.fit(SOYBEAN)
expected = set(tuple(row) for row in model_baseline.cluster_centroids_)
for weight in [.5, 1, 1., 2]:
sample_weight = [weight] * SOYBEAN.shape[0]
kmodes_weighted = KModes(n_clusters=4, init='Cao', random_state=42)
model_weighted = kmodes_weighted.fit(SOYBEAN, sample_weight=sample_weight)
factual = set(tuple(row) for row in model_weighted.cluster_centroids_)
# Centroids might be ordered differently. To compare the centroids, we first
# sort them.
tuple_pairs = zip(sorted(expected), sorted(factual))
for tuple_expected, tuple_factual in tuple_pairs:
self.assertAlmostEqual(tuple_expected, tuple_factual)
8 changes: 5 additions & 3 deletions kmodes/tests/test_kprototypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ def test_kprototypes_sample_weights_validation(self):

def test_k_prototypes_sample_weight_all_but_one_zero(self):
"""Test whether centroid collapses to single datapoint with non-zero weight."""
kproto = kprototypes.KPrototypes(n_clusters=1, init='Cao', verbose=2)
kproto = kprototypes.KPrototypes(n_clusters=1, init='Cao', random_state=42)
n_samples = 2
for indicator in range(n_samples):
sample_weight = np.zeros(n_samples)
Expand All @@ -367,13 +367,15 @@ def test_k_prototypes_sample_weight_all_but_one_zero(self):
def test_k_prototypes_sample_weight_unchanged(self):
"""Test whether centroid definition remains unchanged when scaling uniformly."""
categorical = [1, 2]
kproto_baseline = kprototypes.KPrototypes(n_clusters=3, init='Cao')
kproto_baseline = kprototypes.KPrototypes(n_clusters=3, init='Cao', random_state=42)
model_baseline = kproto_baseline.fit(STOCKS, categorical=categorical)
expected = set(tuple(row) for row in model_baseline.cluster_centroids_)
# The exact value of a weight shouldn't matter if equal for all samples.
for weight in [.5, .1, 1, 1., 2]:
sample_weight = [weight] * STOCKS.shape[0]
kproto_weighted = kprototypes.KPrototypes(n_clusters=3, init='Cao')
kproto_weighted = kprototypes.KPrototypes(
n_clusters=3, init='Cao', random_state=42
)
model_weighted = kproto_weighted.fit(
STOCKS, categorical=categorical, sample_weight=sample_weight
)
Expand Down

0 comments on commit 247f193

Please sign in to comment.