Skip to content

Commit

Permalink
feat: add ml KMeans model params (#477)
Browse files Browse the repository at this point in the history
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code!  That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [ ] Appropriate docs were updated (if necessary)

Fixes #<issue_number_goes_here> 🦕
  • Loading branch information
GarrettWu authored Mar 21, 2024
1 parent 0b3f8e5 commit 23a8d9a
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 9 deletions.
61 changes: 54 additions & 7 deletions bigframes/ml/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from __future__ import annotations

from typing import Dict, List, Optional, Union
from typing import List, Literal, Optional, Union

import bigframes_vendored.sklearn.cluster._kmeans
from google.cloud import bigquery
Expand All @@ -27,6 +27,16 @@
from bigframes.ml import base, core, globals, utils
import bigframes.pandas as bpd

_BQML_PARAMS_MAPPING = {
"n_clusters": "numClusters",
"init": "kmeansInitializationMethod",
"init_col": "kmeansInitializationColumn",
"distance_type": "distanceType",
"max_iter": "maxIterations",
"early_stop": "earlyStop",
"tol": "minRelativeProgress",
}


@log_adapter.class_logger
class KMeans(
Expand All @@ -36,30 +46,67 @@ class KMeans(

__doc__ = bigframes_vendored.sklearn.cluster._kmeans.KMeans.__doc__

def __init__(self, n_clusters: int = 8):
def __init__(
self,
n_clusters: int = 8,
*,
init: Literal["kmeans++", "random", "custom"] = "kmeans++",
init_col: Optional[str] = None,
distance_type: Literal["euclidean", "cosine"] = "euclidean",
max_iter: int = 20,
tol: float = 0.01,
warm_start: bool = False,
):
self.n_clusters = n_clusters
self.init = init
self.init_col = init_col
self.distance_type = distance_type
self.max_iter = max_iter
self.tol = tol
self.warm_start = warm_start
self._bqml_model: Optional[core.BqmlModel] = None
self._bqml_model_factory = globals.bqml_model_factory()

@classmethod
def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> KMeans:
assert model.model_type == "KMEANS"

kwargs = {}
kwargs: dict = {}

# See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun
last_fitting = model.training_runs[-1]["trainingOptions"]
if "numClusters" in last_fitting:
kwargs["n_clusters"] = int(last_fitting["numClusters"])
dummy_kmeans = cls()
for bf_param, bf_value in dummy_kmeans.__dict__.items():
bqml_param = _BQML_PARAMS_MAPPING.get(bf_param)
if bqml_param in last_fitting:
# Convert types
kwargs[bf_param] = (
str(last_fitting[bqml_param])
if bf_param in ["init"]
else type(bf_value)(last_fitting[bqml_param])
)

new_kmeans = cls(**kwargs)
new_kmeans._bqml_model = core.BqmlModel(session, model)
return new_kmeans

@property
def _bqml_options(self) -> Dict[str, str | int | float | List[str]]:
def _bqml_options(self) -> dict:
"""The model options as they will be set for BQML"""
return {"model_type": "KMEANS", "num_clusters": self.n_clusters}
options = {
"model_type": "KMEANS",
"num_clusters": self.n_clusters,
"KMEANS_INIT_METHOD": self.init,
"DISTANCE_TYPE": self.distance_type,
"MAX_ITERATIONS": self.max_iter,
"MIN_REL_PROGRESS": self.tol,
"WARM_START": self.warm_start,
}

if self.init_col is not None:
options["KMEANS_INIT_COL"] = self.init_col

return options

def _fit(
self,
Expand Down
48 changes: 46 additions & 2 deletions tests/system/large/ml/test_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
from tests.system.utils import assert_pandas_df_equal


@pytest.mark.flaky(retries=2, delay=120)
@pytest.mark.flaky(retries=2)
def test_cluster_configure_fit_score_predict(
session, penguins_df_default_index, dataset_id
):
model = cluster.KMeans(n_clusters=3)
model = cluster.KMeans(n_clusters=3, init="random")

df = penguins_df_default_index.dropna()[
[
Expand Down Expand Up @@ -118,3 +118,47 @@ def test_cluster_configure_fit_score_predict(
in reloaded_model._bqml_model.model_name
)
assert reloaded_model.n_clusters == 3
assert reloaded_model.init == "RANDOM"
assert reloaded_model.distance_type == "EUCLIDEAN"
assert reloaded_model.max_iter == 20
assert reloaded_model.tol == 0.01


def test_cluster_configure_fit_load_params(penguins_df_default_index, dataset_id):
model = cluster.KMeans(
n_clusters=4,
init="random",
distance_type="cosine",
max_iter=30,
tol=0.001,
)

df = penguins_df_default_index.dropna()[
[
"culmen_length_mm",
"culmen_depth_mm",
"flipper_length_mm",
"sex",
]
]

# TODO(swast): How should we handle the default index? Currently, we get:
# "Column bigframes_index_0_z is not found in the input data to the
# EVALUATE function."
df = df.reset_index(drop=True)

model.fit(df)

# save, load, check n_clusters to ensure configuration was kept
reloaded_model = model.to_gbq(
f"{dataset_id}.temp_configured_cluster_model", replace=True
)
assert (
f"{dataset_id}.temp_configured_cluster_model"
in reloaded_model._bqml_model.model_name
)
assert reloaded_model.n_clusters == 4
assert reloaded_model.init == "RANDOM"
assert reloaded_model.distance_type == "COSINE"
assert reloaded_model.max_iter == 30
assert reloaded_model.tol == 0.001
28 changes: 28 additions & 0 deletions third_party/bigframes_vendored/sklearn/cluster/_kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,34 @@ class KMeans(_BaseKMeans):
n_clusters (int, default 8):
The number of clusters to form as well as the number of centroids to generate.
Default to 8.
init ("kmeans++", "random" or "custom", default "kmeans++"):
The method of initializing the clusters. Default to "kmeans++"
kmeas++: Initializes a number of centroids equal to the n_clusters value by using the k-means++ algorithm. Using this approach usually trains a better model than using random cluster initialization.
random: Initializes the centroids by randomly selecting a number of data points equal to the n_clusters value from the input data.
custom: Initializes the centroids using a provided column of type bool. Uses the rows with a value of True as the initial centroids. You specify the column to use by using the init_col option.
init_col (str or None, default None):
The name of the column to use to initialize the centroids. This column must have a type of bool. If this column contains a value of True for a given row, then uses that row as an initial centroid. The number of True rows in this column must be equal to the value you have specified for the n_clusters option.
Only works with init method "custom". Default to None.
distance_type ("euclidean" or "cosine", default "euclidean"):
The type of metric to use to compute the distance between two points.
Default to "euclidean".
max_iter (int, default 20):
The maximum number of training iterations, where one iteration represents a single pass of the entire training data. Default to 20.
tol (float, default 0.01):
The minimum relative loss improvement that is necessary to continue training. For example, a value of 0.01 specifies that each iteration must reduce the loss by 1% for training to continue.
Default to 0.01.
warm_start (bool, default False):
Determines whether to train a model with new training data, new model options, or both. Unless you explicitly override them, the initial options used to train the model are used for the warm start run.
Default to False.
"""

def fit(
Expand Down

0 comments on commit 23a8d9a

Please sign in to comment.