diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index c294d1f424..ac5b6f1e3b 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -17,7 +17,7 @@ from __future__ import annotations -from typing import Dict, List, Optional, Union +from typing import List, Literal, Optional, Union import bigframes_vendored.sklearn.cluster._kmeans from google.cloud import bigquery @@ -27,6 +27,16 @@ from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd +_BQML_PARAMS_MAPPING = { + "n_clusters": "numClusters", + "init": "kmeansInitializationMethod", + "init_col": "kmeansInitializationColumn", + "distance_type": "distanceType", + "max_iter": "maxIterations", + "early_stop": "earlyStop", + "tol": "minRelativeProgress", +} + @log_adapter.class_logger class KMeans( @@ -36,8 +46,24 @@ class KMeans( __doc__ = bigframes_vendored.sklearn.cluster._kmeans.KMeans.__doc__ - def __init__(self, n_clusters: int = 8): + def __init__( + self, + n_clusters: int = 8, + *, + init: Literal["kmeans++", "random", "custom"] = "kmeans++", + init_col: Optional[str] = None, + distance_type: Literal["euclidean", "cosine"] = "euclidean", + max_iter: int = 20, + tol: float = 0.01, + warm_start: bool = False, + ): self.n_clusters = n_clusters + self.init = init + self.init_col = init_col + self.distance_type = distance_type + self.max_iter = max_iter + self.tol = tol + self.warm_start = warm_start self._bqml_model: Optional[core.BqmlModel] = None self._bqml_model_factory = globals.bqml_model_factory() @@ -45,21 +71,42 @@ def __init__(self, n_clusters: int = 8): def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> KMeans: assert model.model_type == "KMEANS" - kwargs = {} + kwargs: dict = {} # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun last_fitting = model.training_runs[-1]["trainingOptions"] - if "numClusters" in last_fitting: - kwargs["n_clusters"] = int(last_fitting["numClusters"]) + dummy_kmeans = cls() + for bf_param, bf_value in dummy_kmeans.__dict__.items(): + bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) + if bqml_param in last_fitting: + # Convert types + kwargs[bf_param] = ( + str(last_fitting[bqml_param]) + if bf_param in ["init"] + else type(bf_value)(last_fitting[bqml_param]) + ) new_kmeans = cls(**kwargs) new_kmeans._bqml_model = core.BqmlModel(session, model) return new_kmeans @property - def _bqml_options(self) -> Dict[str, str | int | float | List[str]]: + def _bqml_options(self) -> dict: """The model options as they will be set for BQML""" - return {"model_type": "KMEANS", "num_clusters": self.n_clusters} + options = { + "model_type": "KMEANS", + "num_clusters": self.n_clusters, + "KMEANS_INIT_METHOD": self.init, + "DISTANCE_TYPE": self.distance_type, + "MAX_ITERATIONS": self.max_iter, + "MIN_REL_PROGRESS": self.tol, + "WARM_START": self.warm_start, + } + + if self.init_col is not None: + options["KMEANS_INIT_COL"] = self.init_col + + return options def _fit( self, diff --git a/tests/system/large/ml/test_cluster.py b/tests/system/large/ml/test_cluster.py index 9244c4b9f1..b633ca4ea2 100644 --- a/tests/system/large/ml/test_cluster.py +++ b/tests/system/large/ml/test_cluster.py @@ -19,11 +19,11 @@ from tests.system.utils import assert_pandas_df_equal -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_cluster_configure_fit_score_predict( session, penguins_df_default_index, dataset_id ): - model = cluster.KMeans(n_clusters=3) + model = cluster.KMeans(n_clusters=3, init="random") df = penguins_df_default_index.dropna()[ [ @@ -118,3 +118,47 @@ def test_cluster_configure_fit_score_predict( in reloaded_model._bqml_model.model_name ) assert reloaded_model.n_clusters == 3 + assert reloaded_model.init == "RANDOM" + assert reloaded_model.distance_type == "EUCLIDEAN" + assert reloaded_model.max_iter == 20 + assert reloaded_model.tol == 0.01 + + +def test_cluster_configure_fit_load_params(penguins_df_default_index, dataset_id): + model = cluster.KMeans( + n_clusters=4, + init="random", + distance_type="cosine", + max_iter=30, + tol=0.001, + ) + + df = penguins_df_default_index.dropna()[ + [ + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "sex", + ] + ] + + # TODO(swast): How should we handle the default index? Currently, we get: + # "Column bigframes_index_0_z is not found in the input data to the + # EVALUATE function." + df = df.reset_index(drop=True) + + model.fit(df) + + # save, load, check n_clusters to ensure configuration was kept + reloaded_model = model.to_gbq( + f"{dataset_id}.temp_configured_cluster_model", replace=True + ) + assert ( + f"{dataset_id}.temp_configured_cluster_model" + in reloaded_model._bqml_model.model_name + ) + assert reloaded_model.n_clusters == 4 + assert reloaded_model.init == "RANDOM" + assert reloaded_model.distance_type == "COSINE" + assert reloaded_model.max_iter == 30 + assert reloaded_model.tol == 0.001 diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index d72b9b7bd5..2a0acc8cfe 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -31,6 +31,34 @@ class KMeans(_BaseKMeans): n_clusters (int, default 8): The number of clusters to form as well as the number of centroids to generate. Default to 8. + + init ("kmeans++", "random" or "custom", default "kmeans++"): + The method of initializing the clusters. Default to "kmeans++" + + kmeas++: Initializes a number of centroids equal to the n_clusters value by using the k-means++ algorithm. Using this approach usually trains a better model than using random cluster initialization. + random: Initializes the centroids by randomly selecting a number of data points equal to the n_clusters value from the input data. + custom: Initializes the centroids using a provided column of type bool. Uses the rows with a value of True as the initial centroids. You specify the column to use by using the init_col option. + + init_col (str or None, default None): + The name of the column to use to initialize the centroids. This column must have a type of bool. If this column contains a value of True for a given row, then uses that row as an initial centroid. The number of True rows in this column must be equal to the value you have specified for the n_clusters option. + Only works with init method "custom". Default to None. + + distance_type ("euclidean" or "cosine", default "euclidean"): + The type of metric to use to compute the distance between two points. + Default to "euclidean". + + max_iter (int, default 20): + The maximum number of training iterations, where one iteration represents a single pass of the entire training data. Default to 20. + + tol (float, default 0.01): + The minimum relative loss improvement that is necessary to continue training. For example, a value of 0.01 specifies that each iteration must reduce the loss by 1% for training to continue. + Default to 0.01. + + warm_start (bool, default False): + Determines whether to train a model with new training data, new model options, or both. Unless you explicitly override them, the initial options used to train the model are used for the warm start run. + Default to False. + + """ def fit(