From 4b7e853f17423f098512f3f2127cf5852fabe79b Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Fri, 29 Mar 2024 23:37:32 +0000 Subject: [PATCH 1/2] feat: add transformers save/load --- bigframes/ml/base.py | 30 ++++- bigframes/ml/compose.py | 54 +------- bigframes/ml/loader.py | 10 +- bigframes/ml/preprocessing.py | 10 ++ tests/system/large/ml/test_compose.py | 1 + tests/system/large/ml/test_pipeline.py | 6 +- tests/system/small/ml/test_core.py | 2 +- tests/system/small/ml/test_llm.py | 24 ++-- tests/system/small/ml/test_preprocessing.py | 130 ++++++++++++++++---- 9 files changed, 172 insertions(+), 95 deletions(-) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index e58ed4feef..5e7aada8de 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -178,7 +178,33 @@ def fit( return self._fit(X, y) -class Transformer(BaseEstimator): +class BaseTransformer(BaseEstimator): + """Transformer base class.""" + + def __init__(self): + self._bqml_model: Optional[core.BqmlModel] = None + + _T = TypeVar("_T", bound="BaseTransformer") + + def to_gbq(self: _T, model_name: str, replace: bool = False) -> _T: + """Save the transformer as a BigQuery model. + + Args: + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. + + Returns: + Saved transformer.""" + if not self._bqml_model: + raise RuntimeError("A transformer must be fitted before it can be saved") + + new_model = self._bqml_model.copy(model_name, replace) + return new_model.session.read_gbq_model(model_name) + + +class Transformer(BaseTransformer): """A BigQuery DataFrames Transformer base class that transforms data. Also the transformers can be attached to a pipeline with a predictor.""" @@ -199,7 +225,7 @@ def fit_transform( return self.fit(X, y).transform(X) -class LabelTransformer(BaseEstimator): +class LabelTransformer(BaseTransformer): """A BigQuery DataFrames Label Transformer base class that transforms data. Also the transformers can be attached to a pipeline with a predictor.""" diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index cd233589d6..a1f63f5f28 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -26,21 +26,11 @@ import bigframes_vendored.sklearn.compose._column_transformer from google.cloud import bigquery -import bigframes from bigframes import constants from bigframes.core import log_adapter from bigframes.ml import base, core, globals, preprocessing, utils import bigframes.pandas as bpd -_PREPROCESSING_TYPES = Union[ - preprocessing.OneHotEncoder, - preprocessing.StandardScaler, - preprocessing.MaxAbsScaler, - preprocessing.MinMaxScaler, - preprocessing.KBinsDiscretizer, - preprocessing.LabelEncoder, -] - _BQML_TRANSFROM_TYPE_MAPPING = types.MappingProxyType( { "ML.STANDARD_SCALER": preprocessing.StandardScaler, @@ -67,7 +57,7 @@ def __init__( transformers: List[ Tuple[ str, - _PREPROCESSING_TYPES, + preprocessing.PreprocessingType, Union[str, List[str]], ] ], @@ -82,12 +72,12 @@ def __init__( @property def transformers_( self, - ) -> List[Tuple[str, _PREPROCESSING_TYPES, str,]]: + ) -> List[Tuple[str, preprocessing.PreprocessingType, str,]]: """The collection of transformers as tuples of (name, transformer, column).""" result: List[ Tuple[ str, - _PREPROCESSING_TYPES, + preprocessing.PreprocessingType, str, ] ] = [] @@ -105,15 +95,6 @@ def transformers_( return result - @classmethod - def _from_bq( - cls, session: bigframes.Session, model: bigquery.Model - ) -> ColumnTransformer: - col_transformer = cls._extract_from_bq_model(model) - col_transformer._bqml_model = core.BqmlModel(session, model) - - return col_transformer - @classmethod def _extract_from_bq_model( cls, @@ -125,7 +106,7 @@ def _extract_from_bq_model( transformers: List[ Tuple[ str, - _PREPROCESSING_TYPES, + preprocessing.PreprocessingType, Union[str, List[str]], ] ] = [] @@ -164,15 +145,7 @@ def camel_to_snake(name): def _merge( self, bq_model: bigquery.Model - ) -> Union[ - ColumnTransformer, - preprocessing.StandardScaler, - preprocessing.OneHotEncoder, - preprocessing.MaxAbsScaler, - preprocessing.MinMaxScaler, - preprocessing.KBinsDiscretizer, - preprocessing.LabelEncoder, - ]: + ) -> Union[ColumnTransformer, preprocessing.PreprocessingType,]: """Try to merge the column transformer to a simple transformer. Depends on all the columns in bq_model are transformed with the same transformer.""" transformers = self.transformers_ @@ -249,20 +222,3 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: bpd.DataFrame, df[self._output_names], ) - - def to_gbq(self, model_name: str, replace: bool = False) -> ColumnTransformer: - """Save the transformer as a BigQuery model. - - Args: - model_name (str): - the name of the model. - replace (bool, default False): - whether to replace if the model already exists. Default to False. - - Returns: - ColumnTransformer: saved model.""" - if not self._bqml_model: - raise RuntimeError("A transformer must be fitted before it can be saved") - - new_model = self._bqml_model.copy(model_name, replace) - return new_model.session.read_gbq_model(model_name) diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 508003a98d..39148a6412 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -24,6 +24,7 @@ from bigframes.ml import ( cluster, compose, + core, decomposition, ensemble, forecasting, @@ -31,6 +32,7 @@ linear_model, llm, pipeline, + preprocessing, utils, ) @@ -81,6 +83,7 @@ def from_bq( llm.PaLM2TextEmbeddingGenerator, pipeline.Pipeline, compose.ColumnTransformer, + preprocessing.PreprocessingType, ]: """Load a BQML model to BigQuery DataFrames ML. @@ -107,8 +110,11 @@ def from_bq( def _transformer_from_bq(session: bigframes.Session, bq_model: bigquery.Model): - # TODO(garrettwu): add other transformers - return compose.ColumnTransformer._from_bq(session, bq_model) + transformer = compose.ColumnTransformer._extract_from_bq_model(bq_model) + transformer = transformer._merge(bq_model) + transformer._bqml_model = core.BqmlModel(session, bq_model) + + return transformer def _model_from_bq(session: bigframes.Session, bq_model: bigquery.Model): diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 23eab42978..fd7d44f731 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -639,3 +639,13 @@ def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: bpd.DataFrame, df[self._output_names], ) + + +PreprocessingType = Union[ + OneHotEncoder, + StandardScaler, + MaxAbsScaler, + MinMaxScaler, + KBinsDiscretizer, + LabelEncoder, +] diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index d7c49ca95a..72e016f4bb 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -151,3 +151,4 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id): ("standard_scaler", preprocessing.StandardScaler(), "flipper_length_mm"), ] assert reloaded_transformer.transformers_ == expected + assert reloaded_transformer._bqml_model is not None diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index c460efa75f..c165b1e030 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -222,7 +222,7 @@ def test_pipeline_logistic_regression_fit_score_predict( ) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_pipeline_xgbregressor_fit_score_predict(session, penguins_df_default_index): """Test a supervised model with a minimal preprocessing step""" pl = pipeline.Pipeline( @@ -297,7 +297,7 @@ def test_pipeline_xgbregressor_fit_score_predict(session, penguins_df_default_in ) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_pipeline_random_forest_classifier_fit_score_predict( session, penguins_df_default_index ): @@ -445,7 +445,7 @@ def test_pipeline_PCA_fit_score_predict(session, penguins_df_default_index): ) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_pipeline_standard_scaler_kmeans_fit_score_predict( session, penguins_pandas_df_default_index ): diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index 02030cd31e..c505057d7b 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -333,7 +333,7 @@ def test_remote_model_predict( ) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_model_generate_text( bqml_palm2_text_generator_model: core.BqmlModel, llm_text_df ): diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 2e135bef7b..e526d54362 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -49,7 +49,7 @@ def test_create_text_generator_32k_model( assert reloaded_model.connection_name == bq_connection -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_create_text_generator_model_default_session( bq_connection, llm_text_pandas_df, bigquery_client ): @@ -76,7 +76,7 @@ def test_create_text_generator_model_default_session( assert all(series.str.len() > 20) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_create_text_generator_32k_model_default_session( bq_connection, llm_text_pandas_df, bigquery_client ): @@ -103,7 +103,7 @@ def test_create_text_generator_32k_model_default_session( assert all(series.str.len() > 20) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_create_text_generator_model_default_connection( llm_text_pandas_df, bigquery_client ): @@ -131,7 +131,7 @@ def test_create_text_generator_model_default_connection( # Marked as flaky only because BQML LLM is in preview, the service only has limited capacity, not stable enough. -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_text_generator_predict_default_params_success( palm2_text_generator_model, llm_text_df ): @@ -142,7 +142,7 @@ def test_text_generator_predict_default_params_success( assert all(series.str.len() > 20) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_text_generator_predict_series_default_params_success( palm2_text_generator_model, llm_text_df ): @@ -153,7 +153,7 @@ def test_text_generator_predict_series_default_params_success( assert all(series.str.len() > 20) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_text_generator_predict_arbitrary_col_label_success( palm2_text_generator_model, llm_text_df ): @@ -165,7 +165,7 @@ def test_text_generator_predict_arbitrary_col_label_success( assert all(series.str.len() > 20) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_text_generator_predict_with_params_success( palm2_text_generator_model, llm_text_df ): @@ -255,7 +255,7 @@ def test_create_text_embedding_generator_multilingual_model_defaults(bq_connecti assert model._bqml_model is not None -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_embedding_generator_predict_success( palm2_embedding_generator_model, llm_text_df ): @@ -267,7 +267,7 @@ def test_embedding_generator_predict_success( assert len(value) == 768 -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_embedding_generator_multilingual_predict_success( palm2_embedding_generator_multilingual_model, llm_text_df ): @@ -279,7 +279,7 @@ def test_embedding_generator_multilingual_predict_success( assert len(value) == 768 -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_embedding_generator_predict_series_success( palm2_embedding_generator_model, llm_text_df ): @@ -306,7 +306,7 @@ def test_create_gemini_text_generator_model( assert reloaded_model.connection_name == bq_connection -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_gemini_text_generator_predict_default_params_success( gemini_text_generator_model, llm_text_df ): @@ -317,7 +317,7 @@ def test_gemini_text_generator_predict_default_params_success( assert all(series.str.len() > 20) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_gemini_text_generator_predict_with_params_success( gemini_text_generator_model, llm_text_df ): diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 990795da3b..040111f38a 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -18,7 +18,7 @@ import pyarrow as pa import bigframes.features -import bigframes.ml.preprocessing +from bigframes.ml import preprocessing ONE_HOT_ENCODED_DTYPE = ( pd.ArrowDtype(pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())]))) @@ -29,7 +29,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. - scaler = bigframes.ml.preprocessing.StandardScaler() + scaler = preprocessing.StandardScaler() scaler.fit( penguins_df_default_index[ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] @@ -68,7 +68,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df): def test_standard_scaler_normalizeds_fit_transform(new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. - scaler = bigframes.ml.preprocessing.StandardScaler() + scaler = preprocessing.StandardScaler() result = scaler.fit_transform( new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] ).to_pandas() @@ -97,7 +97,7 @@ def test_standard_scaler_normalizeds_fit_transform(new_penguins_df): def test_standard_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. - scaler = bigframes.ml.preprocessing.StandardScaler() + scaler = preprocessing.StandardScaler() scaler.fit(penguins_df_default_index["culmen_length_mm"]) result = scaler.transform(penguins_df_default_index["culmen_length_mm"]).to_pandas() @@ -128,9 +128,22 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui pd.testing.assert_frame_equal(result, expected, rtol=1e-3) +def test_standard_scaler_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.StandardScaler() + transformer.fit( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.StandardScaler) + assert reloaded_transformer._bqml_model is not None + + def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MaxAbsScaler, when BQML's change is in prod. - scaler = bigframes.ml.preprocessing.MaxAbsScaler() + scaler = preprocessing.MaxAbsScaler() scaler.fit( penguins_df_default_index[ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] @@ -168,7 +181,7 @@ def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df): def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df): - scaler = bigframes.ml.preprocessing.MaxAbsScaler() + scaler = preprocessing.MaxAbsScaler() result = scaler.fit_transform( new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] ).to_pandas() @@ -192,7 +205,7 @@ def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df): def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): - scaler = bigframes.ml.preprocessing.MaxAbsScaler() + scaler = preprocessing.MaxAbsScaler() scaler.fit(penguins_df_default_index["culmen_length_mm"]) result = scaler.transform(penguins_df_default_index["culmen_length_mm"]).to_pandas() @@ -219,8 +232,21 @@ def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguin pd.testing.assert_frame_equal(result, expected, rtol=1e-3) +def test_max_abs_scaler_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.MaxAbsScaler() + transformer.fit( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.MaxAbsScaler) + assert reloaded_transformer._bqml_model is not None + + def test_min_max_scaler_normalized_fit_transform(new_penguins_df): - scaler = bigframes.ml.preprocessing.MinMaxScaler() + scaler = preprocessing.MinMaxScaler() result = scaler.fit_transform( new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] ).to_pandas() @@ -244,7 +270,7 @@ def test_min_max_scaler_normalized_fit_transform(new_penguins_df): def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): - scaler = bigframes.ml.preprocessing.MinMaxScaler() + scaler = preprocessing.MinMaxScaler() scaler.fit(penguins_df_default_index["culmen_length_mm"]) result = scaler.transform(penguins_df_default_index["culmen_length_mm"]).to_pandas() @@ -274,7 +300,7 @@ def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguin def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MinMaxScaler, when BQML's change is in prod. - scaler = bigframes.ml.preprocessing.MinMaxScaler() + scaler = preprocessing.MinMaxScaler() scaler.fit( penguins_df_default_index[ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] @@ -312,8 +338,21 @@ def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df): pd.testing.assert_frame_equal(result, expected, rtol=1e-3) +def test_min_max_scaler_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.MinMaxScaler() + transformer.fit( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.MinMaxScaler) + assert reloaded_transformer._bqml_model is not None + + def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins_df): - discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(strategy="uniform") + discretizer = preprocessing.KBinsDiscretizer(strategy="uniform") result = discretizer.fit_transform( new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] ).to_pandas() @@ -339,7 +378,7 @@ def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins def test_k_bins_discretizer_series_normalizes( penguins_df_default_index, new_penguins_df ): - discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(strategy="uniform") + discretizer = preprocessing.KBinsDiscretizer(strategy="uniform") discretizer.fit(penguins_df_default_index["culmen_length_mm"]) result = discretizer.transform( @@ -365,7 +404,7 @@ def test_k_bins_discretizer_series_normalizes( def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.KBinsDiscretizer, when BQML's change is in prod. - discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(strategy="uniform") + discretizer = preprocessing.KBinsDiscretizer(strategy="uniform") discretizer.fit( penguins_df_default_index[ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] @@ -402,9 +441,7 @@ def test_k_bins_discretizer_normalizes_different_params( penguins_df_default_index, new_penguins_df ): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.KBinsDiscretizer, when BQML's change is in prod. - discretizer = bigframes.ml.preprocessing.KBinsDiscretizer( - n_bins=6, strategy="uniform" - ) + discretizer = preprocessing.KBinsDiscretizer(n_bins=6, strategy="uniform") discretizer.fit( penguins_df_default_index[ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] @@ -437,8 +474,23 @@ def test_k_bins_discretizer_normalizes_different_params( pd.testing.assert_frame_equal(result, expected, rtol=1e-3) +def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.KBinsDiscretizer(n_bins=6, strategy="uniform") + transformer.fit( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.KBinsDiscretizer) + assert reloaded_transformer.n_bins == transformer.n_bins + assert reloaded_transformer.strategy == transformer.strategy + assert reloaded_transformer._bqml_model is not None + + def test_one_hot_encoder_default_params(new_penguins_df): - encoder = bigframes.ml.preprocessing.OneHotEncoder() + encoder = preprocessing.OneHotEncoder() encoder.fit(new_penguins_df[["species", "sex"]]) result = encoder.transform(new_penguins_df).to_pandas() @@ -469,7 +521,7 @@ def test_one_hot_encoder_default_params(new_penguins_df): def test_one_hot_encoder_default_params_fit_transform(new_penguins_df): - encoder = bigframes.ml.preprocessing.OneHotEncoder() + encoder = preprocessing.OneHotEncoder() result = encoder.fit_transform(new_penguins_df[["species", "sex"]]).to_pandas() @@ -499,7 +551,7 @@ def test_one_hot_encoder_default_params_fit_transform(new_penguins_df): def test_one_hot_encoder_series_default_params(new_penguins_df): - encoder = bigframes.ml.preprocessing.OneHotEncoder() + encoder = preprocessing.OneHotEncoder() encoder.fit(new_penguins_df["species"]) result = encoder.transform(new_penguins_df).to_pandas() @@ -525,7 +577,7 @@ def test_one_hot_encoder_series_default_params(new_penguins_df): def test_one_hot_encoder_params(new_penguins_df): - encoder = bigframes.ml.preprocessing.OneHotEncoder("most_frequent", 100, 2) + encoder = preprocessing.OneHotEncoder("most_frequent", 100, 2) encoder.fit(new_penguins_df[["species", "sex"]]) result = encoder.transform(new_penguins_df).to_pandas() @@ -556,7 +608,7 @@ def test_one_hot_encoder_params(new_penguins_df): def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_df): - encoder = bigframes.ml.preprocessing.OneHotEncoder() + encoder = preprocessing.OneHotEncoder() encoder.fit(penguins_df_default_index[["species", "sex"]]) result = encoder.transform(new_penguins_df).to_pandas() @@ -586,8 +638,21 @@ def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_ pd.testing.assert_frame_equal(result, expected) +def test_one_hot_encoder_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.OneHotEncoder(min_frequency=1, max_categories=10) + transformer.fit(new_penguins_df[["species", "sex"]]) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.OneHotEncoder) + assert reloaded_transformer.min_frequency == transformer.min_frequency + assert reloaded_transformer.max_categories == transformer.max_categories + assert reloaded_transformer._bqml_model is not None + + def test_label_encoder_default_params(new_penguins_df): - encoder = bigframes.ml.preprocessing.LabelEncoder() + encoder = preprocessing.LabelEncoder() encoder.fit(new_penguins_df["species"]) result = encoder.transform(new_penguins_df["species"]).to_pandas() @@ -613,7 +678,7 @@ def test_label_encoder_default_params(new_penguins_df): def test_label_encoder_default_params_fit_transform(new_penguins_df): - encoder = bigframes.ml.preprocessing.LabelEncoder() + encoder = preprocessing.LabelEncoder() result = encoder.fit_transform(new_penguins_df[["species"]]).to_pandas() @@ -638,7 +703,7 @@ def test_label_encoder_default_params_fit_transform(new_penguins_df): def test_label_encoder_series_default_params(new_penguins_df): - encoder = bigframes.ml.preprocessing.LabelEncoder() + encoder = preprocessing.LabelEncoder() encoder.fit(new_penguins_df["species"]) result = encoder.transform(new_penguins_df).to_pandas() @@ -664,7 +729,7 @@ def test_label_encoder_series_default_params(new_penguins_df): def test_label_encoder_params(new_penguins_df): - encoder = bigframes.ml.preprocessing.LabelEncoder(100, 2) + encoder = preprocessing.LabelEncoder(100, 2) encoder.fit(new_penguins_df[["species"]]) result = encoder.transform(new_penguins_df).to_pandas() @@ -690,7 +755,7 @@ def test_label_encoder_params(new_penguins_df): def test_label_encoder_different_data(penguins_df_default_index, new_penguins_df): - encoder = bigframes.ml.preprocessing.LabelEncoder() + encoder = preprocessing.LabelEncoder() encoder.fit(penguins_df_default_index[["species"]]) result = encoder.transform(new_penguins_df).to_pandas() @@ -715,4 +780,17 @@ def test_label_encoder_different_data(penguins_df_default_index, new_penguins_df pd.testing.assert_frame_equal(result, expected) +def test_label_encoder_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.LabelEncoder(min_frequency=1, max_categories=10) + transformer.fit(new_penguins_df[["species"]]) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.LabelEncoder) + assert reloaded_transformer.min_frequency == transformer.min_frequency + assert reloaded_transformer.max_categories == transformer.max_categories + assert reloaded_transformer._bqml_model is not None + + # TODO(garrettwu): add OneHotEncoder tests to compare with sklearn. From 0f777884b5fcbd5f270bad31e139f1ee2c06a449 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Fri, 29 Mar 2024 23:54:26 +0000 Subject: [PATCH 2/2] fix mypy --- bigframes/ml/loader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 39148a6412..c6e38e6534 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -110,8 +110,9 @@ def from_bq( def _transformer_from_bq(session: bigframes.Session, bq_model: bigquery.Model): - transformer = compose.ColumnTransformer._extract_from_bq_model(bq_model) - transformer = transformer._merge(bq_model) + transformer = compose.ColumnTransformer._extract_from_bq_model(bq_model)._merge( + bq_model + ) transformer._bqml_model = core.BqmlModel(session, bq_model) return transformer