From 39fe47451d24a8cf55d7dbb15c6d3b176d25ab18 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 4 Apr 2024 10:19:03 -0700 Subject: [PATCH] fix: reloaded transformer .transform error (#569) * fix: reloaded transformer .transform error * fix mypy --- bigframes/ml/compose.py | 13 +- tests/system/large/ml/test_compose.py | 23 ++++ tests/system/small/ml/test_preprocessing.py | 129 ++++++++++++++++++-- 3 files changed, 149 insertions(+), 16 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 8638f4d182..89969f23e7 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -115,14 +115,17 @@ def camel_to_snake(name): name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower() + output_names = [] for transform_col in bq_model._properties["transformColumns"]: + transform_col_dict = cast(dict, transform_col) # pass the columns that are not transformed - if "transformSql" not in transform_col: + if "transformSql" not in transform_col_dict: continue - transform_sql: str = cast(dict, transform_col)["transformSql"] + transform_sql: str = transform_col_dict["transformSql"] if not transform_sql.startswith("ML."): continue + output_names.append(transform_col_dict["name"]) found_transformer = False for prefix in _BQML_TRANSFROM_TYPE_MAPPING: if transform_sql.startswith(prefix): @@ -141,7 +144,10 @@ def camel_to_snake(name): f"Unsupported transformer type. {constants.FEEDBACK_LINK}" ) - return cls(transformers=transformers) + transformer = cls(transformers=transformers) + transformer._output_names = output_names + + return transformer def _merge( self, bq_model: bigquery.Model @@ -164,6 +170,7 @@ def _merge( for feature_column in bq_model.feature_columns ] ) == sorted(columns): + transformer_0._output_names = self._output_names return transformer_0 return self diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index 0107d371cb..7513b78b29 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -142,3 +142,26 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id): ] assert reloaded_transformer.transformers_ == expected assert reloaded_transformer._bqml_model is not None + + result = transformer.fit_transform( + new_penguins_df[["species", "culmen_length_mm", "flipper_length_mm"]] + ).to_pandas() + + expected = pandas.DataFrame( + { + "onehotencoded_species": [ + [{"index": 1, "value": 1.0}], + [{"index": 1, "value": 1.0}], + [{"index": 2, "value": 1.0}], + ], + "standard_scaled_culmen_length_mm": [ + 1.313249, + -0.20198, + -1.111118, + ], + "standard_scaled_flipper_length_mm": [1.251098, -1.196588, -0.054338], + }, + index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"), + ) + + pandas.testing.assert_frame_equal(result, expected, rtol=0.1, check_dtype=False) diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 22c3c84959..faa0cd7bbd 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -58,7 +58,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df): index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_standard_scaler_normalizeds_fit_transform(new_penguins_df): @@ -82,7 +82,7 @@ def test_standard_scaler_normalizeds_fit_transform(new_penguins_df): index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_standard_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): @@ -110,7 +110,7 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_standard_scaler_save_load(new_penguins_df, dataset_id): @@ -125,6 +125,22 @@ def test_standard_scaler_save_load(new_penguins_df, dataset_id): assert isinstance(reloaded_transformer, preprocessing.StandardScaler) assert reloaded_transformer._bqml_model is not None + result = reloaded_transformer.transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ).to_pandas() + + expected = pd.DataFrame( + { + "standard_scaled_culmen_length_mm": [1.313249, -0.20198, -1.111118], + "standard_scaled_culmen_depth_mm": [1.17072, -1.272416, 0.101848], + "standard_scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=0.1) + def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MaxAbsScaler, when BQML's change is in prod. @@ -157,7 +173,7 @@ def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df): index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df): @@ -176,7 +192,7 @@ def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df): index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): @@ -199,7 +215,7 @@ def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguin index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_max_abs_scaler_save_load(new_penguins_df, dataset_id): @@ -214,6 +230,22 @@ def test_max_abs_scaler_save_load(new_penguins_df, dataset_id): assert isinstance(reloaded_transformer, preprocessing.MaxAbsScaler) assert reloaded_transformer._bqml_model is not None + result = reloaded_transformer.transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ).to_pandas() + + expected = pd.DataFrame( + { + "max_abs_scaled_culmen_length_mm": [1.0, 0.974684, 0.959494], + "max_abs_scaled_culmen_depth_mm": [1.0, 0.914894, 0.962766], + "max_abs_scaled_flipper_length_mm": [1.0, 0.923469, 0.959184], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=0.1) + def test_min_max_scaler_normalized_fit_transform(new_penguins_df): scaler = preprocessing.MinMaxScaler() @@ -231,7 +263,7 @@ def test_min_max_scaler_normalized_fit_transform(new_penguins_df): index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): @@ -255,7 +287,7 @@ def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguin index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df): @@ -290,7 +322,7 @@ def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df): index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_min_max_scaler_save_load(new_penguins_df, dataset_id): @@ -305,6 +337,22 @@ def test_min_max_scaler_save_load(new_penguins_df, dataset_id): assert isinstance(reloaded_transformer, preprocessing.MinMaxScaler) assert reloaded_transformer._bqml_model is not None + result = reloaded_transformer.fit_transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ).to_pandas() + + expected = pd.DataFrame( + { + "min_max_scaled_culmen_length_mm": [1.0, 0.375, 0.0], + "min_max_scaled_culmen_depth_mm": [1.0, 0.0, 0.5625], + "min_max_scaled_flipper_length_mm": [1.0, 0.0, 0.466667], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=0.1) + def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins_df): discretizer = preprocessing.KBinsDiscretizer(strategy="uniform") @@ -322,7 +370,7 @@ def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_k_bins_discretizer_series_normalizes( @@ -344,7 +392,7 @@ def test_k_bins_discretizer_series_normalizes( index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_df): @@ -374,7 +422,7 @@ def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_d index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_k_bins_discretizer_normalizes_different_params( @@ -406,7 +454,7 @@ def test_k_bins_discretizer_normalizes_different_params( index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id): @@ -423,6 +471,22 @@ def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id): assert reloaded_transformer.strategy == transformer.strategy assert reloaded_transformer._bqml_model is not None + result = reloaded_transformer.fit_transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ).to_pandas() + + expected = pd.DataFrame( + { + "kbinsdiscretizer_culmen_length_mm": ["bin_6", "bin_4", "bin_2"], + "kbinsdiscretizer_culmen_depth_mm": ["bin_6", "bin_2", "bin_5"], + "kbinsdiscretizer_flipper_length_mm": ["bin_6", "bin_2", "bin_4"], + }, + dtype="string[pyarrow]", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=0.1) + def test_one_hot_encoder_default_params(new_penguins_df): encoder = preprocessing.OneHotEncoder() @@ -560,6 +624,29 @@ def test_one_hot_encoder_save_load(new_penguins_df, dataset_id): assert reloaded_transformer.max_categories == transformer.max_categories assert reloaded_transformer._bqml_model is not None + result = reloaded_transformer.fit_transform( + new_penguins_df[["species", "sex"]] + ).to_pandas() + + expected = pd.DataFrame( + { + "onehotencoded_species": [ + [{"index": 1, "value": 1.0}], + [{"index": 1, "value": 1.0}], + [{"index": 2, "value": 1.0}], + ], + "onehotencoded_sex": [ + [{"index": 2, "value": 1.0}], + [{"index": 1, "value": 1.0}], + [{"index": 1, "value": 1.0}], + ], + }, + dtype=ONE_HOT_ENCODED_DTYPE, + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + def test_label_encoder_default_params(new_penguins_df): encoder = preprocessing.LabelEncoder() @@ -677,5 +764,21 @@ def test_label_encoder_save_load(new_penguins_df, dataset_id): assert reloaded_transformer.max_categories == transformer.max_categories assert reloaded_transformer._bqml_model is not None + result = reloaded_transformer.transform(new_penguins_df).to_pandas() + + expected = pd.DataFrame( + { + "labelencoded_species": [ + 1, + 1, + 2, + ], + }, + dtype="Int64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + # TODO(garrettwu): add OneHotEncoder tests to compare with sklearn.