Skip to content

Commit

Permalink
fix: reloaded transformer .transform error (#569)
Browse files Browse the repository at this point in the history
* fix: reloaded transformer .transform error

* fix mypy
  • Loading branch information
GarrettWu authored Apr 4, 2024
1 parent 098d444 commit 39fe474
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 16 deletions.
13 changes: 10 additions & 3 deletions bigframes/ml/compose.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,14 +115,17 @@ def camel_to_snake(name):
name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower()

output_names = []
for transform_col in bq_model._properties["transformColumns"]:
transform_col_dict = cast(dict, transform_col)
# pass the columns that are not transformed
if "transformSql" not in transform_col:
if "transformSql" not in transform_col_dict:
continue
transform_sql: str = cast(dict, transform_col)["transformSql"]
transform_sql: str = transform_col_dict["transformSql"]
if not transform_sql.startswith("ML."):
continue

output_names.append(transform_col_dict["name"])
found_transformer = False
for prefix in _BQML_TRANSFROM_TYPE_MAPPING:
if transform_sql.startswith(prefix):
Expand All @@ -141,7 +144,10 @@ def camel_to_snake(name):
f"Unsupported transformer type. {constants.FEEDBACK_LINK}"
)

return cls(transformers=transformers)
transformer = cls(transformers=transformers)
transformer._output_names = output_names

return transformer

def _merge(
self, bq_model: bigquery.Model
Expand All @@ -164,6 +170,7 @@ def _merge(
for feature_column in bq_model.feature_columns
]
) == sorted(columns):
transformer_0._output_names = self._output_names
return transformer_0

return self
Expand Down
23 changes: 23 additions & 0 deletions tests/system/large/ml/test_compose.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,26 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id):
]
assert reloaded_transformer.transformers_ == expected
assert reloaded_transformer._bqml_model is not None

result = transformer.fit_transform(
new_penguins_df[["species", "culmen_length_mm", "flipper_length_mm"]]
).to_pandas()

expected = pandas.DataFrame(
{
"onehotencoded_species": [
[{"index": 1, "value": 1.0}],
[{"index": 1, "value": 1.0}],
[{"index": 2, "value": 1.0}],
],
"standard_scaled_culmen_length_mm": [
1.313249,
-0.20198,
-1.111118,
],
"standard_scaled_flipper_length_mm": [1.251098, -1.196588, -0.054338],
},
index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"),
)

pandas.testing.assert_frame_equal(result, expected, rtol=0.1, check_dtype=False)
129 changes: 116 additions & 13 deletions tests/system/small/ml/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df):
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_standard_scaler_normalizeds_fit_transform(new_penguins_df):
Expand All @@ -82,7 +82,7 @@ def test_standard_scaler_normalizeds_fit_transform(new_penguins_df):
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_standard_scaler_series_normalizes(penguins_df_default_index, new_penguins_df):
Expand Down Expand Up @@ -110,7 +110,7 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_standard_scaler_save_load(new_penguins_df, dataset_id):
Expand All @@ -125,6 +125,22 @@ def test_standard_scaler_save_load(new_penguins_df, dataset_id):
assert isinstance(reloaded_transformer, preprocessing.StandardScaler)
assert reloaded_transformer._bqml_model is not None

result = reloaded_transformer.transform(
new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
).to_pandas()

expected = pd.DataFrame(
{
"standard_scaled_culmen_length_mm": [1.313249, -0.20198, -1.111118],
"standard_scaled_culmen_depth_mm": [1.17072, -1.272416, 0.101848],
"standard_scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338],
},
dtype="Float64",
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df):
# TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MaxAbsScaler, when BQML's change is in prod.
Expand Down Expand Up @@ -157,7 +173,7 @@ def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df):
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df):
Expand All @@ -176,7 +192,7 @@ def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df):
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguins_df):
Expand All @@ -199,7 +215,7 @@ def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguin
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_max_abs_scaler_save_load(new_penguins_df, dataset_id):
Expand All @@ -214,6 +230,22 @@ def test_max_abs_scaler_save_load(new_penguins_df, dataset_id):
assert isinstance(reloaded_transformer, preprocessing.MaxAbsScaler)
assert reloaded_transformer._bqml_model is not None

result = reloaded_transformer.transform(
new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
).to_pandas()

expected = pd.DataFrame(
{
"max_abs_scaled_culmen_length_mm": [1.0, 0.974684, 0.959494],
"max_abs_scaled_culmen_depth_mm": [1.0, 0.914894, 0.962766],
"max_abs_scaled_flipper_length_mm": [1.0, 0.923469, 0.959184],
},
dtype="Float64",
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_min_max_scaler_normalized_fit_transform(new_penguins_df):
scaler = preprocessing.MinMaxScaler()
Expand All @@ -231,7 +263,7 @@ def test_min_max_scaler_normalized_fit_transform(new_penguins_df):
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguins_df):
Expand All @@ -255,7 +287,7 @@ def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguin
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df):
Expand Down Expand Up @@ -290,7 +322,7 @@ def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df):
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_min_max_scaler_save_load(new_penguins_df, dataset_id):
Expand All @@ -305,6 +337,22 @@ def test_min_max_scaler_save_load(new_penguins_df, dataset_id):
assert isinstance(reloaded_transformer, preprocessing.MinMaxScaler)
assert reloaded_transformer._bqml_model is not None

result = reloaded_transformer.fit_transform(
new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
).to_pandas()

expected = pd.DataFrame(
{
"min_max_scaled_culmen_length_mm": [1.0, 0.375, 0.0],
"min_max_scaled_culmen_depth_mm": [1.0, 0.0, 0.5625],
"min_max_scaled_flipper_length_mm": [1.0, 0.0, 0.466667],
},
dtype="Float64",
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins_df):
discretizer = preprocessing.KBinsDiscretizer(strategy="uniform")
Expand All @@ -322,7 +370,7 @@ def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_k_bins_discretizer_series_normalizes(
Expand All @@ -344,7 +392,7 @@ def test_k_bins_discretizer_series_normalizes(
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_df):
Expand Down Expand Up @@ -374,7 +422,7 @@ def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_d
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_k_bins_discretizer_normalizes_different_params(
Expand Down Expand Up @@ -406,7 +454,7 @@ def test_k_bins_discretizer_normalizes_different_params(
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id):
Expand All @@ -423,6 +471,22 @@ def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id):
assert reloaded_transformer.strategy == transformer.strategy
assert reloaded_transformer._bqml_model is not None

result = reloaded_transformer.fit_transform(
new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
).to_pandas()

expected = pd.DataFrame(
{
"kbinsdiscretizer_culmen_length_mm": ["bin_6", "bin_4", "bin_2"],
"kbinsdiscretizer_culmen_depth_mm": ["bin_6", "bin_2", "bin_5"],
"kbinsdiscretizer_flipper_length_mm": ["bin_6", "bin_2", "bin_4"],
},
dtype="string[pyarrow]",
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_one_hot_encoder_default_params(new_penguins_df):
encoder = preprocessing.OneHotEncoder()
Expand Down Expand Up @@ -560,6 +624,29 @@ def test_one_hot_encoder_save_load(new_penguins_df, dataset_id):
assert reloaded_transformer.max_categories == transformer.max_categories
assert reloaded_transformer._bqml_model is not None

result = reloaded_transformer.fit_transform(
new_penguins_df[["species", "sex"]]
).to_pandas()

expected = pd.DataFrame(
{
"onehotencoded_species": [
[{"index": 1, "value": 1.0}],
[{"index": 1, "value": 1.0}],
[{"index": 2, "value": 1.0}],
],
"onehotencoded_sex": [
[{"index": 2, "value": 1.0}],
[{"index": 1, "value": 1.0}],
[{"index": 1, "value": 1.0}],
],
},
dtype=ONE_HOT_ENCODED_DTYPE,
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected)


def test_label_encoder_default_params(new_penguins_df):
encoder = preprocessing.LabelEncoder()
Expand Down Expand Up @@ -677,5 +764,21 @@ def test_label_encoder_save_load(new_penguins_df, dataset_id):
assert reloaded_transformer.max_categories == transformer.max_categories
assert reloaded_transformer._bqml_model is not None

result = reloaded_transformer.transform(new_penguins_df).to_pandas()

expected = pd.DataFrame(
{
"labelencoded_species": [
1,
1,
2,
],
},
dtype="Int64",
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected)


# TODO(garrettwu): add OneHotEncoder tests to compare with sklearn.

0 comments on commit 39fe474

Please sign in to comment.