From a1bce4db5810f775613982fb786f7a1dbf9b89e0 Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Wed, 20 Mar 2024 16:31:57 +0000 Subject: [PATCH 01/11] feat: support Series.dt.normalize --- bigframes/core/compile/scalar_op_compiler.py | 5 ++++ bigframes/operations/__init__.py | 1 + bigframes/operations/datetimes.py | 3 ++ .../system/small/operations/test_datetimes.py | 16 +++++++++++ .../pandas/core/arrays/datetimelike.py | 28 +++++++++++++++++++ 5 files changed, 53 insertions(+) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index c95d1ca45e..4e43e526d0 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -632,6 +632,11 @@ def year_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.TimestampValue, x).year().cast(ibis_dtypes.int64) +@scalar_op_compiler.register_unary_op(ops.normalize_op) +def normalize_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).truncate("D") + + # Parameterized ops @scalar_op_compiler.register_unary_op(ops.StructFieldOp, pass_op=True) def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp): diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index b122f1fe7c..71a188aed6 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -206,6 +206,7 @@ def create_ternary_op( second_op = create_unary_op(name="second", type_rule=op_typing.INTEGER) time_op = create_unary_op(name="time", type_rule=op_typing.INTEGER) year_op = create_unary_op(name="year", type_rule=op_typing.INTEGER) +normalize_op = create_unary_op(name="normalize") ## Trigonometry Ops sin_op = create_unary_op(name="sin", type_rule=op_typing.REAL_NUMERIC) cos_op = create_unary_op(name="cos", type_rule=op_typing.REAL_NUMERIC) diff --git a/bigframes/operations/datetimes.py b/bigframes/operations/datetimes.py index eb91bc0b20..1b4a2fe0e6 100644 --- a/bigframes/operations/datetimes.py +++ b/bigframes/operations/datetimes.py @@ -94,3 +94,6 @@ def unit(self) -> str: def strftime(self, date_format: str) -> series.Series: return self._apply_unary_op(ops.StrftimeOp(date_format=date_format)) + + def normalize(self) -> series.Series: + return self._apply_unary_op(ops.normalize_op) diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index 854672585d..303290f89e 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -266,3 +266,19 @@ def test_dt_strftime_time(): bf_result, expected_result, check_index_type=False, check_dtype=False ) assert bf_result.dtype == "string[pyarrow]" + + +@pytest.mark.parametrize( + ("col_name",), + DATETIME_COL_NAMES, +) +def test_dt_normalize(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series: bigframes.series.Series = scalars_df[col_name] + bf_result = bf_series.dt.normalize().to_pandas() + pd_result = scalars_pandas_df[col_name].dt.normalize() + + assert_series_equal( + pd_result.astype(pd.Int64Dtype()), + bf_result, + ) diff --git a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py index 4f7e33909e..5d6ebb7fe7 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py @@ -36,3 +36,31 @@ def strftime(self, date_format: str): bigframes.series.Series of formatted strings. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def normalize(self): + """ + Convert times to midnight. + + The time component of the date-time is converted to midnight i.e. + 00:00:00. This is useful in cases, when the time does not matter. + Length is unaltered. The timezones are unaffected. + + This method is available on Series with datetime values under the + .dt accessor. + + **Examples:** + + >>> import pandas as pd + >>> impor bigframes.pandas as bpd + >>> s = bpd.Series(pd.date_range( + start='2014-08-01 10:00', + freq='h', + periods=3, + tz='Asia/Calcutta') + ) + >>> s.dt.normalize() + + Returns: + bigframes.series.Series of the same dtype as the input. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 0046d59852ec544bb4c574877896a18ef7856bab Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Wed, 20 Mar 2024 18:51:11 +0000 Subject: [PATCH 02/11] finish normalize code --- bigframes/core/compile/scalar_op_compiler.py | 4 +++- bigframes/operations/datetimes.py | 4 +++- tests/system/small/operations/test_datetimes.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 4e43e526d0..7c6503a94f 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -634,7 +634,9 @@ def year_op_impl(x: ibis_types.Value): @scalar_op_compiler.register_unary_op(ops.normalize_op) def normalize_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).truncate("D") + result_type = x.type() + result = x.truncate("D") + return result.cast(result_type) # Parameterized ops diff --git a/bigframes/operations/datetimes.py b/bigframes/operations/datetimes.py index 1b4a2fe0e6..3944a79ebb 100644 --- a/bigframes/operations/datetimes.py +++ b/bigframes/operations/datetimes.py @@ -17,6 +17,8 @@ import datetime as dt from typing import Optional +import pandas as pd + import bigframes_vendored.pandas.core.arrays.datetimelike as vendored_pandas_datetimelike import bigframes_vendored.pandas.core.indexes.accessor as vendordt @@ -96,4 +98,4 @@ def strftime(self, date_format: str) -> series.Series: return self._apply_unary_op(ops.StrftimeOp(date_format=date_format)) def normalize(self) -> series.Series: - return self._apply_unary_op(ops.normalize_op) + return self._apply_unary_op(ops.normalize_op) \ No newline at end of file diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index 303290f89e..b5541d9ea9 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -279,6 +279,6 @@ def test_dt_normalize(scalars_dfs, col_name): pd_result = scalars_pandas_df[col_name].dt.normalize() assert_series_equal( - pd_result.astype(pd.Int64Dtype()), + pd_result.astype(bf_series.dtype), # normalize preserves type bf_result, ) From 057e8d3fc840d7271d5699307fe6e6c7c9322adc Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Wed, 20 Mar 2024 18:51:32 +0000 Subject: [PATCH 03/11] fix documentation --- .../pandas/core/arrays/datetimelike.py | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py index 5d6ebb7fe7..deebb3fd82 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py @@ -42,25 +42,29 @@ def normalize(self): Convert times to midnight. The time component of the date-time is converted to midnight i.e. - 00:00:00. This is useful in cases, when the time does not matter. - Length is unaltered. The timezones are unaffected. + 00:00:00. This is useful in cases when the time does not matter. + The return dtype will match the source series. This method is available on Series with datetime values under the .dt accessor. **Examples:** - >>> import pandas as pd - >>> impor bigframes.pandas as bpd - >>> s = bpd.Series(pd.date_range( - start='2014-08-01 10:00', - freq='h', - periods=3, - tz='Asia/Calcutta') - ) - >>> s.dt.normalize() + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> s = bpd.Series(pd.date_range( + start='2014-08-01 10:00', + freq='h', + periods=3, + tz='Asia/Calcutta') + ) # note timezones will be converted to UTC here + >>> s.dt.normalize() + 0 2014-08-01 00:00:00+00:00 + 1 2014-08-01 00:00:00+00:00 + 2 2014-08-01 00:00:00+00:00 + dtype: timestamp[us, tz=UTC][pyarrow] Returns: - bigframes.series.Series of the same dtype as the input. + bigframes.series.Series of the same dtype as the data. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 28c7fb02bc01ce637f10dc22c2d7c59c08e342ca Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Wed, 20 Mar 2024 20:35:20 +0000 Subject: [PATCH 04/11] formatting --- bigframes/operations/datetimes.py | 4 +--- tests/system/small/operations/test_datetimes.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/bigframes/operations/datetimes.py b/bigframes/operations/datetimes.py index 3944a79ebb..1b4a2fe0e6 100644 --- a/bigframes/operations/datetimes.py +++ b/bigframes/operations/datetimes.py @@ -17,8 +17,6 @@ import datetime as dt from typing import Optional -import pandas as pd - import bigframes_vendored.pandas.core.arrays.datetimelike as vendored_pandas_datetimelike import bigframes_vendored.pandas.core.indexes.accessor as vendordt @@ -98,4 +96,4 @@ def strftime(self, date_format: str) -> series.Series: return self._apply_unary_op(ops.StrftimeOp(date_format=date_format)) def normalize(self) -> series.Series: - return self._apply_unary_op(ops.normalize_op) \ No newline at end of file + return self._apply_unary_op(ops.normalize_op) diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index b5541d9ea9..cfd103884b 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -279,6 +279,6 @@ def test_dt_normalize(scalars_dfs, col_name): pd_result = scalars_pandas_df[col_name].dt.normalize() assert_series_equal( - pd_result.astype(bf_series.dtype), # normalize preserves type + pd_result.astype(bf_series.dtype), # normalize preserves type bf_result, ) From aec2651a03fe5499f90a1b2147fac2fc20bed54e Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Wed, 20 Mar 2024 10:04:15 -0700 Subject: [PATCH 05/11] feat: add params for LinearRegression model (#464) * feat: add params for LinearRegression model * fix tests * update docs --- bigframes/ml/linear_model.py | 32 +++++++++++--- bigframes/ml/sql.py | 4 +- tests/system/large/ml/test_linear_model.py | 42 ++++++++++++------- tests/unit/ml/test_golden_sql.py | 4 +- .../sklearn/linear_model/_base.py | 10 ++++- 5 files changed, 67 insertions(+), 25 deletions(-) diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 68d1e12676..62855bae90 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -61,19 +61,25 @@ def __init__( "auto_strategy", "batch_gradient_descent", "normal_equation" ] = "normal_equation", fit_intercept: bool = True, + l1_reg: Optional[float] = None, l2_reg: float = 0.0, max_iterations: int = 20, + warm_start: bool = False, + learn_rate: Optional[float] = None, learn_rate_strategy: Literal["line_search", "constant"] = "line_search", early_stop: bool = True, min_rel_progress: float = 0.01, - ls_init_learn_rate: float = 0.1, + ls_init_learn_rate: Optional[float] = None, calculate_p_values: bool = False, enable_global_explain: bool = False, ): self.optimize_strategy = optimize_strategy self.fit_intercept = fit_intercept + self.l1_reg = l1_reg self.l2_reg = l2_reg self.max_iterations = max_iterations + self.warm_start = warm_start + self.learn_rate = learn_rate self.learn_rate_strategy = learn_rate_strategy self.early_stop = early_stop self.min_rel_progress = min_rel_progress @@ -99,17 +105,21 @@ def _from_bq( for bf_param, bf_value in dummy_linear.__dict__.items(): bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) if bqml_param in last_fitting: - kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param]) + # Convert types + kwargs[bf_param] = ( + float(last_fitting[bqml_param]) + if bf_param in ["l1_reg", "learn_rate", "ls_init_learn_rate"] + else type(bf_value)(last_fitting[bqml_param]) + ) new_linear_regression = cls(**kwargs) new_linear_regression._bqml_model = core.BqmlModel(session, model) return new_linear_regression @property - def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: + def _bqml_options(self) -> dict: """The model options as they will be set for BQML""" - # TODO: Support l1_reg, warm_start, and learn_rate with error catching. - return { + options = { "model_type": "LINEAR_REG", "data_split_method": "NO_SPLIT", "optimize_strategy": self.optimize_strategy, @@ -119,10 +129,20 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: "learn_rate_strategy": self.learn_rate_strategy, "early_stop": self.early_stop, "min_rel_progress": self.min_rel_progress, - "ls_init_learn_rate": self.ls_init_learn_rate, "calculate_p_values": self.calculate_p_values, "enable_global_explain": self.enable_global_explain, } + if self.l1_reg is not None: + options["l1_reg"] = self.l1_reg + if self.learn_rate is not None: + options["learn_rate"] = self.learn_rate + if self.ls_init_learn_rate is not None: + options["ls_init_learn_rate"] = self.ls_init_learn_rate + # Even presenting warm_start returns error for NORMAL_EQUATION optimizer + if self.warm_start is True: + options["warm_start"] = self.warm_start + + return options def _fit( self, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index fa74458e77..807fadc06a 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -38,7 +38,9 @@ def encode_value(self, v: Union[str, int, float, Iterable[str]]) -> str: inner = ", ".join([self.encode_value(x) for x in v]) return f"[{inner}]" else: - raise ValueError(f"Unexpected value type. {constants.FEEDBACK_LINK}") + raise ValueError( + f"Unexpected value type {type(v)}. {constants.FEEDBACK_LINK}" + ) def build_parameters(self, **kwargs: Union[str, int, float, Iterable[str]]) -> str: """Encode a dict of values into a formatted Iterable of key-value pairs for SQL""" diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index a0f4182e6f..03fed00770 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -60,9 +60,11 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase assert reloaded_model.calculate_p_values is False assert reloaded_model.early_stop is True assert reloaded_model.enable_global_explain is False + assert reloaded_model.l1_reg is None assert reloaded_model.l2_reg == 0.0 + assert reloaded_model.learn_rate is None assert reloaded_model.learn_rate_strategy == "line_search" - assert reloaded_model.ls_init_learn_rate == 0.1 + assert reloaded_model.ls_init_learn_rate is None assert reloaded_model.max_iterations == 20 assert reloaded_model.min_rel_progress == 0.01 @@ -71,7 +73,14 @@ def test_linear_regression_customized_params_fit_score( penguins_df_default_index, dataset_id ): model = bigframes.ml.linear_model.LinearRegression( - fit_intercept=False, l2_reg=0.1, min_rel_progress=0.01 + fit_intercept=False, + l2_reg=0.2, + min_rel_progress=0.02, + l1_reg=0.2, + max_iterations=30, + optimize_strategy="batch_gradient_descent", + learn_rate_strategy="constant", + learn_rate=0.2, ) df = penguins_df_default_index.dropna() @@ -92,12 +101,12 @@ def test_linear_regression_customized_params_fit_score( result = model.score(X_train, y_train).to_pandas() expected = pd.DataFrame( { - "mean_absolute_error": [226.108411], - "mean_squared_error": [80459.668456], - "mean_squared_log_error": [0.00497], - "median_absolute_error": [171.618872], - "r2_score": [0.875415], - "explained_variance": [0.875417], + "mean_absolute_error": [240], + "mean_squared_error": [91197], + "mean_squared_log_error": [0.00573], + "median_absolute_error": [197], + "r2_score": [0.858], + "explained_variance": [0.8588], }, dtype="Float64", ) @@ -109,16 +118,21 @@ def test_linear_regression_customized_params_fit_score( assert ( f"{dataset_id}.temp_configured_model" in reloaded_model._bqml_model.model_name ) - assert reloaded_model.optimize_strategy == "NORMAL_EQUATION" + assert reloaded_model.optimize_strategy == "BATCH_GRADIENT_DESCENT" assert reloaded_model.fit_intercept is False assert reloaded_model.calculate_p_values is False assert reloaded_model.early_stop is True assert reloaded_model.enable_global_explain is False - assert reloaded_model.l2_reg == 0.1 - assert reloaded_model.learn_rate_strategy == "line_search" - assert reloaded_model.ls_init_learn_rate == 0.1 - assert reloaded_model.max_iterations == 20 - assert reloaded_model.min_rel_progress == 0.01 + assert reloaded_model.l1_reg == 0.2 + assert reloaded_model.l2_reg == 0.2 + assert reloaded_model.ls_init_learn_rate is None + assert reloaded_model.max_iterations == 30 + assert reloaded_model.min_rel_progress == 0.02 + assert reloaded_model.learn_rate_strategy == "CONSTANT" + assert reloaded_model.learn_rate == 0.2 + + +# TODO(garrettwu): add tests for param warm_start. Requires a trained model. def test_logistic_regression_configure_fit_score(penguins_df_default_index, dataset_id): diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index d63bc7aaa1..8996a9c77f 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -105,7 +105,7 @@ def test_linear_regression_default_fit( model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -115,7 +115,7 @@ def test_linear_regression_params_fit(bqml_model_factory, mock_session, mock_X, model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index ad2c872468..39012cbe08 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -71,18 +71,24 @@ class LinearRegression(RegressorMixin, LinearModel): Default ``True``. Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered). + l1_reg (float or None, default None): + The amount of L1 regularization applied. Default to None. Can't be set in "normal_equation" mode. If unset, value 0 is used. l2_reg (float, default 0.0): The amount of L2 regularization applied. Default to 0. max_iterations (int, default 20): The maximum number of training iterations or steps. Default to 20. + warm_start (bool, default False): + Determines whether to train a model with new training data, new model options, or both. Unless you explicitly override them, the initial options used to train the model are used for the warm start run. Default to False. + learn_rate (float or None, default None): + The learn rate for gradient descent when learn_rate_strategy='constant'. If unset, value 0.1 is used. If learn_rate_strategy='line_search', an error is returned. learn_rate_strategy (str, default "line_search"): The strategy for specifying the learning rate during training. Default to "line_search". early_stop (bool, default True): Whether training should stop after the first iteration in which the relative loss improvement is less than the value specified for min_rel_progress. Default to True. min_rel_progress (float, default 0.01): The minimum relative loss improvement that is necessary to continue training when EARLY_STOP is set to true. For example, a value of 0.01 specifies that each iteration must reduce the loss by 1% for training to continue. Default to 0.01. - ls_init_learn_rate (float, default 0.1): - Sets the initial learning rate that learn_rate_strategy='line_search' uses. This option can only be used if line_search is specified. Default to 0.1. + ls_init_learn_rate (float or None, default None): + Sets the initial learning rate that learn_rate_strategy='line_search' uses. This option can only be used if line_search is specified. If unset, value 0.1 is used. calculate_p_values (bool, default False): Specifies whether to compute p-values and standard errors during training. Default to False. enable_global_explain (bool, default False): From fca79f6b2eefb9a93a2ed8eda2f7a7233a12d34f Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Wed, 20 Mar 2024 20:43:45 +0000 Subject: [PATCH 06/11] revert accidental changes --- bigframes/ml/linear_model.py | 32 +++----------- bigframes/ml/sql.py | 4 +- tests/system/large/ml/test_linear_model.py | 42 +++++++------------ tests/unit/ml/test_golden_sql.py | 4 +- .../sklearn/linear_model/_base.py | 10 +---- 5 files changed, 25 insertions(+), 67 deletions(-) diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 62855bae90..68d1e12676 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -61,25 +61,19 @@ def __init__( "auto_strategy", "batch_gradient_descent", "normal_equation" ] = "normal_equation", fit_intercept: bool = True, - l1_reg: Optional[float] = None, l2_reg: float = 0.0, max_iterations: int = 20, - warm_start: bool = False, - learn_rate: Optional[float] = None, learn_rate_strategy: Literal["line_search", "constant"] = "line_search", early_stop: bool = True, min_rel_progress: float = 0.01, - ls_init_learn_rate: Optional[float] = None, + ls_init_learn_rate: float = 0.1, calculate_p_values: bool = False, enable_global_explain: bool = False, ): self.optimize_strategy = optimize_strategy self.fit_intercept = fit_intercept - self.l1_reg = l1_reg self.l2_reg = l2_reg self.max_iterations = max_iterations - self.warm_start = warm_start - self.learn_rate = learn_rate self.learn_rate_strategy = learn_rate_strategy self.early_stop = early_stop self.min_rel_progress = min_rel_progress @@ -105,21 +99,17 @@ def _from_bq( for bf_param, bf_value in dummy_linear.__dict__.items(): bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) if bqml_param in last_fitting: - # Convert types - kwargs[bf_param] = ( - float(last_fitting[bqml_param]) - if bf_param in ["l1_reg", "learn_rate", "ls_init_learn_rate"] - else type(bf_value)(last_fitting[bqml_param]) - ) + kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param]) new_linear_regression = cls(**kwargs) new_linear_regression._bqml_model = core.BqmlModel(session, model) return new_linear_regression @property - def _bqml_options(self) -> dict: + def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: """The model options as they will be set for BQML""" - options = { + # TODO: Support l1_reg, warm_start, and learn_rate with error catching. + return { "model_type": "LINEAR_REG", "data_split_method": "NO_SPLIT", "optimize_strategy": self.optimize_strategy, @@ -129,20 +119,10 @@ def _bqml_options(self) -> dict: "learn_rate_strategy": self.learn_rate_strategy, "early_stop": self.early_stop, "min_rel_progress": self.min_rel_progress, + "ls_init_learn_rate": self.ls_init_learn_rate, "calculate_p_values": self.calculate_p_values, "enable_global_explain": self.enable_global_explain, } - if self.l1_reg is not None: - options["l1_reg"] = self.l1_reg - if self.learn_rate is not None: - options["learn_rate"] = self.learn_rate - if self.ls_init_learn_rate is not None: - options["ls_init_learn_rate"] = self.ls_init_learn_rate - # Even presenting warm_start returns error for NORMAL_EQUATION optimizer - if self.warm_start is True: - options["warm_start"] = self.warm_start - - return options def _fit( self, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 807fadc06a..fa74458e77 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -38,9 +38,7 @@ def encode_value(self, v: Union[str, int, float, Iterable[str]]) -> str: inner = ", ".join([self.encode_value(x) for x in v]) return f"[{inner}]" else: - raise ValueError( - f"Unexpected value type {type(v)}. {constants.FEEDBACK_LINK}" - ) + raise ValueError(f"Unexpected value type. {constants.FEEDBACK_LINK}") def build_parameters(self, **kwargs: Union[str, int, float, Iterable[str]]) -> str: """Encode a dict of values into a formatted Iterable of key-value pairs for SQL""" diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index 03fed00770..a0f4182e6f 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -60,11 +60,9 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase assert reloaded_model.calculate_p_values is False assert reloaded_model.early_stop is True assert reloaded_model.enable_global_explain is False - assert reloaded_model.l1_reg is None assert reloaded_model.l2_reg == 0.0 - assert reloaded_model.learn_rate is None assert reloaded_model.learn_rate_strategy == "line_search" - assert reloaded_model.ls_init_learn_rate is None + assert reloaded_model.ls_init_learn_rate == 0.1 assert reloaded_model.max_iterations == 20 assert reloaded_model.min_rel_progress == 0.01 @@ -73,14 +71,7 @@ def test_linear_regression_customized_params_fit_score( penguins_df_default_index, dataset_id ): model = bigframes.ml.linear_model.LinearRegression( - fit_intercept=False, - l2_reg=0.2, - min_rel_progress=0.02, - l1_reg=0.2, - max_iterations=30, - optimize_strategy="batch_gradient_descent", - learn_rate_strategy="constant", - learn_rate=0.2, + fit_intercept=False, l2_reg=0.1, min_rel_progress=0.01 ) df = penguins_df_default_index.dropna() @@ -101,12 +92,12 @@ def test_linear_regression_customized_params_fit_score( result = model.score(X_train, y_train).to_pandas() expected = pd.DataFrame( { - "mean_absolute_error": [240], - "mean_squared_error": [91197], - "mean_squared_log_error": [0.00573], - "median_absolute_error": [197], - "r2_score": [0.858], - "explained_variance": [0.8588], + "mean_absolute_error": [226.108411], + "mean_squared_error": [80459.668456], + "mean_squared_log_error": [0.00497], + "median_absolute_error": [171.618872], + "r2_score": [0.875415], + "explained_variance": [0.875417], }, dtype="Float64", ) @@ -118,21 +109,16 @@ def test_linear_regression_customized_params_fit_score( assert ( f"{dataset_id}.temp_configured_model" in reloaded_model._bqml_model.model_name ) - assert reloaded_model.optimize_strategy == "BATCH_GRADIENT_DESCENT" + assert reloaded_model.optimize_strategy == "NORMAL_EQUATION" assert reloaded_model.fit_intercept is False assert reloaded_model.calculate_p_values is False assert reloaded_model.early_stop is True assert reloaded_model.enable_global_explain is False - assert reloaded_model.l1_reg == 0.2 - assert reloaded_model.l2_reg == 0.2 - assert reloaded_model.ls_init_learn_rate is None - assert reloaded_model.max_iterations == 30 - assert reloaded_model.min_rel_progress == 0.02 - assert reloaded_model.learn_rate_strategy == "CONSTANT" - assert reloaded_model.learn_rate == 0.2 - - -# TODO(garrettwu): add tests for param warm_start. Requires a trained model. + assert reloaded_model.l2_reg == 0.1 + assert reloaded_model.learn_rate_strategy == "line_search" + assert reloaded_model.ls_init_learn_rate == 0.1 + assert reloaded_model.max_iterations == 20 + assert reloaded_model.min_rel_progress == 0.01 def test_logistic_regression_configure_fit_score(penguins_df_default_index, dataset_id): diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 8996a9c77f..d63bc7aaa1 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -105,7 +105,7 @@ def test_linear_regression_default_fit( model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -115,7 +115,7 @@ def test_linear_regression_params_fit(bqml_model_factory, mock_session, mock_X, model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index 39012cbe08..ad2c872468 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -71,24 +71,18 @@ class LinearRegression(RegressorMixin, LinearModel): Default ``True``. Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered). - l1_reg (float or None, default None): - The amount of L1 regularization applied. Default to None. Can't be set in "normal_equation" mode. If unset, value 0 is used. l2_reg (float, default 0.0): The amount of L2 regularization applied. Default to 0. max_iterations (int, default 20): The maximum number of training iterations or steps. Default to 20. - warm_start (bool, default False): - Determines whether to train a model with new training data, new model options, or both. Unless you explicitly override them, the initial options used to train the model are used for the warm start run. Default to False. - learn_rate (float or None, default None): - The learn rate for gradient descent when learn_rate_strategy='constant'. If unset, value 0.1 is used. If learn_rate_strategy='line_search', an error is returned. learn_rate_strategy (str, default "line_search"): The strategy for specifying the learning rate during training. Default to "line_search". early_stop (bool, default True): Whether training should stop after the first iteration in which the relative loss improvement is less than the value specified for min_rel_progress. Default to True. min_rel_progress (float, default 0.01): The minimum relative loss improvement that is necessary to continue training when EARLY_STOP is set to true. For example, a value of 0.01 specifies that each iteration must reduce the loss by 1% for training to continue. Default to 0.01. - ls_init_learn_rate (float or None, default None): - Sets the initial learning rate that learn_rate_strategy='line_search' uses. This option can only be used if line_search is specified. If unset, value 0.1 is used. + ls_init_learn_rate (float, default 0.1): + Sets the initial learning rate that learn_rate_strategy='line_search' uses. This option can only be used if line_search is specified. Default to 0.1. calculate_p_values (bool, default False): Specifies whether to compute p-values and standard errors during training. Default to False. enable_global_explain (bool, default False): From f3371480627b78ff64cd0beba8fa707757440d5e Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Wed, 20 Mar 2024 21:21:46 +0000 Subject: [PATCH 07/11] skip op old pandas doesn't support --- tests/system/small/operations/test_datetimes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index cfd103884b..ec8c23c4f7 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -272,6 +272,7 @@ def test_dt_strftime_time(): ("col_name",), DATETIME_COL_NAMES, ) +@skip_legacy_pandas def test_dt_normalize(scalars_dfs, col_name): scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] From a158d536e8d85176f5879ea3f0dbcf87c1d45682 Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Wed, 20 Mar 2024 23:28:11 +0000 Subject: [PATCH 08/11] format fix --- .../bigframes_vendored/pandas/core/arrays/datetimelike.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py index deebb3fd82..586fa5ec4c 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py @@ -56,8 +56,7 @@ def normalize(self): start='2014-08-01 10:00', freq='h', periods=3, - tz='Asia/Calcutta') - ) # note timezones will be converted to UTC here + tz='Asia/Calcutta')) # note timezones will be converted to UTC here >>> s.dt.normalize() 0 2014-08-01 00:00:00+00:00 1 2014-08-01 00:00:00+00:00 From ef1ea8e7da8f1193f438c633b96a0f3d9b29696d Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Thu, 21 Mar 2024 00:56:28 +0000 Subject: [PATCH 09/11] docstring fix --- .../bigframes_vendored/pandas/core/arrays/datetimelike.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py index 586fa5ec4c..ce65a9178d 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py @@ -52,11 +52,7 @@ def normalize(self): >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> s = bpd.Series(pd.date_range( - start='2014-08-01 10:00', - freq='h', - periods=3, - tz='Asia/Calcutta')) # note timezones will be converted to UTC here + >>> s = bpd.Series(pd.date_range(start='2014-08-01 10:00', freq='h', periods=3, tz='Asia/Calcutta')) # note timezones will be converted to UTC here >>> s.dt.normalize() 0 2014-08-01 00:00:00+00:00 1 2014-08-01 00:00:00+00:00 From 40197beebe7090f6a565f18d28dc41f3b713be58 Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Thu, 21 Mar 2024 01:00:16 +0000 Subject: [PATCH 10/11] fix docstring --- .../bigframes_vendored/pandas/core/arrays/datetimelike.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py index ce65a9178d..60ac19b818 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py @@ -52,7 +52,11 @@ def normalize(self): >>> import pandas as pd >>> import bigframes.pandas as bpd - >>> s = bpd.Series(pd.date_range(start='2014-08-01 10:00', freq='h', periods=3, tz='Asia/Calcutta')) # note timezones will be converted to UTC here + >>> s = bpd.Series(pd.date_range( + ... start='2014-08-01 10:00', + ... freq='h', + ... periods=3, + ... tz='Asia/Calcutta')) # note timezones will be converted to UTC here >>> s.dt.normalize() 0 2014-08-01 00:00:00+00:00 1 2014-08-01 00:00:00+00:00 From 04d21fe6ea415484e51dc72dbcdbc6ada6469bfa Mon Sep 17 00:00:00 2001 From: milkshakeiii Date: Thu, 21 Mar 2024 05:25:30 +0000 Subject: [PATCH 11/11] remove line --- tests/system/small/operations/test_datetimes.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index ec8c23c4f7..d5100e7dc2 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -275,11 +275,10 @@ def test_dt_strftime_time(): @skip_legacy_pandas def test_dt_normalize(scalars_dfs, col_name): scalars_df, scalars_pandas_df = scalars_dfs - bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.dt.normalize().to_pandas() + bf_result = scalars_df[col_name].dt.normalize().to_pandas() pd_result = scalars_pandas_df[col_name].dt.normalize() assert_series_equal( - pd_result.astype(bf_series.dtype), # normalize preserves type + pd_result.astype(scalars_df[col_name].dtype), # normalize preserves type bf_result, )