From ca6f778aaf2b81664af16d92344f4696c0e62b27 Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Sun, 21 Jan 2024 19:05:22 +0000
Subject: [PATCH 1/5] feat: add ARIMA_EVAULATE options in forecasting models

---
 bigframes/ml/core.py                      |  8 ++++
 bigframes/ml/forecasting.py               | 25 ++++++++++-
 bigframes/ml/sql.py                       |  6 +++
 tests/system/large/ml/test_forecasting.py | 41 +++++++++++++++++-
 tests/system/small/ml/test_forecasting.py | 51 +++++++++++++++++++++++
 tests/unit/ml/test_sql.py                 | 13 ++++++
 6 files changed, 140 insertions(+), 4 deletions(-)

diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py
index 1e2224c9bc..f8f97ed301 100644
--- a/bigframes/ml/core.py
+++ b/bigframes/ml/core.py
@@ -136,6 +136,14 @@ def evaluate(self, input_data: Optional[bpd.DataFrame] = None):
 
         return self._session.read_gbq(sql)
 
+    def arima_evaluate(self, show_all_candidate_models: bool = False):
+        # TODO: validate input data schema
+        sql = self._model_manipulation_sql_generator.ml_arima_evaluate(
+            show_all_candidate_models
+        )
+
+        return self._session.read_gbq(sql)
+
     def centroids(self) -> bpd.DataFrame:
         assert self._model.model_type == "KMEANS"
 
diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py
index 03b9857cc5..5d6de82d1a 100644
--- a/bigframes/ml/forecasting.py
+++ b/bigframes/ml/forecasting.py
@@ -123,15 +123,21 @@ def score(
         self,
         X: Union[bpd.DataFrame, bpd.Series],
         y: Union[bpd.DataFrame, bpd.Series],
+        verbose: bool = False,
+        show_all_candidate_models: bool = False,
     ) -> bpd.DataFrame:
         """Calculate evaluation metrics of the model.
 
         .. note::
 
-            Output matches that of the BigQuery ML.EVALUTE function.
+            If `verbose = False`. Output matches that of the BigQuery ML.EVALUTE function.
             See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#time_series_models
             for the outputs relevant to this model type.
 
+            If `verbose = True`. Output matches that of the BigQuery ML.ARIMA_EVALUATE function.
+            See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-arima-evaluate
+            for the outputs relevant to this model type.
+
         Args:
             X (bigframes.dataframe.DataFrame or bigframes.series.Series):
                 A BigQuery DataFrame only contains 1 column as
@@ -140,16 +146,31 @@ def score(
             y (bigframes.dataframe.DataFrame or bigframes.series.Series):
                 A BigQuery DataFrame only contains 1 column as
                 evaluation numeric values.
+            verbose (bool, default to False):
+                Whether to report the metrics (log_likelihood, AIC, variance...)
+                for ARIMA candidate models characterized by different (p, d,
+                q, has_drift) tuples. Default to False.
+            show_all_candidate_models (bool, default to False):
+                Whether to show evaluation metrics or an error message for either
+                all candidate models or for only the best model with the lowest
+                AIC. It is only valid when verbose is set to True. Default to False.
 
         Returns:
             bigframes.dataframe.DataFrame: A DataFrame as evaluation result.
         """
+        if verbose is False and show_all_candidate_models is True:
+            raise ValueError(
+                "show_all_candidate_models variable is only valid when verbose is True."
+            )
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before score")
         X, y = utils.convert_to_dataframe(X, y)
 
         input_data = X.join(y, how="outer")
-        return self._bqml_model.evaluate(input_data)
+        if verbose is False:
+            return self._bqml_model.evaluate(input_data)
+        else:
+            return self._bqml_model.arima_evaluate(show_all_candidate_models)
 
     def to_gbq(self, model_name: str, replace: bool = False) -> ARIMAPlus:
         """Save the model to BigQuery.
diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py
index 25caaf1ac6..152f881ec0 100644
--- a/bigframes/ml/sql.py
+++ b/bigframes/ml/sql.py
@@ -260,6 +260,12 @@ def ml_evaluate(self, source_df: Optional[bpd.DataFrame] = None) -> str:
             return f"""SELECT * FROM ML.EVALUATE(MODEL `{self._model_name}`,
   ({source_sql}))"""
 
+    # ML evaluation TVFs
+    def ml_arima_evaluate(self, show_all_candidate_models: bool = False) -> str:
+        """Encode ML.ARMIA_EVALUATE for BQML"""
+        return f"""SELECT * FROM ML.ARIMA_EVALUATE(MODEL `{self._model_name}`,
+            STRUCT({show_all_candidate_models} AS show_all_candidate_models))"""
+
     def ml_centroids(self) -> str:
         """Encode ML.CENTROIDS for BQML"""
         return f"""SELECT * FROM ML.CENTROIDS(MODEL `{self._model_name}`)"""
diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py
index 33b835e852..b35029baa9 100644
--- a/tests/system/large/ml/test_forecasting.py
+++ b/tests/system/large/ml/test_forecasting.py
@@ -16,6 +16,20 @@
 
 from bigframes.ml import forecasting
 
+ARIMA_EVALUATE_OUTPUT_COL = [
+    "non_seasonal_p",
+    "non_seasonal_d",
+    "non_seasonal_q",
+    "log_likelihood",
+    "AIC",
+    "variance",
+    "seasonal_periods",
+    "has_holiday_effect",
+    "has_spikes_and_dips",
+    "has_step_changes",
+    "error_message",
+]
+
 
 def test_arima_plus_model_fit_score(
     time_series_df_default_index, dataset_id, new_time_series_df
@@ -42,7 +56,30 @@ def test_arima_plus_model_fit_score(
     pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1)
 
     # save, load to ensure configuration was kept
-    reloaded_model = model.to_gbq(f"{dataset_id}.temp_configured_model", replace=True)
+    reloaded_model = model.to_gbq(f"{dataset_id}.temp_arima_plus_model", replace=True)
+    assert (
+        f"{dataset_id}.temp_arima_plus_model" in reloaded_model._bqml_model.model_name
+    )
+
+
+def test_arima_plus_model_fit_score_verbose(
+    time_series_df_default_index, dataset_id, new_time_series_df
+):
+    model = forecasting.ARIMAPlus()
+    X_train = time_series_df_default_index[["parsed_date"]]
+    y_train = time_series_df_default_index[["total_visits"]]
+    model.fit(X_train, y_train)
+
+    result = model.score(
+        new_time_series_df[["parsed_date"]],
+        new_time_series_df[["total_visits"]],
+        verbose=True,
+    )
+    assert result.shape == (1, 12)
+    assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL)
+
+    # save, load to ensure configuration was kept
+    reloaded_model = model.to_gbq(f"{dataset_id}.temp_arima_plus_model", replace=True)
     assert (
-        f"{dataset_id}.temp_configured_model" in reloaded_model._bqml_model.model_name
+        f"{dataset_id}.temp_arima_plus_model" in reloaded_model._bqml_model.model_name
     )
diff --git a/tests/system/small/ml/test_forecasting.py b/tests/system/small/ml/test_forecasting.py
index be8d9c2bac..618cf5a639 100644
--- a/tests/system/small/ml/test_forecasting.py
+++ b/tests/system/small/ml/test_forecasting.py
@@ -20,6 +20,20 @@
 
 from bigframes.ml import forecasting
 
+ARIMA_EVALUATE_OUTPUT_COL = [
+    "non_seasonal_p",
+    "non_seasonal_d",
+    "non_seasonal_q",
+    "log_likelihood",
+    "AIC",
+    "variance",
+    "seasonal_periods",
+    "has_holiday_effect",
+    "has_spikes_and_dips",
+    "has_step_changes",
+    "error_message",
+]
+
 
 def test_model_predict_default(time_series_arima_plus_model: forecasting.ARIMAPlus):
     utc = pytz.utc
@@ -104,6 +118,31 @@ def test_model_score(
     )
 
 
+def test_model_score_verbose(
+    time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df
+):
+    result = time_series_arima_plus_model.score(
+        new_time_series_df[["parsed_date"]],
+        new_time_series_df[["total_visits"]],
+        verbose=True,
+    )
+    assert result.shape == (1, 12)
+    assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL)
+
+
+def test_model_score_verbose_show_all_candidates(
+    time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df
+):
+    result = time_series_arima_plus_model.score(
+        new_time_series_df[["parsed_date"]],
+        new_time_series_df[["total_visits"]],
+        verbose=True,
+        show_all_candidate_models=True,
+    )
+    assert result.shape[0] > 1
+    assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL)
+
+
 def test_model_score_series(
     time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df
 ):
@@ -126,3 +165,15 @@ def test_model_score_series(
         rtol=0.1,
         check_index_type=False,
     )
+
+
+def test_model_score_series_verbose(
+    time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df
+):
+    result = time_series_arima_plus_model.score(
+        new_time_series_df["parsed_date"],
+        new_time_series_df["total_visits"],
+        verbose=True,
+    )
+    assert result.shape == (1, 12)
+    assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL)
diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py
index 73d19cc0bb..37cc33d33e 100644
--- a/tests/unit/ml/test_sql.py
+++ b/tests/unit/ml/test_sql.py
@@ -273,6 +273,19 @@ def test_ml_evaluate_produces_correct_sql(
     )
 
 
+def test_ml_arima_evaluate_produces_correct_sql(
+    model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
+):
+    sql = model_manipulation_sql_generator.ml_arima_evaluate(
+        show_all_candidate_models=True
+    )
+    assert (
+        sql
+        == """SELECT * FROM ML.ARIMA_EVALUATE(MODEL `my_project_id.my_dataset_id.my_model_id`,
+            STRUCT(True AS show_all_candidate_models))"""
+    )
+
+
 def test_ml_evaluate_no_source_produces_correct_sql(
     model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
 ):

From 191ffea395b6cfcfc14631aaa8121dcc9f8d54f9 Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Wed, 24 Jan 2024 03:40:35 +0000
Subject: [PATCH 2/5] feat: add summary method

---
 bigframes/ml/forecasting.py               | 44 ++++++++++++++---------
 tests/system/large/ml/test_forecasting.py | 10 ++----
 tests/system/small/ml/test_forecasting.py | 15 +++-----
 3 files changed, 33 insertions(+), 36 deletions(-)

diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py
index 5d6de82d1a..e762cbc627 100644
--- a/bigframes/ml/forecasting.py
+++ b/bigframes/ml/forecasting.py
@@ -130,14 +130,10 @@ def score(
 
         .. note::
 
-            If `verbose = False`. Output matches that of the BigQuery ML.EVALUTE function.
+            Output matches that of the BigQuery ML.EVALUTE function.
             See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#time_series_models
             for the outputs relevant to this model type.
 
-            If `verbose = True`. Output matches that of the BigQuery ML.ARIMA_EVALUATE function.
-            See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-arima-evaluate
-            for the outputs relevant to this model type.
-
         Args:
             X (bigframes.dataframe.DataFrame or bigframes.series.Series):
                 A BigQuery DataFrame only contains 1 column as
@@ -146,14 +142,6 @@ def score(
             y (bigframes.dataframe.DataFrame or bigframes.series.Series):
                 A BigQuery DataFrame only contains 1 column as
                 evaluation numeric values.
-            verbose (bool, default to False):
-                Whether to report the metrics (log_likelihood, AIC, variance...)
-                for ARIMA candidate models characterized by different (p, d,
-                q, has_drift) tuples. Default to False.
-            show_all_candidate_models (bool, default to False):
-                Whether to show evaluation metrics or an error message for either
-                all candidate models or for only the best model with the lowest
-                AIC. It is only valid when verbose is set to True. Default to False.
 
         Returns:
             bigframes.dataframe.DataFrame: A DataFrame as evaluation result.
@@ -167,10 +155,32 @@ def score(
         X, y = utils.convert_to_dataframe(X, y)
 
         input_data = X.join(y, how="outer")
-        if verbose is False:
-            return self._bqml_model.evaluate(input_data)
-        else:
-            return self._bqml_model.arima_evaluate(show_all_candidate_models)
+        return self._bqml_model.evaluate(input_data)
+
+    def summary(
+        self,
+        show_all_candidate_models: bool = False,
+    ) -> bpd.DataFrame:
+        """Summary of the evaluation metrics of the time series model.
+
+        .. note::
+
+            Output matches that of the BigQuery ML.ARIMA_EVALUATE function.
+            See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-arima-evaluate
+            for the outputs relevant to this model type.
+
+        Args:
+            show_all_candidate_models (bool, default to False):
+                Whether to show evaluation metrics or an error message for either
+                all candidate models or for only the best model with the lowest
+                AIC. Default to False.
+
+        Returns:
+            bigframes.dataframe.DataFrame: A DataFrame as evaluation result.
+        """
+        if not self._bqml_model:
+            raise RuntimeError("A model must be fitted before score")
+        return self._bqml_model.arima_evaluate(show_all_candidate_models)
 
     def to_gbq(self, model_name: str, replace: bool = False) -> ARIMAPlus:
         """Save the model to BigQuery.
diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py
index b35029baa9..2bb136b0f2 100644
--- a/tests/system/large/ml/test_forecasting.py
+++ b/tests/system/large/ml/test_forecasting.py
@@ -62,19 +62,13 @@ def test_arima_plus_model_fit_score(
     )
 
 
-def test_arima_plus_model_fit_score_verbose(
-    time_series_df_default_index, dataset_id, new_time_series_df
-):
+def test_arima_plus_model_fit_summary(time_series_df_default_index, dataset_id):
     model = forecasting.ARIMAPlus()
     X_train = time_series_df_default_index[["parsed_date"]]
     y_train = time_series_df_default_index[["total_visits"]]
     model.fit(X_train, y_train)
 
-    result = model.score(
-        new_time_series_df[["parsed_date"]],
-        new_time_series_df[["total_visits"]],
-        verbose=True,
-    )
+    result = model.summary()
     assert result.shape == (1, 12)
     assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL)
 
diff --git a/tests/system/small/ml/test_forecasting.py b/tests/system/small/ml/test_forecasting.py
index 618cf5a639..9d80826aa4 100644
--- a/tests/system/small/ml/test_forecasting.py
+++ b/tests/system/small/ml/test_forecasting.py
@@ -118,25 +118,18 @@ def test_model_score(
     )
 
 
-def test_model_score_verbose(
+def test_model_summary(
     time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df
 ):
-    result = time_series_arima_plus_model.score(
-        new_time_series_df[["parsed_date"]],
-        new_time_series_df[["total_visits"]],
-        verbose=True,
-    )
+    result = time_series_arima_plus_model.summary()
     assert result.shape == (1, 12)
     assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL)
 
 
-def test_model_score_verbose_show_all_candidates(
+def test_model_summary_show_all_candidates(
     time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df
 ):
-    result = time_series_arima_plus_model.score(
-        new_time_series_df[["parsed_date"]],
-        new_time_series_df[["total_visits"]],
-        verbose=True,
+    result = time_series_arima_plus_model.summary(
         show_all_candidate_models=True,
     )
     assert result.shape[0] > 1

From 3bd6c6023b26dbc925a18421d8b0484db0d8155e Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Wed, 24 Jan 2024 03:43:33 +0000
Subject: [PATCH 3/5] fix minor errors

---
 bigframes/ml/forecasting.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py
index e762cbc627..8d448fbace 100644
--- a/bigframes/ml/forecasting.py
+++ b/bigframes/ml/forecasting.py
@@ -123,8 +123,6 @@ def score(
         self,
         X: Union[bpd.DataFrame, bpd.Series],
         y: Union[bpd.DataFrame, bpd.Series],
-        verbose: bool = False,
-        show_all_candidate_models: bool = False,
     ) -> bpd.DataFrame:
         """Calculate evaluation metrics of the model.
 
@@ -146,10 +144,6 @@ def score(
         Returns:
             bigframes.dataframe.DataFrame: A DataFrame as evaluation result.
         """
-        if verbose is False and show_all_candidate_models is True:
-            raise ValueError(
-                "show_all_candidate_models variable is only valid when verbose is True."
-            )
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before score")
         X, y = utils.convert_to_dataframe(X, y)

From e06787c6909a143f0ac67766a5528ebf05a45589 Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Wed, 24 Jan 2024 03:52:17 +0000
Subject: [PATCH 4/5] fix failed tests

---
 tests/system/small/ml/test_forecasting.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/system/small/ml/test_forecasting.py b/tests/system/small/ml/test_forecasting.py
index 9d80826aa4..4726d5ab21 100644
--- a/tests/system/small/ml/test_forecasting.py
+++ b/tests/system/small/ml/test_forecasting.py
@@ -160,13 +160,9 @@ def test_model_score_series(
     )
 
 
-def test_model_score_series_verbose(
+def test_model_summary_series(
     time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df
 ):
-    result = time_series_arima_plus_model.score(
-        new_time_series_df["parsed_date"],
-        new_time_series_df["total_visits"],
-        verbose=True,
-    )
+    result = time_series_arima_plus_model.summary()
     assert result.shape == (1, 12)
     assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL)

From c7ff0b379e3960f12b50c9ec3b201788a62ae7e6 Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Wed, 24 Jan 2024 22:03:30 +0000
Subject: [PATCH 5/5] address comments

---
 bigframes/ml/core.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py
index f8f97ed301..7c156b4cb7 100644
--- a/bigframes/ml/core.py
+++ b/bigframes/ml/core.py
@@ -137,7 +137,6 @@ def evaluate(self, input_data: Optional[bpd.DataFrame] = None):
         return self._session.read_gbq(sql)
 
     def arima_evaluate(self, show_all_candidate_models: bool = False):
-        # TODO: validate input data schema
         sql = self._model_manipulation_sql_generator.ml_arima_evaluate(
             show_all_candidate_models
         )