googleapis · GarrettWu · Sep 26, 2024 · Sep 25, 2024 · Sep 26, 2024 · Genesis929
@@ -18,9 +18,11 @@
 
 
 import inspect
-from typing import cast, Generator, List, Union
+import time
+from typing import cast, Generator, List, Optional, Union
 
 import bigframes_vendored.sklearn.model_selection._split as vendored_model_selection_split
+import bigframes_vendored.sklearn.model_selection._validation as vendored_model_selection_validation
 
 from bigframes.core import log_adapter
 from bigframes.ml import utils
@@ -147,3 +149,37 @@ def split(
             yield utils.convert_to_types(
                 [X_train, X_test, y_train, y_test], [X, X, y, y]
             )
+
+
+def cross_validate(
+    estimator,
+    X: Union[bpd.DataFrame, bpd.Series],
+    y: Union[bpd.DataFrame, bpd.Series, None] = None,
+    *,
+    cv: Optional[Union[int, KFold]] = None,
+) -> dict[str, list]:
+    if cv is None:
+        cv = KFold(n_splits=5)
+    elif isinstance(cv, int):
+        cv = KFold(n_splits=cv)
+
+    result: dict[str, list] = {"test_score": [], "fit_time": [], "score_time": []}
+    for X_train, X_test, y_train, y_test in cv.split(X, y):  # type: ignore
+        fit_start_time = time.time()
+        estimator.fit(X_train, y_train)
+        fit_time = time.time() - fit_start_time
+
+        score_start_time = time.time()
+        score = estimator.score(X_test, y_test)
+        score_time = time.time() - score_start_time
+
+        result["test_score"].append(score)
+        result["fit_time"].append(fit_time)
+        result["score_time"].append(score_time)
+
+    return result
+
+
+cross_validate.__doc__ = inspect.getdoc(
+    vendored_model_selection_validation.cross_validate
+)
@@ -0,0 +1,64 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from bigframes.ml import linear_model, model_selection
+from tests.system import utils
+
+
+@pytest.mark.parametrize(
+    ("cv", "n_fold"),
+    (
+        pytest.param(
+            None,
+            5,
+        ),
+        pytest.param(
+            4,
+            4,
+        ),
+        pytest.param(
+            model_selection.KFold(3),
+            3,
+        ),
+    ),
+)
+def test_cross_validate(penguins_df_default_index, cv, n_fold):
+    model = linear_model.LinearRegression()
+    df = penguins_df_default_index.dropna()
+    X = df[
+        [
+            "species",
+            "island",
+            "culmen_length_mm",
+        ]
+    ]
+    y = df["body_mass_g"]
+
+    cv_results = model_selection.cross_validate(model, X, y, cv=cv)
+
+    assert "test_score" in cv_results
+    assert "fit_time" in cv_results
+    assert "score_time" in cv_results
+
+    assert len(cv_results["test_score"]) == n_fold
+    assert len(cv_results["fit_time"]) == n_fold
+    assert len(cv_results["score_time"]) == n_fold
+
+    utils.check_pandas_df_schema_and_index(
+        cv_results["test_score"][0].to_pandas(),
+        columns=utils.ML_REGRESSION_METRICS,
+        index=1,
+    )
@@ -0,0 +1,46 @@
+"""
+The :mod:`sklearn.model_selection._validation` module includes classes and
+functions to validate the model.
+"""
+
+# Author: Alexandre Gramfort <[email protected]>
+#         Gael Varoquaux <[email protected]>
+#         Olivier Grisel <[email protected]>
+#         Raghav RV <[email protected]>
+#         Michal Karbownik <[email protected]>
+# License: BSD 3 clause
+
+
+def cross_validate(estimator, X, y=None, *, cv=None):
+    """Evaluate metric(s) by cross-validation and also record fit/score times.
+
+    Args:
+        estimator:
+            bigframes.ml model that implements fit().
+        The object to use to fit the data.
+
+        X (bigframes.dataframe.DataFrame or bigframes.series.Series):
+            The data to fit.
+
+        y (bigframes.dataframe.DataFrame, bigframes.series.Series or None):
+            The target variable to try to predict in the case of supe()rvised learning. Default to None.
+
+        cv (int, bigframes.ml.model_selection.KFold or None):
+            Determines the cross-validation splitting strategy.
+            Possible inputs for cv are:
+
+            - None, to use the default 5-fold cross validation,
+            - int, to specify the number of folds in a `KFold`,
+            - bigframes.ml.model_selection.KFold instance.
+
+    Returns:
+        Dict[str, List]: A dict of arrays containing the score/time arrays for each scorer is returned. The keys for this ``dict`` are:
+
+            ``test_score``
+                The score array for test scores on each cv split.
+            ``fit_time``
+                The time for fitting the estimator on the train
+                set for each cv split.
+            ``score_time``
+                The time for scoring the estimator on the test set for each
+                cv split."""