make timegapsplit dataframe-agnostic

koaning · May 12, 2024 · 4eda210 · 4eda210
1 parent 28c102b
commit 4eda210
Show file tree

Hide file tree

Showing 2 changed files with 155 additions and 44 deletions.
diff --git a/sklego/model_selection.py b/sklego/model_selection.py
@@ -3,6 +3,7 @@
 from itertools import combinations
 from warnings import warn
 
+import narwhals as nw
 import numpy as np
 import pandas as pd
 from sklearn.exceptions import NotFittedError
@@ -82,11 +83,7 @@ def __init__(
         if (train_duration is not None) and (train_duration <= gap_duration):
             raise ValueError("gap_duration is longer than train_duration, it should be shorter.")
 
-        if not date_serie.index.is_unique:
-            raise ValueError("date_serie doesn't have a unique index")
-
-        self.date_serie = date_serie.copy()
-        self.date_serie = self.date_serie.rename("__date__")
+        self.date_serie = nw.from_native(date_serie, series_only=True).alias("__date__")
         self.train_duration = train_duration
         self.valid_duration = valid_duration
         self.gap_duration = gap_duration
@@ -103,8 +100,7 @@ def _join_date_and_x(self, X):
         X : pd.DataFrame
             Dataframe with the data to split
         """
-        X_index_df = pd.DataFrame(range(len(X)), columns=["np_index"], index=X.index)
-        X_index_df = X_index_df.join(self.date_serie)
+        X_index_df = nw.maybe_align_index(self.date_serie, X).to_frame().with_row_index("np_index")
 
         return X_index_df
 
@@ -125,9 +121,9 @@ def split(self, X, y=None, groups=None):
         tuple[np.ndarray, np.ndarray]
             Train and test indices of the same fold.
         """
-
+        X = nw.from_native(X, eager_only=True)
         X_index_df = self._join_date_and_x(X)
-        X_index_df = X_index_df.sort_values("__date__", ascending=True)
+        X_index_df = X_index_df.sort("__date__", descending=False)
 
         if len(X) != len(X_index_df):
             raise AssertionError(
@@ -167,23 +163,20 @@ def split(self, X, y=None, groups=None):
             if current_date + self.train_duration + time_shift + self.gap_duration > date_max:
                 break
 
-            X_train_df = X_index_df[
-                (X_index_df["__date__"] >= start_date) & (X_index_df["__date__"] < current_date + self.train_duration)
-            ]
-            X_valid_df = X_index_df[
-                (X_index_df["__date__"] >= current_date + self.train_duration + self.gap_duration)
-                & (
-                    X_index_df["__date__"]
-                    < current_date + self.train_duration + self.valid_duration + self.gap_duration
-                )
-            ]
+            X_train_df = X_index_df.filter(
+                (nw.col("__date__") >= start_date) & (nw.col("__date__") < current_date + self.train_duration)
+            )
+            X_valid_df = X_index_df.filter(
+                (nw.col("__date__") >= current_date + self.train_duration + self.gap_duration)
+                & (nw.col("__date__") < current_date + self.train_duration + self.valid_duration + self.gap_duration)
+            )
 
             current_date = current_date + time_shift
             if self.window == "rolling":
                 start_date = current_date
             yield (
-                X_train_df["np_index"].values,
-                X_valid_df["np_index"].values,
+                X_train_df["np_index"].to_numpy(),
+                X_valid_df["np_index"].to_numpy(),
             )
 
     def get_n_splits(self, X=None, y=None, groups=None):
@@ -210,42 +203,52 @@ def summary(self, X):
 
         Parameters
         ----------
-        X : pd.DataFrame
+        X : DataFrame
             Dataframe with the data to split.
 
         Returns
         -------
-        pd.DataFrame
+        DataFrame
             Summary of all folds.
         """
         summary = []
+        X = nw.from_native(X, eager_only=True)
         X_index_df = self._join_date_and_x(X)
 
-        def get_split_info(X, indices, j, part, summary):
-            dates = X_index_df.iloc[indices]["__date__"]
+        summary = {
+            "Start date": [],
+            "End date": [],
+            "Period": [],
+            "Unique days": [],
+            "nbr samples": [],
+            "part": [],
+            "fold": [],
+        }
+        native_namespace = nw.get_native_namespace(X)
+
+        def update_split_info(indices, j, part, summary):
+            dates = X_index_df["__date__"][indices]
             mindate = dates.min()
             maxdate = dates.max()
+            n_unique = dates.n_unique()
 
-            s = pd.Series(
-                {
-                    "Start date": mindate,
-                    "End date": maxdate,
-                    "Period": pd.to_datetime(maxdate, format="%Y%m%d") - pd.to_datetime(mindate, format="%Y%m%d"),
-                    "Unique days": len(dates.unique()),
-                    "nbr samples": len(indices),
-                },
-                name=(j, part),
-            )
-            summary.append(s)
-            return summary
+            summary["Start date"].append(mindate)
+            summary["End date"].append(maxdate)
+            summary["Period"].append(maxdate - mindate)
+            summary["Unique days"].append(n_unique)
+            summary["nbr samples"].append(len(indices))
+            summary["part"].append(part)
+            summary["fold"].append(j)
 
         j = 0
-        for i in self.split(X):
-            summary = get_split_info(X, i[0], j, "train", summary)
-            summary = get_split_info(X, i[1], j, "valid", summary)
-            j = j + 1
-
-        return pd.DataFrame(summary)
+        for i in self.split(nw.to_native(X)):
+            update_split_info(native_namespace.Series(i[0]), j, "train", summary)
+            update_split_info(native_namespace.Series(i[1]), j, "valid", summary)
+            j += 1
+
+        result = nw.from_native(native_namespace.DataFrame(summary))
+        result = nw.maybe_set_index(result, ["fold", "part"])
+        return nw.to_native(result)
 
 
 def KlusterFoldValidation(**kwargs):

diff --git a/tests/test_model_selection/test_timegapsplit.py b/tests/test_model_selection/test_timegapsplit.py
@@ -3,7 +3,10 @@
 
 import numpy as np
 import pandas as pd
+import polars as pl
 import pytest
+from pandas.testing import assert_frame_equal as pandas_assert_frame_equal
+from polars.testing import assert_frame_equal as polars_assert_frame_equal
 from sklearn.linear_model import Lasso
 from sklearn.model_selection import GridSearchCV
 from sklearn.pipeline import Pipeline
@@ -44,6 +47,33 @@ def test_timegapsplit():
     assert valid_maxdate == datetime.datetime.strptime("2018-01-23", "%Y-%m-%d")
 
 
+def test_timegapsplit_polars():
+    # Polars doesn't have an index, so this class behaves a bit differenly for
+    # index-less objects. We need to first ensure that `date_serie`, `X_train`,
+    # and `y_train` all have the same length.
+    date_serie = df["date"].loc[X_train.index]
+    cv = TimeGapSplit(
+        date_serie=pl.from_pandas(date_serie),
+        train_duration=timedelta(days=5),
+        valid_duration=timedelta(days=3),
+        gap_duration=timedelta(days=0),
+    )
+
+    for i, indices in enumerate(cv.split(pl.from_pandas(X_train), pl.from_pandas(y_train))):
+        train_mindate = df.loc[X_train.iloc[indices[0]].index]["date"].min()
+        train_maxdate = df.loc[X_train.iloc[indices[0]].index]["date"].max()
+        valid_mindate = df.loc[X_train.iloc[indices[1]].index]["date"].min()
+        valid_maxdate = df.loc[X_train.iloc[indices[1]].index]["date"].max()
+
+        assert train_mindate <= train_maxdate <= valid_mindate <= valid_maxdate
+
+    # regression testing, check if output changes of the last fold
+    assert train_mindate == datetime.datetime.strptime("2018-01-16", "%Y-%m-%d")
+    assert train_maxdate == datetime.datetime.strptime("2018-01-20", "%Y-%m-%d")
+    assert valid_mindate == datetime.datetime.strptime("2018-01-21", "%Y-%m-%d")
+    assert valid_maxdate == datetime.datetime.strptime("2018-01-23", "%Y-%m-%d")
+
+
 def test_timegapsplit_too_big_gap():
     try:
         TimeGapSplit(
@@ -151,5 +181,83 @@ def test_timegapsplit_summary():
     )
 
     summary = cv.summary(X_train)
-
     assert summary.shape == (12, 5)
+
+    expected_data = {
+        "Start date": [
+            datetime.datetime(2018, 1, 1, 0, 0),
+            datetime.datetime(2018, 1, 6, 0, 0),
+            datetime.datetime(2018, 1, 4, 0, 0),
+            datetime.datetime(2018, 1, 9, 0, 0),
+            datetime.datetime(2018, 1, 7, 0, 0),
+            datetime.datetime(2018, 1, 12, 0, 0),
+            datetime.datetime(2018, 1, 10, 0, 0),
+            datetime.datetime(2018, 1, 15, 0, 0),
+            datetime.datetime(2018, 1, 13, 0, 0),
+            datetime.datetime(2018, 1, 18, 0, 0),
+            datetime.datetime(2018, 1, 16, 0, 0),
+            datetime.datetime(2018, 1, 21, 0, 0),
+        ],
+        "End date": [
+            datetime.datetime(2018, 1, 5, 0, 0),
+            datetime.datetime(2018, 1, 8, 0, 0),
+            datetime.datetime(2018, 1, 8, 0, 0),
+            datetime.datetime(2018, 1, 11, 0, 0),
+            datetime.datetime(2018, 1, 11, 0, 0),
+            datetime.datetime(2018, 1, 14, 0, 0),
+            datetime.datetime(2018, 1, 14, 0, 0),
+            datetime.datetime(2018, 1, 17, 0, 0),
+            datetime.datetime(2018, 1, 17, 0, 0),
+            datetime.datetime(2018, 1, 20, 0, 0),
+            datetime.datetime(2018, 1, 20, 0, 0),
+            datetime.datetime(2018, 1, 23, 0, 0),
+        ],
+        "Period": [
+            datetime.timedelta(days=4),
+            datetime.timedelta(days=2),
+            datetime.timedelta(days=4),
+            datetime.timedelta(days=2),
+            datetime.timedelta(days=4),
+            datetime.timedelta(days=2),
+            datetime.timedelta(days=4),
+            datetime.timedelta(days=2),
+            datetime.timedelta(days=4),
+            datetime.timedelta(days=2),
+            datetime.timedelta(days=4),
+            datetime.timedelta(days=2),
+        ],
+        "Unique days": [5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3],
+        "nbr samples": [5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3],
+        "part": [
+            "train",
+            "valid",
+            "train",
+            "valid",
+            "train",
+            "valid",
+            "train",
+            "valid",
+            "train",
+            "valid",
+            "train",
+            "valid",
+        ],
+        "fold": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5],
+    }
+    expected = pd.DataFrame(expected_data).set_index(["fold", "part"])
+    pandas_assert_frame_equal(summary, expected)
+
+    # Polars doesn't have an index, so this class behaves a bit differenly for
+    # index-less objects. We need to ensure that `date_serie` and `X_train` have
+    # the same length.
+    date_serie = df["date"].loc[X_train.index]
+    cv = TimeGapSplit(
+        date_serie=pl.from_pandas(date_serie),
+        train_duration=timedelta(days=5),
+        valid_duration=timedelta(days=3),
+        gap_duration=timedelta(days=0),
+    )
+    summary = cv.summary(pl.from_pandas(X_train))
+
+    expected = pl.DataFrame(expected_data)
+    polars_assert_frame_equal(summary, expected)