Skip to content

Commit

Permalink
make timegapsplit dataframe-agnostic
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcoGorelli committed May 12, 2024
1 parent 28c102b commit 4eda210
Show file tree
Hide file tree
Showing 2 changed files with 155 additions and 44 deletions.
89 changes: 46 additions & 43 deletions sklego/model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from itertools import combinations
from warnings import warn

import narwhals as nw
import numpy as np
import pandas as pd
from sklearn.exceptions import NotFittedError
Expand Down Expand Up @@ -82,11 +83,7 @@ def __init__(
if (train_duration is not None) and (train_duration <= gap_duration):
raise ValueError("gap_duration is longer than train_duration, it should be shorter.")

if not date_serie.index.is_unique:
raise ValueError("date_serie doesn't have a unique index")

self.date_serie = date_serie.copy()
self.date_serie = self.date_serie.rename("__date__")
self.date_serie = nw.from_native(date_serie, series_only=True).alias("__date__")
self.train_duration = train_duration
self.valid_duration = valid_duration
self.gap_duration = gap_duration
Expand All @@ -103,8 +100,7 @@ def _join_date_and_x(self, X):
X : pd.DataFrame
Dataframe with the data to split
"""
X_index_df = pd.DataFrame(range(len(X)), columns=["np_index"], index=X.index)
X_index_df = X_index_df.join(self.date_serie)
X_index_df = nw.maybe_align_index(self.date_serie, X).to_frame().with_row_index("np_index")

return X_index_df

Expand All @@ -125,9 +121,9 @@ def split(self, X, y=None, groups=None):
tuple[np.ndarray, np.ndarray]
Train and test indices of the same fold.
"""

X = nw.from_native(X, eager_only=True)
X_index_df = self._join_date_and_x(X)
X_index_df = X_index_df.sort_values("__date__", ascending=True)
X_index_df = X_index_df.sort("__date__", descending=False)

if len(X) != len(X_index_df):
raise AssertionError(
Expand Down Expand Up @@ -167,23 +163,20 @@ def split(self, X, y=None, groups=None):
if current_date + self.train_duration + time_shift + self.gap_duration > date_max:
break

X_train_df = X_index_df[
(X_index_df["__date__"] >= start_date) & (X_index_df["__date__"] < current_date + self.train_duration)
]
X_valid_df = X_index_df[
(X_index_df["__date__"] >= current_date + self.train_duration + self.gap_duration)
& (
X_index_df["__date__"]
< current_date + self.train_duration + self.valid_duration + self.gap_duration
)
]
X_train_df = X_index_df.filter(
(nw.col("__date__") >= start_date) & (nw.col("__date__") < current_date + self.train_duration)
)
X_valid_df = X_index_df.filter(
(nw.col("__date__") >= current_date + self.train_duration + self.gap_duration)
& (nw.col("__date__") < current_date + self.train_duration + self.valid_duration + self.gap_duration)
)

current_date = current_date + time_shift
if self.window == "rolling":
start_date = current_date
yield (
X_train_df["np_index"].values,
X_valid_df["np_index"].values,
X_train_df["np_index"].to_numpy(),
X_valid_df["np_index"].to_numpy(),
)

def get_n_splits(self, X=None, y=None, groups=None):
Expand All @@ -210,42 +203,52 @@ def summary(self, X):
Parameters
----------
X : pd.DataFrame
X : DataFrame
Dataframe with the data to split.
Returns
-------
pd.DataFrame
DataFrame
Summary of all folds.
"""
summary = []
X = nw.from_native(X, eager_only=True)
X_index_df = self._join_date_and_x(X)

def get_split_info(X, indices, j, part, summary):
dates = X_index_df.iloc[indices]["__date__"]
summary = {
"Start date": [],
"End date": [],
"Period": [],
"Unique days": [],
"nbr samples": [],
"part": [],
"fold": [],
}
native_namespace = nw.get_native_namespace(X)

def update_split_info(indices, j, part, summary):
dates = X_index_df["__date__"][indices]
mindate = dates.min()
maxdate = dates.max()
n_unique = dates.n_unique()

s = pd.Series(
{
"Start date": mindate,
"End date": maxdate,
"Period": pd.to_datetime(maxdate, format="%Y%m%d") - pd.to_datetime(mindate, format="%Y%m%d"),
"Unique days": len(dates.unique()),
"nbr samples": len(indices),
},
name=(j, part),
)
summary.append(s)
return summary
summary["Start date"].append(mindate)
summary["End date"].append(maxdate)
summary["Period"].append(maxdate - mindate)
summary["Unique days"].append(n_unique)
summary["nbr samples"].append(len(indices))
summary["part"].append(part)
summary["fold"].append(j)

j = 0
for i in self.split(X):
summary = get_split_info(X, i[0], j, "train", summary)
summary = get_split_info(X, i[1], j, "valid", summary)
j = j + 1

return pd.DataFrame(summary)
for i in self.split(nw.to_native(X)):
update_split_info(native_namespace.Series(i[0]), j, "train", summary)
update_split_info(native_namespace.Series(i[1]), j, "valid", summary)
j += 1

result = nw.from_native(native_namespace.DataFrame(summary))
result = nw.maybe_set_index(result, ["fold", "part"])
return nw.to_native(result)


def KlusterFoldValidation(**kwargs):
Expand Down
110 changes: 109 additions & 1 deletion tests/test_model_selection/test_timegapsplit.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@

import numpy as np
import pandas as pd
import polars as pl
import pytest
from pandas.testing import assert_frame_equal as pandas_assert_frame_equal
from polars.testing import assert_frame_equal as polars_assert_frame_equal
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
Expand Down Expand Up @@ -44,6 +47,33 @@ def test_timegapsplit():
assert valid_maxdate == datetime.datetime.strptime("2018-01-23", "%Y-%m-%d")


def test_timegapsplit_polars():
# Polars doesn't have an index, so this class behaves a bit differenly for
# index-less objects. We need to first ensure that `date_serie`, `X_train`,
# and `y_train` all have the same length.
date_serie = df["date"].loc[X_train.index]
cv = TimeGapSplit(
date_serie=pl.from_pandas(date_serie),
train_duration=timedelta(days=5),
valid_duration=timedelta(days=3),
gap_duration=timedelta(days=0),
)

for i, indices in enumerate(cv.split(pl.from_pandas(X_train), pl.from_pandas(y_train))):
train_mindate = df.loc[X_train.iloc[indices[0]].index]["date"].min()
train_maxdate = df.loc[X_train.iloc[indices[0]].index]["date"].max()
valid_mindate = df.loc[X_train.iloc[indices[1]].index]["date"].min()
valid_maxdate = df.loc[X_train.iloc[indices[1]].index]["date"].max()

assert train_mindate <= train_maxdate <= valid_mindate <= valid_maxdate

# regression testing, check if output changes of the last fold
assert train_mindate == datetime.datetime.strptime("2018-01-16", "%Y-%m-%d")
assert train_maxdate == datetime.datetime.strptime("2018-01-20", "%Y-%m-%d")
assert valid_mindate == datetime.datetime.strptime("2018-01-21", "%Y-%m-%d")
assert valid_maxdate == datetime.datetime.strptime("2018-01-23", "%Y-%m-%d")


def test_timegapsplit_too_big_gap():
try:
TimeGapSplit(
Expand Down Expand Up @@ -151,5 +181,83 @@ def test_timegapsplit_summary():
)

summary = cv.summary(X_train)

assert summary.shape == (12, 5)

expected_data = {
"Start date": [
datetime.datetime(2018, 1, 1, 0, 0),
datetime.datetime(2018, 1, 6, 0, 0),
datetime.datetime(2018, 1, 4, 0, 0),
datetime.datetime(2018, 1, 9, 0, 0),
datetime.datetime(2018, 1, 7, 0, 0),
datetime.datetime(2018, 1, 12, 0, 0),
datetime.datetime(2018, 1, 10, 0, 0),
datetime.datetime(2018, 1, 15, 0, 0),
datetime.datetime(2018, 1, 13, 0, 0),
datetime.datetime(2018, 1, 18, 0, 0),
datetime.datetime(2018, 1, 16, 0, 0),
datetime.datetime(2018, 1, 21, 0, 0),
],
"End date": [
datetime.datetime(2018, 1, 5, 0, 0),
datetime.datetime(2018, 1, 8, 0, 0),
datetime.datetime(2018, 1, 8, 0, 0),
datetime.datetime(2018, 1, 11, 0, 0),
datetime.datetime(2018, 1, 11, 0, 0),
datetime.datetime(2018, 1, 14, 0, 0),
datetime.datetime(2018, 1, 14, 0, 0),
datetime.datetime(2018, 1, 17, 0, 0),
datetime.datetime(2018, 1, 17, 0, 0),
datetime.datetime(2018, 1, 20, 0, 0),
datetime.datetime(2018, 1, 20, 0, 0),
datetime.datetime(2018, 1, 23, 0, 0),
],
"Period": [
datetime.timedelta(days=4),
datetime.timedelta(days=2),
datetime.timedelta(days=4),
datetime.timedelta(days=2),
datetime.timedelta(days=4),
datetime.timedelta(days=2),
datetime.timedelta(days=4),
datetime.timedelta(days=2),
datetime.timedelta(days=4),
datetime.timedelta(days=2),
datetime.timedelta(days=4),
datetime.timedelta(days=2),
],
"Unique days": [5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3],
"nbr samples": [5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3],
"part": [
"train",
"valid",
"train",
"valid",
"train",
"valid",
"train",
"valid",
"train",
"valid",
"train",
"valid",
],
"fold": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5],
}
expected = pd.DataFrame(expected_data).set_index(["fold", "part"])
pandas_assert_frame_equal(summary, expected)

# Polars doesn't have an index, so this class behaves a bit differenly for
# index-less objects. We need to ensure that `date_serie` and `X_train` have
# the same length.
date_serie = df["date"].loc[X_train.index]
cv = TimeGapSplit(
date_serie=pl.from_pandas(date_serie),
train_duration=timedelta(days=5),
valid_duration=timedelta(days=3),
gap_duration=timedelta(days=0),
)
summary = cv.summary(pl.from_pandas(X_train))

expected = pl.DataFrame(expected_data)
polars_assert_frame_equal(summary, expected)

0 comments on commit 4eda210

Please sign in to comment.