diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4d2e0a47c..1fd39b8b8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,7 +13,6 @@ jobs: matrix: os: [ubuntu-latest, macos-latest, windows-latest] python-version: [ - "3.8", "3.9", "3.10", "3.11", diff --git a/docs/_scripts/preprocessing.py b/docs/_scripts/preprocessing.py index 50ab3e824..fb30d1be7 100644 --- a/docs/_scripts/preprocessing.py +++ b/docs/_scripts/preprocessing.py @@ -325,3 +325,66 @@ def generate_dataset(start, n=600): plt.savefig(_static_path / "monotonic-3.png") plt.clf() + + +######################################## Monotonic Spline ####################################### + +# --8<-- [start:monotonic-spline] +import matplotlib.pylab as plt +import numpy as np + +N = 1000 + +np.random.seed(42) +X = np.random.uniform(0, 1, size=N) +y = X ** 2 + .1 * np.sin(20 * X) + 1 + .1 * np.random.randn(N) + +plt.figure(figsize=(10, 4)) +plt.scatter(X, y) +# --8<-- [end:monotonic-spline] + +plt.savefig(_static_path / "monotonic-spline.png") +plt.clf() + +# --8<-- [start:monotonic-spline-transform] +from sklego.preprocessing import MonotonicSplineTransformer + +X_plt = np.sort(X) + +plt.figure(figsize=(10, 4)) +tfm = MonotonicSplineTransformer(n_knots=10) +X_out = tfm.fit_transform(X_plt.reshape(-1, 1)) +plt.plot(X_out); +# --8<-- [end:monotonic-spline-transform] + +plt.savefig(_static_path / "monotonic-spline-transform.png") +plt.clf() + +# --8<-- [start:monotonic-spline-regr] +from sklearn.pipeline import make_pipeline +from sklearn.linear_model import Ridge +from sklearn.isotonic import IsotonicRegression +from sklego.preprocessing import MonotonicSplineTransformer + +pipe = make_pipeline( + MonotonicSplineTransformer(n_knots=10), + Ridge(positive=True), +) +pipe.fit(X.reshape(-1, 1), y) + +iso = IsotonicRegression(out_of_bounds="clip") +iso.fit(X, y) + +X_test = np.linspace(-0.2, 1.2, 100)[:, None] + +plt.figure(figsize=(10, 4)) +plt.plot(X_test[:, 0], pipe.predict(X_test), color="orange", linewidth=2, label="MonotonicSpline") +plt.plot( + X_test[:, 0], iso.predict(X_test), color="green", linewidth=2, label="Isotonic" +) +plt.scatter(X, y, alpha=0.3, label="Data") +plt.legend() +# --8<-- [end:monotonic-spline-regr] + +plt.savefig(_static_path / "monotonic-spline-regr.png") +plt.clf() diff --git a/docs/_static/preprocessing/interval-encoder-1.png b/docs/_static/preprocessing/interval-encoder-1.png index 57655f6d7..a5110e7e8 100644 Binary files a/docs/_static/preprocessing/interval-encoder-1.png and b/docs/_static/preprocessing/interval-encoder-1.png differ diff --git a/docs/_static/preprocessing/interval-encoder-2.png b/docs/_static/preprocessing/interval-encoder-2.png index 51a452610..7196eebad 100644 Binary files a/docs/_static/preprocessing/interval-encoder-2.png and b/docs/_static/preprocessing/interval-encoder-2.png differ diff --git a/docs/_static/preprocessing/interval-encoder-3.png b/docs/_static/preprocessing/interval-encoder-3.png index 4d97255d4..2eb00b4f0 100644 Binary files a/docs/_static/preprocessing/interval-encoder-3.png and b/docs/_static/preprocessing/interval-encoder-3.png differ diff --git a/docs/_static/preprocessing/monotonic-2.png b/docs/_static/preprocessing/monotonic-2.png index a82f1fcb2..57a3de75f 100644 Binary files a/docs/_static/preprocessing/monotonic-2.png and b/docs/_static/preprocessing/monotonic-2.png differ diff --git a/docs/_static/preprocessing/monotonic-3.png b/docs/_static/preprocessing/monotonic-3.png index 853aebb47..e2a584882 100644 Binary files a/docs/_static/preprocessing/monotonic-3.png and b/docs/_static/preprocessing/monotonic-3.png differ diff --git a/docs/_static/preprocessing/monotonic-spline-regr.png b/docs/_static/preprocessing/monotonic-spline-regr.png new file mode 100644 index 000000000..606fa88ea Binary files /dev/null and b/docs/_static/preprocessing/monotonic-spline-regr.png differ diff --git a/docs/_static/preprocessing/monotonic-spline-transform.png b/docs/_static/preprocessing/monotonic-spline-transform.png new file mode 100644 index 000000000..c2b40a300 Binary files /dev/null and b/docs/_static/preprocessing/monotonic-spline-transform.png differ diff --git a/docs/_static/preprocessing/monotonic-spline.png b/docs/_static/preprocessing/monotonic-spline.png new file mode 100644 index 000000000..1d8549564 Binary files /dev/null and b/docs/_static/preprocessing/monotonic-spline.png differ diff --git a/docs/_static/preprocessing/rbf-data.png b/docs/_static/preprocessing/rbf-data.png index 3a2288c1b..a79b577af 100644 Binary files a/docs/_static/preprocessing/rbf-data.png and b/docs/_static/preprocessing/rbf-data.png differ diff --git a/docs/_static/preprocessing/rbf-plot.png b/docs/_static/preprocessing/rbf-plot.png index b5aacf160..cc6b7e213 100644 Binary files a/docs/_static/preprocessing/rbf-plot.png and b/docs/_static/preprocessing/rbf-plot.png differ diff --git a/docs/_static/preprocessing/rbf-regr.png b/docs/_static/preprocessing/rbf-regr.png index 2acb5129d..4547c5d07 100644 Binary files a/docs/_static/preprocessing/rbf-regr.png and b/docs/_static/preprocessing/rbf-regr.png differ diff --git a/docs/api/preprocessing.md b/docs/api/preprocessing.md index b27c971cb..3c67ced65 100644 --- a/docs/api/preprocessing.md +++ b/docs/api/preprocessing.md @@ -40,6 +40,11 @@ show_root_full_path: true show_root_heading: true +:::sklego.preprocessing.monotonicspline.MonotonicSplineTransformer + options: + show_root_full_path: true + show_root_heading: true + :::sklego.preprocessing.projections.OrthogonalTransformer options: show_root_full_path: true diff --git a/docs/user-guide/preprocessing.md b/docs/user-guide/preprocessing.md index b84906f75..f9695debd 100644 --- a/docs/user-guide/preprocessing.md +++ b/docs/user-guide/preprocessing.md @@ -199,6 +199,40 @@ Note that you can make this approach even more powerful for timeseries by choosi To explore this idea we've also implemented a `DecayEstimator`. For more information see the [section on meta estimators][decay-section] for this. +## Monotonic Spline Transformer + +The `MonotonicSplineTransformer` is a transformer that fits a monotonic spline to the input data. This can be useful when you want to capture non-linear relationships between features and the target variable, while ensuring that the relationship is monotonic. The technique is based on [_Fitting monotonic curves using splines_ blogpost by Mate Kadlicsko](https://matekadlicsko.github.io/posts/monotonic-splines/). + +To demonstrate how this works let's first generate some data. + +```py +--8<-- "docs/_scripts/preprocessing.py:monotonic-spline" +``` + + +![monotonic-spline](../_static/preprocessing/monotonic-spline.png) + +Next, let's show what the transformed data looks like. + +```py +--8<-- "docs/_scripts/preprocessing.py:monotonic-spline-transform" +``` + +![monotonic-spline-transform](../_static/preprocessing/monotonic-spline-transform.png) + +Finally, let's show how these features might compare with an isotonic regression. + +```py +--8<-- "docs/_scripts/preprocessing.py:monotonic-spline-regr" +``` + +![monotonic-spline-regr](../_static/preprocessing/monotonic-spline-regr.png) + +While the `IsotonicRegression` gives a similar result, there are a few reasons why the monotonic spline might be preferred: + +1. The monotonic model can result in a smoother model when followed up by a linear model. The linear model can still guarantee monotonicity, but the `IsotonicRegression` might result in a spiky output. +2. When datasets get big, especially when there are many features involved, the monotonic spline might be faster to compute. This is because the `IsotonicRegression` demands a more complex solver that might not scale as well as a linear model. + ## Interval Encoders Sometimes a linear regression doesn't entirely do what you'd like. Take this pattern; diff --git a/pyproject.toml b/pyproject.toml index e01370de0..ce7e06e16 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,7 +102,7 @@ line-length = 120 exclude = ["docs"] [tool.ruff.lint] -extend-select = ["I"] +extend-select = ["I", "T201"] ignore = [ "E731", # do not assign a `lambda` expression, use a `def` ] diff --git a/readme.md b/readme.md index 21bfe1eed..824d2108e 100644 --- a/readme.md +++ b/readme.md @@ -125,6 +125,7 @@ Here's a list of features that this library currently offers: - `sklego.preprocessing.RepeatingBasisFunction` repeating feature engineering, useful for timeseries - `sklego.preprocessing.DictMapper` assign numeric values on categorical columns - `sklego.preprocessing.OutlierRemover` experimental method to remove outliers during training +- `sklego.preprocessing.MonotonicSplineTransformer` re-uses `SplineTransformer` in an attempt to make monotonic features - `sklego.model_selection.GroupTimeSeriesSplit` timeseries Kfold for groups with different amount of observations per group - `sklego.model_selection.KlusterFoldValidation` experimental feature that does K folds based on clustering - `sklego.model_selection.TimeGapSplit` timeseries Kfold with a gap between train/test diff --git a/sklego/preprocessing/__init__.py b/sklego/preprocessing/__init__.py index bd068a397..3dbcacc6c 100644 --- a/sklego/preprocessing/__init__.py +++ b/sklego/preprocessing/__init__.py @@ -13,6 +13,7 @@ "TypeSelector", "RandomAdder", "RepeatingBasisFunction", + "MonotonicSplineTransformer", ] from sklego.preprocessing.columncapper import ColumnCapper @@ -20,6 +21,7 @@ from sklego.preprocessing.formulaictransformer import FormulaicTransformer from sklego.preprocessing.identitytransformer import IdentityTransformer from sklego.preprocessing.intervalencoder import IntervalEncoder +from sklego.preprocessing.monotonicspline import MonotonicSplineTransformer from sklego.preprocessing.outlier_remover import OutlierRemover from sklego.preprocessing.pandastransformers import ColumnDropper, ColumnSelector, PandasTypeSelector, TypeSelector from sklego.preprocessing.projections import InformationFilter, OrthogonalTransformer diff --git a/sklego/preprocessing/monotonicspline.py b/sklego/preprocessing/monotonicspline.py new file mode 100644 index 000000000..130870518 --- /dev/null +++ b/sklego/preprocessing/monotonicspline.py @@ -0,0 +1,102 @@ +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.preprocessing import SplineTransformer +from sklearn.utils import check_array +from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted + + +class MonotonicSplineTransformer(TransformerMixin, BaseEstimator): + """The `MonotonicSplineTransformer` integrates the output of the `SplineTransformer` in an attempt to make monotonic features. + + This estimator is heavily inspired by [this blogpost](https://matekadlicsko.github.io/posts/monotonic-splines/) by Mate Kadlicsko. + + Parameters + ---------- + n_knots : int, default=3 + The number of knots to use in the spline transformation. + degree : int, default=3 + The polynomial degree to use in the spline transformation + knots : Literal['uniform', 'quantile'], default="uniform" + Knots argument of spline transformer + + Attributes + ---------- + spline_transformer_ : trained SplineTransformer + features_in_ : int + The number of features seen in the training data. + + """ + + def __init__(self, n_knots=3, degree=3, knots="uniform"): + self.n_knots = n_knots + self.degree = degree + self.knots = knots + + def fit(self, X, y=None): + """Fit the `MonotonicSplineTransformer` transformer by computing the spline transformation of `X`. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to transform. + y : array-like of shape (n_samples,), default=None + Ignored, present for compatibility. + + Returns + ------- + self : MonotonicSplineTransformer + The fitted transformer. + + Raises + ------ + ValueError + If `X` contains non-numeric columns. + """ + X = check_array(X, copy=True, force_all_finite=False, dtype=FLOAT_DTYPES, estimator=self) + + # If X contains infs, we need to replace them by nans before computing quantiles + self.spline_transformer_ = { + col: SplineTransformer(n_knots=self.n_knots, degree=self.degree, knots=self.knots).fit( + X[:, col].reshape(-1, 1) + ) + for col in range(X.shape[1]) + } + self.n_features_in_ = X.shape[1] + return self + + def transform(self, X): + """Performs the Ispline transformation on `X`. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + Returns + ------- + X : np.ndarray of shape (n_samples, n_out) + Transformed `X` values. + + Raises + ------ + ValueError + If the number of columns from `X` differs from the number of columns when fitting. + """ + check_is_fitted(self, "spline_transformer_") + X = check_array( + X, + force_all_finite=False, + dtype=FLOAT_DTYPES, + estimator=self, + ) + if X.shape[1] != self.n_features_in_: + raise ValueError("Number of features going into .transform() do not match number going into .fit().") + + out = [] + for col in range(X.shape[1]): + out.append( + np.cumsum( + self.spline_transformer_[col].transform(X[:, [col]])[:, ::-1], + axis=1, + ) + ) + return np.concatenate(out, axis=1) diff --git a/sklego/this.py b/sklego/this.py index ebd7c258a..3b57ce42d 100644 --- a/sklego/this.py +++ b/sklego/this.py @@ -36,4 +36,4 @@ you're doing it wrong. """ -print(poem) +print(poem) # noqa: T201 diff --git a/tests/test_model_selection/test_timegapsplit.py b/tests/test_model_selection/test_timegapsplit.py index 47c5cb45e..9be0a1046 100644 --- a/tests/test_model_selection/test_timegapsplit.py +++ b/tests/test_model_selection/test_timegapsplit.py @@ -92,7 +92,7 @@ def test_timegapsplit_too_big_gap(): gap_duration=timedelta(days=5), ) except ValueError: - print("Successfully failed") + print("Successfully failed") # noqa: T201 def test_timegapsplit_using_splits(): diff --git a/tests/test_preprocessing/test_interval_encoder.py b/tests/test_preprocessing/test_interval_encoder.py index bc5f2a2fc..398ae0157 100644 --- a/tests/test_preprocessing/test_interval_encoder.py +++ b/tests/test_preprocessing/test_interval_encoder.py @@ -51,7 +51,6 @@ def test_monotonicity_decreasing(data_init): X, y = generate_dataset(start=data_init) encoder = IntervalEncoder(n_chunks=40, method="decreasing") y_transformed = encoder.fit_transform(X, y).reshape(-1).round(4) - print(y_transformed.reshape(-1)) for i in range(len(y_transformed) - 1): assert y_transformed[i] >= y_transformed[i + 1] diff --git a/tests/test_preprocessing/test_monospline.py b/tests/test_preprocessing/test_monospline.py new file mode 100644 index 000000000..01a996cc1 --- /dev/null +++ b/tests/test_preprocessing/test_monospline.py @@ -0,0 +1,43 @@ +import numpy as np +import pytest +from sklearn.preprocessing import SplineTransformer +from sklearn.utils.estimator_checks import parametrize_with_checks + +from sklego.preprocessing import MonotonicSplineTransformer + + +@parametrize_with_checks([MonotonicSplineTransformer()]) +def test_sklearn_compatible_estimator(estimator, check): + check(estimator) + + +@pytest.mark.parametrize("n_knots", [3, 5]) +@pytest.mark.parametrize("degree", [3, 5]) +@pytest.mark.parametrize("knots", ["uniform", "quantile"]) +def test_monotonic_spline_transformer(n_knots, degree, knots): + X = np.random.uniform(size=(100, 10)) + transformer = MonotonicSplineTransformer(n_knots=n_knots, degree=degree, knots=knots) + transformer_sk = SplineTransformer(n_knots=n_knots, degree=degree, knots=knots) + transformer.fit(X) + transformer_sk.fit(X) + out = transformer.transform(X) + out_sk = transformer_sk.transform(X) + + # Both should have the same shape + assert out.shape == out_sk.shape + + n_splines_per_feature = n_knots + degree - 1 + assert out.shape[1] == X.shape[1] * n_splines_per_feature + + # I splines should be bounded by 0 and 1 + assert np.logical_or(out >= 0, np.isclose(out, 0)).all() + assert np.logical_or(out <= 1, np.isclose(out, 1)).all() + + # The features should be monotonically increasing + for i in range(X.shape[1]): + feature = X[:, i] + sorted_out = out[np.argsort(feature), i * n_splines_per_feature : (i + 1) * n_splines_per_feature] + differences = np.diff(sorted_out, axis=0) + + # All differences should be greater or equal to zero upto floating point errors + assert np.logical_or(np.greater_equal(differences, 0), np.isclose(differences, 0)).all()