Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MonotonicSplineTransformer #709

Merged
merged 21 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ jobs:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: [
"3.8",
"3.9",
"3.10",
"3.11",
Expand Down
63 changes: 63 additions & 0 deletions docs/_scripts/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,3 +325,66 @@ def generate_dataset(start, n=600):

plt.savefig(_static_path / "monotonic-3.png")
plt.clf()


######################################## Monotonic Spline #######################################

# --8<-- [start:monotonic-spline]
import matplotlib.pylab as plt
import numpy as np

N = 1000

np.random.seed(42)
X = np.random.uniform(0, 1, size=N)
y = X ** 2 + .1 * np.sin(20 * X) + 1 + .1 * np.random.randn(N)

plt.figure(figsize=(10, 4))
plt.scatter(X, y)
# --8<-- [end:monotonic-spline]

plt.savefig(_static_path / "monotonic-spline.png")
plt.clf()

# --8<-- [start:monotonic-spline-transform]
from sklego.preprocessing import MonotonicSplineTransformer

X_plt = np.sort(X)

plt.figure(figsize=(10, 4))
tfm = MonotonicSplineTransformer(n_knots=10)
X_out = tfm.fit_transform(X_plt.reshape(-1, 1))
plt.plot(X_out);
# --8<-- [end:monotonic-spline-transform]

plt.savefig(_static_path / "monotonic-spline-transform.png")
plt.clf()

# --8<-- [start:monotonic-spline-regr]
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.isotonic import IsotonicRegression
from sklego.preprocessing import MonotonicSplineTransformer

pipe = make_pipeline(
MonotonicSplineTransformer(n_knots=10),
Ridge(positive=True),
)
pipe.fit(X.reshape(-1, 1), y)

iso = IsotonicRegression(out_of_bounds="clip")
iso.fit(X, y)

X_test = np.linspace(-0.2, 1.2, 100)[:, None]

plt.figure(figsize=(10, 4))
plt.plot(X_test[:, 0], pipe.predict(X_test), color="orange", linewidth=2, label="MonotonicSpline")
plt.plot(
X_test[:, 0], iso.predict(X_test), color="green", linewidth=2, label="Isotonic"
)
plt.scatter(X, y, alpha=0.3, label="Data")
plt.legend()
# --8<-- [end:monotonic-spline-regr]

plt.savefig(_static_path / "monotonic-spline-regr.png")
plt.clf()
Binary file modified docs/_static/preprocessing/interval-encoder-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/_static/preprocessing/interval-encoder-2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/_static/preprocessing/interval-encoder-3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/_static/preprocessing/monotonic-2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/_static/preprocessing/monotonic-3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/_static/preprocessing/monotonic-spline.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/_static/preprocessing/rbf-data.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/_static/preprocessing/rbf-plot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/_static/preprocessing/rbf-regr.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5 changes: 5 additions & 0 deletions docs/api/preprocessing.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@
show_root_full_path: true
show_root_heading: true

:::sklego.preprocessing.monotonicspline.MonotonicSplineTransformer
options:
show_root_full_path: true
show_root_heading: true

:::sklego.preprocessing.projections.OrthogonalTransformer
options:
show_root_full_path: true
Expand Down
34 changes: 34 additions & 0 deletions docs/user-guide/preprocessing.md
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,40 @@ Note that you can make this approach even more powerful for timeseries by choosi

To explore this idea we've also implemented a `DecayEstimator`. For more information see the [section on meta estimators][decay-section] for this.

## Monotonic Spline Transformer

The `MonotonicSplineTransformer` is a transformer that fits a monotonic spline to the input data. This can be useful when you want to capture non-linear relationships between features and the target variable, while ensuring that the relationship is monotonic. The technique is based on [_Fitting monotonic curves using splines_ blogpost by Mate Kadlicsko](https://matekadlicsko.github.io/posts/monotonic-splines/).

To demonstrate how this works let's first generate some data.

```py
--8<-- "docs/_scripts/preprocessing.py:monotonic-spline"
```


![monotonic-spline](../_static/preprocessing/monotonic-spline.png)

Next, let's show what the transformed data looks like.

```py
--8<-- "docs/_scripts/preprocessing.py:monotonic-spline-transform"
```

![monotonic-spline-transform](../_static/preprocessing/monotonic-spline-transform.png)

Finally, let's show how these features might compare with an isotonic regression.

```py
--8<-- "docs/_scripts/preprocessing.py:monotonic-spline-regr"
```

![monotonic-spline-regr](../_static/preprocessing/monotonic-spline-regr.png)

While the `IsotonicRegression` gives a similar result, there are a few reasons why the monotonic spline might be preferred:

1. The monotonic model can result in a smoother model when followed up by a linear model. The linear model can still guarantee monotonicity, but the `IsotonicRegression` might result in a spiky output.
2. When datasets get big, especially when there are many features involved, the monotonic spline might be faster to compute. This is because the `IsotonicRegression` demands a more complex solver that might not scale as well as a linear model.

## Interval Encoders

Sometimes a linear regression doesn't entirely do what you'd like. Take this pattern;
Expand Down
1 change: 1 addition & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ Here's a list of features that this library currently offers:
- `sklego.preprocessing.RepeatingBasisFunction` repeating feature engineering, useful for timeseries
- `sklego.preprocessing.DictMapper` assign numeric values on categorical columns
- `sklego.preprocessing.OutlierRemover` experimental method to remove outliers during training
- `sklego.preprocessing.MonotonicSplineTransformer` re-uses `SplineTransformer` in an attempt to make monotonic features
- `sklego.model_selection.GroupTimeSeriesSplit` timeseries Kfold for groups with different amount of observations per group
- `sklego.model_selection.KlusterFoldValidation` experimental feature that does K folds based on clustering
- `sklego.model_selection.TimeGapSplit` timeseries Kfold with a gap between train/test
Expand Down
2 changes: 2 additions & 0 deletions sklego/preprocessing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@
"TypeSelector",
"RandomAdder",
"RepeatingBasisFunction",
"MonotonicSplineTransformer",
]

from sklego.preprocessing.columncapper import ColumnCapper
from sklego.preprocessing.dictmapper import DictMapper
from sklego.preprocessing.formulaictransformer import FormulaicTransformer
from sklego.preprocessing.identitytransformer import IdentityTransformer
from sklego.preprocessing.intervalencoder import IntervalEncoder
from sklego.preprocessing.monotonicspline import MonotonicSplineTransformer
from sklego.preprocessing.outlier_remover import OutlierRemover
from sklego.preprocessing.pandastransformers import ColumnDropper, ColumnSelector, PandasTypeSelector, TypeSelector
from sklego.preprocessing.projections import InformationFilter, OrthogonalTransformer
Expand Down
100 changes: 100 additions & 0 deletions sklego/preprocessing/monotonicspline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import SplineTransformer
from sklearn.utils import check_array
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted


class MonotonicSplineTransformer(TransformerMixin, BaseEstimator):
"""The `MonotonicSplineTransformer` integrates the output of the `SplineTransformer` in an attempt to make monotonic features.
koaning marked this conversation as resolved.
Show resolved Hide resolved

This estimator is heavily inspired by [this blogpost](https://matekadlicsko.github.io/posts/monotonic-splines/) by Mate Kadlicsko.

Parameters
----------
n_knots : int, default=3
The number of knots to use in the spline transformation.
degree : int, default=3
knots: str, default="uniform"
koaning marked this conversation as resolved.
Show resolved Hide resolved

Attributes
----------
spline_transformer_ : trained SplineTransformer
features_in_ : int
The number of features seen in the training data.

"""

def __init__(self, n_knots=3, degree=3, knots="uniform"):
self.n_knots = n_knots
self.degree = degree
self.knots = knots

def fit(self, X, y=None):
"""Fit the `MonotonicSplineTransformer` transformer by computing the spline transformation of `X`.

Parameters
----------
X : array-like of shape (n_samples, n_features)
The data to transform.
y : array-like of shape (n_samples,), default=None
Ignored, present for compatibility.

Returns
-------
self : MonotonicSplineTransformer
The fitted transformer.

Raises
------
ValueError
If `X` contains non-numeric columns.
"""
X = check_array(X, copy=True, force_all_finite=False, dtype=FLOAT_DTYPES, estimator=self)

# If X contains infs, we need to replace them by nans before computing quantiles
self.spline_transformer_ = {
col: SplineTransformer(n_knots=self.n_knots, degree=self.degree, knots=self.knots).fit(
X[:, col].reshape(-1, 1)
koaning marked this conversation as resolved.
Show resolved Hide resolved
)
for col in range(X.shape[1])
}
self.n_features_in_ = X.shape[1]
return self

def transform(self, X):
"""Performs the Ispline transformation on `X`.

Parameters
----------
X : array-like of shape (n_samples, n_features)

Returns
-------
X : np.ndarray of shape (n_samples, n_out)
Transformed `X` values.

Raises
------
ValueError
If the number of columns from `X` differs from the number of columns when fitting.
"""
check_is_fitted(self, "spline_transformer_")
X = check_array(
X,
force_all_finite=False,
dtype=FLOAT_DTYPES,
estimator=self,
)
if X.shape[1] != self.n_features_in_:
raise ValueError("Number of features going into .transform() do not match number going into .fit().")

out = []
for col in range(X.shape[1]):
out.append(
np.cumsum(
self.spline_transformer_[col].transform(X[:, [col]])[:, ::-1],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I may need to do a bit more reading on why there is an inversion here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The direction in which you perform the cumsum on the columns will determine whether the output splines will be increasing or decreasing. However, now that you mention it, I'm pretty sure that

1 - np.cumsum(without inversion)

would achieve the same effect.

axis=1,
)
)
return np.concatenate(out, axis=1)
44 changes: 44 additions & 0 deletions tests/test_preprocessing/test_monospline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import numpy as np
import pytest
from sklearn.preprocessing import SplineTransformer
from sklearn.utils.estimator_checks import parametrize_with_checks

from sklego.preprocessing import MonotonicSplineTransformer


@parametrize_with_checks([MonotonicSplineTransformer()])
def test_sklearn_compatible_estimator(estimator, check):
check(estimator)


@pytest.mark.parametrize("n_knots", [3, 5])
@pytest.mark.parametrize("degree", [3, 5])
@pytest.mark.parametrize("knots", ["uniform", "quantile"])
def test_monotonic_spline_transformer(n_knots, degree, knots):
X = np.random.uniform(size=(100, 10))
transformer = MonotonicSplineTransformer(n_knots=n_knots, degree=degree, knots=knots)
transformer_sk = SplineTransformer(n_knots=n_knots, degree=degree, knots=knots)
transformer.fit(X)
transformer_sk.fit(X)
out = transformer.transform(X)
out_sk = transformer_sk.transform(X)
print(out.shape, out_sk.shape)
koaning marked this conversation as resolved.
Show resolved Hide resolved

# Both should have the same shape
assert out.shape == out_sk.shape

n_splines_per_feature = n_knots + degree - 1
assert out.shape[1] == X.shape[1] * n_splines_per_feature

# I splines should be bounded by 0 and 1
assert np.logical_or(out >= 0, np.isclose(out, 0)).all()
assert np.logical_or(out <= 1, np.isclose(out, 1)).all()

# The features should be monotonically increasing
for i in range(X.shape[1]):
feature = X[:, i]
sorted_out = out[np.argsort(feature), i * n_splines_per_feature : (i + 1) * n_splines_per_feature]
differences = np.diff(sorted_out, axis=0)

# All differences should be greater or equal to zero upto floating point errors
assert np.logical_or(np.greater_equal(differences, 0), np.isclose(differences, 0)).all()