Skip to content

Commit

Permalink
Make RegressionOutlier dataframe-agnostic (#665)
Browse files Browse the repository at this point in the history
* make regression outlier df-agnostic

* need to use eager-only for this one

* pass native to check_array

* remove cudf, link to check_X_y
  • Loading branch information
MarcoGorelli authored May 11, 2024
1 parent 28c102b commit 94cf506
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 9 deletions.
29 changes: 24 additions & 5 deletions sklego/meta/regression_outlier_detector.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import narwhals as nw
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, OutlierMixin
from sklearn.utils.validation import check_array, check_is_fitted

Expand All @@ -11,8 +11,11 @@ class RegressionOutlierDetector(BaseEstimator, OutlierMixin):
----------
model : scikit-learn compatible regression model
A regression model that will be used for prediction.
column : int
The index of the target column to predict in the input data.
column : int | str
This should be:
- The index of the target column to predict in the input data, when the input is an array.
- The name of the target column to predict in the input data, when the input is a dataframe.
lower : float, default=2.0
Lower threshold for outlier detection. The method used for detection depends on the `method` parameter.
upper : float, default=2.0
Expand All @@ -32,6 +35,21 @@ class RegressionOutlierDetector(BaseEstimator, OutlierMixin):
The standard deviation of the differences between true and predicted values.
idx_ : int
The index of the target column in the input data.
Notes
-----
Native cross-dataframe support is achieved using
[Narwhals](https://narwhals-dev.github.io/narwhals/){:target="_blank"}.
Supported dataframes are:
- pandas
- Polars (eager)
- Modin
See [Narwhals docs](https://narwhals-dev.github.io/narwhals/extending/){:target="_blank"} for an up-to-date list
(and to learn how you can add your dataframe library to it!), though note that only those
supported by [sklearn.utils.check_X_y](https://scikit-learn.org/stable/modules/generated/sklearn.utils.check_X_y.html)
will work with this class.
"""

def __init__(self, model, column, lower=2, upper=2, method="sd"):
Expand Down Expand Up @@ -112,8 +130,9 @@ def fit(self, X, y=None):
ValueError
If the `model` is not a regression estimator.
"""
self.idx_ = np.argmax([i == self.column for i in X.columns]) if isinstance(X, pd.DataFrame) else self.column
X = check_array(X, estimator=self)
X = nw.from_native(X, eager_only=True, strict=False)
self.idx_ = np.argmax([i == self.column for i in X.columns]) if isinstance(X, nw.DataFrame) else self.column
X = check_array(nw.to_native(X, strict=False), estimator=self)
if not self._is_regression_model():
raise ValueError("Passed model must be regression!")
X, y = self.to_x_y(X)
Expand Down
11 changes: 7 additions & 4 deletions tests/test_meta/test_regression_outlier.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import pandas as pd
import polars as pl
import pytest
from sklearn.linear_model import LinearRegression, LogisticRegression

Expand Down Expand Up @@ -42,14 +43,15 @@ def test_obvious_example():
assert preds[i] == -1


def test_obvious_example_pandas():
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
def test_obvious_example_dataframe(frame_func):
# generate random data for illustrative example
np.random.seed(42)
x = np.random.normal(0, 1, 100)
y = 1 + x + np.random.normal(0, 0.2, 100)
for i in [20, 25, 50, 80]:
y[i] += 2
X = pd.DataFrame({"x": x, "y": y})
X = frame_func({"x": x, "y": y})

# fit and plot
mod = RegressionOutlierDetector(LinearRegression(), column="y")
Expand All @@ -58,14 +60,15 @@ def test_obvious_example_pandas():
assert preds[i] == -1


def test_raises_error():
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
def test_raises_error(frame_func):
# generate random data for illustrative example
np.random.seed(42)
x = np.random.normal(0, 1, 100)
y = 1 + x + np.random.normal(0, 0.2, 100)
for i in [20, 25, 50, 80]:
y[i] += 2
X = pd.DataFrame({"x": x, "y": y})
X = frame_func({"x": x, "y": y})

with pytest.raises(ValueError):
mod = RegressionOutlierDetector(LogisticRegression(), column="y")
Expand Down

0 comments on commit 94cf506

Please sign in to comment.