Skip to content

Commit

Permalink
feat: ZeroInflatedRegressor.score_samples(...) (#680)
Browse files Browse the repository at this point in the history
* feat: zir score_samples

* no predict proba test

* warning on calibration
  • Loading branch information
FBruzzesi authored Jul 8, 2024
1 parent 439c9c0 commit d8a165e
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 19 deletions.
18 changes: 13 additions & 5 deletions docs/_scripts/meta-models.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,23 +400,31 @@ def false_negatives(mod, x, y):

# --8<-- [start:zero-inflated]
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.model_selection import cross_val_score

from sklego.meta import ZeroInflatedRegressor

np.random.seed(0)
X = np.random.randn(10000, 4)
y = ((X[:, 0]>0) & (X[:, 1]>0)) * np.abs(X[:, 2] * X[:, 3]**2) # many zeroes here, in about 75% of the cases.
y = ((X[:, 0]>0) & (X[:, 1]>0)) * np.abs(X[:, 2] * X[:, 3]**2)

zir = ZeroInflatedRegressor(
classifier=RandomForestClassifier(random_state=0),
regressor=RandomForestRegressor(random_state=0)
classifier=ExtraTreesClassifier(random_state=0, max_depth=10),
regressor=ExtraTreesRegressor(random_state=0)
)

print("ZIR (RFC+RFR) r²:", cross_val_score(zir, X, y).mean())
print("RFR r²:", cross_val_score(RandomForestRegressor(random_state=0), X, y).mean())
print("RFR r²:", cross_val_score(ExtraTreesRegressor(random_state=0), X, y).mean())
# --8<-- [end:zero-inflated]


# --8<-- [start:zero-inflated-score-samples]
_ = zir.fit(X, y)
print(f"Predict={zir.predict(X[:5]).round(2)}")
print(f"Scores={zir.score_samples(X[:5]).round(2)}")
# --8<-- [end:zero-inflated-score-samples]

# --8<-- [start:outlier-classifier]
import numpy as np
from sklego.meta.outlier_classifier import OutlierClassifier
Expand Down
26 changes: 22 additions & 4 deletions docs/user-guide/meta-models.md
Original file line number Diff line number Diff line change
Expand Up @@ -374,8 +374,8 @@ Sure, you can get regions where you are close to zero, but modelling an output o

What we can do circumvent these problems is the following:

1. Train a classifier to tell us whether the target is zero, or not.
2. Train a regressor on all samples with a non-zero target.
1. Train a **classifier** to tell us whether the target is zero, or not.
2. Train a **regressor** on all samples with a non-zero target.

By putting these two together in an obvious way, we get the [`ZeroInflatedRegressor`][zero-inflated-api]. You can use it like this:

Expand All @@ -384,8 +384,26 @@ By putting these two together in an obvious way, we get the [`ZeroInflatedRegres
```

```console
ZIR (RFC+RFR) r²: 0.8992404366385873
RFR r²: 0.8516522752031502
ZIR (RFC+RFR) r²: 0.8579468997736154
RFR r²: 0.7691291933110612
```

If the underlying classifier is able to predict the _probability_ of a sample to be zero (i.e. it implements a `predict_proba` method), then the `ZeroInflatedRegressor` can be used to predict the probability of a sample being non-zero _times_doc the expected value of such sample.

This quantity is sometimes called _risk estimate_ or _expected impact_, however, to adhere to scikit-learn convention, we made it accessible via the `score_samples` method.

!!! warning "About `predict_proba`"
The `predict_proba` method of the classifier does not always return actual probabilities.

For this reason if you want to use the `score_samples` method, it is recommended to train with a classifier wrapped by the [`CalibratedClassifierCV`][calibrated-classifier-api] class from scikit-learn to calibrate the probabilities.

```py title="score_samples"
--8<-- "docs/_scripts/meta-models.py:zero-inflated-score-samples"
```

```console
Predict=[4.91 0. 0. 0.05 0. ]
Scores=[3.73 0. 0.11 0.03 0.06]
```

## Outlier Classifier
Expand Down
56 changes: 46 additions & 10 deletions sklego/meta/zero_inflated_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
from sklearn.base import BaseEstimator, MetaEstimatorMixin, RegressorMixin, clone, is_classifier, is_regressor
from sklearn.exceptions import NotFittedError
from sklearn.utils.metaestimators import available_if
from sklearn.utils.validation import _check_sample_weight, check_array, check_is_fitted, check_X_y


Expand All @@ -12,9 +13,9 @@ class ZeroInflatedRegressor(BaseEstimator, RegressorMixin, MetaEstimatorMixin):
`ZeroInflatedRegressor` consists of a classifier and a regressor.
- The classifier's task is to find of if the target is zero or not.
- The regressor's task is to output a (usually positive) prediction whenever the classifier indicates that the
there should be a non-zero prediction.
- The classifier's task is to find of if the target is zero or not.
- The regressor's task is to output a (usually positive) prediction whenever the classifier indicates that the
there should be a non-zero prediction.
The regressor is only trained on examples where the target is non-zero, which makes it easier for it to focus.
Expand Down Expand Up @@ -46,17 +47,17 @@ class ZeroInflatedRegressor(BaseEstimator, RegressorMixin, MetaEstimatorMixin):
np.random.seed(0)
X = np.random.randn(10000, 4)
y = ((X[:, 0]>0) & (X[:, 1]>0)) * np.abs(X[:, 2] * X[:, 3]**2)
model = ZeroInflatedRegressor(
classifier=ExtraTreesClassifier(random_state=0),
classifier=ExtraTreesClassifier(random_state=0, max_depth=10),
regressor=ExtraTreesRegressor(random_state=0)
)
model.fit(X, y)
# ZeroInflatedRegressor(classifier=ExtraTreesClassifier(random_state=0),
# regressor=ExtraTreesRegressor(random_state=0))
).fit(X, y)
model.predict(X)[:5]
model.predict(X[:5])
# array([4.91483294, 0. , 0. , 0.04941909, 0. ])
model.score_samples(X[:5]).round(2)
# array([3.73, 0. , 0.11, 0.03, 0.06])
```
"""

Expand Down Expand Up @@ -165,3 +166,38 @@ def predict(self, X):
output[non_zero_indices] = self.regressor_.predict(X[non_zero_indices])

return output

@available_if(lambda self: hasattr(self.classifier_, "predict_proba"))
def score_samples(self, X):
r"""Predict risk estimate of `X` as the probability of `X` to not be zero times the expected value of `X`:
$$\text{score_sample(X)} = (1-P(X=0)) \cdot E[X]$$
where:
- $P(X=0)$ is calculated using the `.predict_proba()` method of the underlying classifier.
- $E[X]$ is the regressor prediction on `X`.
!!! info
This method requires the underlying classifier to implement `.predict_proba()` method.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data to predict.
Returns
-------
array-like of shape (n_samples,)
The predicted risk.
"""

check_is_fitted(self)
X = check_array(X)
self._check_n_features(X, reset=True)

non_zero_proba = self.classifier_.predict_proba(X)[:, 1]
expected_impact = self.regressor_.predict(X)

return non_zero_proba * expected_impact
45 changes: 45 additions & 0 deletions tests/test_meta/test_zero_inflated_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
import pytest
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.utils.estimator_checks import parametrize_with_checks

Expand Down Expand Up @@ -81,3 +82,47 @@ def test_wrong_estimators_exceptions():
with pytest.raises(ValueError, match="`regressor` has to be a regressor."):
zir = ZeroInflatedRegressor(ExtraTreesClassifier(), ExtraTreesClassifier())
zir.fit(X, y)


def approx_lte(x, y):
return ((x <= y) | np.isclose(x, y)).all()


def approx_gte(x, y):
return ((x >= y) | np.isclose(x, y)).all()


def test_score_samples():
np.random.seed(0)
X = np.random.randn(1_000, 4)
y = ((X[:, 0] > 0) & (X[:, 1] > 0)) * np.abs(X[:, 2] * X[:, 3] ** 2)

zir = ZeroInflatedRegressor(
classifier=ExtraTreesClassifier(max_depth=20, random_state=0, n_jobs=-1),
regressor=ExtraTreesRegressor(max_depth=20, random_state=0, n_jobs=-1),
).fit(X, y)

scores = zir.score_samples(X)
preds = zir.predict(X)

pred_is_non_zero = zir.classifier_.predict(X)

# Where the classifier prediction is non-zero, then the value is multiplied by something less than 1.
assert approx_lte(scores[pred_is_non_zero], preds[pred_is_non_zero])
# Where the classifier prediction is zero, then the score is by something greater than 0.
assert approx_gte(scores[~pred_is_non_zero], preds[~pred_is_non_zero])

def test_no_predict_proba():

np.random.seed(0)
X = np.random.randn(1_000, 4)
y = ((X[:, 0] > 0) & (X[:, 1] > 0)) * np.abs(X[:, 2] * X[:, 3] ** 2)

zir = ZeroInflatedRegressor(
classifier=RidgeClassifier(),
regressor=ExtraTreesRegressor(max_depth=20, random_state=0, n_jobs=-1),
).fit(X, y)

with pytest.raises(AttributeError, match="This 'ZeroInflatedRegressor' has no attribute 'score_samples'"):
zir.score_samples(X)

0 comments on commit d8a165e

Please sign in to comment.