From d8a165eb523a0a05b2b6e24cfd6eca28bcd4caac Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 8 Jul 2024 09:00:23 +0200 Subject: [PATCH] feat: `ZeroInflatedRegressor.score_samples(...)` (#680) * feat: zir score_samples * no predict proba test * warning on calibration --- docs/_scripts/meta-models.py | 18 ++++-- docs/user-guide/meta-models.md | 26 +++++++-- sklego/meta/zero_inflated_regressor.py | 56 +++++++++++++++---- .../test_meta/test_zero_inflated_regressor.py | 45 +++++++++++++++ 4 files changed, 126 insertions(+), 19 deletions(-) diff --git a/docs/_scripts/meta-models.py b/docs/_scripts/meta-models.py index da473ab9c..b7134ebea 100644 --- a/docs/_scripts/meta-models.py +++ b/docs/_scripts/meta-models.py @@ -400,23 +400,31 @@ def false_negatives(mod, x, y): # --8<-- [start:zero-inflated] import numpy as np -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor from sklearn.model_selection import cross_val_score + from sklego.meta import ZeroInflatedRegressor np.random.seed(0) X = np.random.randn(10000, 4) -y = ((X[:, 0]>0) & (X[:, 1]>0)) * np.abs(X[:, 2] * X[:, 3]**2) # many zeroes here, in about 75% of the cases. +y = ((X[:, 0]>0) & (X[:, 1]>0)) * np.abs(X[:, 2] * X[:, 3]**2) zir = ZeroInflatedRegressor( - classifier=RandomForestClassifier(random_state=0), - regressor=RandomForestRegressor(random_state=0) + classifier=ExtraTreesClassifier(random_state=0, max_depth=10), + regressor=ExtraTreesRegressor(random_state=0) ) print("ZIR (RFC+RFR) r²:", cross_val_score(zir, X, y).mean()) -print("RFR r²:", cross_val_score(RandomForestRegressor(random_state=0), X, y).mean()) +print("RFR r²:", cross_val_score(ExtraTreesRegressor(random_state=0), X, y).mean()) # --8<-- [end:zero-inflated] + +# --8<-- [start:zero-inflated-score-samples] +_ = zir.fit(X, y) +print(f"Predict={zir.predict(X[:5]).round(2)}") +print(f"Scores={zir.score_samples(X[:5]).round(2)}") +# --8<-- [end:zero-inflated-score-samples] + # --8<-- [start:outlier-classifier] import numpy as np from sklego.meta.outlier_classifier import OutlierClassifier diff --git a/docs/user-guide/meta-models.md b/docs/user-guide/meta-models.md index 08d4c032c..9965c0bdd 100644 --- a/docs/user-guide/meta-models.md +++ b/docs/user-guide/meta-models.md @@ -374,8 +374,8 @@ Sure, you can get regions where you are close to zero, but modelling an output o What we can do circumvent these problems is the following: -1. Train a classifier to tell us whether the target is zero, or not. -2. Train a regressor on all samples with a non-zero target. +1. Train a **classifier** to tell us whether the target is zero, or not. +2. Train a **regressor** on all samples with a non-zero target. By putting these two together in an obvious way, we get the [`ZeroInflatedRegressor`][zero-inflated-api]. You can use it like this: @@ -384,8 +384,26 @@ By putting these two together in an obvious way, we get the [`ZeroInflatedRegres ``` ```console -ZIR (RFC+RFR) r²: 0.8992404366385873 -RFR r²: 0.8516522752031502 +ZIR (RFC+RFR) r²: 0.8579468997736154 +RFR r²: 0.7691291933110612 +``` + +If the underlying classifier is able to predict the _probability_ of a sample to be zero (i.e. it implements a `predict_proba` method), then the `ZeroInflatedRegressor` can be used to predict the probability of a sample being non-zero _times_doc the expected value of such sample. + +This quantity is sometimes called _risk estimate_ or _expected impact_, however, to adhere to scikit-learn convention, we made it accessible via the `score_samples` method. + +!!! warning "About `predict_proba`" + The `predict_proba` method of the classifier does not always return actual probabilities. + + For this reason if you want to use the `score_samples` method, it is recommended to train with a classifier wrapped by the [`CalibratedClassifierCV`][calibrated-classifier-api] class from scikit-learn to calibrate the probabilities. + +```py title="score_samples" +--8<-- "docs/_scripts/meta-models.py:zero-inflated-score-samples" +``` + +```console +Predict=[4.91 0. 0. 0.05 0. ] +Scores=[3.73 0. 0.11 0.03 0.06] ``` ## Outlier Classifier diff --git a/sklego/meta/zero_inflated_regressor.py b/sklego/meta/zero_inflated_regressor.py index e78abb305..9a15edb02 100644 --- a/sklego/meta/zero_inflated_regressor.py +++ b/sklego/meta/zero_inflated_regressor.py @@ -4,6 +4,7 @@ import numpy as np from sklearn.base import BaseEstimator, MetaEstimatorMixin, RegressorMixin, clone, is_classifier, is_regressor from sklearn.exceptions import NotFittedError +from sklearn.utils.metaestimators import available_if from sklearn.utils.validation import _check_sample_weight, check_array, check_is_fitted, check_X_y @@ -12,9 +13,9 @@ class ZeroInflatedRegressor(BaseEstimator, RegressorMixin, MetaEstimatorMixin): `ZeroInflatedRegressor` consists of a classifier and a regressor. - - The classifier's task is to find of if the target is zero or not. - - The regressor's task is to output a (usually positive) prediction whenever the classifier indicates that the - there should be a non-zero prediction. + - The classifier's task is to find of if the target is zero or not. + - The regressor's task is to output a (usually positive) prediction whenever the classifier indicates that the + there should be a non-zero prediction. The regressor is only trained on examples where the target is non-zero, which makes it easier for it to focus. @@ -46,17 +47,17 @@ class ZeroInflatedRegressor(BaseEstimator, RegressorMixin, MetaEstimatorMixin): np.random.seed(0) X = np.random.randn(10000, 4) y = ((X[:, 0]>0) & (X[:, 1]>0)) * np.abs(X[:, 2] * X[:, 3]**2) + model = ZeroInflatedRegressor( - classifier=ExtraTreesClassifier(random_state=0), + classifier=ExtraTreesClassifier(random_state=0, max_depth=10), regressor=ExtraTreesRegressor(random_state=0) - ) - - model.fit(X, y) - # ZeroInflatedRegressor(classifier=ExtraTreesClassifier(random_state=0), - # regressor=ExtraTreesRegressor(random_state=0)) + ).fit(X, y) - model.predict(X)[:5] + model.predict(X[:5]) # array([4.91483294, 0. , 0. , 0.04941909, 0. ]) + + model.score_samples(X[:5]).round(2) + # array([3.73, 0. , 0.11, 0.03, 0.06]) ``` """ @@ -165,3 +166,38 @@ def predict(self, X): output[non_zero_indices] = self.regressor_.predict(X[non_zero_indices]) return output + + @available_if(lambda self: hasattr(self.classifier_, "predict_proba")) + def score_samples(self, X): + r"""Predict risk estimate of `X` as the probability of `X` to not be zero times the expected value of `X`: + + $$\text{score_sample(X)} = (1-P(X=0)) \cdot E[X]$$ + + where: + + - $P(X=0)$ is calculated using the `.predict_proba()` method of the underlying classifier. + - $E[X]$ is the regressor prediction on `X`. + + !!! info + + This method requires the underlying classifier to implement `.predict_proba()` method. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to predict. + + Returns + ------- + array-like of shape (n_samples,) + The predicted risk. + """ + + check_is_fitted(self) + X = check_array(X) + self._check_n_features(X, reset=True) + + non_zero_proba = self.classifier_.predict_proba(X)[:, 1] + expected_impact = self.regressor_.predict(X) + + return non_zero_proba * expected_impact diff --git a/tests/test_meta/test_zero_inflated_regressor.py b/tests/test_meta/test_zero_inflated_regressor.py index 3792fe9ac..14b9b7a65 100644 --- a/tests/test_meta/test_zero_inflated_regressor.py +++ b/tests/test_meta/test_zero_inflated_regressor.py @@ -3,6 +3,7 @@ import numpy as np import pytest from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor +from sklearn.linear_model import RidgeClassifier from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.utils.estimator_checks import parametrize_with_checks @@ -81,3 +82,47 @@ def test_wrong_estimators_exceptions(): with pytest.raises(ValueError, match="`regressor` has to be a regressor."): zir = ZeroInflatedRegressor(ExtraTreesClassifier(), ExtraTreesClassifier()) zir.fit(X, y) + + +def approx_lte(x, y): + return ((x <= y) | np.isclose(x, y)).all() + + +def approx_gte(x, y): + return ((x >= y) | np.isclose(x, y)).all() + + +def test_score_samples(): + np.random.seed(0) + X = np.random.randn(1_000, 4) + y = ((X[:, 0] > 0) & (X[:, 1] > 0)) * np.abs(X[:, 2] * X[:, 3] ** 2) + + zir = ZeroInflatedRegressor( + classifier=ExtraTreesClassifier(max_depth=20, random_state=0, n_jobs=-1), + regressor=ExtraTreesRegressor(max_depth=20, random_state=0, n_jobs=-1), + ).fit(X, y) + + scores = zir.score_samples(X) + preds = zir.predict(X) + + pred_is_non_zero = zir.classifier_.predict(X) + + # Where the classifier prediction is non-zero, then the value is multiplied by something less than 1. + assert approx_lte(scores[pred_is_non_zero], preds[pred_is_non_zero]) + # Where the classifier prediction is zero, then the score is by something greater than 0. + assert approx_gte(scores[~pred_is_non_zero], preds[~pred_is_non_zero]) + +def test_no_predict_proba(): + + np.random.seed(0) + X = np.random.randn(1_000, 4) + y = ((X[:, 0] > 0) & (X[:, 1] > 0)) * np.abs(X[:, 2] * X[:, 3] ** 2) + + zir = ZeroInflatedRegressor( + classifier=RidgeClassifier(), + regressor=ExtraTreesRegressor(max_depth=20, random_state=0, n_jobs=-1), + ).fit(X, y) + + with pytest.raises(AttributeError, match="This 'ZeroInflatedRegressor' has no attribute 'score_samples'"): + zir.score_samples(X) +