diff --git a/docs/_scripts/meta-models.py b/docs/_scripts/meta-models.py index b7134ebea..9069c8ed5 100644 --- a/docs/_scripts/meta-models.py +++ b/docs/_scripts/meta-models.py @@ -132,7 +132,7 @@ def plot_model(model): ("datagrab", FeatureUnion([ ("discrete", Pipeline([ ("grab", ColumnSelector("diet")), - ("encode", OneHotEncoder(categories="auto", sparse=False)) + ("encode", OneHotEncoder(categories="auto")) ])), ("continuous", Pipeline([ ("grab", ColumnSelector("time")), @@ -265,8 +265,15 @@ def plot_model(model): mod1 = (GroupedPredictor(DummyRegressor(), groups=["m"]) .fit(df[["m"]], df["yt"])) -mod2 = (GroupedPredictor(DecayEstimator(DummyRegressor(), decay_func="exponential", decay_rate=0.9), groups=["m"]) - .fit(df[["index", "m"]], df["yt"])) +mod2 = (GroupedPredictor( + estimator=DecayEstimator( + model=DummyRegressor(), + decay_func="exponential", + decay_kwargs={"decay_rate": 0.9} + ), + groups=["m"] + ).fit(df[["index", "m"]], df["yt"]) +) plt.figure(figsize=(12, 3)) plt.plot(df["yt"], alpha=0.5); @@ -494,12 +501,16 @@ def false_negatives(mod, x, y): from sklearn.linear_model import LogisticRegression from sklego.meta import OrdinalClassifier -ord_clf = OrdinalClassifier(LogisticRegression(), n_jobs=-1, use_calibration=False) -_ = ord_clf.fit(X, y) -ord_clf.predict_proba(X[0]) +ord_clf = OrdinalClassifier( + LogisticRegression(), + n_jobs=-1, + use_calibration=False, + ).fit(X, y) + +ord_clf.predict_proba(X[:1]) # --8<-- [end:ordinal-classifier] -print(ord_clf.predict_proba(X[0])) +print(ord_clf.predict_proba(X[:1])) # --8<-- [start:ordinal-classifier-with-calibration] from sklearn.calibration import CalibratedClassifierCV diff --git a/docs/_static/meta-models/baseline-model.png b/docs/_static/meta-models/baseline-model.png index 4da4b554b..555f933ca 100644 Binary files a/docs/_static/meta-models/baseline-model.png and b/docs/_static/meta-models/baseline-model.png differ diff --git a/docs/_static/meta-models/confusion-balanced-grid.html b/docs/_static/meta-models/confusion-balanced-grid.html index f79165111..9b4eb4727 100644 --- a/docs/_static/meta-models/confusion-balanced-grid.html +++ b/docs/_static/meta-models/confusion-balanced-grid.html @@ -1,4 +1,408 @@ -
GridSearchCV(cv=5,
+
GridSearchCV(cv=5,
              estimator=ConfusionBalancer(alpha=1.0,
                                          estimator=LogisticRegression(max_iter=1000)),
              n_jobs=-1,
@@ -10,9 +414,9 @@
         2.33333333,  2.46666667,  2.6       ,  2.73333333,  2.86666667,
         3.        ])},
              refit='negatives', return_train_score=True,
-             scoring={'accuracy': make_scorer(accuracy_score),
-                      'negatives': <function false_negatives at 0x7f33dfb60c10>,
-                      'positives': <function false_positives at 0x7f33dfb61fc0>})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
ConfusionBalancer(alpha=1.0, estimator=LogisticRegression(max_iter=1000))
LogisticRegression(max_iter=1000)
LogisticRegression(max_iter=1000)
\ No newline at end of file diff --git a/docs/_static/meta-models/confusion-balancer-results.png b/docs/_static/meta-models/confusion-balancer-results.png index 90375303d..5f9d3b0f7 100644 Binary files a/docs/_static/meta-models/confusion-balancer-results.png and b/docs/_static/meta-models/confusion-balancer-results.png differ diff --git a/docs/_static/meta-models/decay-functions.png b/docs/_static/meta-models/decay-functions.png index 383f23818..cdb1da1b4 100644 Binary files a/docs/_static/meta-models/decay-functions.png and b/docs/_static/meta-models/decay-functions.png differ diff --git a/docs/_static/meta-models/decay-model.png b/docs/_static/meta-models/decay-model.png index 0e52f9172..b3b97b3cc 100644 Binary files a/docs/_static/meta-models/decay-model.png and b/docs/_static/meta-models/decay-model.png differ diff --git a/docs/_static/meta-models/grouped-dummy-model.png b/docs/_static/meta-models/grouped-dummy-model.png index 67ccbfd13..cfba93eb1 100644 Binary files a/docs/_static/meta-models/grouped-dummy-model.png and b/docs/_static/meta-models/grouped-dummy-model.png differ diff --git a/docs/_static/meta-models/grouped-model.png b/docs/_static/meta-models/grouped-model.png index 12e82fefc..085977c20 100644 Binary files a/docs/_static/meta-models/grouped-model.png and b/docs/_static/meta-models/grouped-model.png differ diff --git a/docs/_static/meta-models/grouped-transform.png b/docs/_static/meta-models/grouped-transform.png index 2fe836570..51a89213d 100644 Binary files a/docs/_static/meta-models/grouped-transform.png and b/docs/_static/meta-models/grouped-transform.png differ diff --git a/docs/_static/meta-models/make-blobs.png b/docs/_static/meta-models/make-blobs.png index 6df9e2a0f..347c40dea 100644 Binary files a/docs/_static/meta-models/make-blobs.png and b/docs/_static/meta-models/make-blobs.png differ diff --git a/docs/_static/meta-models/ordinal_data.md b/docs/_static/meta-models/ordinal_data.md index 105883bbd..c79613f27 100644 --- a/docs/_static/meta-models/ordinal_data.md +++ b/docs/_static/meta-models/ordinal_data.md @@ -4,4 +4,4 @@ | somewhat likely | 1 | 0 | 3.21 | 1 | | unlikely | 1 | 1 | 3.94 | 0 | | somewhat likely | 0 | 0 | 2.81 | 1 | -| somewhat likely | 0 | 0 | 2.53 | 1 | +| somewhat likely | 0 | 0 | 2.53 | 1 | \ No newline at end of file diff --git a/docs/_static/meta-models/outlier-classifier-stacking.html b/docs/_static/meta-models/outlier-classifier-stacking.html index e12b15b8c..a0995ffa7 100644 --- a/docs/_static/meta-models/outlier-classifier-stacking.html +++ b/docs/_static/meta-models/outlier-classifier-stacking.html @@ -1,7 +1,411 @@ -
StackingClassifier(estimators=[('anomaly',
+
StackingClassifier(estimators=[('anomaly',
                                 OutlierClassifier(model=IsolationForest())),
                                ('classifier', RandomForestClassifier())],
-                   passthrough=True, stack_method='predict_proba')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
IsolationForest()
IsolationForest()
RandomForestClassifier()
LogisticRegression()
\ No newline at end of file diff --git a/docs/_static/meta-models/outlier-classifier.html b/docs/_static/meta-models/outlier-classifier.html index 263609f1e..d9284d520 100644 --- a/docs/_static/meta-models/outlier-classifier.html +++ b/docs/_static/meta-models/outlier-classifier.html @@ -1,3 +1,407 @@ -
OutlierClassifier(model=IsolationForest(contamination=0.01, n_estimators=1000,
-                                        random_state=0))
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
\ No newline at end of file +
OutlierClassifier(model=IsolationForest(contamination=0.01, n_estimators=1000,
+                                        random_state=0))
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
\ No newline at end of file diff --git a/docs/_static/meta-models/skewed-data.png b/docs/_static/meta-models/skewed-data.png index 9219ec041..5531c9f80 100644 Binary files a/docs/_static/meta-models/skewed-data.png and b/docs/_static/meta-models/skewed-data.png differ diff --git a/docs/_static/meta-models/threshold-chart.png b/docs/_static/meta-models/threshold-chart.png index 966ced736..316ab49ff 100644 Binary files a/docs/_static/meta-models/threshold-chart.png and b/docs/_static/meta-models/threshold-chart.png differ diff --git a/docs/_static/meta-models/ts-data.png b/docs/_static/meta-models/ts-data.png index dab2adc9a..36a9e87ed 100644 Binary files a/docs/_static/meta-models/ts-data.png and b/docs/_static/meta-models/ts-data.png differ diff --git a/sklego/meta/_shrinkage_utils.py b/sklego/meta/_shrinkage_utils.py index 741299f60..757eeee46 100644 --- a/sklego/meta/_shrinkage_utils.py +++ b/sklego/meta/_shrinkage_utils.py @@ -212,7 +212,7 @@ def _fit_shrinkage_factors(self, frame, groups, most_granular_only=False): } shrinkage_factors = { - grp_value: self.shrinkage_function_(counts_, **self.shrinkage_kwargs) + grp_value: self.shrinkage_function_(counts_, **(self.shrinkage_kwargs or {})) for grp_value, counts_ in hierarchical_counts.items() } diff --git a/sklego/meta/decay_estimator.py b/sklego/meta/decay_estimator.py index eeb55bb27..b454c1327 100644 --- a/sklego/meta/decay_estimator.py +++ b/sklego/meta/decay_estimator.py @@ -87,7 +87,7 @@ class DecayEstimator(BaseEstimator, MetaEstimatorMixin): _required_parameters = ["model"] - def __init__(self, model, decay_func="exponential", check_input=False, **decay_kwargs): + def __init__(self, model, decay_func="exponential", check_input=False, decay_kwargs=None): self.model = model self.decay_func = decay_func self.check_input = check_input @@ -128,7 +128,7 @@ def fit(self, X, y): else: raise ValueError(f"`decay_func` should be one of {self._ALLOWED_DECAYS.keys()} or a callable") - self.weights_ = self.decay_func_(X, y, **self.decay_kwargs) + self.weights_ = self.decay_func_(X, y, **(self.decay_kwargs or {})) self.estimator_ = clone(self.model) try: diff --git a/sklego/meta/grouped_predictor.py b/sklego/meta/grouped_predictor.py index 1a9d1b13d..a61f331fd 100644 --- a/sklego/meta/grouped_predictor.py +++ b/sklego/meta/grouped_predictor.py @@ -99,7 +99,7 @@ def __init__( shrinkage=None, use_global_model=True, check_X=True, - **shrinkage_kwargs, + shrinkage_kwargs=None, ): self.estimator = estimator self.groups = groups diff --git a/sklego/meta/hierarchical_predictor.py b/sklego/meta/hierarchical_predictor.py index 57a656139..f7a53274a 100644 --- a/sklego/meta/hierarchical_predictor.py +++ b/sklego/meta/hierarchical_predictor.py @@ -135,7 +135,7 @@ class HierarchicalPredictor(ShrinkageMixin, MetaEstimatorMixin, BaseEstimator): check_X : bool, default=True Whether to validate `X` to be non-empty 2D array of finite values and attempt to cast `X` to float. If disabled, the model/pipeline is expected to handle e.g. missing, non-numeric, or non-finite values. - **shrinkage_kwargs : dict + shrinkage_kwargs : dict Keyword arguments to the shrinkage function Attributes @@ -210,7 +210,7 @@ def __init__( fallback_method="parent", n_jobs=None, check_X=True, - **shrinkage_kwargs, + shrinkage_kwargs=None, ): self.estimator = estimator self.groups = groups diff --git a/sklego/meta/ordinal_classification.py b/sklego/meta/ordinal_classification.py index 4299b86b8..a08a4e924 100644 --- a/sklego/meta/ordinal_classification.py +++ b/sklego/meta/ordinal_classification.py @@ -95,7 +95,7 @@ class OrdinalClassifier(MultiOutputMixin, ClassifierMixin, MetaEstimatorMixin, B is_multiclass = True - def __init__(self, estimator, *, n_jobs=None, use_calibration=False, **calibration_kwargs): + def __init__(self, estimator, *, n_jobs=None, use_calibration=False, calibration_kwargs=None): self.estimator = estimator self.n_jobs = n_jobs self.use_calibration = use_calibration @@ -218,7 +218,9 @@ def _fit_binary_estimator(self, X, y, y_label): """ y_bin = (y <= y_label).astype(int) if self.use_calibration: - return CalibratedClassifierCV(estimator=clone(self.estimator), **self.calibration_kwargs).fit(X, y_bin) + return CalibratedClassifierCV(estimator=clone(self.estimator), **(self.calibration_kwargs or {})).fit( + X, y_bin + ) else: return clone(self.estimator).fit(X, y_bin) diff --git a/tests/test_meta/test_decay_estimator.py b/tests/test_meta/test_decay_estimator.py index fabf80611..a0d6fc5e8 100644 --- a/tests/test_meta/test_decay_estimator.py +++ b/tests/test_meta/test_decay_estimator.py @@ -53,7 +53,7 @@ def test_decay_weight(mod, is_clf, decay_func, decay_kwargs): if is_clf: y = (y < 0).astype(int) - mod = DecayEstimator(mod, decay_func=decay_func, **decay_kwargs).fit(X, y) + mod = DecayEstimator(mod, decay_func=decay_func, decay_kwargs=decay_kwargs).fit(X, y) assert np.logical_and(mod.weights_ >= 0, mod.weights_ <= 1).all() assert np.all(mod.weights_[:-1] <= mod.weights_[1:]) diff --git a/tests/test_meta/test_grouped_predictor.py b/tests/test_meta/test_grouped_predictor.py index 5d20a23a5..21606a46c 100644 --- a/tests/test_meta/test_grouped_predictor.py +++ b/tests/test_meta/test_grouped_predictor.py @@ -249,7 +249,7 @@ def test_constant_shrinkage(shrinkage_data): ["Planet", "Country", "City"], shrinkage="constant", use_global_model=False, - alpha=0.1, + shrinkage_kwargs={"alpha": 0.1}, ) shrinkage_factors = np.array([0.01, 0.09, 0.9]) @@ -304,7 +304,7 @@ def test_min_n_obs_shrinkage(shrinkage_data): ["Planet", "Country", "City"], shrinkage="min_n_obs", use_global_model=False, - min_n_obs=2, + shrinkage_kwargs={"min_n_obs": 2}, ) shrink_est.fit(X, y) @@ -327,7 +327,7 @@ def test_min_n_obs_shrinkage_too_little_obs(shrinkage_data): ["Planet", "Country", "City"], shrinkage="min_n_obs", use_global_model=False, - min_n_obs=too_big_n_obs, + shrinkage_kwargs={"min_n_obs": too_big_n_obs}, ) with pytest.raises(ValueError) as e: @@ -459,7 +459,7 @@ def test_global_model_shrinkage(shrinkage_data): ["Planet", "Country", "City"], shrinkage="min_n_obs", use_global_model=False, - min_n_obs=2, + shrinkage_kwargs={"min_n_obs": 2}, ) shrink_est_with_global = GroupedPredictor( @@ -467,7 +467,7 @@ def test_global_model_shrinkage(shrinkage_data): ["Country", "City"], shrinkage="min_n_obs", use_global_model=True, - min_n_obs=2, + shrinkage_kwargs={"min_n_obs": 2}, ) shrink_est_without_global.fit(X, y) @@ -490,7 +490,7 @@ def test_shrinkage_single_group(shrinkage_data): "Country", shrinkage="constant", use_global_model=True, - alpha=0.1, + shrinkage_kwargs={"alpha": 0.1}, ) shrinkage_factors = np.array([0.1, 0.9]) @@ -519,7 +519,7 @@ def test_shrinkage_single_group_no_global(shrinkage_data): "Country", shrinkage="constant", use_global_model=False, - alpha=0.1, + shrinkage_kwargs={"alpha": 0.1}, ) shrink_est.fit(X, y) @@ -548,7 +548,9 @@ def test_unseen_groups_shrinkage(shrinkage_data): X, y = df.drop(columns="Target"), df["Target"] - shrink_est = GroupedPredictor(DummyRegressor(), ["Planet", "Country", "City"], shrinkage="constant", alpha=0.1) + shrink_est = GroupedPredictor( + DummyRegressor(), ["Planet", "Country", "City"], shrinkage="constant", shrinkage_kwargs={"alpha": 0.1} + ) shrink_est.fit(X, y) @@ -569,7 +571,7 @@ def test_predict_missing_group_column(shrinkage_data): ["Planet", "Country", "City"], shrinkage="constant", use_global_model=False, - alpha=0.1, + shrinkage_kwargs={"alpha": 0.1}, ) shrink_est.fit(X, y) @@ -592,7 +594,7 @@ def test_predict_missing_value_column(shrinkage_data): ["Planet", "Country", "City"], shrinkage="constant", use_global_model=False, - alpha=0.1, + shrinkage_kwargs={"alpha": 0.1}, ) shrink_est.fit(X, y) diff --git a/tests/test_meta/test_hierarchical_predictor.py b/tests/test_meta/test_hierarchical_predictor.py index ff5c6cd2a..94bd01610 100644 --- a/tests/test_meta/test_hierarchical_predictor.py +++ b/tests/test_meta/test_hierarchical_predictor.py @@ -116,24 +116,28 @@ def make_hierarchical_dummy(frame_func): ) @pytest.mark.parametrize("fallback_method", ["raise", "parent"]) @pytest.mark.parametrize( - "shrinkage", + ("shrinkage", "kwargs"), [ - {"shrinkage": None}, - {"shrinkage": "equal"}, - {"shrinkage": "relative"}, - {"shrinkage": "min_n_obs", "min_n_obs": 10}, - {"shrinkage": "constant", "alpha": 0.5}, + (None, None), + ("equal", None), + ("relative", None), + ("min_n_obs", {"min_n_obs": 10}), + ("constant", {"alpha": 0.5}), ], ) -def test_fit_predict(meta_cls, base_estimator, task, fallback_method, shrinkage): +def test_fit_predict(meta_cls, base_estimator, task, fallback_method, shrinkage, kwargs): """Tests that the model can be fit and predict with different configurations of fallback and shrinkage methods if X to predict contains same groups as X used to fit. """ X, y, groups = make_hierarchical_dataset(task, frame_func=frame_funcs[randint(0, 1)]) - meta_model = meta_cls(estimator=base_estimator, groups=groups, fallback_method=fallback_method, **shrinkage).fit( - X, y - ) + meta_model = meta_cls( + estimator=base_estimator, + groups=groups, + fallback_method=fallback_method, + shrinkage=shrinkage, + shrinkage_kwargs=kwargs, + ).fit(X, y) assert meta_model.estimators_ is not None assert meta_model.predict(X) is not None @@ -173,23 +177,25 @@ def test_fallback(meta_cls, base_estimator, task, fallback_method, context): ], ) @pytest.mark.parametrize( - "shrinkage", + ("shrinkage", "kwargs"), [ - {"shrinkage": None}, - {"shrinkage": "equal"}, - {"shrinkage": "relative"}, - {"shrinkage": "min_n_obs", "min_n_obs": 10}, - {"shrinkage": "constant", "alpha": 0.5}, + (None, None), + ("equal", None), + ("relative", None), + ("min_n_obs", {"min_n_obs": 10}), + ("constant", {"alpha": 0.5}), ], ) -def test_shrinkage(meta_cls, base_estimator, task, metric, shrinkage): +def test_shrinkage(meta_cls, base_estimator, task, metric, shrinkage, kwargs): """Tests that the model performance is better than the base estimator when predicting with different shrinkage methods. """ X, y, groups = make_hierarchical_dataset(task, frame_func=frame_funcs[randint(0, 1)]) X_ = nw.from_native(X).drop(groups).pipe(nw.to_native) - meta_model = meta_cls(estimator=clone(base_estimator), groups=groups, **shrinkage).fit(X, y) + meta_model = meta_cls( + estimator=clone(base_estimator), groups=groups, shrinkage=shrinkage, shrinkage_kwargs=kwargs + ).fit(X, y) base_model = clone(base_estimator).fit(X_, y) assert metric(y, base_model.predict(X_)) <= metric(y, meta_model.predict(X)) diff --git a/tests/test_pandas_utils/test_pandas_utils.py b/tests/test_pandas_utils/test_pandas_utils.py index bff49ff0b..dd4bbc58d 100644 --- a/tests/test_pandas_utils/test_pandas_utils.py +++ b/tests/test_pandas_utils/test_pandas_utils.py @@ -51,6 +51,7 @@ def test_add_lags_correct_df(data, frame_func): if isinstance(expected, pl.LazyFrame): expected = expected.collect() assert [x for x in ans.columns] == [x for x in expected.columns] + assert (ans.to_numpy() == expected.to_numpy()).all()