diff --git a/sklego/__init__.py b/sklego/__init__.py index 95d13f89..5c6a1f6c 100644 --- a/sklego/__init__.py +++ b/sklego/__init__.py @@ -1,3 +1,4 @@ +import re import sys if sys.version_info >= (3, 8): @@ -5,5 +6,8 @@ else: import importlib_metadata as metadata + __title__ = "sklego" __version__ = metadata.version("scikit-lego") + +SKLEARN_VERSION = tuple(int(re.sub(r"\D", "", str(v))) for v in metadata.version("scikit-learn").split(".")) diff --git a/sklego/common.py b/sklego/common.py index 548faea2..0e46ac9b 100644 --- a/sklego/common.py +++ b/sklego/common.py @@ -4,11 +4,13 @@ import numpy as np import pandas as pd -from sklearn.base import TransformerMixin +from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_array, check_is_fitted, check_X_y +from sklego import SKLEARN_VERSION -class TrainOnlyTransformerMixin(TransformerMixin): + +class TrainOnlyTransformerMixin(TransformerMixin, BaseEstimator): """Mixin class for transformers that can handle training and test data differently. This mixin allows using a separate function for transforming training and test data. @@ -79,9 +81,9 @@ def fit(self, X, y=None): The fitted transformer. """ if y is None: - check_array(X, estimator=self) + validate_data(self, X) else: - check_X_y(X, y, estimator=self, multi_output=True) + validate_data(self, X, y, multi_output=True) self.X_hash_ = self._hash(X) self.n_features_in_ = X.shape[1] return self @@ -145,7 +147,7 @@ def transform(self, X, y=None): If the input dimension does not match the training dimension. """ check_is_fitted(self, ["X_hash_", "n_features_in_"]) - check_array(X, estimator=self) + X = validate_data(self, X, reset=False) if X.shape[1] != self.n_features_in_: raise ValueError(f"Unexpected input dimension {X.shape[1]}, expected {self.n_features_in_}") @@ -339,3 +341,32 @@ def sliding_window(sequence, window_size, step_size): ``` """ return (sequence[pos : pos + window_size] for pos in range(0, len(sequence), step_size)) + + +def validate_data( + estimator, + X="no_validation", + y="no_validation", + reset=True, + validate_separately=False, + skip_check_array=False, + **check_params, +): + if SKLEARN_VERSION >= (1, 6): + from sklearn.utils.validation import validate_data + + return validate_data( + estimator, + X=X, + y=y, + reset=reset, + validate_separately=validate_separately, + skip_check_array=skip_check_array, + **check_params, + ) + + else: + if y == "no_validation": + return check_array(arr=X, estimator=estimator, **check_params) + else: + return check_X_y(X=X, y=y, estimator=estimator, **check_params) diff --git a/sklego/decomposition/pca_reconstruction.py b/sklego/decomposition/pca_reconstruction.py index 3dcc51aa..30d5affc 100644 --- a/sklego/decomposition/pca_reconstruction.py +++ b/sklego/decomposition/pca_reconstruction.py @@ -1,10 +1,12 @@ import numpy as np from sklearn.base import BaseEstimator, OutlierMixin from sklearn.decomposition import PCA -from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted +from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted +from sklego.common import validate_data -class PCAOutlierDetection(BaseEstimator, OutlierMixin): + +class PCAOutlierDetection(OutlierMixin, BaseEstimator): """`PCAOutlierDetection` is an outlier detector based on the reconstruction error from PCA. If the difference between original and reconstructed data is larger than the `threshold`, the point is @@ -94,7 +96,7 @@ def fit(self, X, y=None): ValueError If `threshold` is `None`. """ - X = check_array(X, estimator=self, dtype=FLOAT_DTYPES) + X = validate_data(self, X, dtype=FLOAT_DTYPES) if not self.threshold: raise ValueError("The `threshold` value cannot be `None`.") @@ -157,7 +159,7 @@ def predict(self, X): array-like of shape (n_samples,) The predicted data. 1 for inliers, -1 for outliers. """ - X = check_array(X, estimator=self, dtype=FLOAT_DTYPES) + X = validate_data(self, X, dtype=FLOAT_DTYPES) check_is_fitted(self, ["pca_", "offset_"]) result = np.ones(X.shape[0]) result[self.difference(X) > self.threshold] = -1 diff --git a/sklego/decomposition/umap_reconstruction.py b/sklego/decomposition/umap_reconstruction.py index 330fe8f8..30048c94 100644 --- a/sklego/decomposition/umap_reconstruction.py +++ b/sklego/decomposition/umap_reconstruction.py @@ -8,10 +8,12 @@ import numpy as np from sklearn.base import BaseEstimator, OutlierMixin -from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted +from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted +from sklego.common import validate_data -class UMAPOutlierDetection(BaseEstimator, OutlierMixin): + +class UMAPOutlierDetection(OutlierMixin, BaseEstimator): """`UMAPOutlierDetection` is an outlier detector based on the reconstruction error from UMAP. If the difference between original and reconstructed data is larger than the `threshold`, the point is @@ -100,9 +102,9 @@ def fit(self, X, y=None): - If `n_components` is less than 2. - If `threshold` is `None`. """ - X = check_array(X, estimator=self, dtype=FLOAT_DTYPES) + X = validate_data(self, X, dtype=FLOAT_DTYPES) if y is not None: - y = check_array(y, estimator=self, ensure_2d=False) + y = validate_data(self, y, ensure_2d=False) if not self.threshold: raise ValueError("The `threshold` value cannot be `None`.") @@ -133,6 +135,7 @@ def difference(self, X): The calculated difference. """ check_is_fitted(self, ["umap_", "offset_"]) + reduced = self.umap_.transform(X) diff = np.sum(np.abs(self.umap_.inverse_transform(reduced) - X), axis=1) if self.variant == "relative": @@ -155,7 +158,7 @@ def predict(self, X): array-like of shape (n_samples,) The predicted data. 1 for inliers, -1 for outliers. """ - X = check_array(X, estimator=self, dtype=FLOAT_DTYPES) + X = validate_data(self, X, dtype=FLOAT_DTYPES) check_is_fitted(self, ["umap_", "offset_"]) result = np.ones(X.shape[0]) result[self.difference(X) > self.threshold] = -1 @@ -172,3 +175,13 @@ def score_samples(self, X): def _more_tags(self): return {"non_deterministic": True} + + def __sklearn_tags__(self): + from sklego import SKLEARN_VERSION + + if SKLEARN_VERSION >= (1, 6): + tags = super().__sklearn_tags__() + tags.non_deterministic = True + return tags + else: + pass diff --git a/sklego/dummy.py b/sklego/dummy.py index 35b4d639..c9748436 100644 --- a/sklego/dummy.py +++ b/sklego/dummy.py @@ -11,7 +11,7 @@ ) -class RandomRegressor(BaseEstimator, RegressorMixin): +class RandomRegressor(RegressorMixin, BaseEstimator): """A `RandomRegressor` makes random predictions only based on the `y` value that is seen. The goal is that such a regressor can be used for benchmarking. It _should be_ easily beatable. @@ -101,7 +101,7 @@ def predict(self, X): X = check_array(X, estimator=self, dtype=FLOAT_DTYPES) if X.shape[1] != self.n_features_in_: - raise ValueError(f"Unexpected input dimension {X.shape[1]}, expected {self.dim_}") + raise ValueError(f"Unexpected input dimension {X.shape[1]}, expected {self.n_features_in_}") if self.strategy == "normal": return rs.normal(self.mu_, self.sigma_, X.shape[0]) @@ -127,3 +127,14 @@ def allowed_strategies(self): def _more_tags(self): return {"poor_score": True, "non_deterministic": True} + + def __sklearn_tags__(self): + from sklego import SKLEARN_VERSION + + if SKLEARN_VERSION >= (1, 6): + tags = super().__sklearn_tags__() + tags.non_deterministic = True + tags.regressor_tags.poor_score = True + return tags + else: + pass diff --git a/sklego/feature_selection/mrmr.py b/sklego/feature_selection/mrmr.py index 5670f150..64f436bb 100644 --- a/sklego/feature_selection/mrmr.py +++ b/sklego/feature_selection/mrmr.py @@ -4,7 +4,9 @@ from sklearn.base import BaseEstimator from sklearn.feature_selection import f_classif, f_regression from sklearn.feature_selection._base import SelectorMixin -from sklearn.utils.validation import check_is_fitted, check_X_y +from sklearn.utils.validation import check_is_fitted + +from sklego.common import validate_data def _redundancy_pearson(X, selected, left): @@ -201,7 +203,8 @@ def fit(self, X, y): k parameter is not integer type or is < n_features_in (X.shape[1]) or < 1 """ - X, y = check_X_y(X, y, dtype="numeric", y_numeric=True) + X, y = validate_data(self, X, y, dtype="numeric", y_numeric=True) + self._y_dtype = y.dtype relevance = self._get_relevance diff --git a/sklego/linear_model.py b/sklego/linear_model.py index 7c05262c..9255b5cb 100644 --- a/sklego/linear_model.py +++ b/sklego/linear_model.py @@ -27,7 +27,7 @@ ) -class LowessRegression(BaseEstimator, RegressorMixin): +class LowessRegression(RegressorMixin, BaseEstimator): """`LowessRegression` estimator: LOWESS (Locally Weighted Scatterplot Smoothing) is a type of [local regression](https://en.wikipedia.org/wiki/Local_regression). @@ -145,7 +145,7 @@ def predict(self, X): return results -class ProbWeightRegression(BaseEstimator, RegressorMixin): +class ProbWeightRegression(RegressorMixin, BaseEstimator): """`ProbWeightRegression` assumes that all input signals in `X` need to be reweighted with weights that sum up to one in order to predict `y`. @@ -266,7 +266,7 @@ def coefs_(self): return self.coef_ -class DeadZoneRegressor(BaseEstimator, RegressorMixin): +class DeadZoneRegressor(RegressorMixin, BaseEstimator): r"""The `DeadZoneRegressor` estimator implements a regression model that incorporates a _dead zone effect_ for improving the robustness of regression predictions. @@ -470,7 +470,7 @@ def allowed_effects(self): return self._ALLOWED_EFFECTS -class _FairClassifier(BaseEstimator, LinearClassifierMixin): +class _FairClassifier(LinearClassifierMixin, BaseEstimator): """Base class for fair classifiers that address sensitive attribute fairness. This base class provides a foundation for fair classifiers that aim to mitigate bias and discrimination by taking @@ -671,8 +671,18 @@ def decision_function(self, X): def _more_tags(self): return {"poor_score": True} + def __sklearn_tags__(self): + from sklego import SKLEARN_VERSION -class DemographicParityClassifier(BaseEstimator, LinearClassifierMixin): + if SKLEARN_VERSION >= (1, 6): + tags = super().__sklearn_tags__() + tags.classifier_tags.poor_score = True + return tags + else: + pass + + +class DemographicParityClassifier(LinearClassifierMixin, BaseEstimator): r"""`DemographicParityClassifier` is a logistic regression classifier which can be constrained on demographic parity (p% score). @@ -790,7 +800,7 @@ def constraints(self, y_hat, y_true, sensitive, n_obs): return [] -class EqualOpportunityClassifier(BaseEstimator, LinearClassifierMixin): +class EqualOpportunityClassifier(LinearClassifierMixin, BaseEstimator): r"""`EqualOpportunityClassifier` is a logistic regression classifier which can be constrained on equal opportunity score. @@ -904,7 +914,7 @@ def constraints(self, y_hat, y_true, sensitive, n_obs): return [] -class BaseScipyMinimizeRegressor(BaseEstimator, RegressorMixin, ABC): +class BaseScipyMinimizeRegressor(RegressorMixin, BaseEstimator, ABC): """Abstract base class for regressors relying on Scipy's [minimize method](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html) to minimize a (custom) loss function. @@ -960,8 +970,6 @@ def __init__( self.fit_intercept = fit_intercept self.copy_X = copy_X self.positive = positive - if method not in ("SLSQP", "TNC", "L-BFGS-B"): - raise ValueError(f'method should be one of "SLSQP", "TNC", "L-BFGS-B", ' f"got {method} instead") self.method = method @abstractmethod @@ -1011,6 +1019,9 @@ def fit(self, X, y, sample_weight=None): self : BaseScipyMinimizeRegressor Fitted linear model. """ + if self.method not in {"SLSQP", "TNC", "L-BFGS-B"}: + msg = f"method should be one of 'SLSQP', 'TNC', 'L-BFGS-B', got {self.method} instead" + raise ValueError(msg) X_, grad_loss, loss = self._prepare_inputs(X, sample_weight, y) d = X_.shape[1] - self.n_features_in_ # This is either zero or one. diff --git a/sklego/meta/confusion_balancer.py b/sklego/meta/confusion_balancer.py index 8821d8b0..26b00fdc 100644 --- a/sklego/meta/confusion_balancer.py +++ b/sklego/meta/confusion_balancer.py @@ -7,7 +7,7 @@ from sklego.base import ProbabilisticClassifier -class ConfusionBalancer(BaseEstimator, MetaEstimatorMixin, ClassifierMixin): +class ConfusionBalancer(ClassifierMixin, MetaEstimatorMixin, BaseEstimator): r"""The `ConfusionBalancer` estimator attempts to give it's child estimator a more balanced output by learning from the confusion matrix during training. diff --git a/sklego/meta/decay_estimator.py b/sklego/meta/decay_estimator.py index b454c132..fcdc9e31 100644 --- a/sklego/meta/decay_estimator.py +++ b/sklego/meta/decay_estimator.py @@ -5,7 +5,7 @@ from sklego.meta._decay_utils import exponential_decay, linear_decay, sigmoid_decay, stepwise_decay -class DecayEstimator(BaseEstimator, MetaEstimatorMixin): +class DecayEstimator(MetaEstimatorMixin, BaseEstimator): """Morphs an estimator such that the training weights can be adapted to ensure that points that are far away have less weight. @@ -97,10 +97,16 @@ def _is_classifier(self): """Checks if the wrapped estimator is a classifier.""" return any(["ClassifierMixin" in p.__name__ for p in type(self.model).__bases__]) + def _is_regressor(self): + """Checks if the wrapped estimator is a regressor.""" + return any(["RegressorMixin" in p.__name__ for p in type(self.model).__bases__]) + @property def _estimator_type(self): """Computes `_estimator_type` dynamically from the wrapped model.""" - return self.model._estimator_type + from sklego import SKLEARN_VERSION + + return self.model.__sklearn_tags__().estimator_type if SKLEARN_VERSION >= (1, 6) else self.model._estimator_type def fit(self, X, y): """Fit the underlying estimator on the training data `X` and `y` using the calculated sample weights. @@ -165,3 +171,6 @@ def predict(self, X): def score(self, X, y): """Alias for `.score()` method of the underlying estimator.""" return self.estimator_.score(X, y) + + def __sklearn_tags__(self): + return self.model.__sklearn_tags__() diff --git a/sklego/meta/grouped_predictor.py b/sklego/meta/grouped_predictor.py index 40878201..4dc4378d 100644 --- a/sklego/meta/grouped_predictor.py +++ b/sklego/meta/grouped_predictor.py @@ -401,8 +401,18 @@ def _estimator_type(self): def _more_tags(self): return {"allow_nan": True} + def __sklearn_tags__(self): + from sklego import SKLEARN_VERSION -class GroupedRegressor(GroupedPredictor, RegressorMixin): + if SKLEARN_VERSION >= (1, 6): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags + else: + pass + + +class GroupedRegressor(RegressorMixin, GroupedPredictor): """`GroupedRegressor` is a meta-estimator that fits a separate regressor for each group in the input data. Its spec is the same as [`GroupedPredictor`][sklego.meta.grouped_predictor.GroupedPredictor] but it is available @@ -439,7 +449,7 @@ def fit(self, X, y): return super().fit(X, y) -class GroupedClassifier(GroupedPredictor, ClassifierMixin): +class GroupedClassifier(ClassifierMixin, GroupedPredictor): """`GroupedClassifier` is a meta-estimator that fits a separate classifier for each group in the input data. Its equivalent to [`GroupedPredictor`][sklego.meta.grouped_predictor.GroupedPredictor] with `shrinkage=None` diff --git a/sklego/meta/grouped_transformer.py b/sklego/meta/grouped_transformer.py index 2dfe18ee..30326d1e 100644 --- a/sklego/meta/grouped_transformer.py +++ b/sklego/meta/grouped_transformer.py @@ -209,6 +209,16 @@ def transform(self, X): def _more_tags(self): return {"allow_nan": True} + def __sklearn_tags__(self): + from sklego import SKLEARN_VERSION + + if SKLEARN_VERSION >= (1, 6): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags + else: + pass + def get_feature_names_out(self) -> List[str]: "Alias for the `feature_names_out_` attribute defined during fit." return self.feature_names_out_ diff --git a/sklego/meta/hierarchical_predictor.py b/sklego/meta/hierarchical_predictor.py index 5d71cc5c..870a79a3 100644 --- a/sklego/meta/hierarchical_predictor.py +++ b/sklego/meta/hierarchical_predictor.py @@ -423,6 +423,16 @@ def n_levels_(self): def _more_tags(self): return {"allow_nan": True} + def __sklearn_tags__(self): + from sklego import SKLEARN_VERSION + + if SKLEARN_VERSION >= (1, 6): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags + else: + pass + class HierarchicalRegressor(HierarchicalPredictor, RegressorMixin): """A hierarchical regressor that predicts values using hierarchical grouping. diff --git a/sklego/meta/outlier_classifier.py b/sklego/meta/outlier_classifier.py index 09f6d50d..d965e443 100644 --- a/sklego/meta/outlier_classifier.py +++ b/sklego/meta/outlier_classifier.py @@ -7,7 +7,7 @@ from sklego.base import OutlierModel -class OutlierClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin): +class OutlierClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator): """Morphs an outlier detection model into a classifier. When an outlier is detected it will output 1 and 0 otherwise. This way you can use familiar metrics again and this diff --git a/sklego/meta/regression_outlier_detector.py b/sklego/meta/regression_outlier_detector.py index 6ef8a8b2..4c51267a 100644 --- a/sklego/meta/regression_outlier_detector.py +++ b/sklego/meta/regression_outlier_detector.py @@ -5,7 +5,7 @@ from sklearn.utils.validation import check_array, check_is_fitted -class RegressionOutlierDetector(BaseEstimator, OutlierMixin): +class RegressionOutlierDetector(OutlierMixin, BaseEstimator): """Morphs a regression estimator into one that can detect outliers. We will try to predict `column` in X. Parameters diff --git a/sklego/meta/subjective_classifier.py b/sklego/meta/subjective_classifier.py index 60e72463..b396bddc 100644 --- a/sklego/meta/subjective_classifier.py +++ b/sklego/meta/subjective_classifier.py @@ -6,7 +6,7 @@ from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted, check_X_y -class SubjectiveClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin): +class SubjectiveClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator): """Corrects predictions of the inner classifier by taking into account a (subjective) prior distribution of the classes. diff --git a/sklego/meta/thresholder.py b/sklego/meta/thresholder.py index b08e76b8..85265a2b 100644 --- a/sklego/meta/thresholder.py +++ b/sklego/meta/thresholder.py @@ -5,12 +5,14 @@ from sklearn import clone from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.exceptions import NotFittedError +from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import _check_sample_weight, check_is_fitted, check_X_y +from sklego import SKLEARN_VERSION from sklego.base import ProbabilisticClassifier -class Thresholder(BaseEstimator, ClassifierMixin): +class Thresholder(ClassifierMixin, BaseEstimator): """Takes a binary classifier and moves the threshold. This way you might design the algorithm to only accept a certain class if the probability for it is larger than, say, 90% instead of 50%. @@ -103,8 +105,11 @@ def fit(self, X, y, sample_weight=None): self.n_features_in_ = X.shape[1] self.classes_ = self.estimator_.classes_ - if len(self.classes_) != 2: - raise ValueError("The `Thresholder` meta model only works on models with two classes.") + + extra_args = {"raise_unknown": True} if SKLEARN_VERSION >= (1, 6) else {} + y_type = type_of_target(y, input_name="y", **extra_args) + if y_type != "binary": + raise ValueError("Only binary classification is supported. The type of the target " f"is {y_type}.") return self @@ -139,3 +144,13 @@ def _more_tags(self): return { "binary_only": True, } + + def __sklearn_tags__(self): + from sklego import SKLEARN_VERSION + + if SKLEARN_VERSION >= (1, 6): + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_class = False + return tags + else: + pass diff --git a/sklego/meta/zero_inflated_regressor.py b/sklego/meta/zero_inflated_regressor.py index 9a15edb0..fc9e03f6 100644 --- a/sklego/meta/zero_inflated_regressor.py +++ b/sklego/meta/zero_inflated_regressor.py @@ -8,7 +8,7 @@ from sklearn.utils.validation import _check_sample_weight, check_array, check_is_fitted, check_X_y -class ZeroInflatedRegressor(BaseEstimator, RegressorMixin, MetaEstimatorMixin): +class ZeroInflatedRegressor(RegressorMixin, MetaEstimatorMixin, BaseEstimator): """A meta regressor for zero-inflated datasets, i.e. the targets contain a lot of zeroes. `ZeroInflatedRegressor` consists of a classifier and a regressor. @@ -91,7 +91,8 @@ def fit(self, X, y, sample_weight=None): If `classifier` is not a classifier or `regressor` is not a regressor. """ X, y = check_X_y(X, y) - self._check_n_features(X, reset=True) + self.n_features_in_ = X.shape[1] + if not is_classifier(self.classifier): raise ValueError( f"`classifier` has to be a classifier. Received instance of {type(self.classifier)} instead." @@ -155,9 +156,11 @@ def predict(self, X): array-like of shape (n_samples,) The predicted values. """ - check_is_fitted(self) + check_is_fitted(self, ["n_features_in_", "classifier_", "regressor_"]) X = check_array(X) - self._check_n_features(X, reset=False) + if X.shape[1] != self.n_features_in_: + msg = f"Unexpected input dimension {X.shape[1]}, expected {self.n_features_in_}" + raise ValueError(msg) output = np.zeros(len(X)) non_zero_indices = np.where(self.classifier_.predict(X))[0] @@ -195,7 +198,9 @@ def score_samples(self, X): check_is_fitted(self) X = check_array(X) - self._check_n_features(X, reset=True) + if X.shape[1] != self.n_features_in_: + msg = f"Unexpected input dimension {X.shape[1]}, expected {self.n_features_in_}" + raise ValueError(msg) non_zero_proba = self.classifier_.predict_proba(X)[:, 1] expected_impact = self.regressor_.predict(X) diff --git a/sklego/mixture/bayesian_gmm_classifier.py b/sklego/mixture/bayesian_gmm_classifier.py index 66b6b5e0..805420df 100644 --- a/sklego/mixture/bayesian_gmm_classifier.py +++ b/sklego/mixture/bayesian_gmm_classifier.py @@ -7,7 +7,7 @@ from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted -class BayesianGMMClassifier(BaseEstimator, ClassifierMixin): +class BayesianGMMClassifier(ClassifierMixin, BaseEstimator): """The `BayesianGMMClassifier` trains a Gaussian Mixture Model for each class in `y` on a dataset `X`. Once a density is trained for each class we can evaluate the likelihood scores to see which class is more likely. diff --git a/sklego/mixture/gmm_classifier.py b/sklego/mixture/gmm_classifier.py index 01044325..9b6705a5 100644 --- a/sklego/mixture/gmm_classifier.py +++ b/sklego/mixture/gmm_classifier.py @@ -7,7 +7,7 @@ from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted -class GMMClassifier(BaseEstimator, ClassifierMixin): +class GMMClassifier(ClassifierMixin, BaseEstimator): """The `GMMClassifier` trains a Gaussian Mixture Model for each class in `y` on a dataset `X`. Once a density is trained for each class we can evaluate the likelihood scores to see which class is more likely. diff --git a/sklego/naive_bayes.py b/sklego/naive_bayes.py index a3fab146..05fc9807 100644 --- a/sklego/naive_bayes.py +++ b/sklego/naive_bayes.py @@ -8,7 +8,7 @@ from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted -class GaussianMixtureNB(BaseEstimator, ClassifierMixin): +class GaussianMixtureNB(ClassifierMixin, BaseEstimator): """The `GaussianMixtureNB` estimator is a naive bayes classifier that uses a mixture of gaussians instead of merely a single one. In particular it trains a `GaussianMixture` model for each class in the target and for each feature in the data, on the subset of `X` where `y == class`. @@ -118,6 +118,9 @@ def predict(self, X): """ check_is_fitted(self, ["gmms_", "classes_", "n_features_in_"]) X = check_array(X, estimator=self, dtype=FLOAT_DTYPES) + + if self.n_features_in_ != X.shape[1]: + raise ValueError(f"number of columns {X.shape[1]} does not match fit size {self.n_features_in_}") return self.classes_[self.predict_proba(X).argmax(axis=1)] def predict_proba(self, X: np.ndarray): @@ -158,7 +161,7 @@ def num_fit_cols_(self): return self.n_features_in_ -class BayesianGaussianMixtureNB(BaseEstimator, ClassifierMixin): +class BayesianGaussianMixtureNB(ClassifierMixin, BaseEstimator): """The `BayesianGaussianMixtureNB` estimator is a naive bayes classifier that uses a bayesian mixture of gaussians instead of merely a single one. In particular it trains a `BayesianGaussianMixture` model for each class in the target and for each feature in the data, on the subset of `X` where `y == class`. @@ -235,6 +238,7 @@ def fit(self, X, y) -> "BayesianGaussianMixtureNB": The fitted estimator. """ X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES) + if X.ndim == 1: X = np.expand_dims(X, 1) @@ -284,6 +288,10 @@ def predict(self, X): """ check_is_fitted(self, ["gmms_", "classes_", "n_features_in_"]) X = check_array(X, estimator=self, dtype=FLOAT_DTYPES) + + if self.n_features_in_ != X.shape[1]: + raise ValueError(f"number of columns {X.shape[1]} does not match fit size {self.n_features_in_}") + return self.classes_[self.predict_proba(X).argmax(axis=1)] def predict_proba(self, X: np.ndarray): diff --git a/sklego/neighbors.py b/sklego/neighbors.py index 55cdbe19..9a35ba0c 100644 --- a/sklego/neighbors.py +++ b/sklego/neighbors.py @@ -6,7 +6,7 @@ from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted -class BayesianKernelDensityClassifier(BaseEstimator, ClassifierMixin): +class BayesianKernelDensityClassifier(ClassifierMixin, BaseEstimator): """The `BayesianKernelDensityClassifier` estimator trains using Kernel Density estimations to generate the joint distribution. diff --git a/sklego/preprocessing/columncapper.py b/sklego/preprocessing/columncapper.py index 1caa6969..b2b1a8f0 100644 --- a/sklego/preprocessing/columncapper.py +++ b/sklego/preprocessing/columncapper.py @@ -96,9 +96,6 @@ def __init__( discard_infs=False, copy=True, ): - self._check_quantile_range(quantile_range) - self._check_interpolation(interpolation) - self.quantile_range = quantile_range self.interpolation = interpolation self.discard_infs = discard_infs @@ -124,6 +121,8 @@ def fit(self, X, y=None): ValueError If `X` contains non-numeric columns. """ + self._check_quantile_range(self.quantile_range) + self._check_interpolation(self.interpolation) X = check_array(X, copy=True, force_all_finite=False, dtype=FLOAT_DTYPES, estimator=self) # If X contains infs, we need to replace them by nans before computing quantiles @@ -245,3 +244,13 @@ def n_columns_(self): def _more_tags(self): return {"allow_nan": True} + + def __sklearn_tags__(self): + from sklego import SKLEARN_VERSION + + if SKLEARN_VERSION >= (1, 6): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags + else: + pass diff --git a/sklego/preprocessing/dictmapper.py b/sklego/preprocessing/dictmapper.py index d718430a..33b851ab 100644 --- a/sklego/preprocessing/dictmapper.py +++ b/sklego/preprocessing/dictmapper.py @@ -127,3 +127,15 @@ def dim_(self): def _more_tags(self): return {"preserves_dtype": None, "allow_nan": True, "no_validation": True} + + def __sklearn_tags__(self): + from sklego import SKLEARN_VERSION + + if SKLEARN_VERSION >= (1, 6): + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = [] + tags.input_tags.allow_nan = True + tags.no_validation = True + return tags + else: + pass diff --git a/sklego/preprocessing/identitytransformer.py b/sklego/preprocessing/identitytransformer.py index bf291f00..33dda462 100644 --- a/sklego/preprocessing/identitytransformer.py +++ b/sklego/preprocessing/identitytransformer.py @@ -3,7 +3,7 @@ from sklearn.utils.validation import check_is_fitted -class IdentityTransformer(BaseEstimator, TransformerMixin): +class IdentityTransformer(TransformerMixin, BaseEstimator): """The `IdentityTransformer` returns what it is fed. Does not apply any transformation. The reason for having it is because you can build more expressive pipelines. diff --git a/sklego/preprocessing/pandastransformers.py b/sklego/preprocessing/pandastransformers.py index 2af07cb3..faccfa98 100644 --- a/sklego/preprocessing/pandastransformers.py +++ b/sklego/preprocessing/pandastransformers.py @@ -60,7 +60,7 @@ def _nw_select_dtypes(include: str | list[str], exclude: str | list[str], schema return feature_names -class ColumnDropper(BaseEstimator, TransformerMixin): +class ColumnDropper(TransformerMixin, BaseEstimator): """The `ColumnDropper` transformer allows dropping specific columns from a DataFrame by name. Can be useful in a sklearn Pipeline. @@ -226,7 +226,7 @@ def _check_column_names(self, X): raise KeyError(f"{list(non_existent_columns)} column(s) not in DataFrame") -class TypeSelector(BaseEstimator, TransformerMixin): +class TypeSelector(TransformerMixin, BaseEstimator): """The `TypeSelector` transformer allows to select columns in a DataFrame based on their type. Can be useful in a sklearn Pipeline. @@ -412,7 +412,7 @@ def __init__(self, include=None, exclude=None): super().__init__(include=include, exclude=exclude) -class ColumnSelector(BaseEstimator, TransformerMixin): +class ColumnSelector(TransformerMixin, BaseEstimator): """The `ColumnSelector` transformer allows selecting specific columns from a DataFrame by name. Can be useful in a sklearn Pipeline. diff --git a/sklego/preprocessing/projections.py b/sklego/preprocessing/projections.py index d27e88f3..cfb41a5d 100644 --- a/sklego/preprocessing/projections.py +++ b/sklego/preprocessing/projections.py @@ -7,7 +7,7 @@ from sklego.common import as_list -class OrthogonalTransformer(BaseEstimator, TransformerMixin): +class OrthogonalTransformer(TransformerMixin, BaseEstimator): r"""The `OrthogonalTransformer` transforms the columns of a dataframe or numpy array to orthogonal (or orthonormal if `normalize=True`) matrix. @@ -113,7 +113,7 @@ def vector_projection(vec, unto): return scalar_projection(vec, unto) * unto -class InformationFilter(BaseEstimator, TransformerMixin): +class InformationFilter(TransformerMixin, BaseEstimator): r"""The `InformationFilter` transformer uses a variant of the [Gram-Schmidt process](https://en.wikipedia.org/wiki/Gram%E2%80%93Schmidt_process) to filter information out of the dataset. diff --git a/sklego/preprocessing/randomadder.py b/sklego/preprocessing/randomadder.py index c1a79f39..a3a690ad 100644 --- a/sklego/preprocessing/randomadder.py +++ b/sklego/preprocessing/randomadder.py @@ -1,10 +1,9 @@ from warnings import warn from sklearn.base import BaseEstimator -from sklearn.utils import check_array, check_X_y from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted, check_random_state -from sklego.common import TrainOnlyTransformerMixin +from sklego.common import TrainOnlyTransformerMixin, validate_data class RandomAdder(TrainOnlyTransformerMixin, BaseEstimator): @@ -69,7 +68,7 @@ def fit(self, X, y): The fitted transformer. """ super().fit(X, y) - X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES) + X, y = validate_data(self, X, y, dtype=FLOAT_DTYPES) self.n_features_in_ = X.shape[1] return self @@ -90,7 +89,7 @@ def transform_train(self, X): rs = check_random_state(self.random_state) check_is_fitted(self, ["n_features_in_"]) - X = check_array(X, estimator=self, dtype=FLOAT_DTYPES) + X = validate_data(estimator=self, X=X, dtype=FLOAT_DTYPES) return X + rs.normal(0, self.noise, size=X.shape) @@ -104,3 +103,13 @@ def dim_(self): def _more_tags(self): return {"non_deterministic": True} + + def __sklearn_tags__(self): + from sklego import SKLEARN_VERSION + + if SKLEARN_VERSION >= (1, 6): + tags = super().__sklearn_tags__() + tags.non_deterministic = True + return tags + else: + pass