Skip to content

Commit

Permalink
top level modules
Browse files Browse the repository at this point in the history
  • Loading branch information
FBruzzesi committed Dec 15, 2024
1 parent e1b2520 commit d3cb19f
Show file tree
Hide file tree
Showing 10 changed files with 80 additions and 55 deletions.
14 changes: 7 additions & 7 deletions sklego/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
from sklearn.utils.validation import check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data


class TrainOnlyTransformerMixin(TransformerMixin, BaseEstimator):
Expand Down Expand Up @@ -79,9 +80,10 @@ def fit(self, X, y=None):
The fitted transformer.
"""
if y is None:
check_array(X, estimator=self)
validate_data(self, X=X, reset=True)
else:
check_X_y(X, y, estimator=self, multi_output=True)
validate_data(self, X=X, y=y, multi_output=True, reset=True)
_check_n_features(self, X, reset=True)
self.X_hash_ = self._hash(X)
self.n_features_in_ = X.shape[1]
return self
Expand Down Expand Up @@ -145,10 +147,8 @@ def transform(self, X, y=None):
If the input dimension does not match the training dimension.
"""
check_is_fitted(self, ["X_hash_", "n_features_in_"])
check_array(X, estimator=self)

if X.shape[1] != self.n_features_in_:
raise ValueError(f"Unexpected input dimension {X.shape[1]}, expected {self.n_features_in_}")
validate_data(self, X=X, reset=False)
_check_n_features(self, X, reset=False)

if self._hash(X) == self.X_hash_:
return self.transform_train(X)
Expand Down
18 changes: 11 additions & 7 deletions sklego/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@

import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils import check_X_y
from sklearn.utils.validation import (
FLOAT_DTYPES,
check_array,
check_is_fitted,
check_random_state,
)
from sklearn_compat.utils.validation import _check_n_features, validate_data


class RandomRegressor(RegressorMixin, BaseEstimator):
Expand Down Expand Up @@ -72,8 +71,8 @@ def fit(self, X: np.array, y: np.array) -> "RandomRegressor":
"""
if self.strategy not in self._ALLOWED_STRATEGIES:
raise ValueError(f"strategy {self.strategy} is not in {self._ALLOWED_STRATEGIES}")
X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
self.n_features_in_ = X.shape[1]
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
_check_n_features(self, X, reset=True)

self.min_ = np.min(y)
self.max_ = np.max(y)
Expand All @@ -99,9 +98,8 @@ def predict(self, X):
rs = check_random_state(self.random_state)
check_is_fitted(self, ["n_features_in_", "min_", "max_", "mu_", "sigma_"])

X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
if X.shape[1] != self.n_features_in_:
raise ValueError(f"Unexpected input dimension {X.shape[1]}, expected {self.dim_}")
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)

if self.strategy == "normal":
return rs.normal(self.mu_, self.sigma_, X.shape[0])
Expand All @@ -127,3 +125,9 @@ def allowed_strategies(self):

def _more_tags(self):
return {"poor_score": True, "non_deterministic": True}

def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.non_deterministic = True
tags.regressor_tags.poor_score = True
return tags
29 changes: 21 additions & 8 deletions sklego/linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
check_is_fitted,
column_or_1d,
)
from sklearn_compat.utils.validation import _check_n_features, validate_data


class LowessRegression(RegressorMixin, BaseEstimator):
Expand Down Expand Up @@ -96,7 +97,8 @@ def fit(self, X, y):
- If `span` is not between 0 and 1.
- If `sigma` is negative.
"""
X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
_check_n_features(self, X, reset=True)
if self.span is not None:
if not 0 <= self.span <= 1:
raise ValueError(f"Param `span` must be 0 <= span <= 1, got: {self.span}")
Expand Down Expand Up @@ -138,8 +140,9 @@ def predict(self, X):
array-like of shape (n_samples,)
The predicted values.
"""
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
check_is_fitted(self, ["X_", "y_"])
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)

try:
results = np.stack([np.average(self.y_, weights=self._calc_wts(x_i=x_i)) for x_i in X])
Expand Down Expand Up @@ -233,7 +236,8 @@ def fit(self, X, y):
self : ProbWeightRegression
The fitted estimator.
"""
X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
_check_n_features(self, X, reset=True)

# Construct the problem.
betas = cp.Variable(X.shape[1])
Expand Down Expand Up @@ -263,8 +267,10 @@ def predict(self, X):
array-like of shape (n_samples,)
The predicted data.
"""
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
check_is_fitted(self, ["coef_"])
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)

return np.dot(X, self.coef_)

@property
Expand Down Expand Up @@ -381,7 +387,9 @@ def fit(self, X, y):
ValueError
If `effect` is not one of "linear", "quadratic" or "constant".
"""
X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
_check_n_features(self, X, reset=True)

if self.effect not in self._ALLOWED_EFFECTS:
raise ValueError(f"effect {self.effect} must be in {self._ALLOWED_EFFECTS}")

Expand Down Expand Up @@ -458,8 +466,10 @@ def predict(self, X):
array-like of shape (n_samples,)
The predicted data.
"""
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
check_is_fitted(self, ["coef_"])
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)

return np.dot(X, self.coef_)

@property
Expand Down Expand Up @@ -1053,7 +1063,9 @@ def _prepare_inputs(self, X, sample_weight, y):
This method is called by `fit` to prepare the inputs for the optimization problem. It adds an intercept column
to `X` if `fit_intercept=True`, and returns the loss function and its gradient.
"""
X, y = check_X_y(X, y, y_numeric=True)
X, y = validate_data(self, X=X, y=y, y_numeric=True, reset=True)
_check_n_features(self, X, reset=True)

sample_weight = _check_sample_weight(sample_weight, X)
self.n_features_in_ = X.shape[1]

Expand Down Expand Up @@ -1083,7 +1095,8 @@ def predict(self, X):
The predicted data.
"""
check_is_fitted(self)
X = check_array(X)
X = validate_data(self, X=X, reset=False)
_check_n_features(self, X, reset=False)

return X @ self.coef_ + self.intercept_

Expand Down
5 changes: 3 additions & 2 deletions sklego/model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
import numpy as np
import pandas as pd
from sklearn.exceptions import NotFittedError
from sklearn.model_selection._split import _BaseKFold, check_array
from sklearn.model_selection._split import _BaseKFold
from sklearn.utils.validation import indexable
from sklearn_compat.utils.validation import validate_data

from sklego.base import Clusterer
from sklego.common import sliding_window
Expand Down Expand Up @@ -320,7 +321,7 @@ def split(self, X, y=None, groups=None):
Train and test indices of the same fold.
"""

X = check_array(X)
X = validate_data(self, X=X, reset=True)

if not self._method_is_fitted(X):
self.cluster_method.fit(X)
Expand Down
38 changes: 19 additions & 19 deletions sklego/naive_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
from sklearn.utils import check_X_y
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data


class GaussianMixtureNB(ClassifierMixin, BaseEstimator):
Expand Down Expand Up @@ -73,10 +73,12 @@ def fit(self, X, y) -> "GaussianMixtureNB":
self : GaussianMixtureNB
The fitted estimator.
"""
X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
if X.ndim == 1:
X = np.expand_dims(X, 1)

_check_n_features(self, X, reset=True)

self.gmms_ = {}
self.classes_ = unique_labels(y)
self.n_features_in_ = X.shape[1]
Expand Down Expand Up @@ -117,10 +119,10 @@ def predict(self, X):
The predicted data.
"""
check_is_fitted(self, ["gmms_", "classes_", "n_features_in_"])
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)

if self.n_features_in_ != X.shape[1]:
raise ValueError(f"number of columns {X.shape[1]} does not match fit size {self.n_features_in_}")
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)
# if self.n_features_in_ != X.shape[1]:
# raise ValueError(f"number of columns {X.shape[1]} does not match fit size {self.n_features_in_}")

return self.classes_[self.predict_proba(X).argmax(axis=1)]

Expand All @@ -139,10 +141,9 @@ def predict_proba(self, X: np.ndarray):
The predicted probabilities.
"""
check_is_fitted(self, ["gmms_", "classes_", "n_features_in_"])
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
if self.n_features_in_ != X.shape[1]:
raise ValueError(f"number of columns {X.shape[1]} does not match fit size {self.n_features_in_}")
check_is_fitted(self, ["gmms_", "classes_"])
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X=X, reset=False)

probs = np.zeros((X.shape[0], len(self.classes_)))
for k, v in self.gmms_.items():
class_idx = np.argmax(self.classes_ == k)
Expand Down Expand Up @@ -238,10 +239,11 @@ def fit(self, X, y) -> "BayesianGaussianMixtureNB":
self : BayesianGaussianMixtureNB
The fitted estimator.
"""
X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
if X.ndim == 1:
X = np.expand_dims(X, 1)

_check_n_features(self, X, reset=True)
self.gmms_ = {}
self.classes_ = unique_labels(y)
self.n_features_in_ = X.shape[1]
Expand Down Expand Up @@ -287,10 +289,9 @@ def predict(self, X):
The predicted data.
"""
check_is_fitted(self, ["gmms_", "classes_", "n_features_in_"])
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)

if self.n_features_in_ != X.shape[1]:
raise ValueError(f"number of columns {X.shape[1]} does not match fit size {self.n_features_in_}")
_check_n_features(self, X, reset=False)

return self.classes_[self.predict_proba(X).argmax(axis=1)]

Expand All @@ -309,10 +310,9 @@ def predict_proba(self, X: np.ndarray):
The predicted probabilities.
"""
check_is_fitted(self, ["gmms_", "classes_", "n_features_in_"])
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
if self.n_features_in_ != X.shape[1]:
raise ValueError(f"number of columns {X.shape[1]} does not match fit size {self.n_features_in_}")
check_is_fitted(self, ["gmms_", "classes_"])
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)

probs = np.zeros((X.shape[0], len(self.classes_)))
for k, v in self.gmms_.items():
class_idx = np.argmax(self.classes_ == k)
Expand Down
17 changes: 10 additions & 7 deletions sklego/neighbors.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import KernelDensity
from sklearn.utils import check_X_y
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data


class BayesianKernelDensityClassifier(ClassifierMixin, BaseEstimator):
Expand Down Expand Up @@ -62,7 +62,8 @@ def fit(self, X: np.ndarray, y: np.ndarray):
self : BayesianKernelDensityClassifier
The fitted estimator.
"""
X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
_check_n_features(self, X, reset=True)

self.classes_ = unique_labels(y)
self.models_, self.priors_logp_ = {}, {}
Expand Down Expand Up @@ -103,8 +104,9 @@ def predict_proba(self, X):
array-like of shape (n_samples, n_classes)
The predicted probabilities for each class, ordered as in `self.classes_`.
"""
check_is_fitted(self)
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
check_is_fitted(self, ["classes_", "models_"])
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)

log_prior = np.array([self.priors_logp_[target_label] for target_label in self.classes_])

Expand All @@ -129,7 +131,8 @@ def predict(self, X):
array-like of shape (n_samples,)
The predicted data.
"""
check_is_fitted(self)
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
check_is_fitted(self, ["classes_", "models_"])
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)

return self.classes_[np.argmax(self.predict_proba(X), 1)]
1 change: 1 addition & 0 deletions tests/test_estimators/test_demographic_parity.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def test_sklearn_compatible_estimator(estimator, check):
# the test
"check_classifiers_train",
"check_n_features_in", # TODO: This should be fixable?!
"check_n_features_in_after_fitting", # same problem as above, new check in 1.6
}:
pytest.skip()

Expand Down
1 change: 1 addition & 0 deletions tests/test_estimators/test_equal_opportunity.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def test_sklearn_compatible_estimator(estimator, check):
# the test
"check_classifiers_train",
"check_n_features_in", # TODO: This should be fixable?!
"check_n_features_in_after_fitting", # same problem as above, new check in 1.6
}:
pytest.skip()
check(estimator)
Expand Down
4 changes: 4 additions & 0 deletions tests/test_estimators/test_imbalanced_linear_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ def _create_dataset(coefs, intercept, noise=0.0):
]
)
def test_sklearn_compatible_estimator(estimator, check):
if check.func.__name__ in {
"check_sample_weight_equivalence_on_dense_data",
}:
pytest.skip()
check(estimator)


Expand Down
8 changes: 3 additions & 5 deletions tests/test_estimators/test_quantile_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,9 @@ def _create_dataset(coefs, intercept, noise=0.0):
]
)
def test_sklearn_compatible_estimator(estimator, check):
if (
estimator.method != "SLSQP"
and check.func.__name__ == "check_sample_weights_invariance"
and getattr(check, "keywords", {}).get("kind") == "zeros"
):
if check.func.__name__ in {
"check_sample_weight_equivalence_on_dense_data",
}:
pytest.skip()
check(estimator)

Expand Down

0 comments on commit d3cb19f

Please sign in to comment.