diff --git a/docs/api/preprocessing.md b/docs/api/preprocessing.md index 66c510160..b27c971cb 100644 --- a/docs/api/preprocessing.md +++ b/docs/api/preprocessing.md @@ -64,3 +64,8 @@ options: show_root_full_path: true show_root_heading: true + +:::sklego.preprocessing.pandastransformers.TypeSelector + options: + show_root_full_path: true + show_root_heading: true diff --git a/docs/contribution.md b/docs/contribution.md index 8ef139619..5ead256da 100644 --- a/docs/contribution.md +++ b/docs/contribution.md @@ -174,7 +174,7 @@ When a new feature is introduced, it should be documented, and typically there a - [x] A user guide in the `docs/user-guide/` folder. - [x] A python script in the `docs/_scripts/` folder to generate plots and code snippets (see [next section](#working-with-pymdown-snippets-extension)) - [x] Relevant static files, such as images, plots, tables and html's, should be saved in the `docs/_static/` folder. -- [x] Edit the `mkdocs.yaml` file to include the new pages in the navigation. +- [x] Edit the `mkdocs.yaml` file to include the new pages in the navigation. ### Working with pymdown snippets extension diff --git a/pyproject.toml b/pyproject.toml index b3f2354e1..1f1a7a2a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ maintainers = [ ] dependencies = [ - "narwhals>=0.8.12", + "narwhals>=0.8.13", "pandas>=1.1.5", "scikit-learn>=1.0", "importlib-metadata >= 1.0; python_version < '3.8'", diff --git a/readme.md b/readme.md index fbd570456..21bfe1eed 100644 --- a/readme.md +++ b/readme.md @@ -120,7 +120,7 @@ Here's a list of features that this library currently offers: - `sklego.preprocessing.InformationFilter` transformer that can de-correlate features - `sklego.preprocessing.IdentityTransformer` returns the same data, allows for concatenating pipelines - `sklego.preprocessing.OrthogonalTransformer` makes all features linearly independent -- `sklego.preprocessing.PandasTypeSelector` selects columns based on pandas type +- `sklego.preprocessing.TypeSelector` selects columns based on type - `sklego.preprocessing.RandomAdder` adds randomness in training - `sklego.preprocessing.RepeatingBasisFunction` repeating feature engineering, useful for timeseries - `sklego.preprocessing.DictMapper` assign numeric values on categorical columns diff --git a/sklego/preprocessing/__init__.py b/sklego/preprocessing/__init__.py index 644ada48d..bd068a397 100644 --- a/sklego/preprocessing/__init__.py +++ b/sklego/preprocessing/__init__.py @@ -10,6 +10,7 @@ "OrthogonalTransformer", "OutlierRemover", "PandasTypeSelector", + "TypeSelector", "RandomAdder", "RepeatingBasisFunction", ] @@ -20,7 +21,7 @@ from sklego.preprocessing.identitytransformer import IdentityTransformer from sklego.preprocessing.intervalencoder import IntervalEncoder from sklego.preprocessing.outlier_remover import OutlierRemover -from sklego.preprocessing.pandastransformers import ColumnDropper, ColumnSelector, PandasTypeSelector +from sklego.preprocessing.pandastransformers import ColumnDropper, ColumnSelector, PandasTypeSelector, TypeSelector from sklego.preprocessing.projections import InformationFilter, OrthogonalTransformer from sklego.preprocessing.randomadder import RandomAdder from sklego.preprocessing.repeatingbasis import RepeatingBasisFunction diff --git a/sklego/preprocessing/pandastransformers.py b/sklego/preprocessing/pandastransformers.py index 18af9b279..cc965dc5f 100644 --- a/sklego/preprocessing/pandastransformers.py +++ b/sklego/preprocessing/pandastransformers.py @@ -1,11 +1,60 @@ +from __future__ import annotations + +import warnings + import narwhals as nw -import pandas as pd +from narwhals.dependencies import get_pandas from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_is_fitted from sklego.common import as_list +def _nw_match_dtype(dtype, selection): + if selection == "number": + return dtype in ( + nw.Int64, + nw.Int32, + nw.Int16, + nw.Int8, + nw.UInt64, + nw.UInt32, + nw.UInt16, + nw.UInt8, + nw.Float64, + nw.Float32, + ) + if selection == "bool": + return dtype == nw.Boolean + if selection == "string": + return dtype == nw.String + if selection == "category": + return dtype == nw.Categorical + msg = f"Expected {{'number', 'bool', 'string', 'category'}}, got: {selection}, which is not (yet!) supported." + raise ValueError(msg) + + +def _nw_select_dtypes(df, include: str | list[str], exclude: str | list[str]): + if not include and not exclude: + raise ValueError("Must provide at least one of `include` or `exclude`") + + if isinstance(include, str): + include = [include] + if isinstance(exclude, str): + exclude = [exclude] + + include = include or ["string", "number", "bool", "category"] + exclude = exclude or [] + + feature_names = [ + name + for name, dtype in df.schema.items() + if any(_nw_match_dtype(dtype, _include) for _include in include) + and not any(_nw_match_dtype(dtype, _exclude) for _exclude in exclude) + ] + return df.select(feature_names) + + class ColumnDropper(BaseEstimator, TransformerMixin): """The `ColumnDropper` transformer allows dropping specific columns from a DataFrame by name. Can be useful in a sklearn Pipeline. @@ -172,13 +221,21 @@ def _check_column_names(self, X): raise KeyError(f"{list(non_existent_columns)} column(s) not in DataFrame") -class PandasTypeSelector(BaseEstimator, TransformerMixin): - """The `PandasTypeSelector` transformer allows to select columns in a pandas DataFrame based on their type. +class TypeSelector(BaseEstimator, TransformerMixin): + """The `TypeSelector` transformer allows to select columns in a DataFrame based on their type. Can be useful in a sklearn Pipeline. - It uses - [pandas.DataFrame.select_dtypes](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.select_dtypes.html) - method. + - For pandas, it uses + [pandas.DataFrame.select_dtypes](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.select_dtypes.html) + method. + - For non-pandas dataframes (e.g. Polars), the following inputs are allowed: + + - 'number' + - 'string' + - 'bool' + - 'category' + + !!! info "New in version 0.9.0" Parameters ---------- @@ -191,7 +248,7 @@ class PandasTypeSelector(BaseEstimator, TransformerMixin): ---------- feature_names_ : list[str] The names of the features to keep during transform. - X_dtypes_ : pd.Series + X_dtypes_ : Series | dict[str, DType] The dtypes of the columns in the input DataFrame. !!! warning @@ -202,7 +259,7 @@ class PandasTypeSelector(BaseEstimator, TransformerMixin): -------- ```py import pandas as pd - from sklego.preprocessing import PandasTypeSelector + from sklego.preprocessing import TypeSelector df = pd.DataFrame({ "name": ["Swen", "Victor", "Alex"], @@ -211,14 +268,14 @@ class PandasTypeSelector(BaseEstimator, TransformerMixin): }) #Excluding single column - PandasTypeSelector(exclude="int64").fit_transform(df) + TypeSelector(exclude="int64").fit_transform(df) # name length #0 Swen 1.82 #1 Victor 1.85 #2 Alex 1.80 #Including multiple columns - PandasTypeSelector(include=["int64", "object"]).fit_transform(df) + TypeSelector(include=["int64", "object"]).fit_transform(df) # name shoesize #0 Swen 42 #1 Victor 44 @@ -235,26 +292,30 @@ def fit(self, X, y=None): Parameters ---------- - X : pd.DataFrame + X : DataFrame The data on which we apply the column selection. - y : pd.Series, default=None + y : Series, default=None Ignored, present for compatibility. Returns ------- - self : PandasTypeSelector + self : TypeSelector The fitted transformer. Raises ------ TypeError - If `X` is not a `pd.DataFrame` object. + If `X` is not a supported DataFrame. ValueError If provided type(s) results in empty dataframe. """ - self._check_X_for_type(X) - self.X_dtypes_ = X.dtypes - self.feature_names_ = list(X.select_dtypes(include=self.include, exclude=self.exclude).columns) + if (pd := get_pandas()) is not None and isinstance(X, pd.DataFrame): + self.X_dtypes_ = X.dtypes + self.feature_names_ = list(X.select_dtypes(include=self.include, exclude=self.exclude).columns) + else: + X = nw.from_native(X) + self.X_dtypes_ = X.schema + self.feature_names_ = _nw_select_dtypes(X, include=self.include, exclude=self.exclude).columns if len(self.feature_names_) == 0: raise ValueError("Provided type(s) results in empty dataframe") @@ -266,48 +327,66 @@ def get_feature_names(self, *args, **kwargs): return self.feature_names_ def transform(self, X): - """Returns a pandas DataFrame with columns (de)selected based on their dtype. + """Returns a DataFrame with columns (de)selected based on their dtype. Parameters ---------- - X : pd.DataFrame + X : DataFrame The data to select dtype for. Returns ------- - pd.DataFrame + DataFrame The data with the specified columns selected. Raises ------ TypeError - If `X` is not a `pd.DataFrame` object. + If `X` is not a supported DataFrame. ValueError If column dtypes were not equal during fit and transform. """ check_is_fitted(self, ["X_dtypes_", "feature_names_"]) - try: - if (self.X_dtypes_ != X.dtypes).any(): + if (pd := get_pandas()) is not None and isinstance(X, pd.DataFrame): + try: + if (self.X_dtypes_ != X.dtypes).any(): + raise ValueError( + f"Column dtypes were not equal during fit and transform. Fit types: \n" + f"{self.X_dtypes_}\n" + f"transform: \n" + f"{X.dtypes}" + ) + except ValueError as e: + raise ValueError("Column dtypes were not equal during fit and transform") from e + transformed_df = X.select_dtypes(include=self.include, exclude=self.exclude) + else: + X = nw.from_native(X) + if self.X_dtypes_ != X.schema: raise ValueError( f"Column dtypes were not equal during fit and transform. Fit types: \n" f"{self.X_dtypes_}\n" f"transform: \n" - f"{X.dtypes}" + f"{X.schema}" ) - except ValueError as e: - raise ValueError("Columns were not equal during fit and transform") from e - - self._check_X_for_type(X) - transformed_df = X.select_dtypes(include=self.include, exclude=self.exclude) + transformed_df = _nw_select_dtypes(X, include=self.include, exclude=self.exclude) return transformed_df - @staticmethod - def _check_X_for_type(X): - """Checks if input of the Selector is of the required dtype""" - if not isinstance(X, pd.DataFrame): - raise TypeError("Provided variable X is not of type pandas.DataFrame") + +class PandasTypeSelector(TypeSelector): + """ + !!! warning "Deprecated since version 0.9.0, please use TypeSelector instead" + """ + + def __init__(self, include=None, exclude=None): + warnings.warn( + "PandasTypeSelector is deprecated and will be removed in a future version. " + "Please use `from sklego.preprocessing import TypeSelector` instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(include=include, exclude=exclude) class ColumnSelector(BaseEstimator, TransformerMixin): diff --git a/tests/test_preprocessing/test_pandastypeselector.py b/tests/test_preprocessing/test_pandastypeselector.py index 1ec858d50..71644dc24 100644 --- a/tests/test_preprocessing/test_pandastypeselector.py +++ b/tests/test_preprocessing/test_pandastypeselector.py @@ -2,23 +2,26 @@ import numpy as np import pandas as pd +import polars as pl import pytest -from sklego.preprocessing import PandasTypeSelector +from sklego.preprocessing import PandasTypeSelector, TypeSelector from tests.conftest import id_func -@pytest.mark.parametrize("transformer", [PandasTypeSelector(include=["number"])], ids=id_func) -def test_len_regression(transformer, random_xy_dataset_regr): +@pytest.mark.parametrize("transformer", [TypeSelector(include=["number"])], ids=id_func) +@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame]) +def test_len_regression(transformer, random_xy_dataset_regr, frame_func): X, y = random_xy_dataset_regr - X = pd.DataFrame(X) + X = frame_func(X) assert transformer.fit(X, y).transform(X).shape[0] == X.shape[0] -@pytest.mark.parametrize("transformer", [PandasTypeSelector(include=["number"])], ids=id_func) -def test_len_classification(transformer, random_xy_dataset_clf): +@pytest.mark.parametrize("transformer", [TypeSelector(include=["number"])], ids=id_func) +@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame]) +def test_len_classification(transformer, random_xy_dataset_clf, frame_func): X, y = random_xy_dataset_clf - X = pd.DataFrame(X) + X = frame_func(X) assert transformer.fit(X, y).transform(X).shape[0] == X.shape[0] @@ -27,7 +30,7 @@ def test_len_classification(transformer, random_xy_dataset_clf): [_ for _ in it.combinations(["number", "datetime", "timedelta", "category", "datetimetz", None], 2)], ) def test_get_params_str(include, exclude): - transformer = PandasTypeSelector(include=include, exclude=exclude) + transformer = TypeSelector(include=include, exclude=exclude) assert transformer.get_params() == {"include": include, "exclude": exclude} @@ -37,31 +40,44 @@ def test_get_params_str(include, exclude): [_ for _ in it.combinations([np.int64, np.float64, np.datetime64, np.timedelta64], 2)], ) def test_get_params_np(include, exclude): - transformer = PandasTypeSelector(include=include, exclude=exclude) + transformer = TypeSelector(include=include, exclude=exclude) assert transformer.get_params() == {"include": include, "exclude": exclude} -def test_value_error_differrent_dtyes(): - fit_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - transform_df = pd.DataFrame({"a": [4, 5, 6], "b": ["4", "5", "6"]}) - transformer = PandasTypeSelector(exclude=["category"]).fit(fit_df) +@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame]) +def test_value_error_differrent_dtyes(frame_func): + fit_df = frame_func({"a": [1, 2, 3], "b": [4, 5, 6]}) + transform_df = frame_func({"a": [4, 5, 6], "b": ["4", "5", "6"]}) + transformer = TypeSelector(exclude=["category"]).fit(fit_df) with pytest.raises(ValueError): transformer.transform(transform_df) -def test_get_feature_names(): - df = pd.DataFrame({"a": [4, 5, 6], "b": ["4", "5", "6"]}) - transformer_number = PandasTypeSelector(include="number").fit(df) +@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame]) +def test_get_feature_names(frame_func): + df = frame_func({"a": [4, 5, 6], "b": ["4", "5", "6"]}) + transformer_number = TypeSelector(include="number").fit(df) assert transformer_number.get_feature_names() == ["a"] - transformer_number = PandasTypeSelector(include="object").fit(df) + if frame_func is pd.DataFrame: + transformer_number = TypeSelector(include="object").fit(df) + else: + transformer_number = TypeSelector(include="string").fit(df) assert transformer_number.get_feature_names() == ["b"] +@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame]) +def test_get_feature_names_deprecated(frame_func): + df = frame_func({"a": [4, 5, 6], "b": ["4", "5", "6"]}) + with pytest.deprecated_call(match="Please use `from sklego.preprocessing import TypeSelector`"): + transformer_number = PandasTypeSelector(include="number").fit(df) + assert transformer_number.get_feature_names() == ["a"] + + def test_value_error_empty(random_xy_dataset_regr): - transformer = PandasTypeSelector(exclude=["number"]) + transformer = TypeSelector(exclude=["number"]) X, y = random_xy_dataset_regr X = pd.DataFrame(X) @@ -70,11 +86,11 @@ def test_value_error_empty(random_xy_dataset_regr): def test_value_error_inequal(random_xy_dataset_regr): - transformer = PandasTypeSelector(include=["number"]) + transformer = TypeSelector(include=["number"]) X, y = random_xy_dataset_regr X = pd.DataFrame(X) if X.shape[0] > 0: - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Column dtypes were not equal during fit and transform"): transformer.fit(X) # Remove column to create error transformer.transform(X.iloc[:, :-1])