From 2697b2d2c68d2a4eae71c4c9742dbd9a7817fc14 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 14 May 2024 14:17:43 +0100 Subject: [PATCH 1/9] make pandas dtype selector df-agnostic --- sklego/preprocessing/pandastransformers.py | 123 +++++++++++++----- .../test_pandastypeselector.py | 28 ++-- 2 files changed, 111 insertions(+), 40 deletions(-) diff --git a/sklego/preprocessing/pandastransformers.py b/sklego/preprocessing/pandastransformers.py index 18af9b279..da19cb2c0 100644 --- a/sklego/preprocessing/pandastransformers.py +++ b/sklego/preprocessing/pandastransformers.py @@ -1,11 +1,61 @@ import narwhals as nw -import pandas as pd +from narwhals.dependencies import get_pandas from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_is_fitted from sklego.common import as_list +def _nw_match_dtype(dtype, selection): + if selection == "number": + return any( + dtype == x + for x in ( + nw.Int64, + nw.Int32, + nw.Int16, + nw.Int8, + nw.UInt64, + nw.UInt32, + nw.UInt16, + nw.UInt8, + nw.Float64, + nw.Float32, + ) + ) + if selection == "bool": + return dtype == nw.Boolean + if selection == "string": + return dtype == nw.String + if selection == "category": + return dtype == nw.Categorical + msg = f"Expected {{'number', 'bool', 'string', 'category'}}, got: {selection}" + raise ValueError(msg) + + +def _nw_select_dtypes(df, include: str | list[str], exclude: str | list[str]): + feature_names = [] + if isinstance(include, str): + include = [include] + if isinstance(exclude, str): + exclude = [exclude] + for name, dtype in df.schema.items(): + if include and exclude: + if any(_nw_match_dtype(dtype, _include) for _include in include) and not any( + _nw_match_dtype(dtype, _exclude) for _exclude in exclude + ): + feature_names.append(name) + elif include: + if any(_nw_match_dtype(dtype, _include) for _include in include): + feature_names.append(name) + elif exclude: + if not any(_nw_match_dtype(dtype, _exclude) for _exclude in exclude): + feature_names.append(name) + else: + raise ValueError("Must provide at least one of `include` or `exclude`") + return df.select(feature_names) + + class ColumnDropper(BaseEstimator, TransformerMixin): """The `ColumnDropper` transformer allows dropping specific columns from a DataFrame by name. Can be useful in a sklearn Pipeline. @@ -173,12 +223,18 @@ def _check_column_names(self, X): class PandasTypeSelector(BaseEstimator, TransformerMixin): - """The `PandasTypeSelector` transformer allows to select columns in a pandas DataFrame based on their type. + """The `PandasTypeSelector` transformer allows to select columns in a DataFrame based on their type. Can be useful in a sklearn Pipeline. - It uses - [pandas.DataFrame.select_dtypes](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.select_dtypes.html) - method. + - For pandas, it uses + [pandas.DataFrame.select_dtypes](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.select_dtypes.html) + method. + - For non-pandas dataframes (e.g. Polars), the following inputs are allowed: + + - 'number' + - 'string' + - 'bool' + - 'category' Parameters ---------- @@ -191,7 +247,7 @@ class PandasTypeSelector(BaseEstimator, TransformerMixin): ---------- feature_names_ : list[str] The names of the features to keep during transform. - X_dtypes_ : pd.Series + X_dtypes_ : Series | dict[str, DType] The dtypes of the columns in the input DataFrame. !!! warning @@ -235,9 +291,9 @@ def fit(self, X, y=None): Parameters ---------- - X : pd.DataFrame + X : DataFrame The data on which we apply the column selection. - y : pd.Series, default=None + y : Series, default=None Ignored, present for compatibility. Returns @@ -248,13 +304,17 @@ def fit(self, X, y=None): Raises ------ TypeError - If `X` is not a `pd.DataFrame` object. + If `X` is not a supported DataFrame. ValueError If provided type(s) results in empty dataframe. """ - self._check_X_for_type(X) - self.X_dtypes_ = X.dtypes - self.feature_names_ = list(X.select_dtypes(include=self.include, exclude=self.exclude).columns) + if (pd := get_pandas()) is not None and isinstance(X, pd.DataFrame): + self.X_dtypes_ = X.dtypes + self.feature_names_ = list(X.select_dtypes(include=self.include, exclude=self.exclude).columns) + else: + X = nw.from_native(X) + self.X_dtypes_ = X.schema + self.feature_names_ = _nw_select_dtypes(X, include=self.include, exclude=self.exclude).columns if len(self.feature_names_) == 0: raise ValueError("Provided type(s) results in empty dataframe") @@ -266,49 +326,52 @@ def get_feature_names(self, *args, **kwargs): return self.feature_names_ def transform(self, X): - """Returns a pandas DataFrame with columns (de)selected based on their dtype. + """Returns a DataFrame with columns (de)selected based on their dtype. Parameters ---------- - X : pd.DataFrame + X : DataFrame The data to select dtype for. Returns ------- - pd.DataFrame + DataFrame The data with the specified columns selected. Raises ------ TypeError - If `X` is not a `pd.DataFrame` object. + If `X` is not a supported DataFrame. ValueError If column dtypes were not equal during fit and transform. """ check_is_fitted(self, ["X_dtypes_", "feature_names_"]) - try: - if (self.X_dtypes_ != X.dtypes).any(): + if (pd := get_pandas()) is not None and isinstance(X, pd.DataFrame): + try: + if (self.X_dtypes_ != X.dtypes).any(): + raise ValueError( + f"Column dtypes were not equal during fit and transform. Fit types: \n" + f"{self.X_dtypes_}\n" + f"transform: \n" + f"{X.dtypes}" + ) + except ValueError as e: + raise ValueError("Columns were not equal during fit and transform") from e + transformed_df = X.select_dtypes(include=self.include, exclude=self.exclude) + else: + X = nw.from_native(X) + if self.X_dtypes_ != X.schema: raise ValueError( f"Column dtypes were not equal during fit and transform. Fit types: \n" f"{self.X_dtypes_}\n" f"transform: \n" - f"{X.dtypes}" + f"{X.schema}" ) - except ValueError as e: - raise ValueError("Columns were not equal during fit and transform") from e - - self._check_X_for_type(X) - transformed_df = X.select_dtypes(include=self.include, exclude=self.exclude) + transformed_df = _nw_select_dtypes(X, include=self.include, exclude=self.exclude) return transformed_df - @staticmethod - def _check_X_for_type(X): - """Checks if input of the Selector is of the required dtype""" - if not isinstance(X, pd.DataFrame): - raise TypeError("Provided variable X is not of type pandas.DataFrame") - class ColumnSelector(BaseEstimator, TransformerMixin): """The `ColumnSelector` transformer allows selecting specific columns from a DataFrame by name. diff --git a/tests/test_preprocessing/test_pandastypeselector.py b/tests/test_preprocessing/test_pandastypeselector.py index 1ec858d50..1c1ce9a2b 100644 --- a/tests/test_preprocessing/test_pandastypeselector.py +++ b/tests/test_preprocessing/test_pandastypeselector.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd +import polars as pl import pytest from sklego.preprocessing import PandasTypeSelector @@ -9,16 +10,18 @@ @pytest.mark.parametrize("transformer", [PandasTypeSelector(include=["number"])], ids=id_func) -def test_len_regression(transformer, random_xy_dataset_regr): +@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame]) +def test_len_regression(transformer, random_xy_dataset_regr, frame_func): X, y = random_xy_dataset_regr - X = pd.DataFrame(X) + X = frame_func(X) assert transformer.fit(X, y).transform(X).shape[0] == X.shape[0] @pytest.mark.parametrize("transformer", [PandasTypeSelector(include=["number"])], ids=id_func) -def test_len_classification(transformer, random_xy_dataset_clf): +@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame]) +def test_len_classification(transformer, random_xy_dataset_clf, frame_func): X, y = random_xy_dataset_clf - X = pd.DataFrame(X) + X = frame_func(X) assert transformer.fit(X, y).transform(X).shape[0] == X.shape[0] @@ -42,21 +45,26 @@ def test_get_params_np(include, exclude): assert transformer.get_params() == {"include": include, "exclude": exclude} -def test_value_error_differrent_dtyes(): - fit_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - transform_df = pd.DataFrame({"a": [4, 5, 6], "b": ["4", "5", "6"]}) +@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame]) +def test_value_error_differrent_dtyes(frame_func): + fit_df = frame_func({"a": [1, 2, 3], "b": [4, 5, 6]}) + transform_df = frame_func({"a": [4, 5, 6], "b": ["4", "5", "6"]}) transformer = PandasTypeSelector(exclude=["category"]).fit(fit_df) with pytest.raises(ValueError): transformer.transform(transform_df) -def test_get_feature_names(): - df = pd.DataFrame({"a": [4, 5, 6], "b": ["4", "5", "6"]}) +@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame]) +def test_get_feature_names(frame_func): + df = frame_func({"a": [4, 5, 6], "b": ["4", "5", "6"]}) transformer_number = PandasTypeSelector(include="number").fit(df) assert transformer_number.get_feature_names() == ["a"] - transformer_number = PandasTypeSelector(include="object").fit(df) + if frame_func is pd.DataFrame: + transformer_number = PandasTypeSelector(include="object").fit(df) + else: + transformer_number = PandasTypeSelector(include="string").fit(df) assert transformer_number.get_feature_names() == ["b"] From d2e703cb0a330ab82309b84ab19b53c0915a40df Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 14 May 2024 15:37:01 +0100 Subject: [PATCH 2/9] bump version --- pyproject.toml | 2 +- sklego/preprocessing/pandastransformers.py | 27 ++++++++++------------ 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b3f2354e1..1102f82a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "scikit-lego" -version = "0.8.2" +version = "0.8.13" description="A collection of lego bricks for scikit-learn pipelines" license = {file = "LICENSE"} diff --git a/sklego/preprocessing/pandastransformers.py b/sklego/preprocessing/pandastransformers.py index da19cb2c0..bfd382857 100644 --- a/sklego/preprocessing/pandastransformers.py +++ b/sklego/preprocessing/pandastransformers.py @@ -8,20 +8,17 @@ def _nw_match_dtype(dtype, selection): if selection == "number": - return any( - dtype == x - for x in ( - nw.Int64, - nw.Int32, - nw.Int16, - nw.Int8, - nw.UInt64, - nw.UInt32, - nw.UInt16, - nw.UInt8, - nw.Float64, - nw.Float32, - ) + return dtype in ( + nw.Int64, + nw.Int32, + nw.Int16, + nw.Int8, + nw.UInt64, + nw.UInt32, + nw.UInt16, + nw.UInt8, + nw.Float64, + nw.Float32, ) if selection == "bool": return dtype == nw.Boolean @@ -29,7 +26,7 @@ def _nw_match_dtype(dtype, selection): return dtype == nw.String if selection == "category": return dtype == nw.Categorical - msg = f"Expected {{'number', 'bool', 'string', 'category'}}, got: {selection}" + msg = f"Expected {{'number', 'bool', 'string', 'category'}}, got: {selection}, which is not (yet!) supported." raise ValueError(msg) From d96e427a45f452215ed49c0a9b30752e49e6e82c Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 14 May 2024 16:26:20 +0100 Subject: [PATCH 3/9] 3.8 compat --- sklego/preprocessing/pandastransformers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklego/preprocessing/pandastransformers.py b/sklego/preprocessing/pandastransformers.py index bfd382857..6c98d9b93 100644 --- a/sklego/preprocessing/pandastransformers.py +++ b/sklego/preprocessing/pandastransformers.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import narwhals as nw from narwhals.dependencies import get_pandas from sklearn.base import BaseEstimator, TransformerMixin From 4f6b1ea3b99bc533051b3c04d30abb4bbf01fac1 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 15 May 2024 11:57:07 +0100 Subject: [PATCH 4/9] Update sklego/preprocessing/pandastransformers.py Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> --- sklego/preprocessing/pandastransformers.py | 28 ++++++++++------------ 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/sklego/preprocessing/pandastransformers.py b/sklego/preprocessing/pandastransformers.py index 6c98d9b93..dd1449c09 100644 --- a/sklego/preprocessing/pandastransformers.py +++ b/sklego/preprocessing/pandastransformers.py @@ -33,25 +33,23 @@ def _nw_match_dtype(dtype, selection): def _nw_select_dtypes(df, include: str | list[str], exclude: str | list[str]): - feature_names = [] + if not include and not exclude: + raise ValueError("Must provide at least one of `include` or `exclude`") + if isinstance(include, str): include = [include] if isinstance(exclude, str): exclude = [exclude] - for name, dtype in df.schema.items(): - if include and exclude: - if any(_nw_match_dtype(dtype, _include) for _include in include) and not any( - _nw_match_dtype(dtype, _exclude) for _exclude in exclude - ): - feature_names.append(name) - elif include: - if any(_nw_match_dtype(dtype, _include) for _include in include): - feature_names.append(name) - elif exclude: - if not any(_nw_match_dtype(dtype, _exclude) for _exclude in exclude): - feature_names.append(name) - else: - raise ValueError("Must provide at least one of `include` or `exclude`") + + include = include or ["string", "number", "bool", "category"] + exclude = exclude or [] + + feature_names = [ + name + for name, dtype in df.schema.items() + if any(_nw_match_dtype(dtype, _include) for _include in include) + and not any(_nw_match_dtype(dtype, _exclude) for _exclude in exclude) + ] return df.select(feature_names) From 243f0a5c4a9530ca29f9bff5338bfdd90b95b33f Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 May 2024 11:58:52 +0100 Subject: [PATCH 5/9] fixup pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1102f82a4..1f1a7a2a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "scikit-lego" -version = "0.8.13" +version = "0.8.2" description="A collection of lego bricks for scikit-learn pipelines" license = {file = "LICENSE"} @@ -20,7 +20,7 @@ maintainers = [ ] dependencies = [ - "narwhals>=0.8.12", + "narwhals>=0.8.13", "pandas>=1.1.5", "scikit-learn>=1.0", "importlib-metadata >= 1.0; python_version < '3.8'", From a5334cc9a9b934392e4c3a4c2d77e01d87745acd Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 May 2024 12:03:07 +0100 Subject: [PATCH 6/9] unify (and test!) error message --- sklego/preprocessing/pandastransformers.py | 2 +- tests/test_preprocessing/test_pandastypeselector.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklego/preprocessing/pandastransformers.py b/sklego/preprocessing/pandastransformers.py index dd1449c09..5fd240c33 100644 --- a/sklego/preprocessing/pandastransformers.py +++ b/sklego/preprocessing/pandastransformers.py @@ -354,7 +354,7 @@ def transform(self, X): f"{X.dtypes}" ) except ValueError as e: - raise ValueError("Columns were not equal during fit and transform") from e + raise ValueError("Column dtypes were not equal during fit and transform") from e transformed_df = X.select_dtypes(include=self.include, exclude=self.exclude) else: X = nw.from_native(X) diff --git a/tests/test_preprocessing/test_pandastypeselector.py b/tests/test_preprocessing/test_pandastypeselector.py index 1c1ce9a2b..fa314d21c 100644 --- a/tests/test_preprocessing/test_pandastypeselector.py +++ b/tests/test_preprocessing/test_pandastypeselector.py @@ -82,7 +82,7 @@ def test_value_error_inequal(random_xy_dataset_regr): X, y = random_xy_dataset_regr X = pd.DataFrame(X) if X.shape[0] > 0: - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Column dtypes were not equal during fit and transform"): transformer.fit(X) # Remove column to create error transformer.transform(X.iloc[:, :-1]) From 3ad9a106fcac8682c64b36761cf4baa41addc51d Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 May 2024 13:18:18 +0100 Subject: [PATCH 7/9] deprecate --- docs/api/preprocessing.md | 5 +++ docs/contribution.md | 4 +-- readme.md | 2 +- sklego/preprocessing/__init__.py | 3 +- sklego/preprocessing/pandastransformers.py | 33 +++++++++++++++---- .../test_pandastypeselector.py | 30 ++++++++++------- 6 files changed, 55 insertions(+), 22 deletions(-) diff --git a/docs/api/preprocessing.md b/docs/api/preprocessing.md index 66c510160..b27c971cb 100644 --- a/docs/api/preprocessing.md +++ b/docs/api/preprocessing.md @@ -64,3 +64,8 @@ options: show_root_full_path: true show_root_heading: true + +:::sklego.preprocessing.pandastransformers.TypeSelector + options: + show_root_full_path: true + show_root_heading: true diff --git a/docs/contribution.md b/docs/contribution.md index 8ef139619..3dcb1d856 100644 --- a/docs/contribution.md +++ b/docs/contribution.md @@ -27,7 +27,7 @@ from sklearn.base import BaseEstimator, TransformerMixin, MetaEstimatorMixin from sklearn.utils import check_array, check_X_y from sklearn.utils.validation import FLOAT_DTYPES, check_random_state, check_is_fitted -class PandasTypeSelector(BaseEstimator, TransformerMixin): +class TypeSelector(BaseEstimator, TransformerMixin): TYPES = {'number', 'category', 'float', 'int', 'object', 'datetime', 'timedelta'} """ Select columns in a pandas dataframe based on their dtype @@ -174,7 +174,7 @@ When a new feature is introduced, it should be documented, and typically there a - [x] A user guide in the `docs/user-guide/` folder. - [x] A python script in the `docs/_scripts/` folder to generate plots and code snippets (see [next section](#working-with-pymdown-snippets-extension)) - [x] Relevant static files, such as images, plots, tables and html's, should be saved in the `docs/_static/` folder. -- [x] Edit the `mkdocs.yaml` file to include the new pages in the navigation. +- [x] Edit the `mkdocs.yaml` file to include the new pages in the navigation. ### Working with pymdown snippets extension diff --git a/readme.md b/readme.md index fbd570456..a10095cfa 100644 --- a/readme.md +++ b/readme.md @@ -120,7 +120,7 @@ Here's a list of features that this library currently offers: - `sklego.preprocessing.InformationFilter` transformer that can de-correlate features - `sklego.preprocessing.IdentityTransformer` returns the same data, allows for concatenating pipelines - `sklego.preprocessing.OrthogonalTransformer` makes all features linearly independent -- `sklego.preprocessing.PandasTypeSelector` selects columns based on pandas type +- `sklego.preprocessing.TypeSelector` selects columns based on pandas type - `sklego.preprocessing.RandomAdder` adds randomness in training - `sklego.preprocessing.RepeatingBasisFunction` repeating feature engineering, useful for timeseries - `sklego.preprocessing.DictMapper` assign numeric values on categorical columns diff --git a/sklego/preprocessing/__init__.py b/sklego/preprocessing/__init__.py index 644ada48d..bd068a397 100644 --- a/sklego/preprocessing/__init__.py +++ b/sklego/preprocessing/__init__.py @@ -10,6 +10,7 @@ "OrthogonalTransformer", "OutlierRemover", "PandasTypeSelector", + "TypeSelector", "RandomAdder", "RepeatingBasisFunction", ] @@ -20,7 +21,7 @@ from sklego.preprocessing.identitytransformer import IdentityTransformer from sklego.preprocessing.intervalencoder import IntervalEncoder from sklego.preprocessing.outlier_remover import OutlierRemover -from sklego.preprocessing.pandastransformers import ColumnDropper, ColumnSelector, PandasTypeSelector +from sklego.preprocessing.pandastransformers import ColumnDropper, ColumnSelector, PandasTypeSelector, TypeSelector from sklego.preprocessing.projections import InformationFilter, OrthogonalTransformer from sklego.preprocessing.randomadder import RandomAdder from sklego.preprocessing.repeatingbasis import RepeatingBasisFunction diff --git a/sklego/preprocessing/pandastransformers.py b/sklego/preprocessing/pandastransformers.py index 5fd240c33..cc965dc5f 100644 --- a/sklego/preprocessing/pandastransformers.py +++ b/sklego/preprocessing/pandastransformers.py @@ -1,5 +1,7 @@ from __future__ import annotations +import warnings + import narwhals as nw from narwhals.dependencies import get_pandas from sklearn.base import BaseEstimator, TransformerMixin @@ -219,20 +221,22 @@ def _check_column_names(self, X): raise KeyError(f"{list(non_existent_columns)} column(s) not in DataFrame") -class PandasTypeSelector(BaseEstimator, TransformerMixin): - """The `PandasTypeSelector` transformer allows to select columns in a DataFrame based on their type. +class TypeSelector(BaseEstimator, TransformerMixin): + """The `TypeSelector` transformer allows to select columns in a DataFrame based on their type. Can be useful in a sklearn Pipeline. - For pandas, it uses [pandas.DataFrame.select_dtypes](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.select_dtypes.html) method. - - For non-pandas dataframes (e.g. Polars), the following inputs are allowed: + - For non-pandas dataframes (e.g. Polars), the following inputs are allowed: - 'number' - 'string' - 'bool' - 'category' + !!! info "New in version 0.9.0" + Parameters ---------- include : scalar or list-like @@ -255,7 +259,7 @@ class PandasTypeSelector(BaseEstimator, TransformerMixin): -------- ```py import pandas as pd - from sklego.preprocessing import PandasTypeSelector + from sklego.preprocessing import TypeSelector df = pd.DataFrame({ "name": ["Swen", "Victor", "Alex"], @@ -264,14 +268,14 @@ class PandasTypeSelector(BaseEstimator, TransformerMixin): }) #Excluding single column - PandasTypeSelector(exclude="int64").fit_transform(df) + TypeSelector(exclude="int64").fit_transform(df) # name length #0 Swen 1.82 #1 Victor 1.85 #2 Alex 1.80 #Including multiple columns - PandasTypeSelector(include=["int64", "object"]).fit_transform(df) + TypeSelector(include=["int64", "object"]).fit_transform(df) # name shoesize #0 Swen 42 #1 Victor 44 @@ -295,7 +299,7 @@ def fit(self, X, y=None): Returns ------- - self : PandasTypeSelector + self : TypeSelector The fitted transformer. Raises @@ -370,6 +374,21 @@ def transform(self, X): return transformed_df +class PandasTypeSelector(TypeSelector): + """ + !!! warning "Deprecated since version 0.9.0, please use TypeSelector instead" + """ + + def __init__(self, include=None, exclude=None): + warnings.warn( + "PandasTypeSelector is deprecated and will be removed in a future version. " + "Please use `from sklego.preprocessing import TypeSelector` instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(include=include, exclude=exclude) + + class ColumnSelector(BaseEstimator, TransformerMixin): """The `ColumnSelector` transformer allows selecting specific columns from a DataFrame by name. Can be useful in a sklearn Pipeline. diff --git a/tests/test_preprocessing/test_pandastypeselector.py b/tests/test_preprocessing/test_pandastypeselector.py index fa314d21c..71644dc24 100644 --- a/tests/test_preprocessing/test_pandastypeselector.py +++ b/tests/test_preprocessing/test_pandastypeselector.py @@ -5,11 +5,11 @@ import polars as pl import pytest -from sklego.preprocessing import PandasTypeSelector +from sklego.preprocessing import PandasTypeSelector, TypeSelector from tests.conftest import id_func -@pytest.mark.parametrize("transformer", [PandasTypeSelector(include=["number"])], ids=id_func) +@pytest.mark.parametrize("transformer", [TypeSelector(include=["number"])], ids=id_func) @pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame]) def test_len_regression(transformer, random_xy_dataset_regr, frame_func): X, y = random_xy_dataset_regr @@ -17,7 +17,7 @@ def test_len_regression(transformer, random_xy_dataset_regr, frame_func): assert transformer.fit(X, y).transform(X).shape[0] == X.shape[0] -@pytest.mark.parametrize("transformer", [PandasTypeSelector(include=["number"])], ids=id_func) +@pytest.mark.parametrize("transformer", [TypeSelector(include=["number"])], ids=id_func) @pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame]) def test_len_classification(transformer, random_xy_dataset_clf, frame_func): X, y = random_xy_dataset_clf @@ -30,7 +30,7 @@ def test_len_classification(transformer, random_xy_dataset_clf, frame_func): [_ for _ in it.combinations(["number", "datetime", "timedelta", "category", "datetimetz", None], 2)], ) def test_get_params_str(include, exclude): - transformer = PandasTypeSelector(include=include, exclude=exclude) + transformer = TypeSelector(include=include, exclude=exclude) assert transformer.get_params() == {"include": include, "exclude": exclude} @@ -40,7 +40,7 @@ def test_get_params_str(include, exclude): [_ for _ in it.combinations([np.int64, np.float64, np.datetime64, np.timedelta64], 2)], ) def test_get_params_np(include, exclude): - transformer = PandasTypeSelector(include=include, exclude=exclude) + transformer = TypeSelector(include=include, exclude=exclude) assert transformer.get_params() == {"include": include, "exclude": exclude} @@ -49,7 +49,7 @@ def test_get_params_np(include, exclude): def test_value_error_differrent_dtyes(frame_func): fit_df = frame_func({"a": [1, 2, 3], "b": [4, 5, 6]}) transform_df = frame_func({"a": [4, 5, 6], "b": ["4", "5", "6"]}) - transformer = PandasTypeSelector(exclude=["category"]).fit(fit_df) + transformer = TypeSelector(exclude=["category"]).fit(fit_df) with pytest.raises(ValueError): transformer.transform(transform_df) @@ -58,18 +58,26 @@ def test_value_error_differrent_dtyes(frame_func): @pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame]) def test_get_feature_names(frame_func): df = frame_func({"a": [4, 5, 6], "b": ["4", "5", "6"]}) - transformer_number = PandasTypeSelector(include="number").fit(df) + transformer_number = TypeSelector(include="number").fit(df) assert transformer_number.get_feature_names() == ["a"] if frame_func is pd.DataFrame: - transformer_number = PandasTypeSelector(include="object").fit(df) + transformer_number = TypeSelector(include="object").fit(df) else: - transformer_number = PandasTypeSelector(include="string").fit(df) + transformer_number = TypeSelector(include="string").fit(df) assert transformer_number.get_feature_names() == ["b"] +@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame]) +def test_get_feature_names_deprecated(frame_func): + df = frame_func({"a": [4, 5, 6], "b": ["4", "5", "6"]}) + with pytest.deprecated_call(match="Please use `from sklego.preprocessing import TypeSelector`"): + transformer_number = PandasTypeSelector(include="number").fit(df) + assert transformer_number.get_feature_names() == ["a"] + + def test_value_error_empty(random_xy_dataset_regr): - transformer = PandasTypeSelector(exclude=["number"]) + transformer = TypeSelector(exclude=["number"]) X, y = random_xy_dataset_regr X = pd.DataFrame(X) @@ -78,7 +86,7 @@ def test_value_error_empty(random_xy_dataset_regr): def test_value_error_inequal(random_xy_dataset_regr): - transformer = PandasTypeSelector(include=["number"]) + transformer = TypeSelector(include=["number"]) X, y = random_xy_dataset_regr X = pd.DataFrame(X) if X.shape[0] > 0: From 070e2fecb8510014d1e943cf4a4f55a574459654 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 May 2024 13:31:35 +0100 Subject: [PATCH 8/9] update readme --- readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readme.md b/readme.md index a10095cfa..21bfe1eed 100644 --- a/readme.md +++ b/readme.md @@ -120,7 +120,7 @@ Here's a list of features that this library currently offers: - `sklego.preprocessing.InformationFilter` transformer that can de-correlate features - `sklego.preprocessing.IdentityTransformer` returns the same data, allows for concatenating pipelines - `sklego.preprocessing.OrthogonalTransformer` makes all features linearly independent -- `sklego.preprocessing.TypeSelector` selects columns based on pandas type +- `sklego.preprocessing.TypeSelector` selects columns based on type - `sklego.preprocessing.RandomAdder` adds randomness in training - `sklego.preprocessing.RepeatingBasisFunction` repeating feature engineering, useful for timeseries - `sklego.preprocessing.DictMapper` assign numeric values on categorical columns From d5f0413abbc4fa19de29e61007b2e9e5ae0f364d Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 May 2024 13:32:40 +0100 Subject: [PATCH 9/9] undo contribution.md change --- docs/contribution.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contribution.md b/docs/contribution.md index 3dcb1d856..5ead256da 100644 --- a/docs/contribution.md +++ b/docs/contribution.md @@ -27,7 +27,7 @@ from sklearn.base import BaseEstimator, TransformerMixin, MetaEstimatorMixin from sklearn.utils import check_array, check_X_y from sklearn.utils.validation import FLOAT_DTYPES, check_random_state, check_is_fitted -class TypeSelector(BaseEstimator, TransformerMixin): +class PandasTypeSelector(BaseEstimator, TransformerMixin): TYPES = {'number', 'category', 'float', 'int', 'object', 'datetime', 'timedelta'} """ Select columns in a pandas dataframe based on their dtype