koaning · FBruzzesi · May 18, 2024 · May 14, 2024 · May 14, 2024 · May 14, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "scikit-lego"
-version = "0.8.2"
+version = "0.8.13"
 description="A collection of lego bricks for scikit-learn pipelines"
 
 license = {file = "LICENSE"}

diff --git a/sklego/preprocessing/pandastransformers.py b/sklego/preprocessing/pandastransformers.py
@@ -1,11 +1,60 @@
+from __future__ import annotations
+
 import narwhals as nw
-import pandas as pd
+from narwhals.dependencies import get_pandas
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils.validation import check_is_fitted
 
 from sklego.common import as_list
 
 
+def _nw_match_dtype(dtype, selection):
+    if selection == "number":
+        return dtype in (
+            nw.Int64,
+            nw.Int32,
+            nw.Int16,
+            nw.Int8,
+            nw.UInt64,
+            nw.UInt32,
+            nw.UInt16,
+            nw.UInt8,
+            nw.Float64,
+            nw.Float32,
+        )
+    if selection == "bool":
+        return dtype == nw.Boolean
+    if selection == "string":
+        return dtype == nw.String
+    if selection == "category":
+        return dtype == nw.Categorical
+    msg = f"Expected {{'number', 'bool', 'string', 'category'}}, got: {selection}, which is not (yet!) supported."
+    raise ValueError(msg)
+
+
+def _nw_select_dtypes(df, include: str | list[str], exclude: str | list[str]):
+    feature_names = []
+    if isinstance(include, str):
+        include = [include]
+    if isinstance(exclude, str):
+        exclude = [exclude]
+    for name, dtype in df.schema.items():
+        if include and exclude:
+            if any(_nw_match_dtype(dtype, _include) for _include in include) and not any(
+                _nw_match_dtype(dtype, _exclude) for _exclude in exclude
+            ):
+                feature_names.append(name)
+        elif include:
+            if any(_nw_match_dtype(dtype, _include) for _include in include):
+                feature_names.append(name)
+        elif exclude:
+            if not any(_nw_match_dtype(dtype, _exclude) for _exclude in exclude):
+                feature_names.append(name)
+        else:
+            raise ValueError("Must provide at least one of `include` or `exclude`")
+    return df.select(feature_names)
+
+
 class ColumnDropper(BaseEstimator, TransformerMixin):
     """The `ColumnDropper` transformer allows dropping specific columns from a DataFrame by name.
     Can be useful in a sklearn Pipeline.
@@ -173,12 +222,18 @@ def _check_column_names(self, X):
 
 
 class PandasTypeSelector(BaseEstimator, TransformerMixin):
-    """The `PandasTypeSelector` transformer allows to select columns in a pandas DataFrame based on their type.
+    """The `PandasTypeSelector` transformer allows to select columns in a DataFrame based on their type.
     Can be useful in a sklearn Pipeline.
 
-    It uses
-    [pandas.DataFrame.select_dtypes](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.select_dtypes.html)
-    method.
+    - For pandas, it uses
+      [pandas.DataFrame.select_dtypes](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.select_dtypes.html)
+      method.
+    - For non-pandas dataframes (e.g. Polars), the following  inputs are allowed:
+
+      - 'number'
+      - 'string'
+      - 'bool'
+      - 'category'
 
     Parameters
     ----------
@@ -191,7 +246,7 @@ class PandasTypeSelector(BaseEstimator, TransformerMixin):
     ----------
     feature_names_ : list[str]
         The names of the features to keep during transform.
-    X_dtypes_ : pd.Series
+    X_dtypes_ : Series | dict[str, DType]
         The dtypes of the columns in the input DataFrame.
 
     !!! warning
@@ -235,9 +290,9 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : pd.DataFrame
+        X : DataFrame
             The data on which we apply the column selection.
-        y : pd.Series, default=None
+        y : Series, default=None
             Ignored, present for compatibility.
 
         Returns
@@ -248,13 +303,17 @@ def fit(self, X, y=None):
         Raises
         ------
         TypeError
-            If `X` is not a `pd.DataFrame` object.
+            If `X` is not a supported DataFrame.
         ValueError
             If provided type(s) results in empty dataframe.
         """
-        self._check_X_for_type(X)
-        self.X_dtypes_ = X.dtypes
-        self.feature_names_ = list(X.select_dtypes(include=self.include, exclude=self.exclude).columns)
+        if (pd := get_pandas()) is not None and isinstance(X, pd.DataFrame):
+            self.X_dtypes_ = X.dtypes
+            self.feature_names_ = list(X.select_dtypes(include=self.include, exclude=self.exclude).columns)
+        else:
+            X = nw.from_native(X)
+            self.X_dtypes_ = X.schema
+            self.feature_names_ = _nw_select_dtypes(X, include=self.include, exclude=self.exclude).columns
 
         if len(self.feature_names_) == 0:
             raise ValueError("Provided type(s) results in empty dataframe")
@@ -266,49 +325,52 @@ def get_feature_names(self, *args, **kwargs):
         return self.feature_names_
 
     def transform(self, X):
-        """Returns a pandas DataFrame with columns (de)selected based on their dtype.
+        """Returns a DataFrame with columns (de)selected based on their dtype.
 
         Parameters
         ----------
-        X : pd.DataFrame
+        X : DataFrame
             The data to select dtype for.
 
         Returns
         -------
-        pd.DataFrame
+        DataFrame
             The data with the specified columns selected.
 
         Raises
         ------
         TypeError
-            If `X` is not a `pd.DataFrame` object.
+            If `X` is not a supported DataFrame.
         ValueError
             If column dtypes were not equal during fit and transform.
         """
         check_is_fitted(self, ["X_dtypes_", "feature_names_"])
 
-        try:
-            if (self.X_dtypes_ != X.dtypes).any():
+        if (pd := get_pandas()) is not None and isinstance(X, pd.DataFrame):
+            try:
+                if (self.X_dtypes_ != X.dtypes).any():
+                    raise ValueError(
+                        f"Column dtypes were not equal during fit and transform. Fit types: \n"
+                        f"{self.X_dtypes_}\n"
+                        f"transform: \n"
+                        f"{X.dtypes}"
+                    )
+            except ValueError as e:
+                raise ValueError("Columns were not equal during fit and transform") from e
+            transformed_df = X.select_dtypes(include=self.include, exclude=self.exclude)
+        else:
+            X = nw.from_native(X)
+            if self.X_dtypes_ != X.schema:
                 raise ValueError(
                     f"Column dtypes were not equal during fit and transform. Fit types: \n"
                     f"{self.X_dtypes_}\n"
                     f"transform: \n"
-                    f"{X.dtypes}"
+                    f"{X.schema}"
                 )
-        except ValueError as e:
-            raise ValueError("Columns were not equal during fit and transform") from e
-
-        self._check_X_for_type(X)
-        transformed_df = X.select_dtypes(include=self.include, exclude=self.exclude)
+            transformed_df = _nw_select_dtypes(X, include=self.include, exclude=self.exclude)
 
         return transformed_df
 
-    @staticmethod
-    def _check_X_for_type(X):
-        """Checks if input of the Selector is of the required dtype"""
-        if not isinstance(X, pd.DataFrame):
-            raise TypeError("Provided variable X is not of type pandas.DataFrame")
-
 
 class ColumnSelector(BaseEstimator, TransformerMixin):
     """The `ColumnSelector` transformer allows selecting specific columns from a DataFrame by name.

diff --git a/tests/test_preprocessing/test_pandastypeselector.py b/tests/test_preprocessing/test_pandastypeselector.py
@@ -2,23 +2,26 @@
 
 import numpy as np
 import pandas as pd
+import polars as pl
 import pytest
 
 from sklego.preprocessing import PandasTypeSelector
 from tests.conftest import id_func
 
 
 @pytest.mark.parametrize("transformer", [PandasTypeSelector(include=["number"])], ids=id_func)
-def test_len_regression(transformer, random_xy_dataset_regr):
+@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
+def test_len_regression(transformer, random_xy_dataset_regr, frame_func):
     X, y = random_xy_dataset_regr
-    X = pd.DataFrame(X)
+    X = frame_func(X)
     assert transformer.fit(X, y).transform(X).shape[0] == X.shape[0]
 
 
 @pytest.mark.parametrize("transformer", [PandasTypeSelector(include=["number"])], ids=id_func)
-def test_len_classification(transformer, random_xy_dataset_clf):
+@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
+def test_len_classification(transformer, random_xy_dataset_clf, frame_func):
     X, y = random_xy_dataset_clf
-    X = pd.DataFrame(X)
+    X = frame_func(X)
     assert transformer.fit(X, y).transform(X).shape[0] == X.shape[0]
 
 
@@ -42,21 +45,26 @@ def test_get_params_np(include, exclude):
     assert transformer.get_params() == {"include": include, "exclude": exclude}
 
 
-def test_value_error_differrent_dtyes():
-    fit_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
-    transform_df = pd.DataFrame({"a": [4, 5, 6], "b": ["4", "5", "6"]})
+@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
+def test_value_error_differrent_dtyes(frame_func):
+    fit_df = frame_func({"a": [1, 2, 3], "b": [4, 5, 6]})
+    transform_df = frame_func({"a": [4, 5, 6], "b": ["4", "5", "6"]})
     transformer = PandasTypeSelector(exclude=["category"]).fit(fit_df)
 
     with pytest.raises(ValueError):
         transformer.transform(transform_df)
 
 
-def test_get_feature_names():
-    df = pd.DataFrame({"a": [4, 5, 6], "b": ["4", "5", "6"]})
+@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
+def test_get_feature_names(frame_func):
+    df = frame_func({"a": [4, 5, 6], "b": ["4", "5", "6"]})
     transformer_number = PandasTypeSelector(include="number").fit(df)
     assert transformer_number.get_feature_names() == ["a"]
 
-    transformer_number = PandasTypeSelector(include="object").fit(df)
+    if frame_func is pd.DataFrame:
+        transformer_number = PandasTypeSelector(include="object").fit(df)
+    else:
+        transformer_number = PandasTypeSelector(include="string").fit(df)
     assert transformer_number.get_feature_names() == ["b"]