Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make PandasTypeSelector selector dataframe-agnostic #670

Merged
merged 9 commits into from
May 18, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "scikit-lego"
version = "0.8.2"
version = "0.8.13"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was line 23 the intended target?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah i probably shouldn't make commits in a hurry whilst on a train sorry

description="A collection of lego bricks for scikit-learn pipelines"

license = {file = "LICENSE"}
Expand Down
122 changes: 92 additions & 30 deletions sklego/preprocessing/pandastransformers.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,60 @@
from __future__ import annotations

import narwhals as nw
import pandas as pd
from narwhals.dependencies import get_pandas
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

from sklego.common import as_list


def _nw_match_dtype(dtype, selection):
if selection == "number":
return dtype in (
nw.Int64,
nw.Int32,
nw.Int16,
nw.Int8,
nw.UInt64,
nw.UInt32,
nw.UInt16,
nw.UInt8,
nw.Float64,
nw.Float32,
)
if selection == "bool":
return dtype == nw.Boolean
if selection == "string":
return dtype == nw.String
if selection == "category":
return dtype == nw.Categorical
msg = f"Expected {{'number', 'bool', 'string', 'category'}}, got: {selection}, which is not (yet!) supported."
raise ValueError(msg)


def _nw_select_dtypes(df, include: str | list[str], exclude: str | list[str]):
feature_names = []
if isinstance(include, str):
include = [include]
if isinstance(exclude, str):
exclude = [exclude]
for name, dtype in df.schema.items():
if include and exclude:
if any(_nw_match_dtype(dtype, _include) for _include in include) and not any(
_nw_match_dtype(dtype, _exclude) for _exclude in exclude
):
feature_names.append(name)
elif include:
if any(_nw_match_dtype(dtype, _include) for _include in include):
feature_names.append(name)
elif exclude:
if not any(_nw_match_dtype(dtype, _exclude) for _exclude in exclude):
feature_names.append(name)
else:
raise ValueError("Must provide at least one of `include` or `exclude`")
return df.select(feature_names)
MarcoGorelli marked this conversation as resolved.
Show resolved Hide resolved


class ColumnDropper(BaseEstimator, TransformerMixin):
"""The `ColumnDropper` transformer allows dropping specific columns from a DataFrame by name.
Can be useful in a sklearn Pipeline.
Expand Down Expand Up @@ -173,12 +222,18 @@ def _check_column_names(self, X):


class PandasTypeSelector(BaseEstimator, TransformerMixin):
"""The `PandasTypeSelector` transformer allows to select columns in a pandas DataFrame based on their type.
"""The `PandasTypeSelector` transformer allows to select columns in a DataFrame based on their type.
Copy link
Collaborator

@FBruzzesi FBruzzesi May 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Considering its name, we could do the following:

class PandasTypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, include=None, exclude=None):
        warn(
            "Please use `TypeSelector` instead of `PandasTypeSelector`, `PandasTypeSelector` will be deprecated in future versions",
            DeprecationWarning,
        )
        return TypeSelector(include, exclude)

and then

class TypeSelector(BaseEstimator, TransformerMixin):
    ...
    !!! info "New in version 0.9.0"

Copy link
Contributor Author

@MarcoGorelli MarcoGorelli May 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, and I think the whole pandastransformers.py module needs renaming

OK to do it all in one go in a separate PR, so that all the ones in pandastransformers.py point to the equivalent one in, say, dataframe_transformers.py?


EDIT: I noticed that this is already exported from sklego.preprocessing, and that that's the path the examples use. I've renamed and deprecated as part of this PR then

The contribution.md page still shows PandasTypeSelector, but that page already looks out-of-date anyway and probably needs a revamp - will address that separately (something about Narwhals probably needs mentioning too, as it's used internally in quite a few places)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes we can rename it to have a more intuitive naming path, but as you spotted, it shouldn't matter too much as they are exported into preprocessing.

Can be useful in a sklearn Pipeline.

It uses
[pandas.DataFrame.select_dtypes](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.select_dtypes.html)
method.
- For pandas, it uses
[pandas.DataFrame.select_dtypes](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.select_dtypes.html)
method.
- For non-pandas dataframes (e.g. Polars), the following inputs are allowed:

- 'number'
- 'string'
- 'bool'
- 'category'

Parameters
----------
Expand All @@ -191,7 +246,7 @@ class PandasTypeSelector(BaseEstimator, TransformerMixin):
----------
feature_names_ : list[str]
The names of the features to keep during transform.
X_dtypes_ : pd.Series
X_dtypes_ : Series | dict[str, DType]
The dtypes of the columns in the input DataFrame.

!!! warning
Expand Down Expand Up @@ -235,9 +290,9 @@ def fit(self, X, y=None):

Parameters
----------
X : pd.DataFrame
X : DataFrame
The data on which we apply the column selection.
y : pd.Series, default=None
y : Series, default=None
Ignored, present for compatibility.

Returns
Expand All @@ -248,13 +303,17 @@ def fit(self, X, y=None):
Raises
------
TypeError
If `X` is not a `pd.DataFrame` object.
If `X` is not a supported DataFrame.
ValueError
If provided type(s) results in empty dataframe.
"""
self._check_X_for_type(X)
self.X_dtypes_ = X.dtypes
self.feature_names_ = list(X.select_dtypes(include=self.include, exclude=self.exclude).columns)
if (pd := get_pandas()) is not None and isinstance(X, pd.DataFrame):
self.X_dtypes_ = X.dtypes
self.feature_names_ = list(X.select_dtypes(include=self.include, exclude=self.exclude).columns)
else:
X = nw.from_native(X)
self.X_dtypes_ = X.schema
self.feature_names_ = _nw_select_dtypes(X, include=self.include, exclude=self.exclude).columns

if len(self.feature_names_) == 0:
raise ValueError("Provided type(s) results in empty dataframe")
Expand All @@ -266,49 +325,52 @@ def get_feature_names(self, *args, **kwargs):
return self.feature_names_

def transform(self, X):
"""Returns a pandas DataFrame with columns (de)selected based on their dtype.
"""Returns a DataFrame with columns (de)selected based on their dtype.

Parameters
----------
X : pd.DataFrame
X : DataFrame
The data to select dtype for.

Returns
-------
pd.DataFrame
DataFrame
The data with the specified columns selected.

Raises
------
TypeError
If `X` is not a `pd.DataFrame` object.
If `X` is not a supported DataFrame.
ValueError
If column dtypes were not equal during fit and transform.
"""
check_is_fitted(self, ["X_dtypes_", "feature_names_"])

try:
if (self.X_dtypes_ != X.dtypes).any():
if (pd := get_pandas()) is not None and isinstance(X, pd.DataFrame):
try:
if (self.X_dtypes_ != X.dtypes).any():
raise ValueError(
f"Column dtypes were not equal during fit and transform. Fit types: \n"
f"{self.X_dtypes_}\n"
f"transform: \n"
f"{X.dtypes}"
)
except ValueError as e:
raise ValueError("Columns were not equal during fit and transform") from e
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this happen?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yup, the last test in tests/test_preprocessing/test_pandastypeselector.py goes there

I've unified the messages and included the error message in the test

transformed_df = X.select_dtypes(include=self.include, exclude=self.exclude)
else:
X = nw.from_native(X)
if self.X_dtypes_ != X.schema:
raise ValueError(
f"Column dtypes were not equal during fit and transform. Fit types: \n"
f"{self.X_dtypes_}\n"
f"transform: \n"
f"{X.dtypes}"
f"{X.schema}"
)
except ValueError as e:
raise ValueError("Columns were not equal during fit and transform") from e

self._check_X_for_type(X)
transformed_df = X.select_dtypes(include=self.include, exclude=self.exclude)
transformed_df = _nw_select_dtypes(X, include=self.include, exclude=self.exclude)

return transformed_df

@staticmethod
def _check_X_for_type(X):
"""Checks if input of the Selector is of the required dtype"""
if not isinstance(X, pd.DataFrame):
raise TypeError("Provided variable X is not of type pandas.DataFrame")


class ColumnSelector(BaseEstimator, TransformerMixin):
"""The `ColumnSelector` transformer allows selecting specific columns from a DataFrame by name.
Expand Down
28 changes: 18 additions & 10 deletions tests/test_preprocessing/test_pandastypeselector.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,26 @@

import numpy as np
import pandas as pd
import polars as pl
import pytest

from sklego.preprocessing import PandasTypeSelector
from tests.conftest import id_func


@pytest.mark.parametrize("transformer", [PandasTypeSelector(include=["number"])], ids=id_func)
def test_len_regression(transformer, random_xy_dataset_regr):
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
def test_len_regression(transformer, random_xy_dataset_regr, frame_func):
X, y = random_xy_dataset_regr
X = pd.DataFrame(X)
X = frame_func(X)
assert transformer.fit(X, y).transform(X).shape[0] == X.shape[0]


@pytest.mark.parametrize("transformer", [PandasTypeSelector(include=["number"])], ids=id_func)
def test_len_classification(transformer, random_xy_dataset_clf):
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
def test_len_classification(transformer, random_xy_dataset_clf, frame_func):
X, y = random_xy_dataset_clf
X = pd.DataFrame(X)
X = frame_func(X)
assert transformer.fit(X, y).transform(X).shape[0] == X.shape[0]


Expand All @@ -42,21 +45,26 @@ def test_get_params_np(include, exclude):
assert transformer.get_params() == {"include": include, "exclude": exclude}


def test_value_error_differrent_dtyes():
fit_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
transform_df = pd.DataFrame({"a": [4, 5, 6], "b": ["4", "5", "6"]})
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
def test_value_error_differrent_dtyes(frame_func):
fit_df = frame_func({"a": [1, 2, 3], "b": [4, 5, 6]})
transform_df = frame_func({"a": [4, 5, 6], "b": ["4", "5", "6"]})
transformer = PandasTypeSelector(exclude=["category"]).fit(fit_df)

with pytest.raises(ValueError):
transformer.transform(transform_df)


def test_get_feature_names():
df = pd.DataFrame({"a": [4, 5, 6], "b": ["4", "5", "6"]})
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
def test_get_feature_names(frame_func):
df = frame_func({"a": [4, 5, 6], "b": ["4", "5", "6"]})
transformer_number = PandasTypeSelector(include="number").fit(df)
assert transformer_number.get_feature_names() == ["a"]

transformer_number = PandasTypeSelector(include="object").fit(df)
if frame_func is pd.DataFrame:
transformer_number = PandasTypeSelector(include="object").fit(df)
else:
transformer_number = PandasTypeSelector(include="string").fit(df)
assert transformer_number.get_feature_names() == ["b"]


Expand Down
Loading