skrub-data · Vincent-Maladiere · Aug 31, 2023 · Jun 9, 2023 · Jun 9, 2023 · Jun 30, 2023
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -39,7 +39,9 @@ Major changes
   - scikit-learn >= 1.2.1
   - pandas >= 1.5.3 :pr:`613` by :user:`Lilian Boulard <LilianBoulard>`
 
-* Removed `requests` from the requirements. :pr:`613` by :user:`Lilian Boulard <LilianBoulard>`
+* You can now pass column-specific transformers to :class:`TableVectorizer`
+  using the `column_specific_transformers` argument.
+  :pr:`583` by :user:`Lilian Boulard <LilianBoulard>`.
 
 Minor changes
 -------------
@@ -71,6 +73,8 @@ Minor changes
 * Add `get_feature_names_out` method to :class:`MinHashEncoder`.
   :pr:`616` by :user:`Leo Grinsztajn <LeoGrin>`
 
+* Removed `requests` from the requirements. :pr:`613` by :user:`Lilian Boulard <LilianBoulard>`
+
 Before skrub: dirty_cat
 ========================
 

diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
@@ -15,6 +15,7 @@
 from pandas.core.dtypes.base import ExtensionDtype
 from sklearn.base import TransformerMixin, clone
 from sklearn.compose import ColumnTransformer
+from sklearn.compose._column_transformer import _get_transformer_list
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.utils.deprecation import deprecated
 from sklearn.utils.validation import check_is_fitted
@@ -142,9 +143,7 @@ def _replace_missing_in_cat_col(ser: pd.Series, value: str = "missing") -> pd.Se
     return ser
 
 
-OptionalTransformer = (
-    TransformerMixin | Literal["drop", "remainder", "passthrough"] | None
-)
+Transformer = TransformerMixin | Literal["drop", "remainder", "passthrough"]
 
 
 class TableVectorizer(ColumnTransformer):
@@ -223,7 +222,20 @@ class TableVectorizer(ColumnTransformer):
         Features classified under this category are not imputed at all
         (regardless of `impute_missing`).
 
-    auto_cast : bool, optional, default=True
+    column_specific_transformers : list of tuples ({'drop', 'remainder', 'passthrough'} or Transformer, list of str or int) or (str, {'drop', 'remainder', 'passthrough'} or Transformer, list of str or int), optional
+        On top of the default column type classification (see parameters above),
+        this parameter allows you to manually specify transformers for
+        specific columns.
+        This is equivalent to using a :class:`~sklearn.compose.ColumnTransformer`
+        for assigning the column-specific transformers,
+        and passing the ``TableVectorizer`` as the ``remainder``.
+        This parameter can take two different formats, either:
+        - a list of 2-tuples (transformer, column names or indices)
+        - a list of 3-tuple (name, transformer, column names or indices)
+        In the latter format, you can specify the name of the assignment.
+        Mixing the two is not supported.
+
+    auto_cast : bool, default=True
         If set to `True`, will try to convert each column to the best possible
         data type (dtype).
 
@@ -355,6 +367,11 @@ class TableVectorizer(ColumnTransformer):
     columns_: pd.Index
     types_: dict[str, type]
     imputed_columns_: list[str]
+    low_card_cat_transformer_: Transformer
+    high_card_cat_transformer_: Transformer
+    numerical_transformer_: Transformer
+    datetime_transformer_: Transformer
+    column_specific_transformers_: list[tuple[str, Transformer, list[str, int]]]
 
     # Override required parameters
     _required_parameters = []
@@ -363,10 +380,15 @@ def __init__(
         self,
         *,
         cardinality_threshold: int = 40,
-        low_card_cat_transformer: OptionalTransformer = None,
-        high_card_cat_transformer: OptionalTransformer = None,
-        numerical_transformer: OptionalTransformer = None,
-        datetime_transformer: OptionalTransformer = None,
+        low_card_cat_transformer: Transformer | None = None,
+        high_card_cat_transformer: Transformer | None = None,
+        numerical_transformer: Transformer | None = None,
+        datetime_transformer: Transformer | None = None,
+        column_specific_transformers: list[
+            tuple[Transformer, list[str | int]]
+            | tuple[str, Transformer, list[str, int]]
+        ]
+        | None = None,
         auto_cast: bool = True,
         impute_missing: Literal["auto", "force", "skip"] = "auto",
         # The next parameters are inherited from ColumnTransformer
@@ -383,6 +405,7 @@ def __init__(
         self.high_card_cat_transformer = high_card_cat_transformer
         self.numerical_transformer = numerical_transformer
         self.datetime_transformer = datetime_transformer
+        self.column_specific_transformers = column_specific_transformers
         self.auto_cast = auto_cast
         self.impute_missing = impute_missing
 
@@ -392,13 +415,13 @@ def __init__(
         self.transformer_weights = transformer_weights
         self.verbose = verbose
 
-    def _more_tags(self):
+    def _more_tags(self) -> dict:
         """
         Used internally by sklearn to ease the estimator checks.
         """
         return {"allow_nan": [True]}
 
-    def _clone_transformers(self):
+    def _clone_transformers(self) -> None:
         """
         For each of the different transformers that can be passed,
         create the corresponding variable name with a trailing underscore,
@@ -449,6 +472,25 @@ def _clone_transformers(self):
         else:
             self.datetime_transformer_ = self.datetime_transformer
 
+        if self.column_specific_transformers is None:
+            self.column_specific_transformers_ = []
+        else:
+            if len(self.column_specific_transformers[0]) == 2:
+                # Unnamed assignments, transform to named
+                named_column_specific_transformers = _get_transformer_list(
+                    self.column_specific_transformers
+                )
+            elif len(self.column_specific_transformers[0]) == 3:
+                # Named assignments
+                named_column_specific_transformers = self.column_specific_transformers
+
+            self.column_specific_transformers_ = [
+                (name, clone(transformer), cols)
+                if isinstance(transformer, sklearn.base.TransformerMixin)
+                else (name, transformer, cols)
+                for name, transformer, cols in named_column_specific_transformers
+            ]
+
         # TODO: check that the provided transformers are valid
 
     def _auto_cast(self, X: pd.DataFrame) -> pd.DataFrame:
@@ -603,12 +645,23 @@ def fit_transform(self, X, y=None):
         if self.auto_cast:
             X = self._auto_cast(X)
 
+        # We will filter X to keep only the columns that are not specified
+        # explicitly by the user.
+        X_filtered = X.drop(
+            columns=[
+                # We do this for loop as `self.column_specific_transformers_`
+                # might be empty.
+                col
+                for (_, _, columns) in self.column_specific_transformers_
+                for col in columns
+            ]
+        )
         # Select columns by dtype
-        numeric_columns = X.select_dtypes(include="number").columns.to_list()
-        categorical_columns = X.select_dtypes(
+        numeric_columns = X_filtered.select_dtypes(include="number").columns.to_list()
+        categorical_columns = X_filtered.select_dtypes(
             include=["string", "object", "category"]
         ).columns.to_list()
-        datetime_columns = X.select_dtypes(
+        datetime_columns = X_filtered.select_dtypes(
             include=["datetime", "datetimetz"]
         ).columns.to_list()
 
@@ -622,15 +675,15 @@ def fit_transform(self, X, y=None):
 
         # Next part: construct the transformers
         # Create the list of all the transformers.
-        all_transformers: list[tuple[str, OptionalTransformer, list[str]]] = [
+        all_transformers: list[tuple[str, Transformer, list[str]]] = [
             ("numeric", self.numerical_transformer_, numeric_columns),
             ("datetime", self.datetime_transformer_, datetime_columns),
             ("low_card_cat", self.low_card_cat_transformer_, low_card_cat_columns),
             ("high_card_cat", self.high_card_cat_transformer_, high_card_cat_columns),
+            *self.column_specific_transformers_,
         ]
-        # We will now filter this list, by keeping only the ones with:
-        # - at least one column
-        # - a valid encoder or string (filter out if None)
+        # We will now filter this list,
+        # by keeping only the ones with at least one column.
         self.transformers = []
         for trans in all_transformers:
             name, enc, cols = trans  # Unpack

diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py
@@ -1,11 +1,12 @@
 import numpy as np
 import pandas as pd
 import pytest
+from sklearn.compose import ColumnTransformer, make_column_transformer
 from sklearn.exceptions import NotFittedError
 from sklearn.preprocessing import StandardScaler
 from sklearn.utils.validation import check_is_fitted
 
-from skrub import GapEncoder, SuperVectorizer, TableVectorizer
+from skrub import GapEncoder, MinHashEncoder, SuperVectorizer, TableVectorizer
 from skrub._table_vectorizer import _infer_date_format
 
 
@@ -595,3 +596,98 @@ def test__infer_date_format() -> None:
     # Test with a column containing more than two date formats
     date_column = pd.Series(["2022-01-01", "01/02/2022", "20220103", "2022-Jan-04"])
     assert _infer_date_format(date_column) is None
+
+
+@pytest.mark.parametrize(
+    "column_specific_transformers",
+    [
+        (MinHashEncoder(), ["str1", "str2"]),
+        (StandardScaler(), ["float"]),
+        ("mh_cat1", MinHashEncoder(), ["cat1"]),
+    ],
+)
+def test_specifying_specific_column_transformer(column_specific_transformers) -> None:
+    X = _get_dirty_dataframe()
+
+    tv = TableVectorizer(
+        column_specific_transformers=[column_specific_transformers],
+    )
+    X_enc_tv = tv.fit_transform(X)
+
+    # For the following section, we will use a different syntax using the
+    # column transformer to assert that the internals of the TableVectorizer
+    # work as expected.
+    # To do that, we get the assignment done by the default TableVectorizer
+    # (no arguments passed), and extend it with the column specific transformer
+    # which we then pass to the ColumnTransformer.
+    # We fit_transform the ColumnTransformer, and expect the same
+    # transformation (albeit, not necessarily in the same order) as the
+    # TableVectorizer.
+    if len(column_specific_transformers) == 2:
+        # Unnamed assignment
+        column_specific_transformers: tuple[object, list[str]]
+        transformer, columns = column_specific_transformers
+        default_table_vectorizer_assignment = [
+            (transformer, columns)
+            for _, transformer, columns in TableVectorizer()
+            .fit(X.drop(columns=columns))
+            .transformers
+        ]
+        ct = make_column_transformer(
+            *default_table_vectorizer_assignment, column_specific_transformers
+        )
+        X_enc_ct = ct.fit_transform(X)
+    elif len(column_specific_transformers) == 3:
+        # Named assignment
+        column_specific_transformers: tuple[str, object, list[str]]
+        name, transformer, columns = column_specific_transformers
+        default_table_vectorizer_assignment = (
+            TableVectorizer().fit(X.drop(columns=columns)).transformers
+        )
+        # Assert the name is used in the assignment
+        assert name in tv.named_transformers_
+        ct = ColumnTransformer(
+            transformers=default_table_vectorizer_assignment
+            + [column_specific_transformers],
+        )
+        X_enc_ct = ct.fit_transform(X)
+
+    assert X_enc_tv.shape == X_enc_ct.shape
+
+    # Assert the output is the same.
+    # This comparison works for arrays with possibly different order,
+    # which is a specificity of the output of the ColumnTransformer ; see
+    # the TableVectorizer "Notes" section.
+    for row_index in range(X_enc_tv.shape[0]):
+        c1 = np.convolve(X_enc_tv[row_index], X_enc_tv[row_index], "valid")[0]
+        c2 = np.convolve(X_enc_tv[row_index], X_enc_ct[row_index], "valid")[0]
+        if pd.isna(c1) or pd.isna(c2):
+            continue
+        assert np.isclose(c1, c2)
+
+
+@pytest.mark.parametrize(
+    "pipeline",
+    [
+        TableVectorizer(),
+        TableVectorizer(
+            column_specific_transformers=[
+                (MinHashEncoder(), ["cat1", "cat2"]),
+            ],
+        ),
+        TableVectorizer(
+            low_card_cat_transformer=MinHashEncoder(),
+        ),
+    ],
+)
+def test_deterministic(pipeline) -> None:
+    """
+    Tests that running the same TableVectorizer multiple times with the same
+    (deterministic) components results in the same output.
+    """
+    X = _get_dirty_dataframe()
+    for i in range(5):
+        X_enc = pipeline.fit_transform(X)
+        if i != 0:
+            np.testing.assert_array_equal(X_enc, X_enc_prev)  # noqa
+        X_enc_prev = X_enc  # noqa