feat: make add_lags dataframe-agnostic (#661)

* make add_lags dataframe-agnostic * try getting tests to run? * patch: cvxpy 1.5.0 support (#663) --------- Co-authored-by: Francesco Bruzzesi <[email protected]>
koaning · May 10, 2024 · 28c102b · 28c102b
1 parent fe691b5
commit 28c102b
Show file tree

Hide file tree

Showing 5 changed files with 89 additions and 45 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -4,6 +4,7 @@ on:
   pull_request:
     branches:
     - main
+    - narwhals-development
 
 jobs:
   test:

diff --git a/pyproject.toml b/pyproject.toml
@@ -20,7 +20,7 @@ maintainers = [
 ]
 
 dependencies = [
-    "narwhals>=0.7.16",
+    "narwhals>=0.8.9",
     "pandas>=1.1.5",
     "scikit-learn>=1.0",
     "importlib-metadata >= 1.0; python_version < '3.8'",

diff --git a/sklego/linear_model.py b/sklego/linear_model.py
@@ -4,8 +4,9 @@
     from sklego.notinstalled import NotInstalledPackage
 
     cp = NotInstalledPackage("cvxpy")
-
+import logging
 from abc import ABC, abstractmethod
+from inspect import signature
 from warnings import warn
 
 import numpy as np
@@ -533,12 +534,20 @@ def _solve(self, sensitive, X, y):
         constraints = self.constraints(y_hat, y, sensitive, n_obs)
 
         problem = cp.Problem(cp.Maximize(log_likelihood), constraints)
-        problem.solve(max_iters=self.max_iter)
+
+        if "max_iters" in signature(problem.solve).parameters:
+            kwargs = {"max_iters": self.max_iter}
+        else:
+            if self.max_iter:
+                logging.warning("solver does not support `max_iters` and it `self.max_iter` will be ignored")
+            kwargs = {}
+
+        problem.solve(**kwargs)
 
         if problem.status in ["infeasible", "unbounded"]:
             raise ValueError(f"problem was found to be {problem.status}")
 
-        self.n_iter_ = problem.solver_stats.num_iters
+        self.n_iter_ = getattr(problem.solver_stats, "num_iters", 0)
 
         if self.fit_intercept:
             self.coef_ = theta.value[np.newaxis, 1:]

diff --git a/sklego/pandas_utils.py b/sklego/pandas_utils.py
@@ -2,8 +2,8 @@
 import inspect
 from functools import partial, wraps
 
+import narwhals as nw
 import numpy as np
-import pandas as pd
 from scipy.ndimage import shift
 
 from sklego.common import as_list
@@ -199,13 +199,27 @@ def add_lags(X, cols, lags, drop_na=True):
 
     Returns
     -------
-    pd.DataFrame | np.ndarray
+    DataFrame | np.ndarray
         With only the selected cols.
 
     Raises
     ------
     ValueError
-        If the input is not a `pd.DataFrame` or `np.ndarray`.
+        If the input is not a supported DataFrame.
+
+    Notes
+    -----
+    Native cross-dataframe support is achieved using
+    [Narwhals](https://narwhals-dev.github.io/narwhals/){:target="_blank"}.
+    Supported dataframes are:
+
+    - pandas
+    - Polars (eager or lazy)
+    - Modin
+    - cuDF
+
+    See [Narwhals docs](https://narwhals-dev.github.io/narwhals/extending/){:target="_blank"} for an up-to-date list
+    (and to learn how you can add your dataframe library to it!).
 
     Examples
     --------
@@ -255,8 +269,9 @@ def add_lags(X, cols, lags, drop_na=True):
 
     # The keys of the allowed_inputs dict contain the allowed
     # types, and the values contain the associated handlers
+    X = nw.from_native(X, strict=False)
     allowed_inputs = {
-        pd.core.frame.DataFrame: _add_lagged_pandas_columns,
+        nw.DataFrame: _add_lagged_dataframe_columns,
         np.ndarray: _add_lagged_numpy_columns,
     }
 
@@ -316,12 +331,12 @@ def _add_lagged_numpy_columns(X, cols, lags, drop_na):
     return answer
 
 
-def _add_lagged_pandas_columns(df, cols, lags, drop_na):
+def _add_lagged_dataframe_columns(df, cols, lags, drop_na):
     """Append a lag columns.
 
     Parameters
     ----------
-    df : pd.DataFrame
+    df : narwhals.DataFrame | narwhals.LazyFrame
         Data to be lagged.
     cols : str | List[str]
         Column name / names.
@@ -332,23 +347,19 @@ def _add_lagged_pandas_columns(df, cols, lags, drop_na):
 
     Returns
     -------
-    pd.DataFrame
+    DataFrame
         Dataframe with concatenated lagged cols.
     """
 
     cols = as_list(cols)
 
-    # Indexes are not supported as pandas column names may be
-    # integers themselves, introducing unexpected behaviour
-    if not all([col in df.columns.values for col in cols]):
+    if not all([col in df.columns for col in cols]):
         raise KeyError("The column does not exist")
 
-    combos = (df[col].shift(-lag).rename(col + str(lag)) for col in cols for lag in lags)
+    answer = df.with_columns(nw.col(col).shift(-lag).alias(col + str(lag)) for col in cols for lag in lags)
 
-    answer = pd.concat([df, *combos], axis=1)
-
-    # Remove rows that contain NA values when drop_na is truthy
+    # Remove rows that contain null values when drop_na is truthy
     if drop_na:
-        answer = answer.dropna()
+        answer = answer.drop_nulls()
 
-    return answer
+    return nw.to_native(answer)
diff --git a/tests/test_pandas_utils/test_pandas_utils.py b/tests/test_pandas_utils/test_pandas_utils.py
@@ -1,12 +1,14 @@
 import logging
 
+import narwhals as nw
 import numpy as np
 import pandas as pd
+import polars as pl
 import pytest
 
 from sklego.pandas_utils import (
+    _add_lagged_dataframe_columns,
     _add_lagged_numpy_columns,
-    _add_lagged_pandas_columns,
     add_lags,
     log_step,
     log_step_extra,
@@ -16,39 +18,45 @@
 
 
 @pytest.fixture
-def test_df():
-    return pd.DataFrame({"X1": [0, 1, 2], "X2": [np.nan, "178", "154"]})
+def data():
+    return {"X1": [0, 1, 2], "X2": [float("nan"), "178", "154"]}
 
 
 @pytest.fixture
 def test_X():
     return np.array([[-4, 2], [-2, 0], [4, -6]])
 
 
-def test_add_lags_wrong_inputs(test_df):
+@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
+def test_add_lags_wrong_inputs(data, frame_func):
     invalid_df = [[1, 2, 3], [4, 5, 6]]
     invalid_lags = ["1", "2"]
+    test_df = frame_func(data)
     with pytest.raises(ValueError, match="lags must be a list of type: ?"):
         add_lags(test_df, ["X1"], invalid_lags)
     with pytest.raises(ValueError, match="X type should be one of: ?"):
         add_lags(invalid_df, ["X1"], 1)
 
 
-def test_add_lags_correct_df(test_df):
-    expected = pd.DataFrame({"X1": [1, 2], "X2": ["178", "154"], "X1-1": [0, 1]})
+@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
+def test_add_lags_correct_df(data, frame_func):
+    test_df = frame_func(data)
+    expected = frame_func({"X1": [1, 2], "X2": ["178", "154"], "X1-1": [0, 1]})
     ans = add_lags(test_df, "X1", -1)
-    assert (ans.columns == expected.columns).all()
-    assert (ans.values == expected.values).all()
+    assert [x for x in ans.columns] == [x for x in expected.columns]
+    assert (ans.to_numpy() == expected.to_numpy()).all()
 
 
 def test_add_lags_correct_X(test_X):
     expected = np.array([[-4, 2, -2, 3, 0, -6]])
     assert (add_lags(test_X, [0, 1], [1, 2]) == expected).all()
 
 
-def test_add_lagged_pandas_columns(test_df):
+@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
+def test_add_lagged_dataframe_columns(data, frame_func):
+    test_df = nw.from_native(frame_func(data))
     with pytest.raises(KeyError, match="The column does not exist"):
-        _add_lagged_pandas_columns(test_df, ["last_name"], 1, True)
+        _add_lagged_dataframe_columns(test_df, ["last_name"], 1, True)
 
 
 def test_add_lagged_numpy_columns(test_X):
@@ -62,8 +70,9 @@ def test_add_lagged_numpy_columns(test_X):
         _add_lagged_numpy_columns(test_X, ["test"], 1, True)
 
 
-def test_log_step(capsys, test_df):
+def test_log_step(capsys, data):
     """Base test of log_step without any arguments to the logger"""
+    test_df = pd.DataFrame(data)
 
     @log_step
     def do_something(df):
@@ -83,8 +92,9 @@ def do_nothing(df, *args, **kwargs):
     assert print_statements[2].startswith("[do_something(df)]")
 
 
-def test_log_step_display_args(capsys, test_df):
+def test_log_step_display_args(capsys, data):
     """Test that we can disable printing function arguments in the log_step"""
+    test_df = pd.DataFrame(data)
 
     @log_step(display_args=False)
     def do_something(df):
@@ -104,8 +114,9 @@ def do_nothing(df, *args, **kwargs):
     assert print_statements[2].startswith("[do_something]")
 
 
-def test_log_step_logger(caplog, test_df):
+def test_log_step_logger(caplog, data):
     """Base test of log_step with a logger supplied instead of default print"""
+    test_df = pd.DataFrame(data)
     caplog.clear()
 
     @log_step(print_fn=logging.info)
@@ -125,8 +136,9 @@ def do_nothing(df, *args, **kwargs):
 
 
 @pytest.mark.parametrize("time_taken", [True, False])
-def test_log_time(time_taken, capsys, test_df):
+def test_log_time(time_taken, capsys, data):
     """Test logging of time taken can be switched on and off"""
+    test_df = pd.DataFrame(data)
 
     @log_step(time_taken=time_taken)
     def do_nothing(df, *args, **kwargs):
@@ -141,8 +153,9 @@ def do_nothing(df, *args, **kwargs):
 
 
 @pytest.mark.parametrize("shape", [True, False])
-def test_log_shape(shape, capsys, test_df):
+def test_log_shape(shape, capsys, data):
     """Test logging of shape can be switched on and off"""
+    test_df = pd.DataFrame(data)
 
     @log_step(shape=shape)
     def do_nothing(df, *args, **kwargs):
@@ -156,8 +169,9 @@ def do_nothing(df, *args, **kwargs):
     assert (f"n_col={test_df.shape[1]}" in captured.out) == shape
 
 
-def test_log_shape_delta(capsys, test_df):
+def test_log_shape_delta(capsys, data):
     """Test logging of shape delta can be switched on and off"""
+    test_df = pd.DataFrame(data)
 
     @log_step(shape_delta=True)
     def do_nothing(df, *args, **kwargs):
@@ -194,8 +208,9 @@ def remove_column(df, *args, **kwargs):
 
 
 @pytest.mark.parametrize("names", [True, False])
-def test_log_names(names, capsys, test_df):
+def test_log_names(names, capsys, data):
     """Test logging of names can be switched on and off"""
+    test_df = pd.DataFrame(data)
 
     @log_step(names=names)
     def do_nothing(df, *args, **kwargs):
@@ -212,8 +227,9 @@ def do_nothing(df, *args, **kwargs):
 
 
 @pytest.mark.parametrize("dtypes", [True, False])
-def test_log_dtypes(dtypes, capsys, test_df):
+def test_log_dtypes(dtypes, capsys, data):
     """Test logging of dtypes can be switched on and off"""
+    test_df = pd.DataFrame(data)
 
     @log_step(dtypes=dtypes)
     def do_nothing(df, *args, **kwargs):
@@ -229,11 +245,12 @@ def do_nothing(df, *args, **kwargs):
         assert str(test_df.dtypes.to_dict()) in captured.out
 
 
-def test_log_not_names_and_dtypes(capsys, test_df):
+def test_log_not_names_and_dtypes(capsys, data):
     """
     Test that not both names and types are logged, even if we set both to True
     We don't want this because dtypes also prints the names
     """
+    test_df = pd.DataFrame(data)
 
     @log_step(names=True, dtypes=True)
     def do_nothing(df, *args, **kwargs):
@@ -246,8 +263,9 @@ def do_nothing(df, *args, **kwargs):
     assert "names=" not in captured.out
 
 
-def test_log_custom_logger(caplog, test_df):
+def test_log_custom_logger(caplog, data):
     """Test that we can supply a custom logger to the log_step"""
+    test_df = pd.DataFrame(data)
     caplog.clear()
 
     logger_name = "my_custom_logger"
@@ -265,8 +283,9 @@ def do_nothing(df, *args, **kwargs):
 
 
 @pytest.mark.parametrize("log_error", [True, False])
-def test_log_error(log_error, capsys, test_df):
+def test_log_error(log_error, capsys, data):
     """Test logging of shape can be switched on and off"""
+    test_df = pd.DataFrame(data)
 
     err_msg = "This is a test Exception"
 
@@ -341,8 +360,9 @@ def double_df(df, *args, **kwargs):
     assert f"dogs={2*n_dogs}" in print_statements[1]
 
 
-def test_log_extra_multiple(capsys, test_df):
+def test_log_extra_multiple(capsys, data):
     """Test that we can add multiple logging functions"""
+    test_df = pd.DataFrame(data)
 
     @log_step_extra(len, type)
     def do_nothing(df, *args, **kwargs):
@@ -356,8 +376,9 @@ def do_nothing(df, *args, **kwargs):
     assert str(type(test_df)) in captured.out
 
 
-def test_log_extra_no_func(test_df):
+def test_log_extra_no_func(data):
     """We need at least one logging function"""
+    test_df = pd.DataFrame(data)
     with pytest.raises(ValueError) as e:
 
         @log_step_extra()
@@ -369,8 +390,9 @@ def do_nothing(df, *args, **kwargs):
         assert "log_function" in str(e)
 
 
-def test_log_extra_not_callable_func(test_df):
+def test_log_extra_not_callable_func(data):
     """Make sure the logging functions are checked to be callable"""
+    test_df = pd.DataFrame(data)
     with pytest.raises(ValueError) as e:
 
         @log_step_extra(1)
@@ -383,8 +405,9 @@ def do_nothing(df, *args, **kwargs):
         assert "int" in str(e)
 
 
-def test_log_extra_custom_logger(caplog, test_df):
+def test_log_extra_custom_logger(caplog, data):
     """Test that we can supply a custom logger to the log_step_extra"""
+    test_df = pd.DataFrame(data)
     caplog.clear()
 
     logger_name = "my_custom_logger"
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,7 @@ on: @@
       pull_request:
         branches:
         - main
+        - narwhals-development
     jobs:
       test:
@@ Expand Down @@