Skip to content

Commit

Permalink
feat: make add_lags dataframe-agnostic (#661)
Browse files Browse the repository at this point in the history
* make add_lags dataframe-agnostic

* try getting tests to run?

* patch: cvxpy 1.5.0 support (#663)

---------

Co-authored-by: Francesco Bruzzesi <[email protected]>
  • Loading branch information
MarcoGorelli and FBruzzesi authored May 10, 2024
1 parent fe691b5 commit 28c102b
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 45 deletions.
1 change: 1 addition & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ on:
pull_request:
branches:
- main
- narwhals-development

jobs:
test:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ maintainers = [
]

dependencies = [
"narwhals>=0.7.16",
"narwhals>=0.8.9",
"pandas>=1.1.5",
"scikit-learn>=1.0",
"importlib-metadata >= 1.0; python_version < '3.8'",
Expand Down
15 changes: 12 additions & 3 deletions sklego/linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
from sklego.notinstalled import NotInstalledPackage

cp = NotInstalledPackage("cvxpy")

import logging
from abc import ABC, abstractmethod
from inspect import signature
from warnings import warn

import numpy as np
Expand Down Expand Up @@ -533,12 +534,20 @@ def _solve(self, sensitive, X, y):
constraints = self.constraints(y_hat, y, sensitive, n_obs)

problem = cp.Problem(cp.Maximize(log_likelihood), constraints)
problem.solve(max_iters=self.max_iter)

if "max_iters" in signature(problem.solve).parameters:
kwargs = {"max_iters": self.max_iter}
else:
if self.max_iter:
logging.warning("solver does not support `max_iters` and it `self.max_iter` will be ignored")
kwargs = {}

problem.solve(**kwargs)

if problem.status in ["infeasible", "unbounded"]:
raise ValueError(f"problem was found to be {problem.status}")

self.n_iter_ = problem.solver_stats.num_iters
self.n_iter_ = getattr(problem.solver_stats, "num_iters", 0)

if self.fit_intercept:
self.coef_ = theta.value[np.newaxis, 1:]
Expand Down
43 changes: 27 additions & 16 deletions sklego/pandas_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import inspect
from functools import partial, wraps

import narwhals as nw
import numpy as np
import pandas as pd
from scipy.ndimage import shift

from sklego.common import as_list
Expand Down Expand Up @@ -199,13 +199,27 @@ def add_lags(X, cols, lags, drop_na=True):
Returns
-------
pd.DataFrame | np.ndarray
DataFrame | np.ndarray
With only the selected cols.
Raises
------
ValueError
If the input is not a `pd.DataFrame` or `np.ndarray`.
If the input is not a supported DataFrame.
Notes
-----
Native cross-dataframe support is achieved using
[Narwhals](https://narwhals-dev.github.io/narwhals/){:target="_blank"}.
Supported dataframes are:
- pandas
- Polars (eager or lazy)
- Modin
- cuDF
See [Narwhals docs](https://narwhals-dev.github.io/narwhals/extending/){:target="_blank"} for an up-to-date list
(and to learn how you can add your dataframe library to it!).
Examples
--------
Expand Down Expand Up @@ -255,8 +269,9 @@ def add_lags(X, cols, lags, drop_na=True):

# The keys of the allowed_inputs dict contain the allowed
# types, and the values contain the associated handlers
X = nw.from_native(X, strict=False)
allowed_inputs = {
pd.core.frame.DataFrame: _add_lagged_pandas_columns,
nw.DataFrame: _add_lagged_dataframe_columns,
np.ndarray: _add_lagged_numpy_columns,
}

Expand Down Expand Up @@ -316,12 +331,12 @@ def _add_lagged_numpy_columns(X, cols, lags, drop_na):
return answer


def _add_lagged_pandas_columns(df, cols, lags, drop_na):
def _add_lagged_dataframe_columns(df, cols, lags, drop_na):
"""Append a lag columns.
Parameters
----------
df : pd.DataFrame
df : narwhals.DataFrame | narwhals.LazyFrame
Data to be lagged.
cols : str | List[str]
Column name / names.
Expand All @@ -332,23 +347,19 @@ def _add_lagged_pandas_columns(df, cols, lags, drop_na):
Returns
-------
pd.DataFrame
DataFrame
Dataframe with concatenated lagged cols.
"""

cols = as_list(cols)

# Indexes are not supported as pandas column names may be
# integers themselves, introducing unexpected behaviour
if not all([col in df.columns.values for col in cols]):
if not all([col in df.columns for col in cols]):
raise KeyError("The column does not exist")

combos = (df[col].shift(-lag).rename(col + str(lag)) for col in cols for lag in lags)
answer = df.with_columns(nw.col(col).shift(-lag).alias(col + str(lag)) for col in cols for lag in lags)

answer = pd.concat([df, *combos], axis=1)

# Remove rows that contain NA values when drop_na is truthy
# Remove rows that contain null values when drop_na is truthy
if drop_na:
answer = answer.dropna()
answer = answer.drop_nulls()

return answer
return nw.to_native(answer)
73 changes: 48 additions & 25 deletions tests/test_pandas_utils/test_pandas_utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import logging

import narwhals as nw
import numpy as np
import pandas as pd
import polars as pl
import pytest

from sklego.pandas_utils import (
_add_lagged_dataframe_columns,
_add_lagged_numpy_columns,
_add_lagged_pandas_columns,
add_lags,
log_step,
log_step_extra,
Expand All @@ -16,39 +18,45 @@


@pytest.fixture
def test_df():
return pd.DataFrame({"X1": [0, 1, 2], "X2": [np.nan, "178", "154"]})
def data():
return {"X1": [0, 1, 2], "X2": [float("nan"), "178", "154"]}


@pytest.fixture
def test_X():
return np.array([[-4, 2], [-2, 0], [4, -6]])


def test_add_lags_wrong_inputs(test_df):
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
def test_add_lags_wrong_inputs(data, frame_func):
invalid_df = [[1, 2, 3], [4, 5, 6]]
invalid_lags = ["1", "2"]
test_df = frame_func(data)
with pytest.raises(ValueError, match="lags must be a list of type: ?"):
add_lags(test_df, ["X1"], invalid_lags)
with pytest.raises(ValueError, match="X type should be one of: ?"):
add_lags(invalid_df, ["X1"], 1)


def test_add_lags_correct_df(test_df):
expected = pd.DataFrame({"X1": [1, 2], "X2": ["178", "154"], "X1-1": [0, 1]})
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
def test_add_lags_correct_df(data, frame_func):
test_df = frame_func(data)
expected = frame_func({"X1": [1, 2], "X2": ["178", "154"], "X1-1": [0, 1]})
ans = add_lags(test_df, "X1", -1)
assert (ans.columns == expected.columns).all()
assert (ans.values == expected.values).all()
assert [x for x in ans.columns] == [x for x in expected.columns]
assert (ans.to_numpy() == expected.to_numpy()).all()


def test_add_lags_correct_X(test_X):
expected = np.array([[-4, 2, -2, 3, 0, -6]])
assert (add_lags(test_X, [0, 1], [1, 2]) == expected).all()


def test_add_lagged_pandas_columns(test_df):
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
def test_add_lagged_dataframe_columns(data, frame_func):
test_df = nw.from_native(frame_func(data))
with pytest.raises(KeyError, match="The column does not exist"):
_add_lagged_pandas_columns(test_df, ["last_name"], 1, True)
_add_lagged_dataframe_columns(test_df, ["last_name"], 1, True)


def test_add_lagged_numpy_columns(test_X):
Expand All @@ -62,8 +70,9 @@ def test_add_lagged_numpy_columns(test_X):
_add_lagged_numpy_columns(test_X, ["test"], 1, True)


def test_log_step(capsys, test_df):
def test_log_step(capsys, data):
"""Base test of log_step without any arguments to the logger"""
test_df = pd.DataFrame(data)

@log_step
def do_something(df):
Expand All @@ -83,8 +92,9 @@ def do_nothing(df, *args, **kwargs):
assert print_statements[2].startswith("[do_something(df)]")


def test_log_step_display_args(capsys, test_df):
def test_log_step_display_args(capsys, data):
"""Test that we can disable printing function arguments in the log_step"""
test_df = pd.DataFrame(data)

@log_step(display_args=False)
def do_something(df):
Expand All @@ -104,8 +114,9 @@ def do_nothing(df, *args, **kwargs):
assert print_statements[2].startswith("[do_something]")


def test_log_step_logger(caplog, test_df):
def test_log_step_logger(caplog, data):
"""Base test of log_step with a logger supplied instead of default print"""
test_df = pd.DataFrame(data)
caplog.clear()

@log_step(print_fn=logging.info)
Expand All @@ -125,8 +136,9 @@ def do_nothing(df, *args, **kwargs):


@pytest.mark.parametrize("time_taken", [True, False])
def test_log_time(time_taken, capsys, test_df):
def test_log_time(time_taken, capsys, data):
"""Test logging of time taken can be switched on and off"""
test_df = pd.DataFrame(data)

@log_step(time_taken=time_taken)
def do_nothing(df, *args, **kwargs):
Expand All @@ -141,8 +153,9 @@ def do_nothing(df, *args, **kwargs):


@pytest.mark.parametrize("shape", [True, False])
def test_log_shape(shape, capsys, test_df):
def test_log_shape(shape, capsys, data):
"""Test logging of shape can be switched on and off"""
test_df = pd.DataFrame(data)

@log_step(shape=shape)
def do_nothing(df, *args, **kwargs):
Expand All @@ -156,8 +169,9 @@ def do_nothing(df, *args, **kwargs):
assert (f"n_col={test_df.shape[1]}" in captured.out) == shape


def test_log_shape_delta(capsys, test_df):
def test_log_shape_delta(capsys, data):
"""Test logging of shape delta can be switched on and off"""
test_df = pd.DataFrame(data)

@log_step(shape_delta=True)
def do_nothing(df, *args, **kwargs):
Expand Down Expand Up @@ -194,8 +208,9 @@ def remove_column(df, *args, **kwargs):


@pytest.mark.parametrize("names", [True, False])
def test_log_names(names, capsys, test_df):
def test_log_names(names, capsys, data):
"""Test logging of names can be switched on and off"""
test_df = pd.DataFrame(data)

@log_step(names=names)
def do_nothing(df, *args, **kwargs):
Expand All @@ -212,8 +227,9 @@ def do_nothing(df, *args, **kwargs):


@pytest.mark.parametrize("dtypes", [True, False])
def test_log_dtypes(dtypes, capsys, test_df):
def test_log_dtypes(dtypes, capsys, data):
"""Test logging of dtypes can be switched on and off"""
test_df = pd.DataFrame(data)

@log_step(dtypes=dtypes)
def do_nothing(df, *args, **kwargs):
Expand All @@ -229,11 +245,12 @@ def do_nothing(df, *args, **kwargs):
assert str(test_df.dtypes.to_dict()) in captured.out


def test_log_not_names_and_dtypes(capsys, test_df):
def test_log_not_names_and_dtypes(capsys, data):
"""
Test that not both names and types are logged, even if we set both to True
We don't want this because dtypes also prints the names
"""
test_df = pd.DataFrame(data)

@log_step(names=True, dtypes=True)
def do_nothing(df, *args, **kwargs):
Expand All @@ -246,8 +263,9 @@ def do_nothing(df, *args, **kwargs):
assert "names=" not in captured.out


def test_log_custom_logger(caplog, test_df):
def test_log_custom_logger(caplog, data):
"""Test that we can supply a custom logger to the log_step"""
test_df = pd.DataFrame(data)
caplog.clear()

logger_name = "my_custom_logger"
Expand All @@ -265,8 +283,9 @@ def do_nothing(df, *args, **kwargs):


@pytest.mark.parametrize("log_error", [True, False])
def test_log_error(log_error, capsys, test_df):
def test_log_error(log_error, capsys, data):
"""Test logging of shape can be switched on and off"""
test_df = pd.DataFrame(data)

err_msg = "This is a test Exception"

Expand Down Expand Up @@ -341,8 +360,9 @@ def double_df(df, *args, **kwargs):
assert f"dogs={2*n_dogs}" in print_statements[1]


def test_log_extra_multiple(capsys, test_df):
def test_log_extra_multiple(capsys, data):
"""Test that we can add multiple logging functions"""
test_df = pd.DataFrame(data)

@log_step_extra(len, type)
def do_nothing(df, *args, **kwargs):
Expand All @@ -356,8 +376,9 @@ def do_nothing(df, *args, **kwargs):
assert str(type(test_df)) in captured.out


def test_log_extra_no_func(test_df):
def test_log_extra_no_func(data):
"""We need at least one logging function"""
test_df = pd.DataFrame(data)
with pytest.raises(ValueError) as e:

@log_step_extra()
Expand All @@ -369,8 +390,9 @@ def do_nothing(df, *args, **kwargs):
assert "log_function" in str(e)


def test_log_extra_not_callable_func(test_df):
def test_log_extra_not_callable_func(data):
"""Make sure the logging functions are checked to be callable"""
test_df = pd.DataFrame(data)
with pytest.raises(ValueError) as e:

@log_step_extra(1)
Expand All @@ -383,8 +405,9 @@ def do_nothing(df, *args, **kwargs):
assert "int" in str(e)


def test_log_extra_custom_logger(caplog, test_df):
def test_log_extra_custom_logger(caplog, data):
"""Test that we can supply a custom logger to the log_step_extra"""
test_df = pd.DataFrame(data)
caplog.clear()

logger_name = "my_custom_logger"
Expand Down

0 comments on commit 28c102b

Please sign in to comment.