From 49ae93c1e6b6d6168c9781c411d31def460b1f08 Mon Sep 17 00:00:00 2001 From: Azmy Rajab Date: Fri, 26 Apr 2024 16:14:38 +0100 Subject: [PATCH] release v0.3.3 --- Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 3 ++- polars_ols/least_squares.py | 33 +++++++++++++-------------- src/least_squares.rs | 5 +++++ tests/test_ols.py | 45 ++++++++++++++----------------------- 6 files changed, 41 insertions(+), 49 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4bb0a80..0ca9017 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1916,7 +1916,7 @@ dependencies = [ [[package]] name = "polars_ols" -version = "0.3.1" +version = "0.3.2" dependencies = [ "approx", "blas-src", diff --git a/Cargo.toml b/Cargo.toml index 67b8288..e33f5e8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ cargo-features = ["profile-rustflags"] [package] name = "polars_ols" -version = "0.3.2" +version = "0.3.3" edition = "2021" [lib] diff --git a/README.md b/README.md index 015adb8..a08f7e2 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ df = pl.DataFrame({"y": [1.16, -2.16, -1.57, 0.21, 0.22, 1.6, -2.11, -2.92, -0.8 "weights": [0.34, 0.97, 0.39, 0.8, 0.57, 0.41, 0.19, 0.87, 0.06, 0.34], }) -lasso_expr = pl.col("y").least_squares.lasso(pl.col("x1"), pl.col("x2"), alpha=0.0001, add_intercept=True).over("group") +lasso_expr = pl.col("y").least_squares.lasso("x1", "x2", alpha=0.0001, add_intercept=True).over("group") wls_expr = pls.compute_least_squares_from_formula("y ~ x1 + x2 -1", sample_weights=pl.col("weights")) predictions = df.with_columns(lasso_expr.round(2).alias("predictions_lasso"), @@ -154,6 +154,7 @@ Currently, this extension package supports the following variants: - Weighted Least Squares: ```least_squares.wls``` - Regularized Least Squares (Lasso / Ridge / Elastic Net) ```least_squares.{lasso, ridge, elastic_net}``` - Non-negative Least Squares: ```least_squares.nnls``` +- Multi-target Least Squares: ```least_squares.multi_target_ols``` As well as efficient implementations of moving window models: - Recursive Least Squares: ```least_squares.rls``` diff --git a/polars_ols/least_squares.py b/polars_ols/least_squares.py index 2624d84..3254d8c 100644 --- a/polars_ols/least_squares.py +++ b/polars_ols/least_squares.py @@ -62,6 +62,7 @@ _VALID_NULL_POLICIES: Set[NullPolicy] = set(get_args(NullPolicy)) _VALID_OUTPUT_MODES: Set[OutputMode] = set(get_args(OutputMode)) _VALID_SOLVE_METHODS: Set[SolveMethod] = set(get_args(SolveMethod)).union({None}) +_EPSILON: float = 1.0e-12 @dataclass @@ -190,7 +191,8 @@ def _pre_process_data( # handle sample weights sqrt_w: Optional[pl.Expr] = None if sample_weights is not None: - sqrt_w = parse_into_expr(sample_weights).sqrt() + # missing sample weights are assumed to be minimal 'epsilon' weight + sqrt_w = parse_into_expr(sample_weights).sqrt().fill_null(_EPSILON) target *= sqrt_w features = [(expr * sqrt_w) for expr in features] return target, features, sqrt_w @@ -211,22 +213,17 @@ def _register_least_squares_plugin( # register either coefficient or prediction plugin functions if mode == "coefficients": - # TODO: remove 'rename_fields' after https://github.com/pola-rs/pyo3-polars/issues/79 - # it currently breaks input_wildcard_expansion=True correctly returning a struct. - return ( - register_plugin_function( - plugin_path=Path(__file__).parent, - function_name=f"{function_name}_coefficients", - args=[target_fit, *features_fit], - kwargs=ols_kwargs.to_dict(), - is_elementwise=False, - changes_length=returns_scalar_coefficients, - returns_scalar=returns_scalar_coefficients, - input_wildcard_expansion=True, - ) - .alias("coefficients") - .struct.rename_fields([f.meta.output_name() for f in features_fit]) - ) + return register_plugin_function( + plugin_path=Path(__file__).parent, + function_name=f"{function_name}_coefficients", + args=[target_fit, *features_fit], + kwargs=ols_kwargs.to_dict(), + is_elementwise=False, + changes_length=returns_scalar_coefficients, + returns_scalar=returns_scalar_coefficients, + input_wildcard_expansion=True, + pass_name_to_apply=True, + ).alias("coefficients") else: predictions = register_plugin_function( plugin_path=Path(__file__).parent, @@ -237,7 +234,7 @@ def _register_least_squares_plugin( input_wildcard_expansion=True, ) if sqrt_w is not None: - predictions /= sqrt_w # undo the scaling implicit in WLS weighting + predictions *= 1.0 / sqrt_w # undo the scaling implicit in WLS weighting if mode == "predictions": return predictions else: diff --git a/src/least_squares.rs b/src/least_squares.rs index a68130a..cfa75cb 100644 --- a/src/least_squares.rs +++ b/src/least_squares.rs @@ -252,6 +252,11 @@ pub fn solve_multi_target( alpha: Option, rcond: Option, ) -> Array2 { + // handle degenerate case of no data + if x.is_empty() { + return Array2::zeros((x.ncols(), y.ncols())); // n_features x n_targets + } + // Choose SVD implementation based on L2 regularization let alpha = alpha.unwrap_or(0.0); if alpha > 0.0 { solve_ridge_svd(y, x, alpha, rcond) diff --git a/tests/test_ols.py b/tests/test_ols.py index 225e303..511206e 100644 --- a/tests/test_ols.py +++ b/tests/test_ols.py @@ -90,6 +90,7 @@ def test_multi_target_regression(alpha, mode, null_policy): n_features=3, add_missing=null_policy not in {"zero", "ignore"}, missing_columns=("x1",), + n_groups=3, ) df = df.with_columns( pl.struct( @@ -104,24 +105,24 @@ def test_multi_target_regression(alpha, mode, null_policy): multi_target = df.select( compute_multi_target_least_squares( "y", - "x1", - "x2", - "x3", + pl.col("^x.*$"), mode=mode, ols_kwargs=ols_kwargs, - ).alias(mode) + ) + .over("group") + .alias(mode) ) with timer("compute multiple linear regressions"): expected = df.unnest("y").select( compute_least_squares( target, - "x1", - "x2", - "x3", + pl.col("^x.*$"), mode=mode, ols_kwargs=ols_kwargs, - ).alias(target) + ) + .over("group") + .alias(target) for target in ("y1", "y2", "y3") ) @@ -131,25 +132,15 @@ def test_multi_target_regression(alpha, mode, null_policy): def test_fit_missing_data_coefficients(): df = _make_data(add_missing=True) - # # in presence of unhandled nulls assert the rust library raises ComputeError - # with pytest.raises(pl.exceptions.ComputeError): - # df.select( - # pl.col("y").least_squares.ols( - # pl.col("x1"), pl.col("x2"), null_policy="ignore", mode="coefficients" - # ) - # ) - # test rust zero policy is sane assert np.allclose( df.select( - pl.col("y").least_squares.ols( - pl.col("x1"), pl.col("x2"), null_policy="zero", mode="coefficients" - ) + pl.col("y").least_squares.ols(pl.col("^x.*$"), null_policy="zero", mode="coefficients") ).unnest("coefficients"), df.fill_null(0.0) .select( pl.col("y").least_squares.ols( - pl.col("x1"), pl.col("x2"), null_policy="ignore", mode="coefficients" + pl.col("^x.*$"), null_policy="ignore", mode="coefficients" ) ) .unnest("coefficients"), @@ -158,14 +149,12 @@ def test_fit_missing_data_coefficients(): # test rust drop (any) policy is sane assert np.allclose( df.select( - pl.col("y").least_squares.ols( - pl.col("x1"), pl.col("x2"), null_policy="drop", mode="coefficients" - ) + pl.col("y").least_squares.ols(pl.col("^x.*$"), null_policy="drop", mode="coefficients") ).unnest("coefficients"), df.drop_nulls() .select( pl.col("y").least_squares.ols( - pl.col("x1"), pl.col("x2"), null_policy="ignore", mode="coefficients" + pl.col("^x.*$"), null_policy="ignore", mode="coefficients" ) ) .unnest("coefficients"), @@ -175,14 +164,14 @@ def test_fit_missing_data_coefficients(): assert np.allclose( df.select( pl.col("y").least_squares.ols( - pl.col("x1"), pl.col("x2"), null_policy="drop_y_zero_x", mode="coefficients" + pl.col("^x.*$"), null_policy="drop_y_zero_x", mode="coefficients" ) ).unnest("coefficients"), df.drop_nulls(subset=["y"]) .fill_null(0.0) .select( pl.col("y").least_squares.ols( - pl.col("x1"), pl.col("x2"), null_policy="ignore", mode="coefficients" + pl.col("^x.*$"), null_policy="ignore", mode="coefficients" ) ) .unnest("coefficients"), @@ -393,7 +382,7 @@ def test_coefficients_ols_groups(): df.select( "group", pl.col("y") - .least_squares.from_formula("x1 + x2 -1", mode="coefficients") + .least_squares.ols(pl.col("^x.*$"), mode="coefficients") .over("group") .alias("coefficients"), ) @@ -406,7 +395,7 @@ def test_coefficients_ols_groups(): df.filter(pl.col("group") == 1) .select( pl.col("y") - .least_squares.from_formula("x1 + x2 -1", mode="coefficients") + .least_squares.ols(pl.col("^x.*$"), mode="coefficients") .alias("coefficients") ) .unnest("coefficients")