Skip to content

Commit

Permalink
release v0.3.3
Browse files Browse the repository at this point in the history
  • Loading branch information
azmyrajab committed Apr 26, 2024
1 parent 1746bf2 commit 49ae93c
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 49 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ cargo-features = ["profile-rustflags"]

[package]
name = "polars_ols"
version = "0.3.2"
version = "0.3.3"
edition = "2021"

[lib]
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ df = pl.DataFrame({"y": [1.16, -2.16, -1.57, 0.21, 0.22, 1.6, -2.11, -2.92, -0.8
"weights": [0.34, 0.97, 0.39, 0.8, 0.57, 0.41, 0.19, 0.87, 0.06, 0.34],
})

lasso_expr = pl.col("y").least_squares.lasso(pl.col("x1"), pl.col("x2"), alpha=0.0001, add_intercept=True).over("group")
lasso_expr = pl.col("y").least_squares.lasso("x1", "x2", alpha=0.0001, add_intercept=True).over("group")
wls_expr = pls.compute_least_squares_from_formula("y ~ x1 + x2 -1", sample_weights=pl.col("weights"))

predictions = df.with_columns(lasso_expr.round(2).alias("predictions_lasso"),
Expand Down Expand Up @@ -154,6 +154,7 @@ Currently, this extension package supports the following variants:
- Weighted Least Squares: ```least_squares.wls```
- Regularized Least Squares (Lasso / Ridge / Elastic Net) ```least_squares.{lasso, ridge, elastic_net}```
- Non-negative Least Squares: ```least_squares.nnls```
- Multi-target Least Squares: ```least_squares.multi_target_ols```

As well as efficient implementations of moving window models:
- Recursive Least Squares: ```least_squares.rls```
Expand Down
33 changes: 15 additions & 18 deletions polars_ols/least_squares.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
_VALID_NULL_POLICIES: Set[NullPolicy] = set(get_args(NullPolicy))
_VALID_OUTPUT_MODES: Set[OutputMode] = set(get_args(OutputMode))
_VALID_SOLVE_METHODS: Set[SolveMethod] = set(get_args(SolveMethod)).union({None})
_EPSILON: float = 1.0e-12


@dataclass
Expand Down Expand Up @@ -190,7 +191,8 @@ def _pre_process_data(
# handle sample weights
sqrt_w: Optional[pl.Expr] = None
if sample_weights is not None:
sqrt_w = parse_into_expr(sample_weights).sqrt()
# missing sample weights are assumed to be minimal 'epsilon' weight
sqrt_w = parse_into_expr(sample_weights).sqrt().fill_null(_EPSILON)
target *= sqrt_w
features = [(expr * sqrt_w) for expr in features]
return target, features, sqrt_w
Expand All @@ -211,22 +213,17 @@ def _register_least_squares_plugin(

# register either coefficient or prediction plugin functions
if mode == "coefficients":
# TODO: remove 'rename_fields' after https://github.com/pola-rs/pyo3-polars/issues/79
# it currently breaks input_wildcard_expansion=True correctly returning a struct.
return (
register_plugin_function(
plugin_path=Path(__file__).parent,
function_name=f"{function_name}_coefficients",
args=[target_fit, *features_fit],
kwargs=ols_kwargs.to_dict(),
is_elementwise=False,
changes_length=returns_scalar_coefficients,
returns_scalar=returns_scalar_coefficients,
input_wildcard_expansion=True,
)
.alias("coefficients")
.struct.rename_fields([f.meta.output_name() for f in features_fit])
)
return register_plugin_function(
plugin_path=Path(__file__).parent,
function_name=f"{function_name}_coefficients",
args=[target_fit, *features_fit],
kwargs=ols_kwargs.to_dict(),
is_elementwise=False,
changes_length=returns_scalar_coefficients,
returns_scalar=returns_scalar_coefficients,
input_wildcard_expansion=True,
pass_name_to_apply=True,
).alias("coefficients")
else:
predictions = register_plugin_function(
plugin_path=Path(__file__).parent,
Expand All @@ -237,7 +234,7 @@ def _register_least_squares_plugin(
input_wildcard_expansion=True,
)
if sqrt_w is not None:
predictions /= sqrt_w # undo the scaling implicit in WLS weighting
predictions *= 1.0 / sqrt_w # undo the scaling implicit in WLS weighting
if mode == "predictions":
return predictions
else:
Expand Down
5 changes: 5 additions & 0 deletions src/least_squares.rs
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,11 @@ pub fn solve_multi_target(
alpha: Option<f64>,
rcond: Option<f64>,
) -> Array2<f64> {
// handle degenerate case of no data
if x.is_empty() {
return Array2::zeros((x.ncols(), y.ncols())); // n_features x n_targets
}
// Choose SVD implementation based on L2 regularization
let alpha = alpha.unwrap_or(0.0);
if alpha > 0.0 {
solve_ridge_svd(y, x, alpha, rcond)
Expand Down
45 changes: 17 additions & 28 deletions tests/test_ols.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def test_multi_target_regression(alpha, mode, null_policy):
n_features=3,
add_missing=null_policy not in {"zero", "ignore"},
missing_columns=("x1",),
n_groups=3,
)
df = df.with_columns(
pl.struct(
Expand All @@ -104,24 +105,24 @@ def test_multi_target_regression(alpha, mode, null_policy):
multi_target = df.select(
compute_multi_target_least_squares(
"y",
"x1",
"x2",
"x3",
pl.col("^x.*$"),
mode=mode,
ols_kwargs=ols_kwargs,
).alias(mode)
)
.over("group")
.alias(mode)
)

with timer("compute multiple linear regressions"):
expected = df.unnest("y").select(
compute_least_squares(
target,
"x1",
"x2",
"x3",
pl.col("^x.*$"),
mode=mode,
ols_kwargs=ols_kwargs,
).alias(target)
)
.over("group")
.alias(target)
for target in ("y1", "y2", "y3")
)

Expand All @@ -131,25 +132,15 @@ def test_multi_target_regression(alpha, mode, null_policy):
def test_fit_missing_data_coefficients():
df = _make_data(add_missing=True)

# # in presence of unhandled nulls assert the rust library raises ComputeError
# with pytest.raises(pl.exceptions.ComputeError):
# df.select(
# pl.col("y").least_squares.ols(
# pl.col("x1"), pl.col("x2"), null_policy="ignore", mode="coefficients"
# )
# )

# test rust zero policy is sane
assert np.allclose(
df.select(
pl.col("y").least_squares.ols(
pl.col("x1"), pl.col("x2"), null_policy="zero", mode="coefficients"
)
pl.col("y").least_squares.ols(pl.col("^x.*$"), null_policy="zero", mode="coefficients")
).unnest("coefficients"),
df.fill_null(0.0)
.select(
pl.col("y").least_squares.ols(
pl.col("x1"), pl.col("x2"), null_policy="ignore", mode="coefficients"
pl.col("^x.*$"), null_policy="ignore", mode="coefficients"
)
)
.unnest("coefficients"),
Expand All @@ -158,14 +149,12 @@ def test_fit_missing_data_coefficients():
# test rust drop (any) policy is sane
assert np.allclose(
df.select(
pl.col("y").least_squares.ols(
pl.col("x1"), pl.col("x2"), null_policy="drop", mode="coefficients"
)
pl.col("y").least_squares.ols(pl.col("^x.*$"), null_policy="drop", mode="coefficients")
).unnest("coefficients"),
df.drop_nulls()
.select(
pl.col("y").least_squares.ols(
pl.col("x1"), pl.col("x2"), null_policy="ignore", mode="coefficients"
pl.col("^x.*$"), null_policy="ignore", mode="coefficients"
)
)
.unnest("coefficients"),
Expand All @@ -175,14 +164,14 @@ def test_fit_missing_data_coefficients():
assert np.allclose(
df.select(
pl.col("y").least_squares.ols(
pl.col("x1"), pl.col("x2"), null_policy="drop_y_zero_x", mode="coefficients"
pl.col("^x.*$"), null_policy="drop_y_zero_x", mode="coefficients"
)
).unnest("coefficients"),
df.drop_nulls(subset=["y"])
.fill_null(0.0)
.select(
pl.col("y").least_squares.ols(
pl.col("x1"), pl.col("x2"), null_policy="ignore", mode="coefficients"
pl.col("^x.*$"), null_policy="ignore", mode="coefficients"
)
)
.unnest("coefficients"),
Expand Down Expand Up @@ -393,7 +382,7 @@ def test_coefficients_ols_groups():
df.select(
"group",
pl.col("y")
.least_squares.from_formula("x1 + x2 -1", mode="coefficients")
.least_squares.ols(pl.col("^x.*$"), mode="coefficients")
.over("group")
.alias("coefficients"),
)
Expand All @@ -406,7 +395,7 @@ def test_coefficients_ols_groups():
df.filter(pl.col("group") == 1)
.select(
pl.col("y")
.least_squares.from_formula("x1 + x2 -1", mode="coefficients")
.least_squares.ols(pl.col("^x.*$"), mode="coefficients")
.alias("coefficients")
)
.unnest("coefficients")
Expand Down

0 comments on commit 49ae93c

Please sign in to comment.