Skip to content

Commit

Permalink
fix lint, release v0.3.4
Browse files Browse the repository at this point in the history
  • Loading branch information
azmyrajab committed Aug 25, 2024
1 parent ece7bdd commit 9480630
Show file tree
Hide file tree
Showing 7 changed files with 114 additions and 63 deletions.
30 changes: 29 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,9 @@ shape: (5, 7)
└───────┴───────┴───────┴───────┴─────────┴───────────────────┴─────────────────┘
```

The `mode` parameter is used to set the type of the output returned by all methods (`"predictions", "residuals", "coefficients"`).
The `mode` parameter is used to set the type of the output returned by all methods (`"predictions", "residuals", "coefficients", "statistics"`).
It defaults to returning predictions matching the input's length.
Note that `"statistics"` is currently only supported for OLS/WLS/Ridge models.

In case `"coefficients"` is set the output is a [polars Struct](https://docs.pola.rs/user-guide/expressions/structs/) with coefficients as values and feature names as fields.
It's output shape 'broadcasts' depending on context, see below:
Expand Down Expand Up @@ -137,6 +138,33 @@ shape: (5, 6)
└───────┴───────┴───────┴───────┴─────────┴─────────────────────┘
```

For plain OLS/WLS and Ridge models, support has been recently added for producing a simple statistical significance
report. It can be used as such:

```python
statistics = (df.select(
pl.col("y").least_squares.ols(pl.col("x1", "x2"), mode="statistics", add_intercept=True)
)
.unnest("statistics") # results stored in a nested series by default
.explode(["feature_names", "coefficients", "standard_errors", "t_values", "p_values"])
)

print(statistics)
```
```
shape: (3, 8)
┌─────────┬──────────┬─────────┬──────────────┬──────────────┬─────────────┬───────────┬───────────┐
│ r2 ┆ mae ┆ mse ┆ feature_name ┆ coefficients ┆ standard_er ┆ t_values ┆ p_values │
│ --- ┆ --- ┆ --- ┆ s ┆ --- ┆ rors ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ --- ┆ f64 ┆ --- ┆ f64 ┆ f64 │
│ ┆ ┆ ┆ str ┆ ┆ f64 ┆ ┆ │
╞═════════╪══════════╪═════════╪══════════════╪══════════════╪═════════════╪═══════════╪═══════════╡
│ 0.99631 ┆ 0.061732 ┆ 0.00794 ┆ x1 ┆ 0.977375 ┆ 0.037286 ┆ 26.212765 ┆ 3.0095e-8 │
│ 0.99631 ┆ 0.061732 ┆ 0.00794 ┆ x2 ┆ 0.987413 ┆ 0.037321 ┆ 26.457169 ┆ 2.8218e-8 │
│ 0.99631 ┆ 0.061732 ┆ 0.00794 ┆ const ┆ 0.000757 ┆ 0.037474 ┆ 0.02021 ┆ 0.98444 │
└─────────┴──────────┴─────────┴──────────────┴──────────────┴─────────────┴───────────┴───────────┘
```

Finally, for convenience, in order to compute out-of-sample predictions you can use:
```least_squares.{predict, predict_from_formula}```. This saves you the effort of un-nesting the coefficients and doing the dot product in
python and instead does this in Rust, as an expression. Usage is as follows:
Expand Down
34 changes: 18 additions & 16 deletions notebooks/polars_ols_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
},
"outputs": [],
"source": [
"import os; os.environ[\"POLARS_VERBOSE\"] = \"1\"\n",
"# ruff: noqa \n",
"import os\n",
"os.environ[\"POLARS_VERBOSE\"] = \"1\"\n",
"import polars as pl\n",
"import polars_ols as pls\n",
"import numpy as np"
Expand All @@ -41,7 +43,7 @@
"def insert_nulls(val):\n",
" return None if rng.random() < 0.1 else val\n",
"\n",
"def _make_data(n_samples: int = 2_000, \n",
"def _make_data(n_samples: int = 2_000,\n",
" n_features: int = 5,\n",
" n_groups: int = 5,\n",
" noise: float = 0.1,\n",
Expand All @@ -57,7 +59,7 @@
" sample_weights=pl.lit(rng.uniform(0, 1, size=n_samples)),\n",
" )\n",
" if missing:\n",
" \n",
"\n",
" df = df.with_columns(\n",
" *(pl.col(c).map_elements(insert_nulls, return_dtype=pl.Float64) for c in df.columns)\n",
" )\n",
Expand Down Expand Up @@ -542,7 +544,7 @@
],
"source": [
"pred_zero = df_missing.select(\n",
" pls.compute_least_squares(\"y\", pl.selectors.starts_with(\"x\"), \n",
" pls.compute_least_squares(\"y\", pl.selectors.starts_with(\"x\"),\n",
" mode=\"predictions\",\n",
" ols_kwargs=pls.OLSKwargs(null_policy=\"zero\"))\n",
")\n",
Expand Down Expand Up @@ -601,7 +603,7 @@
],
"source": [
"coef_drop = df_missing.select(\n",
" pl.col(\"y\").least_squares.ols(\"x1\", \"x2\", \n",
" pl.col(\"y\").least_squares.ols(\"x1\", \"x2\",\n",
" mode=\"coefficients\",\n",
" null_policy=\"drop\")\n",
")\n",
Expand Down Expand Up @@ -655,7 +657,7 @@
],
"source": [
"coef_drop_y = df_missing.select(\n",
" pl.col(\"y\").least_squares.ols(\"x1\", \"x2\", \n",
" pl.col(\"y\").least_squares.ols(\"x1\", \"x2\",\n",
" mode=\"coefficients\",\n",
" null_policy=\"drop_y_zero_x\")\n",
")\n",
Expand Down Expand Up @@ -738,7 +740,7 @@
" df\n",
" .select(\"x1\", \"x2\", \"x3\")\n",
" .with_columns(pl.col(\"x2\").alias(\"x3\")) # I add a totally collinear feature x3 which is a copy of x2\n",
" .with_columns(pl.sum_horizontal(pl.col(\"^x.*$\")).alias(\"y\")) # y is the sum of x1, .., x3 \n",
" .with_columns(pl.sum_horizontal(pl.col(\"^x.*$\")).alias(\"y\")) # y is the sum of x1, .., x3\n",
")"
]
},
Expand Down Expand Up @@ -1015,7 +1017,7 @@
" ).alias(\"coef_enet_non_negative\")\n",
"\n",
"ridge_expr = pl.col(\"y\").least_squares.ridge(pl.col(\"x1\"), pl.col(\"x2\"), pl.col(\"x3\"),\n",
" alpha=100.0, \n",
" alpha=100.0,\n",
" sample_weights=pl.col(\"sample_weights\"),\n",
" mode=\"coefficients\").alias(\"coef_ridge\")\n",
"\n",
Expand Down Expand Up @@ -1083,7 +1085,7 @@
},
"outputs": [],
"source": [
"# implementation matches sklearn \n",
"# implementation matches sklearn\n",
"coef = df_wide.select(pl.col(\"y\").least_squares.elastic_net(\n",
" *features,\n",
" l1_ratio=0.5,\n",
Expand Down Expand Up @@ -1230,7 +1232,7 @@
"# compute the residuals in two equivalent ways\n",
"df.select(\n",
" # \"x2:x3\" denotes multiplicative interaction, \"-1\" dentotes no intercept\n",
" pls.compute_least_squares_from_formula(\"y ~ x1 + x2:x3 -1\", mode=\"residuals\").alias(\"residuals_1\"), \n",
" pls.compute_least_squares_from_formula(\"y ~ x1 + x2:x3 -1\", mode=\"residuals\").alias(\"residuals_1\"),\n",
" (pl.col(\"y\") - pl.col(\"y\").least_squares.from_formula(\"x1 + x2:x3 -1\", mode=\"predictions\")).alias(\"residuals_2\"),\n",
").corr()"
]
Expand Down Expand Up @@ -1331,10 +1333,10 @@
],
"source": [
"df.select(\n",
" pl.col(\"y\").least_squares.rolling_ols(\"x1\", \"x2\", \"x3\", \n",
" window_size=252, \n",
" min_periods=5, \n",
" alpha=0.0001, \n",
" pl.col(\"y\").least_squares.rolling_ols(\"x1\", \"x2\", \"x3\",\n",
" window_size=252,\n",
" min_periods=5,\n",
" alpha=0.0001,\n",
" mode=\"coefficients\"\n",
" ).over(\"group\").alias(\"rolling_ridge_coef\"),\n",
" pl.col(\"y\").least_squares.rls(\n",
Expand All @@ -1344,7 +1346,7 @@
" initial_state_covariance=10.0, # inversely proportional to L2 prior towards prior mean\n",
" mode=\"coefficients\",\n",
" ).over(\"group\").alias(\"recursive_least_squares_coef\"),\n",
" pl.col(\"y\").least_squares.expanding_ols(pl.col(\"x1\"), pl.col(\"x2\"), pl.col(\"x3\"), \n",
" pl.col(\"y\").least_squares.expanding_ols(pl.col(\"x1\"), pl.col(\"x2\"), pl.col(\"x3\"),\n",
" mode=\"predictions\").alias(\"expanding_ols_pred\"),\n",
")"
]
Expand Down Expand Up @@ -1646,7 +1648,7 @@
"source": [
"# do multi-target .. it works\n",
"df_multi_target.with_columns(\n",
" pl.col(\"y\").least_squares.multi_target_ols(\"x1\", \"x2\", \"x3\", \n",
" pl.col(\"y\").least_squares.multi_target_ols(\"x1\", \"x2\", \"x3\",\n",
" sample_weights=\"sample_weights\",\n",
" mode=\"residuals\",\n",
" ).over(\"group\").alias(\"residuals\")\n",
Expand Down
2 changes: 1 addition & 1 deletion polars_ols/least_squares.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@
get_args,
)

import polars as pl
from polars.plugins import register_plugin_function

from polars_ols.utils import build_expressions_from_patsy_formula, parse_into_expr
import polars as pl

if TYPE_CHECKING:
ExprOrStr = Union[pl.Expr, str]
Expand Down
29 changes: 22 additions & 7 deletions src/expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ use ndarray::{s, Array, Array1, Array2, Axis};
use polars::datatypes::{DataType, Field, Float64Type};
use polars::error::{polars_err, PolarsResult};
use polars::frame::DataFrame;
use polars::prelude::{lit, BooleanChunked, FillNullStrategy, IndexOrder, IntoLazy, IntoSeries, NamedFrom, NamedFromOwned, Series, NULL, ChunkFillNullValue};
use polars::prelude::{
lit, BooleanChunked, ChunkFillNullValue, FillNullStrategy, IndexOrder, IntoLazy, IntoSeries,
NamedFrom, NamedFromOwned, Series, NULL,
};
use pyo3_polars::derive::polars_expr;
use serde::Deserialize;
use std::str::FromStr;
Expand Down Expand Up @@ -50,7 +53,8 @@ fn construct_features_array(inputs: &[Series], fill_zero: bool) -> Array2<f64> {
.fill_null_with_values(f64::NAN)
.unwrap()
.rechunk();
let array = s.to_ndarray()
let array = s
.to_ndarray()
.expect("Failed to convert f64 series to ndarray");
col.assign(&array);
}
Expand Down Expand Up @@ -441,7 +445,6 @@ fn least_squares_coefficients(inputs: &[Series], kwargs: OLSKwargs) -> PolarsRes
Ok(series.with_name("coefficients"))
}


fn statistics_struct_dtype(_input_fields: &[Field]) -> PolarsResult<Field> {
// Define the fields for the statistics struct
let fields = vec![
Expand All @@ -450,7 +453,10 @@ fn statistics_struct_dtype(_input_fields: &[Field]) -> PolarsResult<Field> {
Field::new("mse", DataType::Float64),
Field::new("feature_names", DataType::List(Box::new(DataType::String))),
Field::new("coefficients", DataType::List(Box::new(DataType::Float64))),
Field::new("standard_errors", DataType::List(Box::new(DataType::Float64))),
Field::new(
"standard_errors",
DataType::List(Box::new(DataType::Float64)),
),
Field::new("t_values", DataType::List(Box::new(DataType::Float64))),
Field::new("p_values", DataType::List(Box::new(DataType::Float64))),
];
Expand Down Expand Up @@ -482,9 +488,18 @@ fn least_squares_statistics(inputs: &[Series], kwargs: OLSKwargs) -> PolarsResul
Series::new("mse", &[residual_metrics.mse]),
Series::new("feature_names", [feature_names]),
Series::new("coefficients", [coefficients.iter().collect::<Series>()]),
Series::new("standard_errors", [feature_metrics.standard_errors.iter().collect::<Series>()]),
Series::new("t_values", [feature_metrics.t_values.iter().collect::<Series>()]),
Series::new("p_values", [feature_metrics.p_values.iter().collect::<Series>()]),
Series::new(
"standard_errors",
[feature_metrics.standard_errors.iter().collect::<Series>()],
),
Series::new(
"t_values",
[feature_metrics.t_values.iter().collect::<Series>()],
),
Series::new(
"p_values",
[feature_metrics.p_values.iter().collect::<Series>()],
),
])?;

// Convert DataFrame to a Series of struct dtype
Expand Down
14 changes: 6 additions & 8 deletions src/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ use faer::Side::Lower;
use faer_ext::{IntoFaer, IntoNdarray};
use ndarray::{Array1, Array2};
use ndarray_linalg::Trace;
use statrs::distribution::{StudentsT, ContinuousCDF};

use statrs::distribution::{ContinuousCDF, StudentsT};

pub struct ResidualMetrics {
pub mse: f64,
Expand Down Expand Up @@ -43,11 +42,10 @@ pub struct FeatureMetrics {
pub p_values: Array1<f64>,
}


fn t_value_to_p_value(t_value: f64, df: f64) -> f64 {
let t_dist = StudentsT::new(0.0, 1.0, df).expect("Invalid parameters for StudentT distribution");
let p_value = 2.0 * (1.0 - t_dist.cdf(t_value.abs()));
p_value
let t_dist =
StudentsT::new(0.0, 1.0, df).expect("Invalid parameters for StudentT distribution");
2.0 * (1.0 - t_dist.cdf(t_value.abs()))
}

/// Computes standard errors and t-values for regression coefficients using Ridge Regression.
Expand Down Expand Up @@ -121,11 +119,11 @@ pub fn compute_feature_metrics(
.map(|(&coef, &se)| coef / se)
.collect::<Array1<f64>>();

let p_values: Array1<f64> = t_values.iter()
let p_values: Array1<f64> = t_values
.iter()
.map(|&t| t_value_to_p_value(t, df))
.collect();


FeatureMetrics {
standard_errors,
t_values,
Expand Down
2 changes: 1 addition & 1 deletion tests/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def _make_data(
w *= sparsity_mask
y = x @ w + eps
return features.with_columns(
pl.lit(y).list.to_struct(fields=[f"y{i}" for i in range(n_targets)]).alias("y")
pl.DataFrame(y, schema=[f"y{i}" for i in range(n_targets)]).to_struct("y")
)


Expand Down
Loading

0 comments on commit 9480630

Please sign in to comment.