fix lint, release v0.3.4

azmyrajab · Aug 25, 2024 · 9480630 · 9480630
1 parent ece7bdd
commit 9480630
Show file tree

Hide file tree

Showing 7 changed files with 114 additions and 63 deletions.
diff --git a/README.md b/README.md
@@ -77,8 +77,9 @@ shape: (5, 7)
 └───────┴───────┴───────┴───────┴─────────┴───────────────────┴─────────────────┘
 ```
 
-The `mode` parameter is used to set the type of the output returned by all methods (`"predictions", "residuals", "coefficients"`).
+The `mode` parameter is used to set the type of the output returned by all methods (`"predictions", "residuals", "coefficients", "statistics"`).
 It defaults to returning predictions matching the input's length.
+Note that `"statistics"` is currently only supported for OLS/WLS/Ridge models.
 
 In case `"coefficients"` is set the output is a [polars Struct](https://docs.pola.rs/user-guide/expressions/structs/) with coefficients as values and feature names as fields.
 It's output shape 'broadcasts' depending on context, see below:
@@ -137,6 +138,33 @@ shape: (5, 6)
 └───────┴───────┴───────┴───────┴─────────┴─────────────────────┘
 ```
 
+For plain OLS/WLS and Ridge models, support has been recently added for producing a simple statistical significance
+report. It can be used as such:
+
+```python
+statistics = (df.select(
+   pl.col("y").least_squares.ols(pl.col("x1", "x2"), mode="statistics", add_intercept=True)
+)
+.unnest("statistics")  # results stored in a nested series by default
+.explode(["feature_names", "coefficients", "standard_errors", "t_values", "p_values"])
+)
+
+print(statistics)
+```
+```
+shape: (3, 8)
+┌─────────┬──────────┬─────────┬──────────────┬──────────────┬─────────────┬───────────┬───────────┐
+│ r2      ┆ mae      ┆ mse     ┆ feature_name ┆ coefficients ┆ standard_er ┆ t_values  ┆ p_values  │
+│ ---     ┆ ---      ┆ ---     ┆ s            ┆ ---          ┆ rors        ┆ ---       ┆ ---       │
+│ f64     ┆ f64      ┆ f64     ┆ ---          ┆ f64          ┆ ---         ┆ f64       ┆ f64       │
+│         ┆          ┆         ┆ str          ┆              ┆ f64         ┆           ┆           │
+╞═════════╪══════════╪═════════╪══════════════╪══════════════╪═════════════╪═══════════╪═══════════╡
+│ 0.99631 ┆ 0.061732 ┆ 0.00794 ┆ x1           ┆ 0.977375     ┆ 0.037286    ┆ 26.212765 ┆ 3.0095e-8 │
+│ 0.99631 ┆ 0.061732 ┆ 0.00794 ┆ x2           ┆ 0.987413     ┆ 0.037321    ┆ 26.457169 ┆ 2.8218e-8 │
+│ 0.99631 ┆ 0.061732 ┆ 0.00794 ┆ const        ┆ 0.000757     ┆ 0.037474    ┆ 0.02021   ┆ 0.98444   │
+└─────────┴──────────┴─────────┴──────────────┴──────────────┴─────────────┴───────────┴───────────┘
+```
+
 Finally, for convenience, in order to compute out-of-sample predictions you can use:
 ```least_squares.{predict, predict_from_formula}```. This saves you the effort of un-nesting the coefficients and doing the dot product in
 python and instead does this in Rust, as an expression. Usage is as follows:

diff --git a/notebooks/polars_ols_demo.ipynb b/notebooks/polars_ols_demo.ipynb
@@ -15,7 +15,9 @@
    },
    "outputs": [],
    "source": [
-    "import os; os.environ[\"POLARS_VERBOSE\"] = \"1\"\n",
+    "# ruff: noqa \n",
+    "import os\n",
+    "os.environ[\"POLARS_VERBOSE\"] = \"1\"\n",
     "import polars as pl\n",
     "import polars_ols as pls\n",
     "import numpy as np"
@@ -41,7 +43,7 @@
     "def insert_nulls(val):\n",
     "    return None if rng.random() < 0.1 else val\n",
     "\n",
-    "def _make_data(n_samples: int = 2_000, \n",
+    "def _make_data(n_samples: int = 2_000,\n",
     "               n_features: int = 5,\n",
     "               n_groups: int = 5,\n",
     "               noise: float = 0.1,\n",
@@ -57,7 +59,7 @@
     "        sample_weights=pl.lit(rng.uniform(0, 1, size=n_samples)),\n",
     "    )\n",
     "    if missing:\n",
-    "        \n",
+    "\n",
     "        df = df.with_columns(\n",
     "            *(pl.col(c).map_elements(insert_nulls, return_dtype=pl.Float64) for c in df.columns)\n",
     "        )\n",
@@ -542,7 +544,7 @@
    ],
    "source": [
     "pred_zero = df_missing.select(\n",
-    "    pls.compute_least_squares(\"y\", pl.selectors.starts_with(\"x\"), \n",
+    "    pls.compute_least_squares(\"y\", pl.selectors.starts_with(\"x\"),\n",
     "                              mode=\"predictions\",\n",
     "                              ols_kwargs=pls.OLSKwargs(null_policy=\"zero\"))\n",
     ")\n",
@@ -601,7 +603,7 @@
    ],
    "source": [
     "coef_drop = df_missing.select(\n",
-    "    pl.col(\"y\").least_squares.ols(\"x1\", \"x2\", \n",
+    "    pl.col(\"y\").least_squares.ols(\"x1\", \"x2\",\n",
     "                              mode=\"coefficients\",\n",
     "                              null_policy=\"drop\")\n",
     ")\n",
@@ -655,7 +657,7 @@
    ],
    "source": [
     "coef_drop_y = df_missing.select(\n",
-    "    pl.col(\"y\").least_squares.ols(\"x1\", \"x2\", \n",
+    "    pl.col(\"y\").least_squares.ols(\"x1\", \"x2\",\n",
     "                              mode=\"coefficients\",\n",
     "                              null_policy=\"drop_y_zero_x\")\n",
     ")\n",
@@ -738,7 +740,7 @@
     "    df\n",
     "    .select(\"x1\", \"x2\", \"x3\")\n",
     "    .with_columns(pl.col(\"x2\").alias(\"x3\"))  # I add a totally collinear feature x3 which is a copy of x2\n",
-    "    .with_columns(pl.sum_horizontal(pl.col(\"^x.*$\")).alias(\"y\"))  # y is the sum of x1, .., x3 \n",
+    "    .with_columns(pl.sum_horizontal(pl.col(\"^x.*$\")).alias(\"y\"))  # y is the sum of x1, .., x3\n",
     ")"
    ]
   },
@@ -1015,7 +1017,7 @@
     "                                                         ).alias(\"coef_enet_non_negative\")\n",
     "\n",
     "ridge_expr = pl.col(\"y\").least_squares.ridge(pl.col(\"x1\"), pl.col(\"x2\"), pl.col(\"x3\"),\n",
-    "                                             alpha=100.0, \n",
+    "                                             alpha=100.0,\n",
     "                                             sample_weights=pl.col(\"sample_weights\"),\n",
     "                                             mode=\"coefficients\").alias(\"coef_ridge\")\n",
     "\n",
@@ -1083,7 +1085,7 @@
    },
    "outputs": [],
    "source": [
-    "# implementation matches sklearn \n",
+    "# implementation matches sklearn\n",
     "coef = df_wide.select(pl.col(\"y\").least_squares.elastic_net(\n",
     "    *features,\n",
     "    l1_ratio=0.5,\n",
@@ -1230,7 +1232,7 @@
     "# compute the residuals in two equivalent ways\n",
     "df.select(\n",
     "    # \"x2:x3\" denotes multiplicative interaction, \"-1\" dentotes no intercept\n",
-    "    pls.compute_least_squares_from_formula(\"y ~ x1 + x2:x3 -1\", mode=\"residuals\").alias(\"residuals_1\"), \n",
+    "    pls.compute_least_squares_from_formula(\"y ~ x1 + x2:x3 -1\", mode=\"residuals\").alias(\"residuals_1\"),\n",
     "    (pl.col(\"y\") - pl.col(\"y\").least_squares.from_formula(\"x1 + x2:x3 -1\", mode=\"predictions\")).alias(\"residuals_2\"),\n",
     ").corr()"
    ]
@@ -1331,10 +1333,10 @@
    ],
    "source": [
     "df.select(\n",
-    "    pl.col(\"y\").least_squares.rolling_ols(\"x1\", \"x2\", \"x3\", \n",
-    "                                           window_size=252, \n",
-    "                                           min_periods=5, \n",
-    "                                           alpha=0.0001,  \n",
+    "    pl.col(\"y\").least_squares.rolling_ols(\"x1\", \"x2\", \"x3\",\n",
+    "                                           window_size=252,\n",
+    "                                           min_periods=5,\n",
+    "                                           alpha=0.0001,\n",
     "                                           mode=\"coefficients\"\n",
     "                                          ).over(\"group\").alias(\"rolling_ridge_coef\"),\n",
     "    pl.col(\"y\").least_squares.rls(\n",
@@ -1344,7 +1346,7 @@
     "        initial_state_covariance=10.0,  # inversely proportional to L2 prior towards prior mean\n",
     "        mode=\"coefficients\",\n",
     "    ).over(\"group\").alias(\"recursive_least_squares_coef\"),\n",
-    "    pl.col(\"y\").least_squares.expanding_ols(pl.col(\"x1\"), pl.col(\"x2\"), pl.col(\"x3\"), \n",
+    "    pl.col(\"y\").least_squares.expanding_ols(pl.col(\"x1\"), pl.col(\"x2\"), pl.col(\"x3\"),\n",
     "                                           mode=\"predictions\").alias(\"expanding_ols_pred\"),\n",
     ")"
    ]
@@ -1646,7 +1648,7 @@
    "source": [
     "# do multi-target .. it works\n",
     "df_multi_target.with_columns(\n",
-    "    pl.col(\"y\").least_squares.multi_target_ols(\"x1\", \"x2\", \"x3\", \n",
+    "    pl.col(\"y\").least_squares.multi_target_ols(\"x1\", \"x2\", \"x3\",\n",
     "                                               sample_weights=\"sample_weights\",\n",
     "                                               mode=\"residuals\",\n",
     "                                              ).over(\"group\").alias(\"residuals\")\n",

diff --git a/polars_ols/least_squares.py b/polars_ols/least_squares.py
@@ -16,10 +16,10 @@
     get_args,
 )
 
+import polars as pl
 from polars.plugins import register_plugin_function
 
 from polars_ols.utils import build_expressions_from_patsy_formula, parse_into_expr
-import polars as pl
 
 if TYPE_CHECKING:
     ExprOrStr = Union[pl.Expr, str]

diff --git a/src/expressions.rs b/src/expressions.rs
@@ -4,7 +4,10 @@ use ndarray::{s, Array, Array1, Array2, Axis};
 use polars::datatypes::{DataType, Field, Float64Type};
 use polars::error::{polars_err, PolarsResult};
 use polars::frame::DataFrame;
-use polars::prelude::{lit, BooleanChunked, FillNullStrategy, IndexOrder, IntoLazy, IntoSeries, NamedFrom, NamedFromOwned, Series, NULL, ChunkFillNullValue};
+use polars::prelude::{
+    lit, BooleanChunked, ChunkFillNullValue, FillNullStrategy, IndexOrder, IntoLazy, IntoSeries,
+    NamedFrom, NamedFromOwned, Series, NULL,
+};
 use pyo3_polars::derive::polars_expr;
 use serde::Deserialize;
 use std::str::FromStr;
@@ -50,7 +53,8 @@ fn construct_features_array(inputs: &[Series], fill_zero: bool) -> Array2<f64> {
                     .fill_null_with_values(f64::NAN)
                     .unwrap()
                     .rechunk();
-                let array = s.to_ndarray()
+                let array = s
+                    .to_ndarray()
                     .expect("Failed to convert f64 series to ndarray");
                 col.assign(&array);
             }
@@ -441,7 +445,6 @@ fn least_squares_coefficients(inputs: &[Series], kwargs: OLSKwargs) -> PolarsRes
     Ok(series.with_name("coefficients"))
 }
 
-
 fn statistics_struct_dtype(_input_fields: &[Field]) -> PolarsResult<Field> {
     // Define the fields for the statistics struct
     let fields = vec![
@@ -450,7 +453,10 @@ fn statistics_struct_dtype(_input_fields: &[Field]) -> PolarsResult<Field> {
         Field::new("mse", DataType::Float64),
         Field::new("feature_names", DataType::List(Box::new(DataType::String))),
         Field::new("coefficients", DataType::List(Box::new(DataType::Float64))),
-        Field::new("standard_errors", DataType::List(Box::new(DataType::Float64))),
+        Field::new(
+            "standard_errors",
+            DataType::List(Box::new(DataType::Float64)),
+        ),
         Field::new("t_values", DataType::List(Box::new(DataType::Float64))),
         Field::new("p_values", DataType::List(Box::new(DataType::Float64))),
     ];
@@ -482,9 +488,18 @@ fn least_squares_statistics(inputs: &[Series], kwargs: OLSKwargs) -> PolarsResul
         Series::new("mse", &[residual_metrics.mse]),
         Series::new("feature_names", [feature_names]),
         Series::new("coefficients", [coefficients.iter().collect::<Series>()]),
-        Series::new("standard_errors", [feature_metrics.standard_errors.iter().collect::<Series>()]),
-        Series::new("t_values", [feature_metrics.t_values.iter().collect::<Series>()]),
-        Series::new("p_values", [feature_metrics.p_values.iter().collect::<Series>()]),
+        Series::new(
+            "standard_errors",
+            [feature_metrics.standard_errors.iter().collect::<Series>()],
+        ),
+        Series::new(
+            "t_values",
+            [feature_metrics.t_values.iter().collect::<Series>()],
+        ),
+        Series::new(
+            "p_values",
+            [feature_metrics.p_values.iter().collect::<Series>()],
+        ),
     ])?;
 
     // Convert DataFrame to a Series of struct dtype

diff --git a/src/statistics.rs b/src/statistics.rs
@@ -3,8 +3,7 @@ use faer::Side::Lower;
 use faer_ext::{IntoFaer, IntoNdarray};
 use ndarray::{Array1, Array2};
 use ndarray_linalg::Trace;
-use statrs::distribution::{StudentsT, ContinuousCDF};
-
+use statrs::distribution::{ContinuousCDF, StudentsT};
 
 pub struct ResidualMetrics {
     pub mse: f64,
@@ -43,11 +42,10 @@ pub struct FeatureMetrics {
     pub p_values: Array1<f64>,
 }
 
-
 fn t_value_to_p_value(t_value: f64, df: f64) -> f64 {
-    let t_dist = StudentsT::new(0.0, 1.0, df).expect("Invalid parameters for StudentT distribution");
-    let p_value = 2.0 * (1.0 - t_dist.cdf(t_value.abs()));
-    p_value
+    let t_dist =
+        StudentsT::new(0.0, 1.0, df).expect("Invalid parameters for StudentT distribution");
+    2.0 * (1.0 - t_dist.cdf(t_value.abs()))
 }
 
 /// Computes standard errors and t-values for regression coefficients using Ridge Regression.
@@ -121,11 +119,11 @@ pub fn compute_feature_metrics(
         .map(|(&coef, &se)| coef / se)
         .collect::<Array1<f64>>();
 
-    let p_values: Array1<f64> = t_values.iter()
+    let p_values: Array1<f64> = t_values
+        .iter()
         .map(|&t| t_value_to_p_value(t, df))
         .collect();
 
-
     FeatureMetrics {
         standard_errors,
         t_values,

diff --git a/tests/benchmark.py b/tests/benchmark.py
@@ -34,7 +34,7 @@ def _make_data(
         w *= sparsity_mask
         y = x @ w + eps
         return features.with_columns(
-            pl.lit(y).list.to_struct(fields=[f"y{i}" for i in range(n_targets)]).alias("y")
+            pl.DataFrame(y, schema=[f"y{i}" for i in range(n_targets)]).to_struct("y")
         )