Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(rust, python): Add Run-length Encoding functions #9826

Merged
merged 9 commits into from
Jul 12, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ fused = ["polars-ops/fused", "polars-lazy/fused"]
list_sets = ["polars-lazy/list_sets"]
list_any_all = ["polars-lazy/list_any_all"]
cutqcut = ["polars-lazy/cutqcut"]
rle = ["polars-lazy/rle"]

test = [
"lazy",
Expand Down
1 change: 1 addition & 0 deletions polars/polars-lazy/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ fused = ["polars-plan/fused", "polars-ops/fused"]
list_sets = ["polars-plan/list_sets", "polars-ops/list_sets"]
list_any_all = ["polars-ops/list_any_all", "polars-plan/list_any_all"]
cutqcut = ["polars-plan/cutqcut", "polars-ops/cutqcut"]
rle = ["polars-plan/rle", "polars-ops/rle"]

binary_encoding = ["polars-plan/binary_encoding"]

Expand Down
1 change: 1 addition & 0 deletions polars/polars-lazy/polars-plan/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ fused = []
list_sets = ["polars-ops/list_sets"]
list_any_all = ["polars-ops/list_any_all"]
cutqcut = ["polars-ops/cutqcut"]
rle = ["polars-ops/rle"]

bigidx = ["polars-arrow/bigidx", "polars-core/bigidx", "polars-utils/bigidx"]

Expand Down
14 changes: 14 additions & 0 deletions polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ pub(super) use list::ListFunction;
use polars_core::prelude::*;
#[cfg(feature = "cutqcut")]
use polars_ops::prelude::{cut, qcut};
#[cfg(feature = "rle")]
use polars_ops::prelude::{rle, rle_id};
#[cfg(feature = "random")]
pub(crate) use random::RandomMethod;
use schema::FieldsMapper;
Expand Down Expand Up @@ -215,6 +217,10 @@ pub enum FunctionExpr {
allow_duplicates: bool,
include_breaks: bool,
},
#[cfg(feature = "rle")]
RLE,
#[cfg(feature = "rle")]
RLEID,
ToPhysical,
#[cfg(feature = "random")]
Random {
Expand Down Expand Up @@ -322,6 +328,10 @@ impl Display for FunctionExpr {
Cut { .. } => "cut",
#[cfg(feature = "cutqcut")]
QCut { .. } => "qcut",
#[cfg(feature = "rle")]
RLE => "rle",
#[cfg(feature = "rle")]
RLEID => "rle_id",
ToPhysical => "to_physical",
#[cfg(feature = "random")]
Random { method, .. } => method.into(),
Expand Down Expand Up @@ -579,6 +589,10 @@ impl From<FunctionExpr> for SpecialEq<Arc<dyn SeriesUdf>> {
allow_duplicates,
include_breaks
),
#[cfg(feature = "rle")]
RLE => map!(rle),
#[cfg(feature = "rle")]
RLEID => map!(rle_id),
ToPhysical => map!(dispatch::to_physical),
#[cfg(feature = "random")]
Random {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,15 @@ impl FunctionExpr {
Cut { .. } => mapper.with_dtype(DataType::Categorical(None)),
#[cfg(feature = "cutqcut")]
QCut { .. } => mapper.with_dtype(DataType::Categorical(None)),
#[cfg(feature = "rle")]
RLE => mapper.map_dtype(|dt| {
DataType::Struct(vec![
Field::new("lengths", DataType::UInt64),
Field::new("values", dt.clone()),
])
}),
#[cfg(feature = "rle")]
RLEID => mapper.with_dtype(DataType::UInt32),
ToPhysical => mapper.to_physical_type(),
#[cfg(feature = "random")]
Random { .. } => mapper.with_same_dtype(),
Expand Down
9 changes: 9 additions & 0 deletions polars/polars-lazy/polars-plan/src/dsl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1498,6 +1498,15 @@ impl Expr {
})
}

#[cfg(feature = "rle")]
pub fn rle(self) -> Expr {
self.apply_private(FunctionExpr::RLE)
}
#[cfg(feature = "rle")]
pub fn rle_id(self) -> Expr {
self.apply_private(FunctionExpr::RLEID)
}

#[cfg(feature = "diff")]
pub fn diff(self, n: i64, null_behavior: NullBehavior) -> Expr {
self.apply_private(FunctionExpr::Diff(n, null_behavior))
Expand Down
1 change: 1 addition & 0 deletions polars/polars-ops/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ is_unique = []
approx_unique = []
fused = []
cutqcut = ["dtype-categorical", "dtype-struct"]
rle = ["dtype-struct"]

# extra utilities for BinaryChunked
binary_encoding = ["base64", "hex"]
Expand Down
4 changes: 4 additions & 0 deletions polars/polars-ops/src/series/ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ mod is_first;
mod is_unique;
#[cfg(feature = "log")]
mod log;
#[cfg(feature = "rle")]
mod rle;
#[cfg(feature = "rolling_window")]
mod rolling;
#[cfg(feature = "search_sorted")]
Expand All @@ -39,6 +41,8 @@ pub use is_unique::*;
#[cfg(feature = "log")]
pub use log::*;
use polars_core::prelude::*;
#[cfg(feature = "rle")]
pub use rle::*;
#[cfg(feature = "rolling_window")]
pub use rolling::*;
#[cfg(feature = "search_sorted")]
Expand Down
40 changes: 40 additions & 0 deletions polars/polars-ops/src/series/ops/rle.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
use polars_arrow::trusted_len::TrustedLenPush;
use polars_core::prelude::*;

pub fn rle(s: &Series) -> PolarsResult<Series> {
let (s1, s2) = (s.slice(0, s.len() - 1), s.slice(1, s.len()));
let s_neq = s1.not_equal_missing(&s2)?;
let n_runs = s_neq.sum().unwrap() + 1;
let mut lengths = Vec::with_capacity(n_runs as usize);
lengths.push(1);
let mut vals = Series::new_empty("values", s.dtype());
let vals = vals.extend(&s.head(Some(1)))?.extend(&s2.filter(&s_neq)?)?;
let mut idx = 0;
for v in s_neq.into_iter() {
if v.unwrap() {
idx += 1;
lengths.push(1);
} else {
lengths[idx] += 1;
}
}

let outvals = vec![Series::from_vec("lengths", lengths), vals.to_owned()];
Ok(StructChunked::new("rle", &outvals)?.into_series())
}

pub fn rle_id(s: &Series) -> PolarsResult<Series> {
let (s1, s2) = (s.slice(0, s.len() - 1), s.slice(1, s.len()));
let s_neq = s1.not_equal_missing(&s2)?;

let mut out = Vec::with_capacity(s.len());
out.push(0); // Run numbers start at zero
s_neq
.downcast_iter()
.for_each(|a| out.extend(a.values_iter().map(|v| v as u32)));
out.iter_mut().fold(0, |a, x| {
Copy link
Member

@ritchie46 ritchie46 Jul 12, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Almost there. :)

I think we can do this in a single pass. Something like this:

 
   // keep track of last written value
   let mut last_value = 0u32;

   s_neq
        .downcast_iter()
        .for_each(|a| {
            let iter = a.values_iter();

           for v in iter {
              let v = v as u32;
              out.push(last_value + v );
              last_value = v;
           }
});

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And I thought I was into micro-optimizations :-)
Ideally we could do everything, even the comparisons, in a single pass and only store what we need but I didn't see a clear way to do that.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally we could do everything, even the comparisons, in a single pass and only store what we need but I didn't see a clear way to do that.

Yes, that would require us to go down into the known types with some generics. We could follow up with that. The benefit of this implementation is that it has little compiler bloat.

And I thought I was into micro-optimizations :-)

Haha, I have put a lot of time in making ensuring what we advoncate. A fast dataframe library. I cannot unsee the potential branches, cache misses and allocations ^^

*x += a;
*x
});
Ok(Series::from_vec("id", out))
}
2 changes: 2 additions & 0 deletions py-polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ binary_encoding = ["polars/binary_encoding"]
list_sets = ["polars-lazy/list_sets"]
list_any_all = ["polars/list_any_all"]
cutqcut = ["polars/cutqcut"]
rle = ["polars/rle"]

all = [
"json",
Expand Down Expand Up @@ -107,6 +108,7 @@ all = [
"list_sets",
"list_any_all",
"cutqcut",
"rle",
]

# we cannot conditionally activate simd
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ Manipulation/selection
Expr.repeat_by
Expr.reshape
Expr.reverse
Expr.rle
Expr.rle_id
Expr.round
Expr.sample
Expr.shift
Expand Down
2 changes: 2 additions & 0 deletions py-polars/docs/source/reference/series/modify_select.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ Manipulation/selection
Series.rename
Series.reshape
Series.reverse
Series.rle
Series.rle_id
Series.round
Series.sample
Series.set
Expand Down
57 changes: 57 additions & 0 deletions py-polars/polars/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3408,6 +3408,63 @@ def qcut(
)
)

def rle(self) -> Self:
"""
Get the lengths of runs of identical values.

Returns
-------
A Struct Series containing "lengths" and "values" Fields

Examples
--------
>>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]))
>>> df.select(pl.col("s").rle()).unnest("s")
shape: (6, 2)
┌─────────┬────────┐
│ lengths ┆ values │
│ --- ┆ --- │
│ i32 ┆ i64 │
╞═════════╪════════╡
│ 2 ┆ 1 │
│ 1 ┆ 2 │
│ 1 ┆ 1 │
│ 1 ┆ null │
│ 1 ┆ 1 │
│ 2 ┆ 3 │
└─────────┴────────┘
"""
return self._from_pyexpr(self._pyexpr.rle())

def rle_id(self) -> Self:
"""
Map values to run IDs.

Similar to RLE, but it maps each value to an ID corresponding to the run into
which it falls. This is especially useful when you want to define groups by
runs of identical values rather than the values themselves.


Examples
--------
>>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"]))
>>> # It works on structs of multiple values too!
>>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id())
shape: (5, 4)
┌─────┬──────┬─────┬──────┐
│ a ┆ b ┆ a_r ┆ ab_r │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ u32 ┆ u32 │
╞═════╪══════╪═════╪══════╡
│ 1 ┆ x ┆ 0 ┆ 0 │
│ 2 ┆ x ┆ 1 ┆ 1 │
│ 1 ┆ null ┆ 2 ┆ 2 │
│ 1 ┆ y ┆ 2 ┆ 3 │
│ 1 ┆ y ┆ 2 ┆ 3 │
└─────┴──────┴─────┴──────┘
"""
return self._from_pyexpr(self._pyexpr.rle_id())

def filter(self, predicate: Expr) -> Self:
"""
Filter a single column.
Expand Down
60 changes: 60 additions & 0 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1846,6 +1846,66 @@ def qcut(
return res.struct.rename_fields([break_point_label, category_label])
return res

def rle(self) -> Series:
"""
Get the lengths of runs of identical values.

Returns
-------
A Struct Series containing "lengths" and "values" Fields

Examples
--------
>>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])
>>> s.rle().struct.unnest()
shape: (6, 2)
┌─────────┬────────┐
│ lengths ┆ values │
│ --- ┆ --- │
│ i32 ┆ i64 │
╞═════════╪════════╡
│ 2 ┆ 1 │
│ 1 ┆ 2 │
│ 1 ┆ 1 │
│ 1 ┆ null │
│ 1 ┆ 1 │
│ 2 ┆ 3 │
└─────────┴────────┘
"""
return self.to_frame().select(F.col(self.name).rle()).to_series()

def rle_id(self) -> Series:
"""
Map values to run IDs.

Similar to RLE, but it maps each value to an ID corresponding to the run into
which it falls. This is especially useful when you want to define groups by
runs of identical values rather than the values themselves.

Returns
-------
Series


Examples
--------
>>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])
>>> s.rle_id()
shape: (8,)
Series: 's' [u32]
[
0
0
1
2
3
4
5
5
]
"""
return self.to_frame().select(F.col(self.name).rle_id()).to_series()

def hist(
self,
bins: list[float] | None = None,
Expand Down
9 changes: 9 additions & 0 deletions py-polars/src/expr/general.rs
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,15 @@ impl PyExpr {
.into()
}

#[cfg(feature = "rle")]
fn rle(&self) -> Self {
self.clone().inner.rle().into()
}
#[cfg(feature = "rle")]
fn rle_id(&self) -> Self {
self.clone().inner.rle_id().into()
}

fn agg_groups(&self) -> Self {
self.clone().inner.agg_groups().into()
}
Expand Down