diff --git a/polars/polars-algo/src/algo.rs b/polars/polars-algo/src/algo.rs index 10fc6d309b32..669bdd6c995c 100644 --- a/polars/polars-algo/src/algo.rs +++ b/polars/polars-algo/src/algo.rs @@ -99,153 +99,3 @@ pub fn hist(s: &Series, bins: Option<&Series>, bin_count: Option) -> Resu .fill_null(FillNullStrategy::Zero)? .sort(["category"], false) } - -pub fn qcut( - s: &Series, - quantiles: &[f64], - labels: Option>, - break_point_label: Option<&str>, - category_label: Option<&str>, - maintain_order: bool, -) -> PolarsResult { - let s = s.cast(&DataType::Float64)?; - - // amortize quantile computation - let s_sorted = s.sort(false); - let ca = s_sorted.f64().unwrap(); - - let mut bins = Vec::with_capacity(quantiles.len()); - for quantile_level in quantiles { - if let Some(quantile) = ca.quantile(*quantile_level, QuantileInterpolOptions::Linear)? { - bins.push(quantile) - } - } - - let bins = Series::new("", bins); - if maintain_order { - cut( - &s, - bins, - labels, - break_point_label, - category_label, - maintain_order, - ) - } else { - // already sorted, saves an extra sort - cut( - &s_sorted, - bins, - labels, - break_point_label, - category_label, - maintain_order, - ) - } -} - -pub fn cut( - s: &Series, - mut bins: Series, - labels: Option>, - break_point_label: Option<&str>, - category_label: Option<&str>, - maintain_order: bool, -) -> PolarsResult { - let var_name = s.name(); - let breakpoint_str = break_point_label.unwrap_or("break_point"); - let category_str = category_label.unwrap_or("category"); - - let bins_len = bins.len(); - - bins.rename(breakpoint_str); - - let mut s_bins = bins - .cast(&DataType::Float64) - .map_err(|_| PolarsError::ComputeError("expected numeric bins".into()))? - .extend_constant(AnyValue::Float64(f64::INFINITY), 1)?; - s_bins.set_sorted_flag(IsSorted::Ascending); - let cuts_df = df![ - breakpoint_str => s_bins - ]?; - - let cuts_df = if let Some(labels) = labels { - polars_ensure!( - labels.len() == (bins_len + 1), - ShapeMismatch: "labels count must equal bins count", - ); - cuts_df - .lazy() - .with_column(lit(Series::new(category_str, labels))) - } else { - cuts_df.lazy().with_column( - format_str( - "({}, {}]", - [ - col(breakpoint_str).shift_and_fill(1, lit(f64::NEG_INFINITY)), - col(breakpoint_str), - ], - )? - .alias(category_str), - ) - } - .collect()?; - - const ROW_COUNT: &str = "__POLARS_IDX"; - - let cuts = cuts_df - .lazy() - .with_columns([col(category_str).cast(DataType::Categorical(None))]) - .collect()?; - - let mut s = s.cast(&DataType::Float64)?; - let valids = if s.null_count() > 0 { - let valids = Some(s.is_not_null()); - s = s.fill_null(FillNullStrategy::MaxBound).unwrap(); - valids - } else { - None - }; - let mut frame = s.clone().into_frame(); - - if maintain_order { - frame = frame.with_row_count(ROW_COUNT, None)?; - } - - let mut out = frame.sort(vec![var_name], vec![false])?.join_asof( - &cuts, - var_name, - breakpoint_str, - AsofStrategy::Forward, - None, - None, - )?; - - if maintain_order { - out = out.sort([ROW_COUNT], false)?.drop(ROW_COUNT).unwrap() - }; - - if let Some(mut valids) = valids { - if !maintain_order { - let idx = s.arg_sort(SortOptions { - nulls_last: true, - ..Default::default() - }); - valids = unsafe { valids.take_unchecked((&idx).into()) }; - } - - let arr = valids.downcast_iter().next().unwrap(); - let validity = arr.values().clone(); - - // Safety: we don't change the length/dtype - unsafe { - for col in out.get_columns_mut() { - let mut s = col.rechunk(); - let chunks = s.chunks_mut(); - chunks[0] = chunks[0].with_validity(Some(validity.clone())); - *col = s; - } - } - } - Ok(out) -} diff --git a/polars/polars-algo/src/prelude.rs b/polars/polars-algo/src/prelude.rs index 52bf2b7b7104..b3fe7e05e3f0 100644 --- a/polars/polars-algo/src/prelude.rs +++ b/polars/polars-algo/src/prelude.rs @@ -1 +1 @@ -pub use crate::{cut, hist}; +pub use crate::hist; diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs b/polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs index 7668e6760444..60d74429202e 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs @@ -205,6 +205,7 @@ pub enum FunctionExpr { breaks: Vec, labels: Option>, left_closed: bool, + include_breaks: bool, }, #[cfg(feature = "cutqcut")] QCut { @@ -212,6 +213,7 @@ pub enum FunctionExpr { labels: Option>, left_closed: bool, allow_duplicates: bool, + include_breaks: bool, }, ToPhysical, #[cfg(feature = "random")] @@ -554,19 +556,28 @@ impl From for SpecialEq> { breaks, labels, left_closed, - } => map!(cut, breaks.clone(), labels.clone(), left_closed), + include_breaks, + } => map!( + cut, + breaks.clone(), + labels.clone(), + left_closed, + include_breaks + ), #[cfg(feature = "cutqcut")] QCut { probs, labels, left_closed, allow_duplicates, + include_breaks, } => map!( qcut, probs.clone(), labels.clone(), left_closed, - allow_duplicates + allow_duplicates, + include_breaks ), ToPhysical => map!(dispatch::to_physical), #[cfg(feature = "random")] diff --git a/polars/polars-lazy/polars-plan/src/dsl/mod.rs b/polars/polars-lazy/polars-plan/src/dsl/mod.rs index 587e322e12d1..bf99b8e88556 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/mod.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/mod.rs @@ -1465,11 +1465,18 @@ impl Expr { } #[cfg(feature = "cutqcut")] - pub fn cut(self, breaks: Vec, labels: Option>, left_closed: bool) -> Expr { + pub fn cut( + self, + breaks: Vec, + labels: Option>, + left_closed: bool, + include_breaks: bool, + ) -> Expr { self.apply_private(FunctionExpr::Cut { breaks, labels, left_closed, + include_breaks, }) } @@ -1480,12 +1487,14 @@ impl Expr { labels: Option>, left_closed: bool, allow_duplicates: bool, + include_breaks: bool, ) -> Expr { self.apply_private(FunctionExpr::QCut { probs, labels, left_closed, allow_duplicates, + include_breaks, }) } diff --git a/polars/polars-ops/src/series/ops/cut.rs b/polars/polars-ops/src/series/ops/cut.rs index 82be7be970b0..493f884168cc 100644 --- a/polars/polars-ops/src/series/ops/cut.rs +++ b/polars/polars-ops/src/series/ops/cut.rs @@ -1,12 +1,68 @@ +use std::cmp::PartialOrd; use std::iter::once; use polars_core::prelude::*; +fn map_cats( + s: &Series, + cutlabs: &[String], + sorted_breaks: &[f64], + left_closed: bool, + include_breaks: bool, +) -> PolarsResult { + let cl: Vec<&str> = cutlabs.iter().map(String::as_str).collect(); + + let out_name = format!("{}_bin", s.name()); + let mut bld = CategoricalChunkedBuilder::new(&out_name, s.len()); + let s2 = s.cast(&DataType::Float64)?; + // It would be nice to parallelize this + let s_iter = s2.f64()?.into_iter(); + + let op = if left_closed { + PartialOrd::ge + } else { + PartialOrd::gt + }; + + if include_breaks { + // This is to replicate the behavior of the old buggy version that only worked on series and + // returned a dataframe. That included a column of the right endpoint of the interval. So we + // return a struct series instead which can be turned into a dataframe later. + let right_ends = [sorted_breaks, &[f64::INFINITY]].concat(); + let mut brk_vals = PrimitiveChunkedBuilder::::new("brk", s.len()); + s_iter + .map(|opt| { + opt.filter(|x| !x.is_nan()) + .map(|x| sorted_breaks.partition_point(|v| op(&x, v))) + }) + .for_each(|idx| match idx { + None => { + bld.append_null(); + brk_vals.append_null(); + } + Some(idx) => unsafe { + bld.append_value(cl.get_unchecked(idx)); + brk_vals.append_value(*right_ends.get_unchecked(idx)); + }, + }); + + let outvals = vec![brk_vals.finish().into_series(), bld.finish().into_series()]; + Ok(StructChunked::new(&out_name, &outvals)?.into_series()) + } else { + bld.drain_iter(s_iter.map(|opt| { + opt.filter(|x| !x.is_nan()) + .map(|x| unsafe { *cl.get_unchecked(sorted_breaks.partition_point(|v| op(&x, v))) }) + })); + Ok(bld.finish().into_series()) + } +} + pub fn cut( s: &Series, breaks: Vec, labels: Option>, left_closed: bool, + include_breaks: bool, ) -> PolarsResult { polars_ensure!(!breaks.is_empty(), ShapeMismatch: "Breaks are empty"); polars_ensure!(!breaks.iter().any(|x| x.is_nan()), ComputeError: "Breaks cannot be NaN"); @@ -36,24 +92,7 @@ pub fn cut( .collect::>(), }; - let cl: Vec<&str> = cutlabs.iter().map(String::as_str).collect(); - let s_flt = s.cast(&DataType::Float64)?; - let bin_iter = s_flt.f64()?.into_iter(); - - let out_name = format!("{}_bin", s.name()); - let mut bld = CategoricalChunkedBuilder::new(&out_name, s.len()); - unsafe { - if left_closed { - bld.drain_iter(bin_iter.map(|opt| { - opt.map(|x| *cl.get_unchecked(sorted_breaks.partition_point(|&v| x >= v))) - })); - } else { - bld.drain_iter(bin_iter.map(|opt| { - opt.map(|x| *cl.get_unchecked(sorted_breaks.partition_point(|&v| x > v))) - })); - } - } - Ok(bld.finish().into_series()) + map_cats(s, &cutlabs, sorted_breaks, left_closed, include_breaks) } pub fn qcut( @@ -62,6 +101,7 @@ pub fn qcut( labels: Option>, left_closed: bool, allow_duplicates: bool, + include_breaks: bool, ) -> PolarsResult { let s = s.cast(&DataType::Float64)?; let s2 = s.sort(false); @@ -94,7 +134,7 @@ pub fn qcut( } }; qbreaks.dedup(); - return cut(&s, qbreaks, lfilt, left_closed); + return cut(&s, qbreaks, lfilt, left_closed, include_breaks); } - cut(&s, qbreaks, labels, left_closed) + cut(&s, qbreaks, labels, left_closed, include_breaks) } diff --git a/py-polars/docs/source/reference/expressions/modify_select.rst b/py-polars/docs/source/reference/expressions/modify_select.rst index ea7d45a142be..14b67ebc4a7f 100644 --- a/py-polars/docs/source/reference/expressions/modify_select.rst +++ b/py-polars/docs/source/reference/expressions/modify_select.rst @@ -16,6 +16,7 @@ Manipulation/selection Expr.clip Expr.clip_max Expr.clip_min + Expr.cut Expr.drop_nans Expr.drop_nulls Expr.explode @@ -33,6 +34,7 @@ Manipulation/selection Expr.lower_bound Expr.map_dict Expr.pipe + Expr.qcut Expr.rechunk Expr.reinterpret Expr.repeat_by diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 9d1fcaae97fb..dce5ef67f675 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -3239,6 +3239,7 @@ def cut( breaks: list[float], labels: list[str] | None = None, left_closed: bool = False, + include_breaks: bool = False, ) -> Self: """ Bin continuous values into discrete categories. @@ -3251,6 +3252,9 @@ def cut( Labels to assign to bins. If given, the length must be len(probs) + 1. left_closed Whether intervals should be [) instead of the default of (] + include_breaks + Include the the right endpoint of the bin each observation falls in. + If True, the resulting column will be a Struct. Examples -------- @@ -3291,7 +3295,9 @@ def cut( │ b ┆ 9 ┆ [5, inf) │ └─────┴─────┴───────────┘ """ - return self._from_pyexpr(self._pyexpr.cut(breaks, labels, left_closed)) + return self._from_pyexpr( + self._pyexpr.cut(breaks, labels, left_closed, include_breaks) + ) def qcut( self, @@ -3299,6 +3305,7 @@ def qcut( labels: list[str] | None = None, left_closed: bool = False, allow_duplicates: bool = False, + include_breaks: bool = False, ) -> Self: """ Bin continuous values into discrete categories based on their quantiles. @@ -3317,6 +3324,9 @@ def qcut( If True, the resulting quantile breaks don't have to be unique. This can happen even with unique probs depending on the data. Duplicates will be dropped, resulting in fewer bins. + include_breaks + Include the the right endpoint of the bin each observation falls in. + If True, the resulting column will be a Struct. Examples @@ -3374,10 +3384,28 @@ def qcut( │ b ┆ 8 ┆ hi │ │ b ┆ 9 ┆ hi │ └─────┴─────┴─────┘ - + >>> df.with_columns(q=pl.col("x").qcut([0.25, 0.5], include_breaks=True)) + shape: (10, 3) + ┌─────┬─────┬───────────────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ struct[2] │ + ╞═════╪═════╪═══════════════════════╡ + │ a ┆ 0 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 1 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 2 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 3 ┆ {4.5,"(2.25, 4.5]"} │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 7 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 8 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 9 ┆ {inf,"(4.5, inf]"} │ + └─────┴─────┴───────────────────────┘ """ return self._from_pyexpr( - self._pyexpr.qcut(probs, labels, left_closed, allow_duplicates) + self._pyexpr.qcut( + probs, labels, left_closed, allow_duplicates, include_breaks + ) ) def filter(self, predicate: Expr) -> Self: diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 8dd559367f20..d087767b828e 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -1611,9 +1611,9 @@ def cut( break_point_label: str = "break_point", category_label: str = "category", *, - maintain_order: bool = False, - series: bool = False, + series: bool = True, left_closed: bool = False, + include_breaks: bool = False, ) -> DataFrame | Series: """ Bin continuous values into discrete categories. @@ -1626,15 +1626,20 @@ def cut( Labels to assign to the bins. If given the length of labels must be len(bins) + 1. break_point_label - Name given to the breakpoint column. Only used if series == False + Name given to the breakpoint column/field. Only used if series == False or + include_breaks == True category_label Name given to the category column. Only used if series == False maintain_order Keep the order of the original `Series`. Only used if series == False series - If True, return the a categorical series in the data's original order + If True, return the a categorical series in the data's original order. left_closed Whether intervals should be [) instead of (] + include_breaks + Include the the right endpoint of the bin each observation falls in. + If returning a DataFrame, it will be a column, and if returning a Series + it will be a field in a Struct Returns ------- @@ -1643,23 +1648,23 @@ def cut( Examples -------- >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) - >>> a.cut([-1, 1]) + >>> a.cut([-1, 1], series=False) shape: (12, 3) - ┌──────┬─────────────┬──────────────┐ - │ a ┆ break_point ┆ category │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ cat │ - ╞══════╪═════════════╪══════════════╡ - │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ - │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ - │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ - │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ - │ … ┆ … ┆ … │ - │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ - │ 1.5 ┆ inf ┆ (1.0, inf] │ - │ 2.0 ┆ inf ┆ (1.0, inf] │ - │ 2.5 ┆ inf ┆ (1.0, inf] │ - └──────┴─────────────┴──────────────┘ + ┌──────┬─────────────┬────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1, 1] │ + │ 1.5 ┆ inf ┆ (1, inf] │ + │ 2.0 ┆ inf ┆ (1, inf] │ + │ 2.5 ┆ inf ┆ (1, inf] │ + └──────┴─────────────┴────────────┘ >>> a.cut([-1, 1], series=True) shape: (12,) Series: 'a' [cat] @@ -1695,21 +1700,26 @@ def cut( "[1, inf)" ] """ - if series: + n = self._s.name() + + if not series: + # "Old style" always includes breaks return ( self.to_frame() - .select(F.col(self._s.name()).cut(bins, labels, left_closed)) - .to_series() - ) - return wrap_df( - self._s.cut( - Series(break_point_label, bins, dtype=Float64)._s, - labels, - break_point_label, - category_label, - maintain_order, + .with_columns( + F.col(n).cut(bins, labels, left_closed, True).alias(n + "_bin") + ) + .unnest(n + "_bin") + .rename({"brk": break_point_label, n + "_bin": category_label}) ) + res = ( + self.to_frame() + .select(F.col(n).cut(bins, labels, left_closed, include_breaks)) + .to_series() ) + if include_breaks: + return res.struct.rename_fields([break_point_label, category_label]) + return res def qcut( self, @@ -1718,10 +1728,10 @@ def qcut( labels: list[str] | None = None, break_point_label: str = "break_point", category_label: str = "category", - maintain_order: bool = False, series: bool = False, left_closed: bool = False, allow_duplicates: bool = False, + include_breaks: bool = False, ) -> DataFrame | Series: """ Bin continuous values into discrete categories based on their quantiles. @@ -1735,7 +1745,8 @@ def qcut( Labels to assign to the quantiles. If given the length of labels must be len(bins) + 1. break_point_label - Name given to the breakpoint column. Only used if series == False. + Name given to the breakpoint column/field. Only used if series == False or + include_breaks == True category_label Name given to the category column. Only used if series == False. maintain_order @@ -1748,6 +1759,10 @@ def qcut( If True, the resulting quantile breaks don't have to be unique. This can happen even with unique probs depending on the data. Duplicates will be dropped, resulting in fewer bins. + include_breaks + Include the the right endpoint of the bin each observation falls in. + If returning a DataFrame, it will be a column, and if returning a Series + it will be a field in a Struct Returns ------- @@ -1761,22 +1776,22 @@ def qcut( Examples -------- >>> a = pl.Series("a", range(-5, 3)) - >>> a.qcut([0.0, 0.25, 0.75]) + >>> a.qcut([0.0, 0.25, 0.75], series=False) shape: (8, 3) - ┌──────┬─────────────┬───────────────┐ - │ a ┆ break_point ┆ category │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ cat │ - ╞══════╪═════════════╪═══════════════╡ - │ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │ - │ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │ - │ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │ - │ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │ - │ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │ - │ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │ - │ 1.0 ┆ inf ┆ (0.25, inf] │ - │ 2.0 ┆ inf ┆ (0.25, inf] │ - └──────┴─────────────┴───────────────┘ + ┌─────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪═══════════════╡ + │ -5 ┆ -5.0 ┆ (-inf, -5] │ + │ -4 ┆ -3.25 ┆ (-5, -3.25] │ + │ -3 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1 ┆ inf ┆ (0.25, inf] │ + │ 2 ┆ inf ┆ (0.25, inf] │ + └─────┴─────────────┴───────────────┘ >>> a.qcut([0.0, 0.25, 0.75], series=True) shape: (8,) Series: 'a' [cat] @@ -1804,25 +1819,32 @@ def qcut( "[0.25, inf)" ] """ - if series: + n = self._s.name() + + if not series: + # "Old style" always includes breaks return ( self.to_frame() - .select( - F.col(self._s.name()).qcut( - quantiles, labels, left_closed, allow_duplicates - ) + .with_columns( + F.col(n) + .qcut(quantiles, labels, left_closed, allow_duplicates, True) + .alias(n + "_bin") ) - .to_series() + .unnest(n + "_bin") + .rename({"brk": break_point_label, n + "_bin": category_label}) ) - return wrap_df( - self._s.qcut( - Series(quantiles, dtype=Float64)._s, - labels, - break_point_label, - category_label, - maintain_order, + res = ( + self.to_frame() + .select( + F.col(n).qcut( + quantiles, labels, left_closed, allow_duplicates, include_breaks + ) ) + .to_series() ) + if include_breaks: + return res.struct.rename_fields([break_point_label, category_label]) + return res def hist( self, diff --git a/py-polars/src/expr/general.rs b/py-polars/src/expr/general.rs index c69505b2f785..53242cd2abde 100644 --- a/py-polars/src/expr/general.rs +++ b/py-polars/src/expr/general.rs @@ -178,12 +178,21 @@ impl PyExpr { .into() } - #[pyo3(signature = (breaks, labels, left_closed))] + #[pyo3(signature = (breaks, labels, left_closed, include_breaks))] #[cfg(feature = "cutqcut")] - fn cut(&self, breaks: Vec, labels: Option>, left_closed: bool) -> Self { - self.inner.clone().cut(breaks, labels, left_closed).into() + fn cut( + &self, + breaks: Vec, + labels: Option>, + left_closed: bool, + include_breaks: bool, + ) -> Self { + self.inner + .clone() + .cut(breaks, labels, left_closed, include_breaks) + .into() } - #[pyo3(signature = (probs, labels, left_closed, allow_duplicates))] + #[pyo3(signature = (probs, labels, left_closed, allow_duplicates, include_breaks))] #[cfg(feature = "cutqcut")] fn qcut( &self, @@ -191,10 +200,11 @@ impl PyExpr { labels: Option>, left_closed: bool, allow_duplicates: bool, + include_breaks: bool, ) -> Self { self.inner .clone() - .qcut(probs, labels, left_closed, allow_duplicates) + .qcut(probs, labels, left_closed, allow_duplicates, include_breaks) .into() } diff --git a/py-polars/src/series/mod.rs b/py-polars/src/series/mod.rs index da315ab01a6a..9dd68daaa83c 100644 --- a/py-polars/src/series/mod.rs +++ b/py-polars/src/series/mod.rs @@ -6,7 +6,7 @@ mod export; mod numpy_ufunc; mod set_at_idx; -use polars_algo::{cut, hist, qcut}; +use polars_algo::hist; use polars_core::series::IsSorted; use polars_core::utils::flatten::flatten_series; use polars_core::with_match_physical_numeric_polars_type; @@ -664,56 +664,6 @@ impl PySeries { self.series.clear().into() } - #[pyo3(signature = (bins, labels, break_point_label, category_label, maintain_order))] - fn cut( - &self, - bins: Self, - labels: Option>, - break_point_label: Option<&str>, - category_label: Option<&str>, - maintain_order: bool, - ) -> PyResult { - let out = cut( - &self.series, - bins.series, - labels, - break_point_label, - category_label, - maintain_order, - ) - .map_err(PyPolarsErr::from)?; - Ok(out.into()) - } - - #[pyo3(signature = (quantiles, labels, break_point_label, category_label, maintain_order))] - fn qcut( - &self, - quantiles: Self, - labels: Option>, - break_point_label: Option<&str>, - category_label: Option<&str>, - maintain_order: bool, - ) -> PyResult { - if quantiles.series.null_count() > 0 { - return Err(PyValueError::new_err( - "did not expect null values in list of quantiles", - )); - } - let quantiles = quantiles.series.cast(&DataType::Float64).unwrap(); - let quantiles = quantiles.f64().unwrap().rechunk(); - - let out = qcut( - &self.series, - quantiles.cont_slice().unwrap(), - labels, - break_point_label, - category_label, - maintain_order, - ) - .map_err(PyPolarsErr::from)?; - Ok(out.into()) - } - fn hist(&self, bins: Option, bin_count: Option) -> PyResult { let bins = bins.map(|s| s.series); let out = hist(&self.series, bins.as_ref(), bin_count).map_err(PyPolarsErr::from)?; diff --git a/py-polars/tests/unit/operations/test_statistics.py b/py-polars/tests/unit/operations/test_statistics.py index 15a10f0fcf3f..12d578d6f8b9 100644 --- a/py-polars/tests/unit/operations/test_statistics.py +++ b/py-polars/tests/unit/operations/test_statistics.py @@ -27,22 +27,22 @@ def test_corr() -> None: def test_cut() -> None: a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) - out = cast(pl.DataFrame, a.cut(bins=[-1, 1])) + out = cast(pl.DataFrame, a.cut(bins=[-1, 1], series=False)) assert out.shape == (12, 3) assert out.filter(pl.col("break_point") < 1e9).to_dict(False) == { "a": [-3.0, -2.5, -2.0, -1.5, -1.0, -0.5, 0.0, 0.5, 1.0], "break_point": [-1.0, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0], "category": [ - "(-inf, -1.0]", - "(-inf, -1.0]", - "(-inf, -1.0]", - "(-inf, -1.0]", - "(-inf, -1.0]", - "(-1.0, 1.0]", - "(-1.0, 1.0]", - "(-1.0, 1.0]", - "(-1.0, 1.0]", + "(-inf, -1]", + "(-inf, -1]", + "(-inf, -1]", + "(-inf, -1]", + "(-inf, -1]", + "(-1, 1]", + "(-1, 1]", + "(-1, 1]", + "(-1, 1]", ], } @@ -50,37 +50,35 @@ def test_cut() -> None: inf = float("inf") df = pl.DataFrame({"a": list(range(5))}) ser = df.select("a").to_series() - assert cast(pl.DataFrame, ser.cut(bins=[-1, 1])).rows() == [ - (0.0, 1.0, "(-1.0, 1.0]"), - (1.0, 1.0, "(-1.0, 1.0]"), - (2.0, inf, "(1.0, inf]"), - (3.0, inf, "(1.0, inf]"), - (4.0, inf, "(1.0, inf]"), + assert cast(pl.DataFrame, ser.cut(bins=[-1, 1], series=False)).rows() == [ + (0.0, 1.0, "(-1, 1]"), + (1.0, 1.0, "(-1, 1]"), + (2.0, inf, "(1, inf]"), + (3.0, inf, "(1, inf]"), + (4.0, inf, "(1, inf]"), ] - -def test_cut_maintain_order() -> None: expected_df = pl.DataFrame( { "a": [5.0, 8.0, 9.0, 5.0, 0.0, 0.0, 1.0, 7.0, 6.0, 9.0], "break_point": [inf, inf, inf, inf, 1.0, 1.0, 1.0, inf, inf, inf], "category": [ - "(1.0, inf]", - "(1.0, inf]", - "(1.0, inf]", - "(1.0, inf]", - "(-1.0, 1.0]", - "(-1.0, 1.0]", - "(-1.0, 1.0]", - "(1.0, inf]", - "(1.0, inf]", - "(1.0, inf]", + "(1, inf]", + "(1, inf]", + "(1, inf]", + "(1, inf]", + "(-1, 1]", + "(-1, 1]", + "(-1, 1]", + "(1, inf]", + "(1, inf]", + "(1, inf]", ], } ) np.random.seed(1) a = pl.Series("a", np.random.randint(0, 10, 10)) - out = cast(pl.DataFrame, a.cut(bins=[-1, 1], maintain_order=True)) + out = cast(pl.DataFrame, a.cut(bins=[-1, 1], series=False)) out_s = cast(pl.Series, a.cut(bins=[-1, 1], series=True)) assert out["a"].cast(int).series_equal(a) # Compare strings and categoricals without a hassle @@ -100,25 +98,16 @@ def test_qcut() -> None: { "a": [-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0], "break_point": [-5.0, -3.25, 0.25, 0.25, 0.25, 0.25, inf, inf], - "category": [ - "(-inf, -5.0]", - "(-5.0, -3.25]", - "(-3.25, 0.25]", - "(-3.25, 0.25]", - "(-3.25, 0.25]", - "(-3.25, 0.25]", - "(0.25, inf]", - "(0.25, inf]", - ], + "category": ["(-inf, -5]", "(-5, -3.25]"] + + ["(-3.25, 0.25]"] * 4 + + ["(0.25, inf]"] * 2, } ) out = cast(pl.DataFrame, input.qcut([0.0, 0.25, 0.75])) out_s = cast(pl.Series, input.qcut([0.0, 0.25, 0.75], series=True)) assert_frame_equal(out, exp, check_dtype=False) assert_series_equal( - pl.Series( - ["(-inf, -5]", "(-5, -3.25]"] + ["(-3.25, 0.25]"] * 4 + ["(0.25, inf]"] * 2 - ), + exp["category"], out_s, check_dtype=False, check_names=False, @@ -159,7 +148,7 @@ def test_cut_null_values() -> None: } ) assert_frame_equal( - cast(pl.DataFrame, s.qcut([0.2, 0.3], maintain_order=True)), + cast(pl.DataFrame, s.qcut([0.2, 0.3], series=False)), exp, check_dtype=False, ) @@ -170,11 +159,6 @@ def test_cut_null_values() -> None: check_names=False, ) - assert ( - str(cast(pl.DataFrame, s.qcut([0.2, 0.3], maintain_order=False)).to_dict(False)) - == "{'': [-1.0, 1.0, 2.0, 4.0, 8.0, None, None], 'break_point': [0.5999999999999996, 1.2000000000000002, inf, inf, inf, None, None], 'category': ['(-inf, 0.5999999999999996]', '(0.5999999999999996, 1.2000000000000002]', '(1.2000000000000002, inf]', '(1.2000000000000002, inf]', '(1.2000000000000002, inf]', None, None]}" - ) - def test_median_quantile_duration() -> None: df = pl.DataFrame({"A": [timedelta(days=0), timedelta(days=1)]}) diff --git a/py-polars/tests/unit/test_errors.py b/py-polars/tests/unit/test_errors.py index 8a37394c822e..6a551d3e98e2 100644 --- a/py-polars/tests/unit/test_errors.py +++ b/py-polars/tests/unit/test_errors.py @@ -43,9 +43,7 @@ def test_error_on_reducing_map() -> None: ), ): df.select( - pl.col("x") - .map(lambda x: x.cut(bins=[1, 2, 3], maintain_order=True)) - .over("group") + pl.col("x").map(lambda x: x.cut(bins=[1, 2, 3], series=False)).over("group") )