diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs b/polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs index 433d900cef26..82a42b9b7f80 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs @@ -25,6 +25,8 @@ mod list; mod log; mod nan; mod pow; +#[cfg(feature = "random")] +mod random; #[cfg(feature = "arange")] mod range; #[cfg(all(feature = "rolling_window", feature = "moment"))] @@ -51,6 +53,8 @@ mod trigonometry; mod unique; use std::fmt::{Display, Formatter}; +#[cfg(feature = "random")] +use std::sync::atomic::AtomicU64; #[cfg(feature = "dtype-array")] pub(super) use array::ArrayFunction; @@ -59,6 +63,8 @@ pub(crate) use correlation::CorrelationMethod; pub(crate) use fused::FusedOperator; pub(super) use list::ListFunction; use polars_core::prelude::*; +#[cfg(feature = "random")] +pub(crate) use random::RandomMethod; use schema::FieldsMapper; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -193,6 +199,14 @@ pub enum FunctionExpr { ddof: u8, }, ToPhysical, + #[cfg(feature = "random")] + Random { + method: random::RandomMethod, + #[cfg_attr(feature = "serde", serde(skip))] + atomic_seed: Option>>, + seed: Option, + fixed_seed: bool, + }, } impl Display for FunctionExpr { @@ -288,6 +302,8 @@ impl Display for FunctionExpr { ConcatExpr(_) => "concat_expr", Correlation { method, .. } => return Display::fmt(method, f), ToPhysical => "to_physical", + #[cfg(feature = "random")] + Random { method, .. } => method.into(), }; write!(f, "{s}") } @@ -515,6 +531,19 @@ impl From for SpecialEq> { ConcatExpr(rechunk) => map_as_slice!(concat::concat_expr, rechunk), Correlation { method, ddof } => map_as_slice!(correlation::corr, ddof, method), ToPhysical => map!(dispatch::to_physical), + #[cfg(feature = "random")] + Random { + method, + seed, + atomic_seed, + fixed_seed, + } => map!( + random::random, + method, + atomic_seed.as_deref(), + seed, + fixed_seed + ), } } } diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/random.rs b/polars/polars-lazy/polars-plan/src/dsl/function_expr/random.rs new file mode 100644 index 000000000000..9515f8843c36 --- /dev/null +++ b/polars/polars-lazy/polars-plan/src/dsl/function_expr/random.rs @@ -0,0 +1,55 @@ +use std::sync::atomic::Ordering; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; +use strum_macros::IntoStaticStr; + +use super::*; + +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Copy, Clone, PartialEq, Debug, IntoStaticStr)] +#[strum(serialize_all = "snake_case")] +pub enum RandomMethod { + Shuffle, + SampleN { + n: usize, + with_replacement: bool, + shuffle: bool, + }, + SampleFrac { + frac: f64, + with_replacement: bool, + shuffle: bool, + }, +} + +pub(super) fn random( + s: &Series, + method: RandomMethod, + atomic_seed: Option<&Arc>, + seed: Option, + fixed_seed: bool, +) -> PolarsResult { + let seed = if fixed_seed { + seed + } else { + // ensure seeds differ between groupby groups + // otherwise all groups would be sampled the same + atomic_seed + .as_ref() + .map(|atomic| atomic.fetch_add(1, Ordering::Relaxed)) + }; + match method { + RandomMethod::Shuffle => Ok(s.shuffle(seed)), + RandomMethod::SampleFrac { + frac, + with_replacement, + shuffle, + } => s.sample_frac(frac, with_replacement, shuffle, seed), + RandomMethod::SampleN { + n, + with_replacement, + shuffle, + } => s.sample_n(n, with_replacement, shuffle, seed), + } +} diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/schema.rs b/polars/polars-lazy/polars-plan/src/dsl/function_expr/schema.rs index a0f44620e1a2..d07977def892 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/schema.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/function_expr/schema.rs @@ -234,6 +234,8 @@ impl FunctionExpr { ConcatExpr(_) => mapper.map_to_supertype(), Correlation { .. } => mapper.map_to_float_dtype(), ToPhysical => mapper.to_physical_type(), + #[cfg(feature = "random")] + Random { .. } => mapper.with_same_dtype(), } } } diff --git a/polars/polars-lazy/polars-plan/src/dsl/mod.rs b/polars/polars-lazy/polars-plan/src/dsl/mod.rs index 18affffe5372..77d5dd20e719 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/mod.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/mod.rs @@ -24,6 +24,8 @@ pub(crate) mod names; mod options; #[cfg(all(feature = "python", feature = "serde"))] pub mod python_udf; +#[cfg(feature = "random")] +mod random; mod selector; #[cfg(feature = "strings")] pub mod string; @@ -1559,45 +1561,6 @@ impl Expr { .with_fmt("reshape") } - #[cfg(feature = "random")] - pub fn shuffle(self, seed: Option) -> Self { - self.apply(move |s| Ok(Some(s.shuffle(seed))), GetOutput::same_type()) - .with_fmt("shuffle") - } - - #[cfg(feature = "random")] - pub fn sample_n( - self, - n: usize, - with_replacement: bool, - shuffle: bool, - seed: Option, - ) -> Self { - self.apply( - move |s| s.sample_n(n, with_replacement, shuffle, seed).map(Some), - GetOutput::same_type(), - ) - .with_fmt("sample_n") - } - - #[cfg(feature = "random")] - pub fn sample_frac( - self, - frac: f64, - with_replacement: bool, - shuffle: bool, - seed: Option, - ) -> Self { - self.apply( - move |s| { - s.sample_frac(frac, with_replacement, shuffle, seed) - .map(Some) - }, - GetOutput::same_type(), - ) - .with_fmt("sample_frac") - } - #[cfg(feature = "ewma")] pub fn ewm_mean(self, options: EWMOptions) -> Self { use DataType::*; diff --git a/polars/polars-lazy/polars-plan/src/dsl/random.rs b/polars/polars-lazy/polars-plan/src/dsl/random.rs new file mode 100644 index 000000000000..d51d4abf8397 --- /dev/null +++ b/polars/polars-lazy/polars-plan/src/dsl/random.rs @@ -0,0 +1,58 @@ +use std::sync::atomic::AtomicU64; + +use super::*; + +fn get_atomic_seed(seed: Option) -> Option>> { + seed.map(|v| SpecialEq::new(Arc::new(AtomicU64::new(v)))) +} + +impl Expr { + pub fn shuffle(self, seed: Option, fixed_seed: bool) -> Self { + self.apply_private(FunctionExpr::Random { + method: RandomMethod::Shuffle, + atomic_seed: get_atomic_seed(seed), + seed, + fixed_seed, + }) + } + + pub fn sample_n( + self, + n: usize, + with_replacement: bool, + shuffle: bool, + seed: Option, + fixed_seed: bool, + ) -> Self { + self.apply_private(FunctionExpr::Random { + method: RandomMethod::SampleN { + n, + with_replacement, + shuffle, + }, + atomic_seed: get_atomic_seed(seed), + seed, + fixed_seed, + }) + } + + pub fn sample_frac( + self, + frac: f64, + with_replacement: bool, + shuffle: bool, + seed: Option, + fixed_seed: bool, + ) -> Self { + self.apply_private(FunctionExpr::Random { + method: RandomMethod::SampleFrac { + frac, + with_replacement, + shuffle, + }, + atomic_seed: get_atomic_seed(seed), + seed, + fixed_seed, + }) + } +} diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 6e220fd4c40e..d2db83d3bafc 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -7497,7 +7497,7 @@ def reshape(self, dimensions: tuple[int, ...]) -> Self: """ return self._from_pyexpr(self._pyexpr.reshape(dimensions)) - def shuffle(self, seed: int | None = None) -> Self: + def shuffle(self, seed: int | None = None, fixed_seed: bool = False) -> Self: """ Shuffle the contents of this expression. @@ -7506,6 +7506,10 @@ def shuffle(self, seed: int | None = None) -> Self: seed Seed for the random number generator. If set to None (default), a random seed is generated using the ``random`` module. + fixed_seed + If True, The seed will not be incremented between draws. + This can make output predictable because draw ordering can + change due to threads being scheduled in a different order. Examples -------- @@ -7523,9 +7527,10 @@ def shuffle(self, seed: int | None = None) -> Self: └─────┘ """ + # we seed from python so that we respect ``random.seed`` if seed is None: seed = random.randint(0, 10000) - return self._from_pyexpr(self._pyexpr.shuffle(seed)) + return self._from_pyexpr(self._pyexpr.shuffle(seed, fixed_seed)) @deprecated_alias(frac="fraction") def sample( @@ -7536,6 +7541,7 @@ def sample( with_replacement: bool = False, shuffle: bool = False, seed: int | None = None, + fixed_seed: bool = False, ) -> Self: """ Sample from this expression. @@ -7554,6 +7560,10 @@ def sample( seed Seed for the random number generator. If set to None (default), a random seed is generated using the ``random`` module. + fixed_seed + If True, The seed will not be incremented between draws. + This can make output predictable because draw ordering can + change due to threads being scheduled in a different order. Examples -------- @@ -7579,13 +7589,15 @@ def sample( if fraction is not None: return self._from_pyexpr( - self._pyexpr.sample_frac(fraction, with_replacement, shuffle, seed) + self._pyexpr.sample_frac( + fraction, with_replacement, shuffle, seed, fixed_seed + ) ) if n is None: n = 1 return self._from_pyexpr( - self._pyexpr.sample_n(n, with_replacement, shuffle, seed) + self._pyexpr.sample_n(n, with_replacement, shuffle, seed, fixed_seed) ) def ewm_mean( diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 173781785a7c..5a13bafcc19d 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -5609,6 +5609,15 @@ def shuffle(self, seed: int | None = None) -> Series: ] """ + return ( + self.to_frame() + .select( + F.col(self.name).shuffle( + seed=seed, + ) + ) + .to_series() + ) def ewm_mean( self, diff --git a/py-polars/src/expr/general.rs b/py-polars/src/expr/general.rs index 229e7b872d5a..47e498c155e1 100644 --- a/py-polars/src/expr/general.rs +++ b/py-polars/src/expr/general.rs @@ -972,27 +972,38 @@ impl PyExpr { self.inner.clone().to_physical().into() } - fn shuffle(&self, seed: Option) -> Self { - self.inner.clone().shuffle(seed).into() + #[pyo3(signature = (seed, fixed_seed))] + fn shuffle(&self, seed: Option, fixed_seed: bool) -> Self { + self.inner.clone().shuffle(seed, fixed_seed).into() } - fn sample_n(&self, n: usize, with_replacement: bool, shuffle: bool, seed: Option) -> Self { + #[pyo3(signature = (n, with_replacement, shuffle, seed, fixed_seed))] + fn sample_n( + &self, + n: usize, + with_replacement: bool, + shuffle: bool, + seed: Option, + fixed_seed: bool, + ) -> Self { self.inner .clone() - .sample_n(n, with_replacement, shuffle, seed) + .sample_n(n, with_replacement, shuffle, seed, fixed_seed) .into() } + #[pyo3(signature = (frac, with_replacement, shuffle, seed, fixed_seed))] fn sample_frac( &self, frac: f64, with_replacement: bool, shuffle: bool, seed: Option, + fixed_seed: bool, ) -> Self { self.inner .clone() - .sample_frac(frac, with_replacement, shuffle, seed) + .sample_frac(frac, with_replacement, shuffle, seed, fixed_seed) .into() } diff --git a/py-polars/tests/unit/operations/test_random.py b/py-polars/tests/unit/operations/test_random.py new file mode 100644 index 000000000000..9bc96b61ea03 --- /dev/null +++ b/py-polars/tests/unit/operations/test_random.py @@ -0,0 +1,105 @@ +import random + +import pytest + +import polars as pl +from polars.testing import assert_frame_equal, assert_series_equal + + +def test_shuffle_reseed() -> None: + assert pl.DataFrame({"x": [1, 2, 3, 1, 2, 3], "c": [0, 0, 0, 1, 1, 1]}).groupby( + "c", maintain_order=True + ).agg(pl.col("x").shuffle(2)).to_dict(False) == { + "c": [0, 1], + "x": [[2, 1, 3], [3, 1, 2]], + } + + +def test_sample_expr() -> None: + a = pl.Series("a", range(0, 20)) + out = pl.select( + pl.lit(a).sample(fraction=0.5, with_replacement=False, seed=1) + ).to_series() + + assert out.shape == (10,) + assert out.to_list() != out.sort().to_list() + assert out.unique().shape == (10,) + assert set(out).issubset(set(a)) + + out = pl.select(pl.lit(a).sample(n=10, with_replacement=False, seed=1)).to_series() + assert out.shape == (10,) + assert out.to_list() != out.sort().to_list() + assert out.unique().shape == (10,) + + # Setting random.seed should lead to reproducible results + random.seed(1) + result1 = pl.select(pl.lit(a).sample(n=10)).to_series() + random.seed(1) + result2 = pl.select(pl.lit(a).sample(n=10)).to_series() + assert_series_equal(result1, result2) + + +def test_sample_df() -> None: + df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]}) + + assert df.sample(n=2, seed=0).shape == (2, 3) + assert df.sample(fraction=0.4, seed=0).shape == (1, 3) + + +def test_sample_series() -> None: + s = pl.Series("a", [1, 2, 3, 4, 5]) + + assert len(s.sample(n=2, seed=0)) == 2 + assert len(s.sample(fraction=0.4, seed=0)) == 2 + + assert len(s.sample(n=2, with_replacement=True, seed=0)) == 2 + + # on a series of length 5, you cannot sample more than 5 items + with pytest.raises(pl.ShapeError): + s.sample(n=10, with_replacement=False, seed=0) + # unless you use with_replacement=True + assert len(s.sample(n=10, with_replacement=True, seed=0)) == 10 + + +def test_rank_random_expr() -> None: + df = pl.from_dict( + {"a": [1] * 5, "b": [1, 2, 3, 4, 5], "c": [200, 100, 100, 50, 100]} + ) + + df_ranks1 = df.with_columns( + pl.col("c").rank(method="random", seed=1).over("a").alias("rank") + ) + df_ranks2 = df.with_columns( + pl.col("c").rank(method="random", seed=1).over("a").alias("rank") + ) + assert_frame_equal(df_ranks1, df_ranks2) + + +def test_rank_random_series() -> None: + s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0]) + assert_series_equal( + s.rank("random", seed=1), pl.Series("a", [2, 4, 7, 3, 5, 6, 1], dtype=pl.UInt32) + ) + + +def test_shuffle_expr() -> None: + # setting 'random.seed' should lead to reproducible results + s = pl.Series("a", range(20)) + s_list = s.to_list() + + random.seed(1) + result1 = pl.select(pl.lit(s).shuffle()).to_series() + + random.seed(1) + result2 = pl.select(a=pl.lit(s_list).shuffle()).to_series() + assert_series_equal(result1, result2) + + +def test_shuffle_series() -> None: + a = pl.Series("a", [1, 2, 3]) + out = a.shuffle(2) + expected = pl.Series("a", [2, 1, 3]) + assert_series_equal(out, expected) + + out = pl.select(pl.lit(a).shuffle(2)).to_series() + assert_series_equal(out, expected) diff --git a/py-polars/tests/unit/operations/test_rolling.py b/py-polars/tests/unit/operations/test_rolling.py index cbcaadee38ab..096a180e94e2 100644 --- a/py-polars/tests/unit/operations/test_rolling.py +++ b/py-polars/tests/unit/operations/test_rolling.py @@ -247,7 +247,7 @@ def test_rolling_extrema() -> None: } # shuffled data triggers other kernels - df = df.select([pl.all().shuffle(0)]) + df = df.select([pl.all().shuffle(0, fixed_seed=True)]) assert df.select([pl.all().rolling_min(3)]).to_dict(False) == { "col1": [None, None, 0, 0, 1, 2, 2], "col2": [None, None, 0, 2, 1, 1, 1], diff --git a/py-polars/tests/unit/test_df.py b/py-polars/tests/unit/test_df.py index ec030fc6a053..6c3a845a0c79 100644 --- a/py-polars/tests/unit/test_df.py +++ b/py-polars/tests/unit/test_df.py @@ -2425,13 +2425,6 @@ def test_n_unique_subsets() -> None: ) -def test_sample() -> None: - df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]}) - - assert df.sample(n=2, seed=0).shape == (2, 3) - assert df.sample(fraction=0.4, seed=0).shape == (1, 3) - - def test_shrink_to_fit() -> None: df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]}) diff --git a/py-polars/tests/unit/test_exprs.py b/py-polars/tests/unit/test_exprs.py index 0710c503dbd0..3816b0773c07 100644 --- a/py-polars/tests/unit/test_exprs.py +++ b/py-polars/tests/unit/test_exprs.py @@ -1,6 +1,5 @@ from __future__ import annotations -import random import sys import typing from datetime import date, datetime, time, timedelta, timezone @@ -154,43 +153,6 @@ def test_count_expr() -> None: assert out["count"].to_list() == [4, 1] -def test_shuffle() -> None: - # setting 'random.seed' should lead to reproducible results - s = pl.Series("a", range(20)) - s_list = s.to_list() - - random.seed(1) - result1 = pl.select(pl.lit(s).shuffle()).to_series() - - random.seed(1) - result2 = pl.select(a=pl.lit(s_list).shuffle()).to_series() - assert_series_equal(result1, result2) - - -def test_sample() -> None: - a = pl.Series("a", range(0, 20)) - out = pl.select( - pl.lit(a).sample(fraction=0.5, with_replacement=False, seed=1) - ).to_series() - - assert out.shape == (10,) - assert out.to_list() != out.sort().to_list() - assert out.unique().shape == (10,) - assert set(out).issubset(set(a)) - - out = pl.select(pl.lit(a).sample(n=10, with_replacement=False, seed=1)).to_series() - assert out.shape == (10,) - assert out.to_list() != out.sort().to_list() - assert out.unique().shape == (10,) - - # Setting random.seed should lead to reproducible results - random.seed(1) - result1 = pl.select(pl.lit(a).sample(n=10)).to_series() - random.seed(1) - result2 = pl.select(pl.lit(a).sample(n=10)).to_series() - assert_series_equal(result1, result2) - - def test_map_alias() -> None: out = pl.DataFrame({"foo": [1, 2, 3]}).select( (pl.col("foo") * 2).map_alias(lambda name: f"{name}{name}") @@ -460,20 +422,6 @@ def test_rank_so_4109() -> None: } -def test_rank_random() -> None: - df = pl.from_dict( - {"a": [1] * 5, "b": [1, 2, 3, 4, 5], "c": [200, 100, 100, 50, 100]} - ) - - df_ranks1 = df.with_columns( - pl.col("c").rank(method="random", seed=1).over("a").alias("rank") - ) - df_ranks2 = df.with_columns( - pl.col("c").rank(method="random", seed=1).over("a").alias("rank") - ) - assert_frame_equal(df_ranks1, df_ranks2) - - def test_unique_empty() -> None: for dt in [pl.Utf8, pl.Boolean, pl.Int32, pl.UInt32]: s = pl.Series([], dtype=dt) diff --git a/py-polars/tests/unit/test_series.py b/py-polars/tests/unit/test_series.py index 67df145b0d9c..aef3c84ae9f4 100644 --- a/py-polars/tests/unit/test_series.py +++ b/py-polars/tests/unit/test_series.py @@ -1288,13 +1288,6 @@ def test_rank() -> None: ) -def test_rank_random() -> None: - s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0]) - assert_series_equal( - s.rank("random", seed=1), pl.Series("a", [2, 4, 7, 3, 5, 6, 1], dtype=UInt32) - ) - - def test_diff() -> None: s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0]) expected = pl.Series("a", [1, 1, -1, 0, 1, -3]) @@ -1815,21 +1808,6 @@ def test_dot() -> None: s1 @ [4, 5, 6, 7, 8] -def test_sample() -> None: - s = pl.Series("a", [1, 2, 3, 4, 5]) - - assert len(s.sample(n=2, seed=0)) == 2 - assert len(s.sample(fraction=0.4, seed=0)) == 2 - - assert len(s.sample(n=2, with_replacement=True, seed=0)) == 2 - - # on a series of length 5, you cannot sample more than 5 items - with pytest.raises(pl.ShapeError): - s.sample(n=10, with_replacement=False, seed=0) - # unless you use with_replacement=True - assert len(s.sample(n=10, with_replacement=True, seed=0)) == 10 - - def test_peak_max_peak_min() -> None: s = pl.Series("a", [4, 1, 3, 2, 5]) result = s.peak_min() @@ -1950,16 +1928,6 @@ def test_log_exp() -> None: assert_series_equal(a.log1p(), expected) -def test_shuffle() -> None: - a = pl.Series("a", [1, 2, 3]) - out = a.shuffle(2) - expected = pl.Series("a", [2, 1, 3]) - assert_series_equal(out, expected) - - out = pl.select(pl.lit(a).shuffle(2)).to_series() - assert_series_equal(out, expected) - - def test_to_physical() -> None: # casting an int result in an int s = pl.Series("a", [1, 2, 3])