From da005d4fb069e422320a64ff552ab22967bd8a47 Mon Sep 17 00:00:00 2001 From: ritchie Date: Wed, 14 Jun 2023 11:00:11 +0200 Subject: [PATCH 1/2] fix(rust, python): list zip with The list `zip_with`, e.g. `when -> then otherwise` did not restore logical types. --- .../src/chunked_array/arithmetic/decimal.rs | 6 +- .../src/chunked_array/arithmetic/numeric.rs | 2 +- polars/polars-core/src/chunked_array/mod.rs | 5 +- .../src/chunked_array/ops/chunkops.rs | 4 +- .../src/chunked_array/ops/explode.rs | 3 +- .../src/chunked_array/ops/filter.rs | 6 +- .../src/chunked_array/ops/nulls.rs | 2 +- .../src/chunked_array/ops/take/mod.rs | 2 +- .../polars-core/src/chunked_array/ops/zip.rs | 119 +++++------------- py-polars/tests/unit/test_arity.py | 29 +++++ 10 files changed, 75 insertions(+), 103 deletions(-) diff --git a/polars/polars-core/src/chunked_array/arithmetic/decimal.rs b/polars/polars-core/src/chunked_array/arithmetic/decimal.rs index f9e2206ecb81..145d20fe661f 100644 --- a/polars/polars-core/src/chunked_array/arithmetic/decimal.rs +++ b/polars/polars-core/src/chunked_array/arithmetic/decimal.rs @@ -58,7 +58,7 @@ impl DecimalChunked { .zip(rhs.downcast_iter()) .map(|(lhs, rhs)| kernel(lhs, rhs).map(|a| Box::new(a) as ArrayRef)) .collect::>()?; - lhs.copy_with_chunks(chunks, false, false) + unsafe { lhs.copy_with_chunks(chunks, false, false) } } // broadcast right path (_, 1) => { @@ -70,7 +70,7 @@ impl DecimalChunked { .downcast_iter() .map(|lhs| operation_lhs(lhs, rhs_val).map(|a| Box::new(a) as ArrayRef)) .collect::>()?; - lhs.copy_with_chunks(chunks, false, false) + unsafe { lhs.copy_with_chunks(chunks, false, false) } } } } @@ -83,7 +83,7 @@ impl DecimalChunked { .downcast_iter() .map(|rhs| operation_rhs(lhs_val, rhs).map(|a| Box::new(a) as ArrayRef)) .collect::>()?; - lhs.copy_with_chunks(chunks, false, false) + unsafe { lhs.copy_with_chunks(chunks, false, false) } } } } diff --git a/polars/polars-core/src/chunked_array/arithmetic/numeric.rs b/polars/polars-core/src/chunked_array/arithmetic/numeric.rs index 03cb849c2e59..ba37ada11a58 100644 --- a/polars/polars-core/src/chunked_array/arithmetic/numeric.rs +++ b/polars/polars-core/src/chunked_array/arithmetic/numeric.rs @@ -19,7 +19,7 @@ where .zip(rhs.downcast_iter()) .map(|(lhs, rhs)| Box::new(kernel(lhs, rhs)) as ArrayRef) .collect(); - lhs.copy_with_chunks(chunks, false, false) + unsafe { lhs.copy_with_chunks(chunks, false, false) } } // broadcast right path (_, 1) => { diff --git a/polars/polars-core/src/chunked_array/mod.rs b/polars/polars-core/src/chunked_array/mod.rs index 9d0e6f697f72..89eb8d2b96f8 100644 --- a/polars/polars-core/src/chunked_array/mod.rs +++ b/polars/polars-core/src/chunked_array/mod.rs @@ -315,7 +315,10 @@ impl ChunkedArray { } /// Create a new ChunkedArray from self, where the chunks are replaced. - fn copy_with_chunks( + /// + /// # Safety + /// The caller must ensure the dtypes of the chunks are correct + unsafe fn copy_with_chunks( &self, chunks: Vec, keep_sorted: bool, diff --git a/polars/polars-core/src/chunked_array/ops/chunkops.rs b/polars/polars-core/src/chunked_array/ops/chunkops.rs index 5306d5670d29..69e21a974ca2 100644 --- a/polars/polars-core/src/chunked_array/ops/chunkops.rs +++ b/polars/polars-core/src/chunked_array/ops/chunkops.rs @@ -100,7 +100,7 @@ impl ChunkedArray { self.clone() } else { let chunks = inner_rechunk(&self.chunks); - self.copy_with_chunks(chunks, true, true) + unsafe { self.copy_with_chunks(chunks, true, true) } } } } @@ -114,7 +114,7 @@ impl ChunkedArray { #[inline] pub fn slice(&self, offset: i64, length: usize) -> Self { let (chunks, len) = slice(&self.chunks, offset, length, self.len()); - let mut out = self.copy_with_chunks(chunks, true, true); + let mut out = unsafe { self.copy_with_chunks(chunks, true, true) }; out.length = len as IdxSize; out } diff --git a/polars/polars-core/src/chunked_array/ops/explode.rs b/polars/polars-core/src/chunked_array/ops/explode.rs index 6b500664058f..b875311b5320 100644 --- a/polars/polars-core/src/chunked_array/ops/explode.rs +++ b/polars/polars-core/src/chunked_array/ops/explode.rs @@ -266,8 +266,7 @@ impl ExplodeByOffsets for ListChunked { } process_range(start, last, &mut builder); let arr = builder.finish(Some(&inner_type.to_arrow())).unwrap(); - self.copy_with_chunks(vec![Box::new(arr)], true, true) - .into_series() + unsafe { self.copy_with_chunks(vec![Box::new(arr)], true, true) }.into_series() } } diff --git a/polars/polars-core/src/chunked_array/ops/filter.rs b/polars/polars-core/src/chunked_array/ops/filter.rs index e6e85cce59cb..e8a92e32176a 100644 --- a/polars/polars-core/src/chunked_array/ops/filter.rs +++ b/polars/polars-core/src/chunked_array/ops/filter.rs @@ -37,7 +37,7 @@ where .zip(filter.downcast_iter()) .map(|(left, mask)| filter_fn(left, mask).unwrap()) .collect::>(); - Ok(self.copy_with_chunks(chunks, true, true)) + unsafe { Ok(self.copy_with_chunks(chunks, true, true)) } } } @@ -58,7 +58,7 @@ impl ChunkFilter for BooleanChunked { .zip(filter.downcast_iter()) .map(|(left, mask)| filter_fn(left, mask).unwrap()) .collect::>(); - Ok(self.copy_with_chunks(chunks, true, true)) + unsafe { Ok(self.copy_with_chunks(chunks, true, true)) } } } @@ -87,7 +87,7 @@ impl ChunkFilter for BinaryChunked { .map(|(left, mask)| filter_fn(left, mask).unwrap()) .collect::>(); - Ok(self.copy_with_chunks(chunks, true, true)) + unsafe { Ok(self.copy_with_chunks(chunks, true, true)) } } } diff --git a/polars/polars-core/src/chunked_array/ops/nulls.rs b/polars/polars-core/src/chunked_array/ops/nulls.rs index a71f27bb59b3..5d069fa6b35b 100644 --- a/polars/polars-core/src/chunked_array/ops/nulls.rs +++ b/polars/polars-core/src/chunked_array/ops/nulls.rs @@ -23,7 +23,7 @@ impl ChunkedArray { pub(crate) fn coalesce_nulls(&self, other: &[ArrayRef]) -> Self { let chunks = coalesce_nulls(&self.chunks, other); - self.copy_with_chunks(chunks, true, false) + unsafe { self.copy_with_chunks(chunks, true, false) } } } pub fn is_not_null(name: &str, chunks: &[ArrayRef]) -> BooleanChunked { diff --git a/polars/polars-core/src/chunked_array/ops/take/mod.rs b/polars/polars-core/src/chunked_array/ops/take/mod.rs index 1d7b9b1a7313..4b4d54955b87 100644 --- a/polars/polars-core/src/chunked_array/ops/take/mod.rs +++ b/polars/polars-core/src/chunked_array/ops/take/mod.rs @@ -64,7 +64,7 @@ where { fn finish_from_array(&self, array: Box) -> Self { let keep_fast_explode = array.null_count() == 0; - self.copy_with_chunks(vec![array], false, keep_fast_explode) + unsafe { self.copy_with_chunks(vec![array], false, keep_fast_explode) } } } diff --git a/polars/polars-core/src/chunked_array/ops/zip.rs b/polars/polars-core/src/chunked_array/ops/zip.rs index 61956b02e5f6..0d2d508cbb27 100644 --- a/polars/polars-core/src/chunked_array/ops/zip.rs +++ b/polars/polars-core/src/chunked_array/ops/zip.rs @@ -66,6 +66,26 @@ macro_rules! impl_ternary_broadcast { }}; } +fn zip_with( + left: &ChunkedArray, + right: &ChunkedArray, + mask: &BooleanChunked, +) -> PolarsResult> { + let (left, right, mask) = align_chunks_ternary(left, right, mask); + let chunks = left + .chunks() + .iter() + .zip(right.chunks()) + .zip(mask.downcast_iter()) + .map(|((left_c, right_c), mask_c)| { + let mask_c = prepare_mask(mask_c); + let arr = if_then_else(&mask_c, left_c.as_ref(), right_c.as_ref())?; + Ok(arr) + }) + .collect::>>()?; + unsafe { Ok(left.copy_with_chunks(chunks, false, false)) } +} + impl ChunkZip for ChunkedArray where T: PolarsNumericType, @@ -79,18 +99,7 @@ where if self.len() != mask.len() || other.len() != mask.len() { impl_ternary_broadcast!(self, self.len(), other.len(), other, mask, T) } else { - let (left, right, mask) = align_chunks_ternary(self, other, mask); - let chunks = left - .downcast_iter() - .zip(right.downcast_iter()) - .zip(mask.downcast_iter()) - .map(|((left_c, right_c), mask_c)| { - let mask_c = prepare_mask(mask_c); - let arr = if_then_else(&mask_c, left_c, right_c)?; - Ok(arr) - }) - .collect::>>()?; - unsafe { Ok(ChunkedArray::from_chunks(self.name(), chunks)) } + zip_with(self, other, mask) } } } @@ -105,39 +114,17 @@ impl ChunkZip for BooleanChunked { if self.len() != mask.len() || other.len() != mask.len() { impl_ternary_broadcast!(self, self.len(), other.len(), other, mask, BooleanType) } else { - let (left, right, mask) = align_chunks_ternary(self, other, mask); - let chunks = left - .downcast_iter() - .zip(right.downcast_iter()) - .zip(mask.downcast_iter()) - .map(|((left_c, right_c), mask_c)| { - let mask_c = prepare_mask(mask_c); - let arr = if_then_else(&mask_c, left_c, right_c)?; - Ok(arr) - }) - .collect::>>()?; - unsafe { Ok(ChunkedArray::from_chunks(self.name(), chunks)) } + zip_with(self, other, mask) } } } impl ChunkZip for Utf8Chunked { fn zip_with(&self, mask: &BooleanChunked, other: &Utf8Chunked) -> PolarsResult { - if self.len() != mask.len() || other.len() != mask.len() { - impl_ternary_broadcast!(self, self.len(), other.len(), other, mask, Utf8Type) - } else { - let (left, right, mask) = align_chunks_ternary(self, other, mask); - let chunks = left - .downcast_iter() - .zip(right.downcast_iter()) - .zip(mask.downcast_iter()) - .map(|((left_c, right_c), mask_c)| { - let mask_c = prepare_mask(mask_c); - let arr = if_then_else(&mask_c, left_c, right_c)?; - Ok(arr) - }) - .collect::>>()?; - unsafe { Ok(ChunkedArray::from_chunks(self.name(), chunks)) } + unsafe { + self.as_binary() + .zip_with(mask, &other.as_binary()) + .map(|ca| ca.to_utf8()) } } } @@ -151,54 +138,21 @@ impl ChunkZip for BinaryChunked { if self.len() != mask.len() || other.len() != mask.len() { impl_ternary_broadcast!(self, self.len(), other.len(), other, mask, BinaryType) } else { - let (left, right, mask) = align_chunks_ternary(self, other, mask); - let chunks = left - .downcast_iter() - .zip(right.downcast_iter()) - .zip(mask.downcast_iter()) - .map(|((left_c, right_c), mask_c)| { - let mask_c = prepare_mask(mask_c); - let arr = if_then_else(&mask_c, left_c, right_c)?; - Ok(arr) - }) - .collect::>>()?; - unsafe { Ok(ChunkedArray::from_chunks(self.name(), chunks)) } + zip_with(self, other, mask) } } } impl ChunkZip for ListChunked { fn zip_with(&self, mask: &BooleanChunked, other: &ListChunked) -> PolarsResult { - let (left, right, mask) = align_chunks_ternary(self, other, mask); - let chunks = left - .downcast_iter() - .zip(right.downcast_iter()) - .zip(mask.downcast_iter()) - .map(|((left_c, right_c), mask_c)| { - let mask_c = prepare_mask(mask_c); - let arr = if_then_else(&mask_c, left_c, right_c)?; - Ok(arr) - }) - .collect::>>()?; - unsafe { Ok(ChunkedArray::from_chunks(self.name(), chunks)) } + zip_with(self, other, mask) } } #[cfg(feature = "dtype-array")] impl ChunkZip for ArrayChunked { fn zip_with(&self, mask: &BooleanChunked, other: &ArrayChunked) -> PolarsResult { - let (left, right, mask) = align_chunks_ternary(self, other, mask); - let chunks = left - .downcast_iter() - .zip(right.downcast_iter()) - .zip(mask.downcast_iter()) - .map(|((left_c, right_c), mask_c)| { - let mask_c = prepare_mask(mask_c); - let arr = if_then_else(&mask_c, left_c, right_c)?; - Ok(arr) - }) - .collect::>>()?; - unsafe { Ok(ChunkedArray::from_chunks(self.name(), chunks)) } + zip_with(self, other, mask) } } @@ -209,19 +163,6 @@ impl ChunkZip> for ObjectChunked { mask: &BooleanChunked, other: &ChunkedArray>, ) -> PolarsResult>> { - let (left, right, mask) = align_chunks_ternary(self, other, mask); - let mut ca: Self = left - .as_ref() - .into_iter() - .zip(right.into_iter()) - .zip(mask.into_iter()) - .map(|((left_c, right_c), mask_c)| match mask_c { - Some(true) => left_c.cloned(), - Some(false) => right_c.cloned(), - None => None, - }) - .collect(); - ca.rename(self.name()); - Ok(ca) + zip_with(self, other, mask) } } diff --git a/py-polars/tests/unit/test_arity.py b/py-polars/tests/unit/test_arity.py index 71730186d2e6..f4b7425f2b83 100644 --- a/py-polars/tests/unit/test_arity.py +++ b/py-polars/tests/unit/test_arity.py @@ -1,3 +1,5 @@ +from datetime import datetime + import polars as pl from polars.testing import assert_frame_equal @@ -40,3 +42,30 @@ def test_expression_literal_series_order() -> None: assert df.select(pl.col("a") + s).to_dict(False) == {"a": [2, 4, 6]} assert df.select(pl.lit(s) + pl.col("a")).to_dict(False) == {"": [2, 4, 6]} + + +def test_list_zip_with_logical_type() -> None: + df = pl.DataFrame( + { + "start": [datetime(2023, 1, 1, 1, 1, 1), datetime(2023, 1, 1, 1, 1, 1)], + "stop": [datetime(2023, 1, 1, 1, 3, 1), datetime(2023, 1, 1, 1, 4, 1)], + "use": [1, 0], + } + ) + + df = df.with_columns( + pl.date_range( + pl.col("start"), pl.col("stop"), interval="1h", eager=False, closed="left" + ).alias("interval_1"), + pl.date_range( + pl.col("start"), pl.col("stop"), interval="1h", eager=False, closed="left" + ).alias("interval_2"), + ) + + out = df.select( + pl.when(pl.col("use") == 1) + .then(pl.col("interval_2")) + .otherwise(pl.col("interval_1")) + .alias("interval_new") + ) + assert out.dtypes == [pl.List(pl.Datetime(time_unit="us", time_zone=None))] From dda3853c2d26638476fea62caa0f5ff0554f74d4 Mon Sep 17 00:00:00 2001 From: ritchie Date: Wed, 14 Jun 2023 11:31:07 +0200 Subject: [PATCH 2/2] restore object --- polars/polars-core/src/chunked_array/ops/zip.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/polars/polars-core/src/chunked_array/ops/zip.rs b/polars/polars-core/src/chunked_array/ops/zip.rs index 0d2d508cbb27..19cb41ada692 100644 --- a/polars/polars-core/src/chunked_array/ops/zip.rs +++ b/polars/polars-core/src/chunked_array/ops/zip.rs @@ -163,6 +163,19 @@ impl ChunkZip> for ObjectChunked { mask: &BooleanChunked, other: &ChunkedArray>, ) -> PolarsResult>> { - zip_with(self, other, mask) + let (left, right, mask) = align_chunks_ternary(self, other, mask); + let mut ca: Self = left + .as_ref() + .into_iter() + .zip(right.into_iter()) + .zip(mask.into_iter()) + .map(|((left_c, right_c), mask_c)| match mask_c { + Some(true) => left_c.cloned(), + Some(false) => right_c.cloned(), + None => None, + }) + .collect(); + ca.rename(self.name()); + Ok(ca) } }