diff --git a/crates/polars-arrow/src/bitmap/immutable.rs b/crates/polars-arrow/src/bitmap/immutable.rs index 53bb63ab9bec..8e156939c3cd 100644 --- a/crates/polars-arrow/src/bitmap/immutable.rs +++ b/crates/polars-arrow/src/bitmap/immutable.rs @@ -11,6 +11,7 @@ use crate::bitmap::iterator::{ FastU32BitmapIter, FastU56BitmapIter, FastU64BitmapIter, TrueIdxIter, }; use crate::buffer::Bytes; +use crate::legacy::utils::FromTrustedLenIterator; use crate::trusted_len::TrustedLen; const UNKNOWN_BIT_COUNT: u64 = u64::MAX; @@ -484,6 +485,15 @@ impl FromIterator for Bitmap { } } +impl FromTrustedLenIterator for Bitmap { + fn from_iter_trusted_length>(iter: T) -> Self + where + T::IntoIter: TrustedLen, + { + MutableBitmap::from_trusted_len_iter(iter.into_iter()).into() + } +} + impl Bitmap { /// Creates a new [`Bitmap`] from an iterator of booleans. /// diff --git a/crates/polars-arrow/src/legacy/kernels/sorted_join/left.rs b/crates/polars-arrow/src/legacy/kernels/sorted_join/left.rs index 5f87df216a69..3d41574bcbf8 100644 --- a/crates/polars-arrow/src/legacy/kernels/sorted_join/left.rs +++ b/crates/polars-arrow/src/legacy/kernels/sorted_join/left.rs @@ -11,7 +11,7 @@ pub fn join( if right.is_empty() { return ( (left_offset..left.len() as IdxSize + left_offset).collect(), - vec![None; left.len()], + vec![NullableIdxSize::null(); left.len()], ); } // * 1.5 because there can be duplicates @@ -27,7 +27,7 @@ pub fn join( let first_right = right[right_idx as usize]; let mut left_idx = left.partition_point(|v| v < &first_right) as IdxSize; - out_rhs.extend(std::iter::repeat(None).take(left_idx as usize)); + out_rhs.extend(std::iter::repeat(NullableIdxSize::null()).take(left_idx as usize)); out_lhs.extend(left_offset..(left_idx + left_offset)); for &val_l in &left[left_idx as usize..] { @@ -37,7 +37,7 @@ pub fn join( // matching join key if val_l == val_r { out_lhs.push(left_idx + left_offset); - out_rhs.push(Some(right_idx)); + out_rhs.push(right_idx.into()); let current_idx = right_idx; loop { @@ -52,7 +52,7 @@ pub fn join( Some(&val_r) => { if val_l == val_r { out_lhs.push(left_idx + left_offset); - out_rhs.push(Some(right_idx)); + out_rhs.push(right_idx.into()); } else { // reset right index because the next lhs value can be the same right_idx = current_idx; @@ -67,7 +67,7 @@ pub fn join( // right is larger than left. if val_r > val_l { out_lhs.push(left_idx + left_offset); - out_rhs.push(None); + out_rhs.push(NullableIdxSize::null()); break; } // continue looping the right side @@ -76,7 +76,7 @@ pub fn join( // we depleted the right array None => { out_lhs.push(left_idx + left_offset); - out_rhs.push(None); + out_rhs.push(NullableIdxSize::null()); break; }, } @@ -98,14 +98,14 @@ mod test { let (l_idx, r_idx) = join(lhs, rhs, 0); let out_left = &[0, 1, 1, 2, 2, 3, 4, 5]; let out_right = &[ - Some(0), - Some(1), - Some(2), - Some(1), - Some(2), - None, - Some(3), - None, + 0.into(), + 1.into(), + 2.into(), + 1.into(), + 2.into(), + NullableIdxSize::null(), + 3.into(), + NullableIdxSize::null(), ]; assert_eq!(&l_idx, out_left); assert_eq!(&r_idx, out_right); @@ -128,21 +128,21 @@ mod test { assert_eq!( &r_idx, &[ - Some(0), - Some(1), - Some(0), - Some(1), - Some(2), - Some(3), - Some(4), - None, - Some(5), - Some(6), - Some(5), - Some(6), - Some(5), - Some(6), - None + 0.into(), + 1.into(), + 0.into(), + 1.into(), + 2.into(), + 3.into(), + 4.into(), + NullableIdxSize::null(), + 5.into(), + 6.into(), + 5.into(), + 6.into(), + 5.into(), + 6.into(), + NullableIdxSize::null(), ] ); @@ -153,16 +153,16 @@ mod test { assert_eq!( &r_idx, &[ - None, - None, - Some(1), - Some(2), - Some(2), - Some(2), - Some(2), - Some(3), - Some(4), - Some(4) + NullableIdxSize::null(), + NullableIdxSize::null(), + 1.into(), + 2.into(), + 2.into(), + 2.into(), + 2.into(), + 3.into(), + 4.into(), + 4.into() ] ); let lhs = &[0, 1, 2, 2, 3, 4, 4, 6, 6, 7]; @@ -172,20 +172,20 @@ mod test { assert_eq!( &r_idx, &[ - None, - None, - None, - None, - None, - Some(0), - Some(1), - Some(2), - Some(0), - Some(1), - Some(2), - None, - None, - None + NullableIdxSize::null(), + NullableIdxSize::null(), + NullableIdxSize::null(), + NullableIdxSize::null(), + NullableIdxSize::null(), + 0.into(), + 1.into(), + 2.into(), + 0.into(), + 1.into(), + 2.into(), + NullableIdxSize::null(), + NullableIdxSize::null(), + NullableIdxSize::null(), ] ) } diff --git a/crates/polars-arrow/src/legacy/kernels/sorted_join/mod.rs b/crates/polars-arrow/src/legacy/kernels/sorted_join/mod.rs index 467d7a6d9b71..5aea170f30d5 100644 --- a/crates/polars-arrow/src/legacy/kernels/sorted_join/mod.rs +++ b/crates/polars-arrow/src/legacy/kernels/sorted_join/mod.rs @@ -3,9 +3,9 @@ pub mod left; use std::fmt::Debug; -use polars_utils::IdxSize; +use polars_utils::{IdxSize, NullableIdxSize}; -type JoinOptIds = Vec>; +type JoinOptIds = Vec; type JoinIds = Vec; type LeftJoinIds = (JoinIds, JoinOptIds); type InnerJoinIds = (JoinIds, JoinIds); diff --git a/crates/polars-core/src/chunked_array/ops/gather.rs b/crates/polars-core/src/chunked_array/ops/gather.rs index 53c0ee5c3546..5be785086fd2 100644 --- a/crates/polars-core/src/chunked_array/ops/gather.rs +++ b/crates/polars-core/src/chunked_array/ops/gather.rs @@ -1,4 +1,5 @@ use arrow::bitmap::bitmask::BitMask; +use arrow::bitmap::Bitmap; use arrow::compute::take::take_unchecked; use polars_error::{polars_bail, polars_ensure}; use polars_utils::index::check_bounds; @@ -275,3 +276,15 @@ impl ChunkTakeUnchecked for StringChunked { self.as_binary().take_unchecked(indices).to_string() } } + +impl IdxCa { + pub fn with_nullable_idx T>(idx: &[NullableIdxSize], f: F) -> T { + let validity: Bitmap = idx.iter().map(|idx| !idx.is_null_idx()).collect_trusted(); + let idx = bytemuck::cast_slice::<_, IdxSize>(idx); + let arr = unsafe { arrow::ffi::mmap::slice(idx) }; + let arr = arr.with_validity_typed(Some(validity)); + let ca = IdxCa::with_chunk("", arr); + + f(&ca) + } +} diff --git a/crates/polars-lazy/src/physical_plan/expressions/window.rs b/crates/polars-lazy/src/physical_plan/expressions/window.rs index 5e60f43923f7..8cefbb4cde34 100644 --- a/crates/polars-lazy/src/physical_plan/expressions/window.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/window.rs @@ -630,7 +630,7 @@ fn materialize_column(join_opt_ids: &ChunkJoinOptIds, out_column: &Series) -> Se match join_opt_ids { Either::Left(ids) => unsafe { - out_column.take_unchecked(&ids.iter().copied().collect_ca("")) + IdxCa::with_nullable_idx(ids, |idx| out_column.take_unchecked(idx)) }, Either::Right(ids) => unsafe { out_column.take_opt_chunked_unchecked(ids) }, } diff --git a/crates/polars-ops/src/frame/join/args.rs b/crates/polars-ops/src/frame/join/args.rs index b0467e1a5cc0..b0008b7afb7c 100644 --- a/crates/polars-ops/src/frame/join/args.rs +++ b/crates/polars-ops/src/frame/join/args.rs @@ -7,10 +7,10 @@ pub type InnerJoinIds = (JoinIds, JoinIds); #[cfg(feature = "chunked_ids")] pub(super) type ChunkJoinIds = Either, Vec>; #[cfg(feature = "chunked_ids")] -pub type ChunkJoinOptIds = Either>, Vec>; +pub type ChunkJoinOptIds = Either, Vec>; #[cfg(not(feature = "chunked_ids"))] -pub type ChunkJoinOptIds = Vec>; +pub type ChunkJoinOptIds = Vec; #[cfg(not(feature = "chunked_ids"))] pub type ChunkJoinIds = Vec; diff --git a/crates/polars-ops/src/frame/join/hash_join/mod.rs b/crates/polars-ops/src/frame/join/hash_join/mod.rs index a23667cba742..dfe4ae71c00b 100644 --- a/crates/polars-ops/src/frame/join/hash_join/mod.rs +++ b/crates/polars-ops/src/frame/join/hash_join/mod.rs @@ -110,7 +110,7 @@ pub trait JoinDispatch: IntoDf { let materialize_right = || { let right_idx = &*right_idx; - unsafe { other.take_unchecked(&right_idx.iter().copied().collect_ca("")) } + unsafe { IdxCa::with_nullable_idx(right_idx, |idx| other.take_unchecked(idx)) } }; let (df_left, df_right) = POOL.join(materialize_left, materialize_right); @@ -150,7 +150,7 @@ pub trait JoinDispatch: IntoDf { if let Some((offset, len)) = args.slice { right_idx = slice_slice(right_idx, offset, len); } - other.take_unchecked(&right_idx.iter().copied().collect_ca("")) + IdxCa::with_nullable_idx(right_idx, |idx| other.take_unchecked(idx)) }, ChunkJoinOptIds::Right(right_idx) => unsafe { let mut right_idx = &*right_idx; diff --git a/crates/polars-ops/src/frame/join/hash_join/multiple_keys.rs b/crates/polars-ops/src/frame/join/hash_join/multiple_keys.rs index 7e2c737da070..119973c9671e 100644 --- a/crates/polars-ops/src/frame/join/hash_join/multiple_keys.rs +++ b/crates/polars-ops/src/frame/join/hash_join/multiple_keys.rs @@ -321,12 +321,13 @@ pub fn _left_join_multiple_keys( Some((_, indexes_b)) => { result_idx_left .extend(std::iter::repeat(idx_a).take(indexes_b.len())); - result_idx_right.extend(indexes_b.iter().copied().map(Some)) + let indexes_b = bytemuck::cast_slice(indexes_b); + result_idx_right.extend_from_slice(indexes_b); }, // only left values, right = null None => { result_idx_left.push(idx_a); - result_idx_right.push(None); + result_idx_right.push(NullableIdxSize::null()); }, } idx_a += 1; diff --git a/crates/polars-ops/src/frame/join/hash_join/single_keys_left.rs b/crates/polars-ops/src/frame/join/hash_join/single_keys_left.rs index e95550c3308c..91c4f0cd1008 100644 --- a/crates/polars-ops/src/frame/join/hash_join/single_keys_left.rs +++ b/crates/polars-ops/src/frame/join/hash_join/single_keys_left.rs @@ -13,11 +13,14 @@ unsafe fn apply_mapping(idx: Vec, chunk_mapping: &[ChunkId]) -> Vec>, chunk_mapping: &[ChunkId]) -> Vec { +unsafe fn apply_opt_mapping(idx: Vec, chunk_mapping: &[ChunkId]) -> Vec { idx.iter() - .map(|opt_idx| match opt_idx { - None => ChunkId::null(), - Some(idx) => *chunk_mapping.get_unchecked(*idx as usize), + .map(|opt_idx| { + if opt_idx.is_null_idx() { + ChunkId::null() + } else { + *chunk_mapping.get_unchecked(opt_idx.idx() as usize) + } }) .collect() } @@ -25,7 +28,7 @@ unsafe fn apply_opt_mapping(idx: Vec>, chunk_mapping: &[ChunkId] #[cfg(feature = "chunked_ids")] pub(super) fn finish_left_join_mappings( result_idx_left: Vec, - result_idx_right: Vec>, + result_idx_right: Vec, chunk_mapping_left: Option<&[ChunkId]>, chunk_mapping_right: Option<&[ChunkId]>, ) -> LeftJoinIds { @@ -46,7 +49,7 @@ pub(super) fn finish_left_join_mappings( #[cfg(not(feature = "chunked_ids"))] pub(super) fn finish_left_join_mappings( _result_idx_left: Vec, - _result_idx_right: Vec>, + _result_idx_right: Vec, _chunk_mapping_left: Option<&[ChunkId]>, _chunk_mapping_right: Option<&[ChunkId]>, ) -> LeftJoinIds { @@ -163,12 +166,12 @@ where // left and right matches Some(indexes_b) => { result_idx_left.extend(std::iter::repeat(idx_a).take(indexes_b.len())); - result_idx_right.extend(indexes_b.iter().copied().map(Some)) + result_idx_right.extend_from_slice(bytemuck::cast_slice(indexes_b)); }, // only left values, right = null None => { result_idx_left.push(idx_a); - result_idx_right.push(None); + result_idx_right.push(NullableIdxSize::null()); }, } }); diff --git a/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs b/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs index 6d97ee4735f4..c19d6f5f9de9 100644 --- a/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs +++ b/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs @@ -11,7 +11,7 @@ use super::*; fn par_sorted_merge_left_impl( s_left: &ChunkedArray, s_right: &ChunkedArray, -) -> (Vec, Vec>) +) -> (Vec, Vec) where T: PolarsNumericType, { @@ -39,7 +39,7 @@ where pub(super) fn par_sorted_merge_left( s_left: &Series, s_right: &Series, -) -> (Vec, Vec>) { +) -> (Vec, Vec) { // Don't use bit_repr here. It messes up sortedness. debug_assert_eq!(s_left.dtype(), s_right.dtype()); let s_left = s_left.to_physical_repr(); @@ -153,7 +153,7 @@ pub(super) fn par_sorted_merge_inner_no_nulls( } #[cfg(feature = "performant")] -fn to_left_join_ids(left_idx: Vec, right_idx: Vec>) -> LeftJoinIds { +fn to_left_join_ids(left_idx: Vec, right_idx: Vec) -> LeftJoinIds { #[cfg(feature = "chunked_ids")] { (Either::Left(left_idx), Either::Left(right_idx)) @@ -332,8 +332,11 @@ pub(super) fn sort_or_hash_left( POOL.install(|| { right.par_iter_mut().for_each(|opt_idx| { - *opt_idx = - opt_idx.map(|idx| unsafe { *reverse_idx_map.get_unchecked(idx as usize) }); + if !opt_idx.is_null_idx() { + *opt_idx = + unsafe { *reverse_idx_map.get_unchecked(opt_idx.idx() as usize) } + .into(); + } }); }); diff --git a/crates/polars-utils/src/index.rs b/crates/polars-utils/src/index.rs index 2a35558839cd..e4fe1f3efd34 100644 --- a/crates/polars-utils/src/index.rs +++ b/crates/polars-utils/src/index.rs @@ -10,23 +10,62 @@ pub type IdxSize = u32; #[cfg(feature = "bigidx")] pub type IdxSize = u64; -pub type NullableIdxSize = IdxSize; +#[derive(Clone, Copy)] +#[repr(transparent)] +pub struct NullableIdxSize { + pub inner: IdxSize, +} + +impl PartialEq for NullableIdxSize { + fn eq(&self, other: &Self) -> bool { + self.inner == other.inner + } +} -pub trait IsNullIdx { - fn is_null(&self) -> bool; +impl Eq for NullableIdxSize {} - fn null() -> Self; +unsafe impl bytemuck::Zeroable for NullableIdxSize {} +unsafe impl bytemuck::AnyBitPattern for NullableIdxSize {} +unsafe impl bytemuck::NoUninit for NullableIdxSize {} + +impl Debug for NullableIdxSize { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.inner) + } } -impl IsNullIdx for IdxSize { +impl NullableIdxSize { #[inline(always)] - fn is_null(&self) -> bool { - *self == IdxSize::MAX + pub fn is_null_idx(&self) -> bool { + self.inner == IdxSize::MAX } #[inline(always)] - fn null() -> Self { - IdxSize::MAX + pub const fn null() -> Self { + Self { + inner: IdxSize::MAX, + } + } + + #[inline(always)] + pub fn idx(&self) -> IdxSize { + self.inner + } + + #[inline(always)] + pub fn to_opt(&self) -> Option { + if self.is_null_idx() { + None + } else { + Some(self.idx()) + } + } +} + +impl From for NullableIdxSize { + #[inline(always)] + fn from(value: IdxSize) -> Self { + Self { inner: value } } } diff --git a/crates/polars-utils/src/lib.rs b/crates/polars-utils/src/lib.rs index bde9d6187150..575571b62985 100644 --- a/crates/polars-utils/src/lib.rs +++ b/crates/polars-utils/src/lib.rs @@ -39,5 +39,5 @@ pub mod nulls; pub mod ord; pub mod partitioned; -pub use index::{IdxSize, IsNullIdx, NullableIdxSize}; +pub use index::{IdxSize, NullableIdxSize}; pub use io::open_file;