From a4cf96ff697dd0ac3489214239fc0b783b0472e3 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Thu, 23 Jan 2025 15:04:05 +0100 Subject: [PATCH] refactor(rust): Move sum kernel to polars-compute (#20867) --- Cargo.lock | 29 --- Cargo.toml | 1 - crates/polars-arrow/Cargo.toml | 9 +- .../polars-arrow/src/compute/aggregate/mod.rs | 8 - .../src/compute/aggregate/simd/mod.rs | 52 ----- .../src/compute/aggregate/simd/native.rs | 15 -- .../src/compute/aggregate/simd/packed.rs | 26 --- .../polars-arrow/src/compute/aggregate/sum.rs | 120 ----------- crates/polars-arrow/src/types/mod.rs | 4 - crates/polars-arrow/src/types/simd/mod.rs | 168 --------------- crates/polars-arrow/src/types/simd/native.rs | 13 -- crates/polars-arrow/src/types/simd/packed.rs | 203 ------------------ crates/polars-compute/src/lib.rs | 23 ++ crates/polars-compute/src/sum.rs | 160 ++++++++++++++ .../src/chunked_array/ops/aggregate/mod.rs | 20 +- crates/polars-core/src/datatypes/mod.rs | 3 +- .../sinks/group_by/aggregates/mean.rs | 14 +- .../sinks/group_by/aggregates/sum.rs | 14 +- .../tests/it/arrow/compute/aggregate/mod.rs | 1 - .../tests/it/arrow/compute/aggregate/sum.rs | 37 ---- 20 files changed, 202 insertions(+), 718 deletions(-) delete mode 100644 crates/polars-arrow/src/compute/aggregate/simd/mod.rs delete mode 100644 crates/polars-arrow/src/compute/aggregate/simd/native.rs delete mode 100644 crates/polars-arrow/src/compute/aggregate/simd/packed.rs delete mode 100644 crates/polars-arrow/src/compute/aggregate/sum.rs delete mode 100644 crates/polars-arrow/src/types/simd/mod.rs delete mode 100644 crates/polars-arrow/src/types/simd/native.rs delete mode 100644 crates/polars-arrow/src/types/simd/packed.rs create mode 100644 crates/polars-compute/src/sum.rs delete mode 100644 crates/polars/tests/it/arrow/compute/aggregate/sum.rs diff --git a/Cargo.lock b/Cargo.lock index 689f5fa9a32c..c8d5c79dc4da 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2445,28 +2445,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "multiversion" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7edb7f0ff51249dfda9ab96b5823695e15a052dc15074c9dbf3d118afaf2c201" -dependencies = [ - "multiversion-macros", - "target-features", -] - -[[package]] -name = "multiversion-macros" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b093064383341eb3271f42e381cb8f10a01459478446953953c75d24bd339fc0" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.94", - "target-features", -] - [[package]] name = "native-tls" version = "0.2.12" @@ -2988,7 +2966,6 @@ dependencies = [ "itoa", "itoap", "lz4", - "multiversion", "num-traits", "parking_lot", "polars-arrow-format", @@ -4713,12 +4690,6 @@ dependencies = [ "windows", ] -[[package]] -name = "target-features" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1bbb9f3c5c463a01705937a24fdabc5047929ac764b2d5b9cf681c1f5041ed5" - [[package]] name = "target-lexicon" version = "0.12.16" diff --git a/Cargo.toml b/Cargo.toml index 2163d52915a4..372c5aa626d8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,7 +53,6 @@ itoap = { version = "1", features = ["simd"] } libc = "0.2" memchr = "2.6" memmap = { package = "memmap2", version = "0.9" } -multiversion = "0.8" ndarray = { version = "0.16", default-features = false } num-traits = "0.2" numpy = "0.23" diff --git a/crates/polars-arrow/Cargo.toml b/crates/polars-arrow/Cargo.toml index 4927eff8392f..eadb77956e79 100644 --- a/crates/polars-arrow/Cargo.toml +++ b/crates/polars-arrow/Cargo.toml @@ -61,9 +61,6 @@ avro-schema = { workspace = true, optional = true } # for division/remainder optimization at runtime strength_reduce = { workspace = true, optional = true } -# For instruction multiversioning -multiversion = { workspace = true, optional = true } - # Faster hashing ahash = { workspace = true } @@ -122,14 +119,13 @@ io_avro_compression = [ io_avro_async = ["avro-schema/async"] # the compute kernels. Disabling this significantly reduces compile time. -compute_aggregate = ["multiversion"] +compute_aggregate = [] compute_arithmetics_decimal = ["strength_reduce"] compute_arithmetics = ["strength_reduce", "compute_arithmetics_decimal"] compute_bitwise = [] compute_boolean = [] compute_boolean_kleene = [] compute_comparison = ["compute_boolean"] -compute_hash = ["multiversion"] compute_temporal = [] compute = [ "compute_aggregate", @@ -138,7 +134,6 @@ compute = [ "compute_boolean", "compute_boolean_kleene", "compute_comparison", - "compute_hash", "compute_temporal", ] serde = ["dep:serde", "polars-schema/serde", "polars-utils/serde"] @@ -161,4 +156,4 @@ features = ["full"] rustdoc-args = ["--cfg", "docsrs"] [package.metadata.cargo-all-features] -allowlist = ["compute", "compute_sort", "compute_hash", "compute_nullif"] +allowlist = ["compute", "compute_sort", "compute_nullif"] diff --git a/crates/polars-arrow/src/compute/aggregate/mod.rs b/crates/polars-arrow/src/compute/aggregate/mod.rs index 481194c1551c..879e3a09c6ed 100644 --- a/crates/polars-arrow/src/compute/aggregate/mod.rs +++ b/crates/polars-arrow/src/compute/aggregate/mod.rs @@ -1,10 +1,2 @@ -/// ! Contains different aggregation functions -#[cfg(feature = "compute_aggregate")] -mod sum; -#[cfg(feature = "compute_aggregate")] -pub use sum::*; - mod memory; pub use memory::*; -#[cfg(feature = "compute_aggregate")] -mod simd; diff --git a/crates/polars-arrow/src/compute/aggregate/simd/mod.rs b/crates/polars-arrow/src/compute/aggregate/simd/mod.rs deleted file mode 100644 index 010ba336fc37..000000000000 --- a/crates/polars-arrow/src/compute/aggregate/simd/mod.rs +++ /dev/null @@ -1,52 +0,0 @@ -use std::ops::Add; - -use super::Sum; -use crate::types::simd::{i128x8, NativeSimd}; - -macro_rules! simd_add { - ($simd:tt, $type:ty, $lanes:expr, $add:tt) => { - impl std::ops::AddAssign for $simd { - #[inline] - fn add_assign(&mut self, rhs: Self) { - for i in 0..$lanes { - self[i] = <$type>::$add(self[i], rhs[i]); - } - } - } - - impl std::ops::Add for $simd { - type Output = Self; - - #[inline] - fn add(self, rhs: Self) -> Self::Output { - let mut result = Self::default(); - for i in 0..$lanes { - result[i] = <$type>::$add(self[i], rhs[i]); - } - result - } - } - - impl Sum<$type> for $simd { - #[inline] - fn simd_sum(self) -> $type { - let mut reduced = <$type>::default(); - (0..<$simd>::LANES).for_each(|i| { - reduced += self[i]; - }); - reduced - } - } - }; -} - -// #[cfg(not(feature = "simd"))] -// pub(super) use simd_add; - -simd_add!(i128x8, i128, 8, add); - -#[cfg(not(feature = "simd"))] -mod native; - -#[cfg(feature = "simd")] -mod packed; diff --git a/crates/polars-arrow/src/compute/aggregate/simd/native.rs b/crates/polars-arrow/src/compute/aggregate/simd/native.rs deleted file mode 100644 index eb33878decbd..000000000000 --- a/crates/polars-arrow/src/compute/aggregate/simd/native.rs +++ /dev/null @@ -1,15 +0,0 @@ -use std::ops::Add; - -use super::super::sum::Sum; -use crate::types::simd::*; - -simd_add!(u8x64, u8, 64, wrapping_add); -simd_add!(u16x32, u16, 32, wrapping_add); -simd_add!(u32x16, u32, 16, wrapping_add); -simd_add!(u64x8, u64, 8, wrapping_add); -simd_add!(i8x64, i8, 64, wrapping_add); -simd_add!(i16x32, i16, 32, wrapping_add); -simd_add!(i32x16, i32, 16, wrapping_add); -simd_add!(i64x8, i64, 8, wrapping_add); -simd_add!(f32x16, f32, 16, add); -simd_add!(f64x8, f64, 8, add); diff --git a/crates/polars-arrow/src/compute/aggregate/simd/packed.rs b/crates/polars-arrow/src/compute/aggregate/simd/packed.rs deleted file mode 100644 index 2f1241993518..000000000000 --- a/crates/polars-arrow/src/compute/aggregate/simd/packed.rs +++ /dev/null @@ -1,26 +0,0 @@ -use std::simd::prelude::{SimdFloat as _, SimdInt as _, SimdUint as _}; - -use super::super::sum::Sum; -use crate::types::simd::*; - -macro_rules! simd_sum { - ($simd:tt, $type:ty, $sum:tt) => { - impl Sum<$type> for $simd { - #[inline] - fn simd_sum(self) -> $type { - self.$sum() - } - } - }; -} - -simd_sum!(f32x16, f32, reduce_sum); -simd_sum!(f64x8, f64, reduce_sum); -simd_sum!(u8x64, u8, reduce_sum); -simd_sum!(u16x32, u16, reduce_sum); -simd_sum!(u32x16, u32, reduce_sum); -simd_sum!(u64x8, u64, reduce_sum); -simd_sum!(i8x64, i8, reduce_sum); -simd_sum!(i16x32, i16, reduce_sum); -simd_sum!(i32x16, i32, reduce_sum); -simd_sum!(i64x8, i64, reduce_sum); diff --git a/crates/polars-arrow/src/compute/aggregate/sum.rs b/crates/polars-arrow/src/compute/aggregate/sum.rs deleted file mode 100644 index e2098d969e03..000000000000 --- a/crates/polars-arrow/src/compute/aggregate/sum.rs +++ /dev/null @@ -1,120 +0,0 @@ -use std::ops::Add; - -use multiversion::multiversion; -use polars_error::PolarsResult; - -use crate::array::{Array, PrimitiveArray}; -use crate::bitmap::utils::{BitChunkIterExact, BitChunksExact}; -use crate::bitmap::Bitmap; -use crate::datatypes::PhysicalType; -use crate::scalar::*; -use crate::types::simd::*; -use crate::types::NativeType; -use crate::with_match_primitive_type; - -/// Object that can reduce itself to a number. This is used in the context of SIMD to reduce -/// a MD (e.g. `[f32; 16]`) into a single number (`f32`). -pub trait Sum { - /// Reduces this element to a single value. - fn simd_sum(self) -> T; -} - -#[multiversion(targets = "simd")] -/// Compute the sum of a slice -pub fn sum_slice(values: &[T]) -> T -where - T: NativeType + Simd + Add + std::iter::Sum, - T::Simd: Sum + Add, -{ - let (head, simd_vals, tail) = T::Simd::align(values); - - let mut reduced = T::Simd::from_incomplete_chunk(&[], T::default()); - for chunk in simd_vals { - reduced = reduced + *chunk; - } - - reduced.simd_sum() + head.iter().copied().sum() + tail.iter().copied().sum() -} - -/// # Panics -/// iff `values.len() != bitmap.len()` or the operation overflows. -#[multiversion(targets = "simd")] -fn null_sum_impl(values: &[T], mut validity_masks: I) -> T -where - T: NativeType + Simd, - T::Simd: Add + Sum, - I: BitChunkIterExact<<::Simd as NativeSimd>::Chunk>, -{ - let mut chunks = values.chunks_exact(T::Simd::LANES); - - let sum = chunks.by_ref().zip(validity_masks.by_ref()).fold( - T::Simd::default(), - |acc, (chunk, validity_chunk)| { - let chunk = T::Simd::from_chunk(chunk); - let mask = ::Mask::from_chunk(validity_chunk); - let selected = chunk.select(mask, T::Simd::default()); - acc + selected - }, - ); - - let remainder = T::Simd::from_incomplete_chunk(chunks.remainder(), T::default()); - let mask = ::Mask::from_chunk(validity_masks.remainder()); - let remainder = remainder.select(mask, T::Simd::default()); - let reduced = sum + remainder; - - reduced.simd_sum() -} - -/// # Panics -/// iff `values.len() != bitmap.len()` or the operation overflows. -fn null_sum(values: &[T], bitmap: &Bitmap) -> T -where - T: NativeType + Simd, - T::Simd: Add + Sum, -{ - let (slice, offset, length) = bitmap.as_slice(); - if offset == 0 { - let validity_masks = BitChunksExact::<::Chunk>::new(slice, length); - null_sum_impl(values, validity_masks) - } else { - let validity_masks = bitmap.chunks::<::Chunk>(); - null_sum_impl(values, validity_masks) - } -} - -/// Returns the sum of values in the array. -/// -/// Returns `None` if the array is empty or only contains null values. -pub fn sum_primitive(array: &PrimitiveArray) -> Option -where - T: NativeType + Simd + Add + std::iter::Sum, - T::Simd: Add + Sum, -{ - let null_count = array.null_count(); - - if null_count == array.len() { - return None; - } - - match array.validity() { - None => Some(sum_slice(array.values())), - Some(bitmap) => Some(null_sum(array.values(), bitmap)), - } -} - -/// Returns the sum of all elements in `array` as a [`Scalar`] of the same physical -/// and logical types as `array`. -/// # Error -/// Errors iff the operation is not supported. -pub fn sum(array: &dyn Array) -> PolarsResult> { - Ok(match array.dtype().to_physical_type() { - PhysicalType::Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { - let dtype = array.dtype().clone(); - let array = array.as_any().downcast_ref().unwrap(); - Box::new(PrimitiveScalar::new(dtype, sum_primitive::<$T>(array))) - }), - _ => { - unimplemented!() - }, - }) -} diff --git a/crates/polars-arrow/src/types/mod.rs b/crates/polars-arrow/src/types/mod.rs index c6f653a32311..7a22102edc16 100644 --- a/crates/polars-arrow/src/types/mod.rs +++ b/crates/polars-arrow/src/types/mod.rs @@ -16,16 +16,12 @@ //! represent chunks of bits (e.g. 8 bits via `u8`, 16 via `u16`), and [`BitChunkIter`], //! that can be used to iterate over bitmaps in [`BitChunk`]s according to //! Arrow's definition of bitmaps. -//! -//! Finally, this module contains traits used to compile code based on [`NativeType`] optimized -//! for SIMD, at [`mod@simd`]. mod aligned_bytes; pub use aligned_bytes::*; mod bit_chunk; pub use bit_chunk::{BitChunk, BitChunkIter, BitChunkOnes}; mod index; -pub mod simd; pub use index::*; mod native; pub use native::*; diff --git a/crates/polars-arrow/src/types/simd/mod.rs b/crates/polars-arrow/src/types/simd/mod.rs deleted file mode 100644 index 2666abe2ba2c..000000000000 --- a/crates/polars-arrow/src/types/simd/mod.rs +++ /dev/null @@ -1,168 +0,0 @@ -//! Contains traits and implementations of multi-data used in SIMD. -//! The actual representation is driven by the feature flag `"simd"`, which, if set, -//! uses [`std::simd`]. -use super::{days_ms, f16, i256, months_days_ns, BitChunk, BitChunkIter, NativeType}; - -/// Describes the ability to convert itself from a [`BitChunk`]. -pub trait FromMaskChunk { - /// Convert itself from a slice. - fn from_chunk(v: T) -> Self; -} - -/// A struct that lends itself well to be compiled leveraging SIMD -/// # Safety -/// The `NativeType` and the `NativeSimd` must have possible a matching alignment. -/// e.g. slicing `&[NativeType]` by `align_of()` must be properly aligned/safe. -pub unsafe trait NativeSimd: Sized + Default + Copy { - /// Number of lanes - const LANES: usize; - /// The [`NativeType`] of this struct. E.g. `f32` for a `NativeSimd = f32x16`. - type Native: NativeType; - /// The type holding bits for masks. - type Chunk: BitChunk; - /// Type used for masking. - type Mask: FromMaskChunk; - - /// Sets values to `default` based on `mask`. - fn select(self, mask: Self::Mask, default: Self) -> Self; - - /// Convert itself from a slice. - /// # Panics - /// * iff `v.len()` != `T::LANES` - fn from_chunk(v: &[Self::Native]) -> Self; - - /// creates a new Self from `v` by populating items from `v` up to its length. - /// Items from `v` at positions larger than the number of lanes are ignored; - /// remaining items are populated with `remaining`. - fn from_incomplete_chunk(v: &[Self::Native], remaining: Self::Native) -> Self; - - /// Returns a tuple of 3 items whose middle item is itself, and the remaining - /// are the head and tail of the un-aligned parts. - fn align(values: &[Self::Native]) -> (&[Self::Native], &[Self], &[Self::Native]); -} - -/// Trait implemented by some [`NativeType`] that have a SIMD representation. -pub trait Simd: NativeType { - /// The SIMD type associated with this trait. - /// This type supports SIMD operations - type Simd: NativeSimd; -} - -#[cfg(not(feature = "simd"))] -mod native; -#[cfg(not(feature = "simd"))] -pub use native::*; -#[cfg(feature = "simd")] -mod packed; -#[cfg(feature = "simd")] -pub use packed::*; - -macro_rules! native_simd { - ($name:tt, $type:ty, $lanes:expr, $mask:ty) => { - /// Multi-Data correspondence of the native type - #[allow(non_camel_case_types)] - #[derive(Copy, Clone)] - pub struct $name(pub [$type; $lanes]); - - unsafe impl NativeSimd for $name { - const LANES: usize = $lanes; - type Native = $type; - type Chunk = $mask; - type Mask = $mask; - - #[inline] - fn select(self, mask: $mask, default: Self) -> Self { - let mut reduced = default; - let iter = BitChunkIter::new(mask, Self::LANES); - for (i, b) in (0..Self::LANES).zip(iter) { - reduced[i] = if b { self[i] } else { reduced[i] }; - } - reduced - } - - #[inline] - fn from_chunk(v: &[$type]) -> Self { - ($name)(v.try_into().unwrap()) - } - - #[inline] - fn from_incomplete_chunk(v: &[$type], remaining: $type) -> Self { - let mut a = [remaining; $lanes]; - a.iter_mut().zip(v.iter()).for_each(|(a, b)| *a = *b); - Self(a) - } - - #[inline] - fn align(values: &[Self::Native]) -> (&[Self::Native], &[Self], &[Self::Native]) { - unsafe { values.align_to::() } - } - } - - impl std::ops::Index for $name { - type Output = $type; - - #[inline] - fn index(&self, index: usize) -> &Self::Output { - &self.0[index] - } - } - - impl std::ops::IndexMut for $name { - #[inline] - fn index_mut(&mut self, index: usize) -> &mut Self::Output { - &mut self.0[index] - } - } - - impl Default for $name { - #[inline] - fn default() -> Self { - ($name)([<$type>::default(); $lanes]) - } - } - }; -} - -#[cfg(not(feature = "simd"))] -pub(super) use native_simd; - -// Types do not have specific intrinsics and thus SIMD can't be specialized. -// Therefore, we can declare their MD representation as `[$t; 8]` irrespectively -// of how they are represented in the different channels. -native_simd!(f16x32, f16, 32, u32); -native_simd!(days_msx8, days_ms, 8, u8); -native_simd!(months_days_nsx8, months_days_ns, 8, u8); -native_simd!(i128x8, i128, 8, u8); -native_simd!(i256x8, i256, 8, u8); - -// In the native implementation, a mask is 1 bit wide, as per AVX512. -impl FromMaskChunk for T { - #[inline] - fn from_chunk(v: T) -> Self { - v - } -} - -macro_rules! native { - ($type:ty, $simd:ty) => { - impl Simd for $type { - type Simd = $simd; - } - }; -} - -native!(u8, u8x64); -native!(u16, u16x32); -native!(u32, u32x16); -native!(u64, u64x8); -native!(i8, i8x64); -native!(i16, i16x32); -native!(i32, i32x16); -native!(i64, i64x8); -native!(f16, f16x32); -native!(f32, f32x16); -native!(f64, f64x8); -native!(i128, i128x8); -native!(i256, i256x8); -native!(days_ms, days_msx8); -native!(months_days_ns, months_days_nsx8); diff --git a/crates/polars-arrow/src/types/simd/native.rs b/crates/polars-arrow/src/types/simd/native.rs deleted file mode 100644 index f0cb5436f4f3..000000000000 --- a/crates/polars-arrow/src/types/simd/native.rs +++ /dev/null @@ -1,13 +0,0 @@ -use super::*; - -native_simd!(u8x64, u8, 64, u64); -native_simd!(u16x32, u16, 32, u32); -native_simd!(u32x16, u32, 16, u16); -native_simd!(u64x8, u64, 8, u8); -native_simd!(i8x64, i8, 64, u64); -native_simd!(i16x32, i16, 32, u32); -native_simd!(i32x16, i32, 16, u16); -native_simd!(i64x8, i64, 8, u8); -native_simd!(f16x32, f16, 32, u32); -native_simd!(f32x16, f32, 16, u16); -native_simd!(f64x8, f64, 8, u8); diff --git a/crates/polars-arrow/src/types/simd/packed.rs b/crates/polars-arrow/src/types/simd/packed.rs deleted file mode 100644 index 3d380f96ff55..000000000000 --- a/crates/polars-arrow/src/types/simd/packed.rs +++ /dev/null @@ -1,203 +0,0 @@ -pub use std::simd::prelude::{ - f32x16, f32x8, f64x8, i16x32, i16x8, i32x16, i32x8, i64x8, i8x64, i8x8, mask32x16 as m32x16, - mask64x8 as m64x8, mask8x64 as m8x64, u16x32, u16x8, u32x16, u32x8, u64x8, u8x64, u8x8, - SimdPartialEq, -}; - -/// Vector of 32 16-bit masks -#[allow(non_camel_case_types)] -pub type m16x32 = std::simd::Mask; - -use super::*; - -macro_rules! simd { - ($name:tt, $type:ty, $lanes:expr, $chunk:ty, $mask:tt) => { - unsafe impl NativeSimd for $name { - const LANES: usize = $lanes; - type Native = $type; - type Chunk = $chunk; - type Mask = $mask; - - #[inline] - fn select(self, mask: $mask, default: Self) -> Self { - mask.select(self, default) - } - - #[inline] - fn from_chunk(v: &[$type]) -> Self { - <$name>::from_slice(v) - } - - #[inline] - fn from_incomplete_chunk(v: &[$type], remaining: $type) -> Self { - let mut a = [remaining; $lanes]; - a.iter_mut().zip(v.iter()).for_each(|(a, b)| *a = *b); - <$name>::from_chunk(a.as_ref()) - } - - #[inline] - fn align(values: &[Self::Native]) -> (&[Self::Native], &[Self], &[Self::Native]) { - unsafe { values.align_to::() } - } - } - }; -} - -simd!(u8x64, u8, 64, u64, m8x64); -simd!(u16x32, u16, 32, u32, m16x32); -simd!(u32x16, u32, 16, u16, m32x16); -simd!(u64x8, u64, 8, u8, m64x8); -simd!(i8x64, i8, 64, u64, m8x64); -simd!(i16x32, i16, 32, u32, m16x32); -simd!(i32x16, i32, 16, u16, m32x16); -simd!(i64x8, i64, 8, u8, m64x8); -simd!(f32x16, f32, 16, u16, m32x16); -simd!(f64x8, f64, 8, u8, m64x8); - -macro_rules! chunk_macro { - ($type:ty, $chunk:ty, $simd:ty, $mask:tt, $m:expr) => { - impl FromMaskChunk<$chunk> for $mask { - #[inline] - fn from_chunk(chunk: $chunk) -> Self { - ($m)(chunk) - } - } - }; -} - -chunk_macro!(u8, u64, u8x64, m8x64, from_chunk_u64); -chunk_macro!(u16, u32, u16x32, m16x32, from_chunk_u32); -chunk_macro!(u32, u16, u32x16, m32x16, from_chunk_u16); -chunk_macro!(u64, u8, u64x8, m64x8, from_chunk_u8); - -#[inline] -fn from_chunk_u8(chunk: u8) -> m64x8 { - let idx = u64x8::from_array([1, 2, 4, 8, 16, 32, 64, 128]); - let vecmask = u64x8::splat(chunk as u64); - - (idx & vecmask).simd_eq(idx) -} - -#[inline] -fn from_chunk_u16(chunk: u16) -> m32x16 { - let idx = u32x16::from_array([ - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, - ]); - let vecmask = u32x16::splat(chunk as u32); - - (idx & vecmask).simd_eq(idx) -} - -#[inline] -fn from_chunk_u32(chunk: u32) -> m16x32 { - let idx = u16x32::from_array([ - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 1, 2, 4, 8, - 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, - ]); - let left = u16x32::from_chunk(&[ - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]); - let right = u16x32::from_chunk(&[ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, - 1024, 2048, 4096, 8192, 16384, 32768, - ]); - - let a = chunk.to_le_bytes(); - let a1 = u16::from_le_bytes([a[0], a[1]]); - let a2 = u16::from_le_bytes([a[2], a[3]]); - - let vecmask1 = u16x32::splat(a1); - let vecmask2 = u16x32::splat(a2); - - (idx & left & vecmask1).simd_eq(idx) | (idx & right & vecmask2).simd_eq(idx) -} - -#[inline] -fn from_chunk_u64(chunk: u64) -> m8x64 { - let idx = u8x64::from_array([ - 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, - 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, - 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, - ]); - let idxs = [ - u8x64::from_chunk(&[ - 1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - ]), - u8x64::from_chunk(&[ - 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - ]), - u8x64::from_chunk(&[ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - ]), - u8x64::from_chunk(&[ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, - 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, - ]), - u8x64::from_chunk(&[ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - ]), - u8x64::from_chunk(&[ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - ]), - u8x64::from_chunk(&[ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, 128, - 0, 0, 0, 0, 0, 0, 0, 0, - ]), - u8x64::from_chunk(&[ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, - 4, 8, 16, 32, 64, 128, - ]), - ]; - - let a = chunk.to_ne_bytes(); - - let mut result = m8x64::default(); - for i in 0..8 { - result |= (idxs[i] & u8x64::splat(a[i])).simd_eq(idx) - } - - result -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_basic1() { - let a = 0b00000000000000000000000000010001u32; - let a = from_chunk_u32(a); - for i in 0..8 { - assert_eq!(a.test(i), i % 4 == 0) - } - for i in 8..32 { - assert!(!a.test(i)) - } - } - - #[test] - fn test_basic2() { - let a = 0b0000000000000000000000000000000000000001000000010000000100000001u64; - let a = from_chunk_u64(a); - for i in 0..32 { - assert_eq!(a.test(i), i % 8 == 0) - } - for i in 32..64 { - assert!(!a.test(i)) - } - } -} diff --git a/crates/polars-compute/src/lib.rs b/crates/polars-compute/src/lib.rs index 9c42dcb1e38e..53d1339a9a10 100644 --- a/crates/polars-compute/src/lib.rs +++ b/crates/polars-compute/src/lib.rs @@ -26,6 +26,7 @@ pub mod if_then_else; pub mod min_max; pub mod propagate_dictionary; pub mod size; +pub mod sum; pub mod unique; pub mod var_cov; @@ -39,3 +40,25 @@ impl NotSimdPrimitive for T {} impl NotSimdPrimitive for u128 {} #[cfg(feature = "simd")] impl NotSimdPrimitive for i128 {} + +// Trait to allow blanket impl for all SIMD types when simd is enabled. +#[cfg(feature = "simd")] +mod _simd_primitive { + use std::simd::SimdElement; + pub trait SimdPrimitive: SimdElement {} + impl SimdPrimitive for u8 {} + impl SimdPrimitive for u16 {} + impl SimdPrimitive for u32 {} + impl SimdPrimitive for u64 {} + impl SimdPrimitive for usize {} + impl SimdPrimitive for i8 {} + impl SimdPrimitive for i16 {} + impl SimdPrimitive for i32 {} + impl SimdPrimitive for i64 {} + impl SimdPrimitive for isize {} + impl SimdPrimitive for f32 {} + impl SimdPrimitive for f64 {} +} + +#[cfg(feature = "simd")] +pub use _simd_primitive::SimdPrimitive; diff --git a/crates/polars-compute/src/sum.rs b/crates/polars-compute/src/sum.rs new file mode 100644 index 000000000000..1179fe0366bf --- /dev/null +++ b/crates/polars-compute/src/sum.rs @@ -0,0 +1,160 @@ +use std::ops::Add; +#[cfg(feature = "simd")] +use std::simd::prelude::*; + +use arrow::array::{Array, PrimitiveArray}; +use arrow::bitmap::bitmask::BitMask; +use arrow::types::NativeType; +use num_traits::Zero; + +macro_rules! wrapping_impl { + ($trait_name:ident, $method:ident, $t:ty) => { + impl $trait_name for $t { + #[inline(always)] + fn wrapping_add(&self, v: &Self) -> Self { + <$t>::$method(*self, *v) + } + } + }; +} + +/// Performs addition that wraps around on overflow. +/// +/// Differs from num::WrappingAdd in that this is also implemented for floats. +pub trait WrappingAdd: Sized { + /// Wrapping (modular) addition. Computes `self + other`, wrapping around at + /// the boundary of the type. + fn wrapping_add(&self, v: &Self) -> Self; +} + +wrapping_impl!(WrappingAdd, wrapping_add, u8); +wrapping_impl!(WrappingAdd, wrapping_add, u16); +wrapping_impl!(WrappingAdd, wrapping_add, u32); +wrapping_impl!(WrappingAdd, wrapping_add, u64); +wrapping_impl!(WrappingAdd, wrapping_add, usize); +wrapping_impl!(WrappingAdd, wrapping_add, u128); + +wrapping_impl!(WrappingAdd, wrapping_add, i8); +wrapping_impl!(WrappingAdd, wrapping_add, i16); +wrapping_impl!(WrappingAdd, wrapping_add, i32); +wrapping_impl!(WrappingAdd, wrapping_add, i64); +wrapping_impl!(WrappingAdd, wrapping_add, isize); +wrapping_impl!(WrappingAdd, wrapping_add, i128); + +wrapping_impl!(WrappingAdd, add, f32); +wrapping_impl!(WrappingAdd, add, f64); + +#[cfg(feature = "simd")] +const STRIPE: usize = 16; + +fn wrapping_sum_with_mask_scalar(vals: &[T], mask: &BitMask) -> T { + assert!(vals.len() == mask.len()); + vals.iter() + .enumerate() + .map(|(i, x)| { + // No filter but rather select of 0 for cmov opt. + if mask.get(i) { + *x + } else { + T::zero() + } + }) + .fold(T::zero(), |a, b| a.wrapping_add(&b)) +} + +#[cfg(not(feature = "simd"))] +impl WrappingSum for T +where + T: NativeType + WrappingAdd + Zero, +{ + fn wrapping_sum(vals: &[Self]) -> Self { + vals.iter() + .copied() + .fold(T::zero(), |a, b| a.wrapping_add(&b)) + } + + fn wrapping_sum_with_validity(vals: &[Self], mask: &BitMask) -> Self { + wrapping_sum_with_mask_scalar(vals, mask) + } +} + +#[cfg(feature = "simd")] +impl WrappingSum for T +where + T: NativeType + WrappingAdd + Zero + crate::SimdPrimitive, +{ + fn wrapping_sum(vals: &[Self]) -> Self { + vals.iter() + .copied() + .fold(T::zero(), |a, b| a.wrapping_add(&b)) + } + + fn wrapping_sum_with_validity(vals: &[Self], mask: &BitMask) -> Self { + assert!(vals.len() == mask.len()); + let remainder = vals.len() % STRIPE; + let (rest, main) = vals.split_at(remainder); + let (rest_mask, main_mask) = mask.split_at(remainder); + let zero: Simd = Simd::default(); + + let vsum = main + .chunks_exact(STRIPE) + .enumerate() + .map(|(i, a)| { + let m: Mask<_, STRIPE> = main_mask.get_simd(i * STRIPE); + m.select(Simd::from_slice(a), zero) + }) + .fold(zero, |a, b| { + let a = a.to_array(); + let b = b.to_array(); + Simd::from_array(std::array::from_fn(|i| a[i].wrapping_add(&b[i]))) + }); + + let mainsum = vsum + .to_array() + .into_iter() + .fold(T::zero(), |a, b| a.wrapping_add(&b)); + + // TODO: faster remainder. + let restsum = wrapping_sum_with_mask_scalar(rest, &rest_mask); + mainsum.wrapping_add(&restsum) + } +} + +#[cfg(feature = "simd")] +impl WrappingSum for u128 { + fn wrapping_sum(vals: &[Self]) -> Self { + vals.iter().copied().fold(0, |a, b| a.wrapping_add(b)) + } + + fn wrapping_sum_with_validity(vals: &[Self], mask: &BitMask) -> Self { + wrapping_sum_with_mask_scalar(vals, mask) + } +} + +#[cfg(feature = "simd")] +impl WrappingSum for i128 { + fn wrapping_sum(vals: &[Self]) -> Self { + vals.iter().copied().fold(0, |a, b| a.wrapping_add(b)) + } + + fn wrapping_sum_with_validity(vals: &[Self], mask: &BitMask) -> Self { + wrapping_sum_with_mask_scalar(vals, mask) + } +} + +pub trait WrappingSum: Sized { + fn wrapping_sum(vals: &[Self]) -> Self; + fn wrapping_sum_with_validity(vals: &[Self], mask: &BitMask) -> Self; +} + +pub fn wrapping_sum_arr(arr: &PrimitiveArray) -> T +where + T: NativeType + WrappingSum, +{ + let validity = arr.validity().filter(|_| arr.null_count() > 0); + if let Some(mask) = validity { + WrappingSum::wrapping_sum_with_validity(arr.values(), &BitMask::from_bitmap(mask)) + } else { + WrappingSum::wrapping_sum(arr.values()) + } +} diff --git a/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs index ef5fd105203d..9829d6aa0832 100644 --- a/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs @@ -2,14 +2,11 @@ mod quantile; mod var; -use std::ops::Add; - -use arrow::compute; -use arrow::types::simd::Simd; use arrow::types::NativeType; use num_traits::{Float, One, ToPrimitive, Zero}; use polars_compute::float_sum; use polars_compute::min_max::MinMaxKernel; +use polars_compute::sum::{wrapping_sum_arr, WrappingSum}; use polars_utils::min_max::MinMax; use polars_utils::sync::SyncPtr; pub use quantile::*; @@ -45,8 +42,7 @@ pub trait ChunkAggSeries { fn sum(array: &PrimitiveArray) -> T where - T: NumericNative + NativeType, - ::Simd: Add::Simd> + compute::aggregate::Sum, + T: NumericNative + NativeType + WrappingSum, { if array.null_count() == array.len() { return T::default(); @@ -69,16 +65,15 @@ where } } } else { - compute::aggregate::sum_primitive(array).unwrap_or(T::zero()) + wrapping_sum_arr(array) } } impl ChunkAgg for ChunkedArray where T: PolarsNumericType, + T::Native: WrappingSum, PrimitiveArray: for<'a> MinMaxKernel = T::Native>, - ::Simd: - Add::Simd> + compute::aggregate::Sum, { fn sum(&self) -> Option { Some( @@ -270,9 +265,8 @@ impl BooleanChunked { impl ChunkAggSeries for ChunkedArray where T: PolarsNumericType, + T::Native: WrappingSum, PrimitiveArray: for<'a> MinMaxKernel = T::Native>, - ::Simd: - Add::Simd> + compute::aggregate::Sum, ChunkedArray: IntoSeries, { fn sum_reduce(&self) -> Scalar { @@ -345,9 +339,7 @@ impl VarAggSeries for Float64Chunked { impl QuantileAggSeries for ChunkedArray where T: PolarsIntegerType, - T::Native: Ord, - ::Simd: - Add::Simd> + compute::aggregate::Sum, + T::Native: Ord + WrappingSum, { fn quantile_reduce(&self, quantile: f64, method: QuantileMethod) -> PolarsResult { let v = self.quantile(quantile, method)?; diff --git a/crates/polars-core/src/datatypes/mod.rs b/crates/polars-core/src/datatypes/mod.rs index 4c4c726037eb..1d6196e167d7 100644 --- a/crates/polars-core/src/datatypes/mod.rs +++ b/crates/polars-core/src/datatypes/mod.rs @@ -30,7 +30,6 @@ pub use arrow::datatypes::reshape::*; #[cfg(feature = "dtype-categorical")] use arrow::datatypes::IntegerType; pub use arrow::datatypes::{ArrowDataType, TimeUnit as ArrowTimeUnit}; -use arrow::types::simd::Simd; use arrow::types::NativeType; use bytemuck::Zeroable; pub use dtype::*; @@ -331,7 +330,7 @@ pub trait NumericNative: + NumCast + Zero + One - + Simd + // + Simd // + Simd8 + std::iter::Sum + Add diff --git a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/mean.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/mean.rs index 8b1374c504d3..0be644ed28ff 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/mean.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/mean.rs @@ -2,12 +2,10 @@ use std::any::Any; use std::ops::Add; use arrow::array::{Array, PrimitiveArray}; -use arrow::compute::aggregate::Sum; -use arrow::types::simd::Simd; +use polars_compute::sum::{wrapping_sum_arr, WrappingSum}; use polars_core::export::arrow::datatypes::PrimitiveType; use polars_core::export::num::NumCast; use polars_core::prelude::*; -use polars_core::utils::arrow::compute::aggregate::sum_primitive; use super::*; @@ -28,8 +26,7 @@ impl MeanAgg { impl AggregateFn for MeanAgg where K::PolarsType: PolarsNumericType, - K: NumericNative + Add, - ::Simd: Add::Simd> + Sum, + K: NumericNative + Add + WrappingSum, { fn has_physical_agg(&self) -> bool { true @@ -83,16 +80,15 @@ where .downcast_ref::>() .unwrap_unchecked() }; - match (sum_primitive(arr), self.sum) { - (Some(val), Some(sum)) => { + match (wrapping_sum_arr(arr), self.sum) { + (val, Some(sum)) => { self.sum = Some(sum + val); self.count += (arr.len() - arr.null_count()) as IdxSize; }, - (Some(val), None) => { + (val, None) => { self.sum = Some(val); self.count += (arr.len() - arr.null_count()) as IdxSize; }, - _ => {}, } } diff --git a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/sum.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/sum.rs index 202ec3969dd5..a1c42029b332 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/sum.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/sum.rs @@ -2,11 +2,9 @@ use std::any::Any; use std::ops::Add; use arrow::array::PrimitiveArray; -use arrow::compute::aggregate::Sum; -use arrow::types::simd::Simd; +use polars_compute::sum::{wrapping_sum_arr, WrappingSum}; use polars_core::export::num::NumCast; use polars_core::prelude::*; -use polars_core::utils::arrow::compute::aggregate::sum_primitive; use super::*; @@ -23,8 +21,7 @@ impl SumAgg { impl AggregateFn for SumAgg where K::PolarsType: PolarsNumericType, - K: NumericNative + Add, - ::Simd: Add::Simd> + Sum, + K: NumericNative + Add + WrappingSum, { fn has_physical_agg(&self) -> bool { true @@ -61,14 +58,13 @@ where .downcast_ref::>() .unwrap_unchecked() }; - match (sum_primitive(arr), self.sum) { - (Some(val), Some(sum)) => { + match (wrapping_sum_arr(arr), self.sum) { + (val, Some(sum)) => { self.sum = Some(sum + val); }, - (Some(val), None) => { + (val, None) => { self.sum = Some(val); }, - _ => {}, } } diff --git a/crates/polars/tests/it/arrow/compute/aggregate/mod.rs b/crates/polars/tests/it/arrow/compute/aggregate/mod.rs index d7de8a8c37c5..2363540356a2 100644 --- a/crates/polars/tests/it/arrow/compute/aggregate/mod.rs +++ b/crates/polars/tests/it/arrow/compute/aggregate/mod.rs @@ -1,2 +1 @@ mod memory; -mod sum; diff --git a/crates/polars/tests/it/arrow/compute/aggregate/sum.rs b/crates/polars/tests/it/arrow/compute/aggregate/sum.rs deleted file mode 100644 index 011f75aad356..000000000000 --- a/crates/polars/tests/it/arrow/compute/aggregate/sum.rs +++ /dev/null @@ -1,37 +0,0 @@ -use arrow::array::*; -use arrow::compute::aggregate::{sum, sum_primitive}; -use arrow::datatypes::ArrowDataType; -use arrow::scalar::{PrimitiveScalar, Scalar}; - -#[test] -fn test_primitive_array_sum() { - let a = Int32Array::from_slice([1, 2, 3, 4, 5]); - assert_eq!( - &PrimitiveScalar::::from(Some(15)) as &dyn Scalar, - sum(&a).unwrap().as_ref() - ); - - let a = a.to(ArrowDataType::Date32); - assert_eq!( - &PrimitiveScalar::::from(Some(15)).to(ArrowDataType::Date32) as &dyn Scalar, - sum(&a).unwrap().as_ref() - ); -} - -#[test] -fn test_primitive_array_float_sum() { - let a = Float64Array::from_slice([1.1f64, 2.2, 3.3, 4.4, 5.5]); - assert!((16.5 - sum_primitive(&a).unwrap()).abs() < f64::EPSILON); -} - -#[test] -fn test_primitive_array_sum_with_nulls() { - let a = Int32Array::from(&[None, Some(2), Some(3), None, Some(5)]); - assert_eq!(10, sum_primitive(&a).unwrap()); -} - -#[test] -fn test_primitive_array_sum_all_nulls() { - let a = Int32Array::from(&[None, None, None]); - assert_eq!(None, sum_primitive(&a)); -}