From a4cf96ff697dd0ac3489214239fc0b783b0472e3 Mon Sep 17 00:00:00 2001
From: Orson Peters <orsonpeters@gmail.com>
Date: Thu, 23 Jan 2025 15:04:05 +0100
Subject: [PATCH] refactor(rust): Move sum kernel to polars-compute (#20867)

---
 Cargo.lock                                    |  29 ---
 Cargo.toml                                    |   1 -
 crates/polars-arrow/Cargo.toml                |   9 +-
 .../polars-arrow/src/compute/aggregate/mod.rs |   8 -
 .../src/compute/aggregate/simd/mod.rs         |  52 -----
 .../src/compute/aggregate/simd/native.rs      |  15 --
 .../src/compute/aggregate/simd/packed.rs      |  26 ---
 .../polars-arrow/src/compute/aggregate/sum.rs | 120 -----------
 crates/polars-arrow/src/types/mod.rs          |   4 -
 crates/polars-arrow/src/types/simd/mod.rs     | 168 ---------------
 crates/polars-arrow/src/types/simd/native.rs  |  13 --
 crates/polars-arrow/src/types/simd/packed.rs  | 203 ------------------
 crates/polars-compute/src/lib.rs              |  23 ++
 crates/polars-compute/src/sum.rs              | 160 ++++++++++++++
 .../src/chunked_array/ops/aggregate/mod.rs    |  20 +-
 crates/polars-core/src/datatypes/mod.rs       |   3 +-
 .../sinks/group_by/aggregates/mean.rs         |  14 +-
 .../sinks/group_by/aggregates/sum.rs          |  14 +-
 .../tests/it/arrow/compute/aggregate/mod.rs   |   1 -
 .../tests/it/arrow/compute/aggregate/sum.rs   |  37 ----
 20 files changed, 202 insertions(+), 718 deletions(-)
 delete mode 100644 crates/polars-arrow/src/compute/aggregate/simd/mod.rs
 delete mode 100644 crates/polars-arrow/src/compute/aggregate/simd/native.rs
 delete mode 100644 crates/polars-arrow/src/compute/aggregate/simd/packed.rs
 delete mode 100644 crates/polars-arrow/src/compute/aggregate/sum.rs
 delete mode 100644 crates/polars-arrow/src/types/simd/mod.rs
 delete mode 100644 crates/polars-arrow/src/types/simd/native.rs
 delete mode 100644 crates/polars-arrow/src/types/simd/packed.rs
 create mode 100644 crates/polars-compute/src/sum.rs
 delete mode 100644 crates/polars/tests/it/arrow/compute/aggregate/sum.rs

diff --git a/Cargo.lock b/Cargo.lock
index 689f5fa9a32c..c8d5c79dc4da 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2445,28 +2445,6 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
-[[package]]
-name = "multiversion"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7edb7f0ff51249dfda9ab96b5823695e15a052dc15074c9dbf3d118afaf2c201"
-dependencies = [
- "multiversion-macros",
- "target-features",
-]
-
-[[package]]
-name = "multiversion-macros"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b093064383341eb3271f42e381cb8f10a01459478446953953c75d24bd339fc0"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.94",
- "target-features",
-]
-
 [[package]]
 name = "native-tls"
 version = "0.2.12"
@@ -2988,7 +2966,6 @@ dependencies = [
  "itoa",
  "itoap",
  "lz4",
- "multiversion",
  "num-traits",
  "parking_lot",
  "polars-arrow-format",
@@ -4713,12 +4690,6 @@ dependencies = [
  "windows",
 ]
 
-[[package]]
-name = "target-features"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1bbb9f3c5c463a01705937a24fdabc5047929ac764b2d5b9cf681c1f5041ed5"
-
 [[package]]
 name = "target-lexicon"
 version = "0.12.16"
diff --git a/Cargo.toml b/Cargo.toml
index 2163d52915a4..372c5aa626d8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -53,7 +53,6 @@ itoap = { version = "1", features = ["simd"] }
 libc = "0.2"
 memchr = "2.6"
 memmap = { package = "memmap2", version = "0.9" }
-multiversion = "0.8"
 ndarray = { version = "0.16", default-features = false }
 num-traits = "0.2"
 numpy = "0.23"
diff --git a/crates/polars-arrow/Cargo.toml b/crates/polars-arrow/Cargo.toml
index 4927eff8392f..eadb77956e79 100644
--- a/crates/polars-arrow/Cargo.toml
+++ b/crates/polars-arrow/Cargo.toml
@@ -61,9 +61,6 @@ avro-schema = { workspace = true, optional = true }
 # for division/remainder optimization at runtime
 strength_reduce = { workspace = true, optional = true }
 
-# For instruction multiversioning
-multiversion = { workspace = true, optional = true }
-
 # Faster hashing
 ahash = { workspace = true }
 
@@ -122,14 +119,13 @@ io_avro_compression = [
 io_avro_async = ["avro-schema/async"]
 
 # the compute kernels. Disabling this significantly reduces compile time.
-compute_aggregate = ["multiversion"]
+compute_aggregate = []
 compute_arithmetics_decimal = ["strength_reduce"]
 compute_arithmetics = ["strength_reduce", "compute_arithmetics_decimal"]
 compute_bitwise = []
 compute_boolean = []
 compute_boolean_kleene = []
 compute_comparison = ["compute_boolean"]
-compute_hash = ["multiversion"]
 compute_temporal = []
 compute = [
   "compute_aggregate",
@@ -138,7 +134,6 @@ compute = [
   "compute_boolean",
   "compute_boolean_kleene",
   "compute_comparison",
-  "compute_hash",
   "compute_temporal",
 ]
 serde = ["dep:serde", "polars-schema/serde", "polars-utils/serde"]
@@ -161,4 +156,4 @@ features = ["full"]
 rustdoc-args = ["--cfg", "docsrs"]
 
 [package.metadata.cargo-all-features]
-allowlist = ["compute", "compute_sort", "compute_hash", "compute_nullif"]
+allowlist = ["compute", "compute_sort", "compute_nullif"]
diff --git a/crates/polars-arrow/src/compute/aggregate/mod.rs b/crates/polars-arrow/src/compute/aggregate/mod.rs
index 481194c1551c..879e3a09c6ed 100644
--- a/crates/polars-arrow/src/compute/aggregate/mod.rs
+++ b/crates/polars-arrow/src/compute/aggregate/mod.rs
@@ -1,10 +1,2 @@
-/// ! Contains different aggregation functions
-#[cfg(feature = "compute_aggregate")]
-mod sum;
-#[cfg(feature = "compute_aggregate")]
-pub use sum::*;
-
 mod memory;
 pub use memory::*;
-#[cfg(feature = "compute_aggregate")]
-mod simd;
diff --git a/crates/polars-arrow/src/compute/aggregate/simd/mod.rs b/crates/polars-arrow/src/compute/aggregate/simd/mod.rs
deleted file mode 100644
index 010ba336fc37..000000000000
--- a/crates/polars-arrow/src/compute/aggregate/simd/mod.rs
+++ /dev/null
@@ -1,52 +0,0 @@
-use std::ops::Add;
-
-use super::Sum;
-use crate::types::simd::{i128x8, NativeSimd};
-
-macro_rules! simd_add {
-    ($simd:tt, $type:ty, $lanes:expr, $add:tt) => {
-        impl std::ops::AddAssign for $simd {
-            #[inline]
-            fn add_assign(&mut self, rhs: Self) {
-                for i in 0..$lanes {
-                    self[i] = <$type>::$add(self[i], rhs[i]);
-                }
-            }
-        }
-
-        impl std::ops::Add for $simd {
-            type Output = Self;
-
-            #[inline]
-            fn add(self, rhs: Self) -> Self::Output {
-                let mut result = Self::default();
-                for i in 0..$lanes {
-                    result[i] = <$type>::$add(self[i], rhs[i]);
-                }
-                result
-            }
-        }
-
-        impl Sum<$type> for $simd {
-            #[inline]
-            fn simd_sum(self) -> $type {
-                let mut reduced = <$type>::default();
-                (0..<$simd>::LANES).for_each(|i| {
-                    reduced += self[i];
-                });
-                reduced
-            }
-        }
-    };
-}
-
-// #[cfg(not(feature = "simd"))]
-// pub(super) use simd_add;
-
-simd_add!(i128x8, i128, 8, add);
-
-#[cfg(not(feature = "simd"))]
-mod native;
-
-#[cfg(feature = "simd")]
-mod packed;
diff --git a/crates/polars-arrow/src/compute/aggregate/simd/native.rs b/crates/polars-arrow/src/compute/aggregate/simd/native.rs
deleted file mode 100644
index eb33878decbd..000000000000
--- a/crates/polars-arrow/src/compute/aggregate/simd/native.rs
+++ /dev/null
@@ -1,15 +0,0 @@
-use std::ops::Add;
-
-use super::super::sum::Sum;
-use crate::types::simd::*;
-
-simd_add!(u8x64, u8, 64, wrapping_add);
-simd_add!(u16x32, u16, 32, wrapping_add);
-simd_add!(u32x16, u32, 16, wrapping_add);
-simd_add!(u64x8, u64, 8, wrapping_add);
-simd_add!(i8x64, i8, 64, wrapping_add);
-simd_add!(i16x32, i16, 32, wrapping_add);
-simd_add!(i32x16, i32, 16, wrapping_add);
-simd_add!(i64x8, i64, 8, wrapping_add);
-simd_add!(f32x16, f32, 16, add);
-simd_add!(f64x8, f64, 8, add);
diff --git a/crates/polars-arrow/src/compute/aggregate/simd/packed.rs b/crates/polars-arrow/src/compute/aggregate/simd/packed.rs
deleted file mode 100644
index 2f1241993518..000000000000
--- a/crates/polars-arrow/src/compute/aggregate/simd/packed.rs
+++ /dev/null
@@ -1,26 +0,0 @@
-use std::simd::prelude::{SimdFloat as _, SimdInt as _, SimdUint as _};
-
-use super::super::sum::Sum;
-use crate::types::simd::*;
-
-macro_rules! simd_sum {
-    ($simd:tt, $type:ty, $sum:tt) => {
-        impl Sum<$type> for $simd {
-            #[inline]
-            fn simd_sum(self) -> $type {
-                self.$sum()
-            }
-        }
-    };
-}
-
-simd_sum!(f32x16, f32, reduce_sum);
-simd_sum!(f64x8, f64, reduce_sum);
-simd_sum!(u8x64, u8, reduce_sum);
-simd_sum!(u16x32, u16, reduce_sum);
-simd_sum!(u32x16, u32, reduce_sum);
-simd_sum!(u64x8, u64, reduce_sum);
-simd_sum!(i8x64, i8, reduce_sum);
-simd_sum!(i16x32, i16, reduce_sum);
-simd_sum!(i32x16, i32, reduce_sum);
-simd_sum!(i64x8, i64, reduce_sum);
diff --git a/crates/polars-arrow/src/compute/aggregate/sum.rs b/crates/polars-arrow/src/compute/aggregate/sum.rs
deleted file mode 100644
index e2098d969e03..000000000000
--- a/crates/polars-arrow/src/compute/aggregate/sum.rs
+++ /dev/null
@@ -1,120 +0,0 @@
-use std::ops::Add;
-
-use multiversion::multiversion;
-use polars_error::PolarsResult;
-
-use crate::array::{Array, PrimitiveArray};
-use crate::bitmap::utils::{BitChunkIterExact, BitChunksExact};
-use crate::bitmap::Bitmap;
-use crate::datatypes::PhysicalType;
-use crate::scalar::*;
-use crate::types::simd::*;
-use crate::types::NativeType;
-use crate::with_match_primitive_type;
-
-/// Object that can reduce itself to a number. This is used in the context of SIMD to reduce
-/// a MD (e.g. `[f32; 16]`) into a single number (`f32`).
-pub trait Sum<T> {
-    /// Reduces this element to a single value.
-    fn simd_sum(self) -> T;
-}
-
-#[multiversion(targets = "simd")]
-/// Compute the sum of a slice
-pub fn sum_slice<T>(values: &[T]) -> T
-where
-    T: NativeType + Simd + Add<Output = T> + std::iter::Sum<T>,
-    T::Simd: Sum<T> + Add<Output = T::Simd>,
-{
-    let (head, simd_vals, tail) = T::Simd::align(values);
-
-    let mut reduced = T::Simd::from_incomplete_chunk(&[], T::default());
-    for chunk in simd_vals {
-        reduced = reduced + *chunk;
-    }
-
-    reduced.simd_sum() + head.iter().copied().sum() + tail.iter().copied().sum()
-}
-
-/// # Panics
-/// iff `values.len() != bitmap.len()` or the operation overflows.
-#[multiversion(targets = "simd")]
-fn null_sum_impl<T, I>(values: &[T], mut validity_masks: I) -> T
-where
-    T: NativeType + Simd,
-    T::Simd: Add<Output = T::Simd> + Sum<T>,
-    I: BitChunkIterExact<<<T as Simd>::Simd as NativeSimd>::Chunk>,
-{
-    let mut chunks = values.chunks_exact(T::Simd::LANES);
-
-    let sum = chunks.by_ref().zip(validity_masks.by_ref()).fold(
-        T::Simd::default(),
-        |acc, (chunk, validity_chunk)| {
-            let chunk = T::Simd::from_chunk(chunk);
-            let mask = <T::Simd as NativeSimd>::Mask::from_chunk(validity_chunk);
-            let selected = chunk.select(mask, T::Simd::default());
-            acc + selected
-        },
-    );
-
-    let remainder = T::Simd::from_incomplete_chunk(chunks.remainder(), T::default());
-    let mask = <T::Simd as NativeSimd>::Mask::from_chunk(validity_masks.remainder());
-    let remainder = remainder.select(mask, T::Simd::default());
-    let reduced = sum + remainder;
-
-    reduced.simd_sum()
-}
-
-/// # Panics
-/// iff `values.len() != bitmap.len()` or the operation overflows.
-fn null_sum<T>(values: &[T], bitmap: &Bitmap) -> T
-where
-    T: NativeType + Simd,
-    T::Simd: Add<Output = T::Simd> + Sum<T>,
-{
-    let (slice, offset, length) = bitmap.as_slice();
-    if offset == 0 {
-        let validity_masks = BitChunksExact::<<T::Simd as NativeSimd>::Chunk>::new(slice, length);
-        null_sum_impl(values, validity_masks)
-    } else {
-        let validity_masks = bitmap.chunks::<<T::Simd as NativeSimd>::Chunk>();
-        null_sum_impl(values, validity_masks)
-    }
-}
-
-/// Returns the sum of values in the array.
-///
-/// Returns `None` if the array is empty or only contains null values.
-pub fn sum_primitive<T>(array: &PrimitiveArray<T>) -> Option<T>
-where
-    T: NativeType + Simd + Add<Output = T> + std::iter::Sum<T>,
-    T::Simd: Add<Output = T::Simd> + Sum<T>,
-{
-    let null_count = array.null_count();
-
-    if null_count == array.len() {
-        return None;
-    }
-
-    match array.validity() {
-        None => Some(sum_slice(array.values())),
-        Some(bitmap) => Some(null_sum(array.values(), bitmap)),
-    }
-}
-
-/// Returns the sum of all elements in `array` as a [`Scalar`] of the same physical
-/// and logical types as `array`.
-/// # Error
-/// Errors iff the operation is not supported.
-pub fn sum(array: &dyn Array) -> PolarsResult<Box<dyn Scalar>> {
-    Ok(match array.dtype().to_physical_type() {
-        PhysicalType::Primitive(primitive) => with_match_primitive_type!(primitive, |$T| {
-            let dtype = array.dtype().clone();
-            let array = array.as_any().downcast_ref().unwrap();
-            Box::new(PrimitiveScalar::new(dtype, sum_primitive::<$T>(array)))
-        }),
-        _ => {
-            unimplemented!()
-        },
-    })
-}
diff --git a/crates/polars-arrow/src/types/mod.rs b/crates/polars-arrow/src/types/mod.rs
index c6f653a32311..7a22102edc16 100644
--- a/crates/polars-arrow/src/types/mod.rs
+++ b/crates/polars-arrow/src/types/mod.rs
@@ -16,16 +16,12 @@
 //! represent chunks of bits (e.g. 8 bits via `u8`, 16 via `u16`), and [`BitChunkIter`],
 //! that can be used to iterate over bitmaps in [`BitChunk`]s according to
 //! Arrow's definition of bitmaps.
-//!
-//! Finally, this module contains traits used to compile code based on [`NativeType`] optimized
-//! for SIMD, at [`mod@simd`].
 
 mod aligned_bytes;
 pub use aligned_bytes::*;
 mod bit_chunk;
 pub use bit_chunk::{BitChunk, BitChunkIter, BitChunkOnes};
 mod index;
-pub mod simd;
 pub use index::*;
 mod native;
 pub use native::*;
diff --git a/crates/polars-arrow/src/types/simd/mod.rs b/crates/polars-arrow/src/types/simd/mod.rs
deleted file mode 100644
index 2666abe2ba2c..000000000000
--- a/crates/polars-arrow/src/types/simd/mod.rs
+++ /dev/null
@@ -1,168 +0,0 @@
-//! Contains traits and implementations of multi-data used in SIMD.
-//! The actual representation is driven by the feature flag `"simd"`, which, if set,
-//! uses [`std::simd`].
-use super::{days_ms, f16, i256, months_days_ns, BitChunk, BitChunkIter, NativeType};
-
-/// Describes the ability to convert itself from a [`BitChunk`].
-pub trait FromMaskChunk<T> {
-    /// Convert itself from a slice.
-    fn from_chunk(v: T) -> Self;
-}
-
-/// A struct that lends itself well to be compiled leveraging SIMD
-/// # Safety
-/// The `NativeType` and the `NativeSimd` must have possible a matching alignment.
-/// e.g. slicing `&[NativeType]` by `align_of<NativeSimd>()` must be properly aligned/safe.
-pub unsafe trait NativeSimd: Sized + Default + Copy {
-    /// Number of lanes
-    const LANES: usize;
-    /// The [`NativeType`] of this struct. E.g. `f32` for a `NativeSimd = f32x16`.
-    type Native: NativeType;
-    /// The type holding bits for masks.
-    type Chunk: BitChunk;
-    /// Type used for masking.
-    type Mask: FromMaskChunk<Self::Chunk>;
-
-    /// Sets values to `default` based on `mask`.
-    fn select(self, mask: Self::Mask, default: Self) -> Self;
-
-    /// Convert itself from a slice.
-    /// # Panics
-    /// * iff `v.len()` != `T::LANES`
-    fn from_chunk(v: &[Self::Native]) -> Self;
-
-    /// creates a new Self from `v` by populating items from `v` up to its length.
-    /// Items from `v` at positions larger than the number of lanes are ignored;
-    /// remaining items are populated with `remaining`.
-    fn from_incomplete_chunk(v: &[Self::Native], remaining: Self::Native) -> Self;
-
-    /// Returns a tuple of 3 items whose middle item is itself, and the remaining
-    /// are the head and tail of the un-aligned parts.
-    fn align(values: &[Self::Native]) -> (&[Self::Native], &[Self], &[Self::Native]);
-}
-
-/// Trait implemented by some [`NativeType`] that have a SIMD representation.
-pub trait Simd: NativeType {
-    /// The SIMD type associated with this trait.
-    /// This type supports SIMD operations
-    type Simd: NativeSimd<Native = Self>;
-}
-
-#[cfg(not(feature = "simd"))]
-mod native;
-#[cfg(not(feature = "simd"))]
-pub use native::*;
-#[cfg(feature = "simd")]
-mod packed;
-#[cfg(feature = "simd")]
-pub use packed::*;
-
-macro_rules! native_simd {
-    ($name:tt, $type:ty, $lanes:expr, $mask:ty) => {
-        /// Multi-Data correspondence of the native type
-        #[allow(non_camel_case_types)]
-        #[derive(Copy, Clone)]
-        pub struct $name(pub [$type; $lanes]);
-
-        unsafe impl NativeSimd for $name {
-            const LANES: usize = $lanes;
-            type Native = $type;
-            type Chunk = $mask;
-            type Mask = $mask;
-
-            #[inline]
-            fn select(self, mask: $mask, default: Self) -> Self {
-                let mut reduced = default;
-                let iter = BitChunkIter::new(mask, Self::LANES);
-                for (i, b) in (0..Self::LANES).zip(iter) {
-                    reduced[i] = if b { self[i] } else { reduced[i] };
-                }
-                reduced
-            }
-
-            #[inline]
-            fn from_chunk(v: &[$type]) -> Self {
-                ($name)(v.try_into().unwrap())
-            }
-
-            #[inline]
-            fn from_incomplete_chunk(v: &[$type], remaining: $type) -> Self {
-                let mut a = [remaining; $lanes];
-                a.iter_mut().zip(v.iter()).for_each(|(a, b)| *a = *b);
-                Self(a)
-            }
-
-            #[inline]
-            fn align(values: &[Self::Native]) -> (&[Self::Native], &[Self], &[Self::Native]) {
-                unsafe { values.align_to::<Self>() }
-            }
-        }
-
-        impl std::ops::Index<usize> for $name {
-            type Output = $type;
-
-            #[inline]
-            fn index(&self, index: usize) -> &Self::Output {
-                &self.0[index]
-            }
-        }
-
-        impl std::ops::IndexMut<usize> for $name {
-            #[inline]
-            fn index_mut(&mut self, index: usize) -> &mut Self::Output {
-                &mut self.0[index]
-            }
-        }
-
-        impl Default for $name {
-            #[inline]
-            fn default() -> Self {
-                ($name)([<$type>::default(); $lanes])
-            }
-        }
-    };
-}
-
-#[cfg(not(feature = "simd"))]
-pub(super) use native_simd;
-
-// Types do not have specific intrinsics and thus SIMD can't be specialized.
-// Therefore, we can declare their MD representation as `[$t; 8]` irrespectively
-// of how they are represented in the different channels.
-native_simd!(f16x32, f16, 32, u32);
-native_simd!(days_msx8, days_ms, 8, u8);
-native_simd!(months_days_nsx8, months_days_ns, 8, u8);
-native_simd!(i128x8, i128, 8, u8);
-native_simd!(i256x8, i256, 8, u8);
-
-// In the native implementation, a mask is 1 bit wide, as per AVX512.
-impl<T: BitChunk> FromMaskChunk<T> for T {
-    #[inline]
-    fn from_chunk(v: T) -> Self {
-        v
-    }
-}
-
-macro_rules! native {
-    ($type:ty, $simd:ty) => {
-        impl Simd for $type {
-            type Simd = $simd;
-        }
-    };
-}
-
-native!(u8, u8x64);
-native!(u16, u16x32);
-native!(u32, u32x16);
-native!(u64, u64x8);
-native!(i8, i8x64);
-native!(i16, i16x32);
-native!(i32, i32x16);
-native!(i64, i64x8);
-native!(f16, f16x32);
-native!(f32, f32x16);
-native!(f64, f64x8);
-native!(i128, i128x8);
-native!(i256, i256x8);
-native!(days_ms, days_msx8);
-native!(months_days_ns, months_days_nsx8);
diff --git a/crates/polars-arrow/src/types/simd/native.rs b/crates/polars-arrow/src/types/simd/native.rs
deleted file mode 100644
index f0cb5436f4f3..000000000000
--- a/crates/polars-arrow/src/types/simd/native.rs
+++ /dev/null
@@ -1,13 +0,0 @@
-use super::*;
-
-native_simd!(u8x64, u8, 64, u64);
-native_simd!(u16x32, u16, 32, u32);
-native_simd!(u32x16, u32, 16, u16);
-native_simd!(u64x8, u64, 8, u8);
-native_simd!(i8x64, i8, 64, u64);
-native_simd!(i16x32, i16, 32, u32);
-native_simd!(i32x16, i32, 16, u16);
-native_simd!(i64x8, i64, 8, u8);
-native_simd!(f16x32, f16, 32, u32);
-native_simd!(f32x16, f32, 16, u16);
-native_simd!(f64x8, f64, 8, u8);
diff --git a/crates/polars-arrow/src/types/simd/packed.rs b/crates/polars-arrow/src/types/simd/packed.rs
deleted file mode 100644
index 3d380f96ff55..000000000000
--- a/crates/polars-arrow/src/types/simd/packed.rs
+++ /dev/null
@@ -1,203 +0,0 @@
-pub use std::simd::prelude::{
-    f32x16, f32x8, f64x8, i16x32, i16x8, i32x16, i32x8, i64x8, i8x64, i8x8, mask32x16 as m32x16,
-    mask64x8 as m64x8, mask8x64 as m8x64, u16x32, u16x8, u32x16, u32x8, u64x8, u8x64, u8x8,
-    SimdPartialEq,
-};
-
-/// Vector of 32 16-bit masks
-#[allow(non_camel_case_types)]
-pub type m16x32 = std::simd::Mask<i16, 32>;
-
-use super::*;
-
-macro_rules! simd {
-    ($name:tt, $type:ty, $lanes:expr, $chunk:ty, $mask:tt) => {
-        unsafe impl NativeSimd for $name {
-            const LANES: usize = $lanes;
-            type Native = $type;
-            type Chunk = $chunk;
-            type Mask = $mask;
-
-            #[inline]
-            fn select(self, mask: $mask, default: Self) -> Self {
-                mask.select(self, default)
-            }
-
-            #[inline]
-            fn from_chunk(v: &[$type]) -> Self {
-                <$name>::from_slice(v)
-            }
-
-            #[inline]
-            fn from_incomplete_chunk(v: &[$type], remaining: $type) -> Self {
-                let mut a = [remaining; $lanes];
-                a.iter_mut().zip(v.iter()).for_each(|(a, b)| *a = *b);
-                <$name>::from_chunk(a.as_ref())
-            }
-
-            #[inline]
-            fn align(values: &[Self::Native]) -> (&[Self::Native], &[Self], &[Self::Native]) {
-                unsafe { values.align_to::<Self>() }
-            }
-        }
-    };
-}
-
-simd!(u8x64, u8, 64, u64, m8x64);
-simd!(u16x32, u16, 32, u32, m16x32);
-simd!(u32x16, u32, 16, u16, m32x16);
-simd!(u64x8, u64, 8, u8, m64x8);
-simd!(i8x64, i8, 64, u64, m8x64);
-simd!(i16x32, i16, 32, u32, m16x32);
-simd!(i32x16, i32, 16, u16, m32x16);
-simd!(i64x8, i64, 8, u8, m64x8);
-simd!(f32x16, f32, 16, u16, m32x16);
-simd!(f64x8, f64, 8, u8, m64x8);
-
-macro_rules! chunk_macro {
-    ($type:ty, $chunk:ty, $simd:ty, $mask:tt, $m:expr) => {
-        impl FromMaskChunk<$chunk> for $mask {
-            #[inline]
-            fn from_chunk(chunk: $chunk) -> Self {
-                ($m)(chunk)
-            }
-        }
-    };
-}
-
-chunk_macro!(u8, u64, u8x64, m8x64, from_chunk_u64);
-chunk_macro!(u16, u32, u16x32, m16x32, from_chunk_u32);
-chunk_macro!(u32, u16, u32x16, m32x16, from_chunk_u16);
-chunk_macro!(u64, u8, u64x8, m64x8, from_chunk_u8);
-
-#[inline]
-fn from_chunk_u8(chunk: u8) -> m64x8 {
-    let idx = u64x8::from_array([1, 2, 4, 8, 16, 32, 64, 128]);
-    let vecmask = u64x8::splat(chunk as u64);
-
-    (idx & vecmask).simd_eq(idx)
-}
-
-#[inline]
-fn from_chunk_u16(chunk: u16) -> m32x16 {
-    let idx = u32x16::from_array([
-        1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768,
-    ]);
-    let vecmask = u32x16::splat(chunk as u32);
-
-    (idx & vecmask).simd_eq(idx)
-}
-
-#[inline]
-fn from_chunk_u32(chunk: u32) -> m16x32 {
-    let idx = u16x32::from_array([
-        1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 1, 2, 4, 8,
-        16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768,
-    ]);
-    let left = u16x32::from_chunk(&[
-        1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    ]);
-    let right = u16x32::from_chunk(&[
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512,
-        1024, 2048, 4096, 8192, 16384, 32768,
-    ]);
-
-    let a = chunk.to_le_bytes();
-    let a1 = u16::from_le_bytes([a[0], a[1]]);
-    let a2 = u16::from_le_bytes([a[2], a[3]]);
-
-    let vecmask1 = u16x32::splat(a1);
-    let vecmask2 = u16x32::splat(a2);
-
-    (idx & left & vecmask1).simd_eq(idx) | (idx & right & vecmask2).simd_eq(idx)
-}
-
-#[inline]
-fn from_chunk_u64(chunk: u64) -> m8x64 {
-    let idx = u8x64::from_array([
-        1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1,
-        2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2,
-        4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128,
-    ]);
-    let idxs = [
-        u8x64::from_chunk(&[
-            1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0,
-        ]),
-        u8x64::from_chunk(&[
-            0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0,
-        ]),
-        u8x64::from_chunk(&[
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0,
-        ]),
-        u8x64::from_chunk(&[
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16,
-            32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0,
-        ]),
-        u8x64::from_chunk(&[
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0,
-        ]),
-        u8x64::from_chunk(&[
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0,
-        ]),
-        u8x64::from_chunk(&[
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, 128,
-            0, 0, 0, 0, 0, 0, 0, 0,
-        ]),
-        u8x64::from_chunk(&[
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2,
-            4, 8, 16, 32, 64, 128,
-        ]),
-    ];
-
-    let a = chunk.to_ne_bytes();
-
-    let mut result = m8x64::default();
-    for i in 0..8 {
-        result |= (idxs[i] & u8x64::splat(a[i])).simd_eq(idx)
-    }
-
-    result
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_basic1() {
-        let a = 0b00000000000000000000000000010001u32;
-        let a = from_chunk_u32(a);
-        for i in 0..8 {
-            assert_eq!(a.test(i), i % 4 == 0)
-        }
-        for i in 8..32 {
-            assert!(!a.test(i))
-        }
-    }
-
-    #[test]
-    fn test_basic2() {
-        let a = 0b0000000000000000000000000000000000000001000000010000000100000001u64;
-        let a = from_chunk_u64(a);
-        for i in 0..32 {
-            assert_eq!(a.test(i), i % 8 == 0)
-        }
-        for i in 32..64 {
-            assert!(!a.test(i))
-        }
-    }
-}
diff --git a/crates/polars-compute/src/lib.rs b/crates/polars-compute/src/lib.rs
index 9c42dcb1e38e..53d1339a9a10 100644
--- a/crates/polars-compute/src/lib.rs
+++ b/crates/polars-compute/src/lib.rs
@@ -26,6 +26,7 @@ pub mod if_then_else;
 pub mod min_max;
 pub mod propagate_dictionary;
 pub mod size;
+pub mod sum;
 pub mod unique;
 pub mod var_cov;
 
@@ -39,3 +40,25 @@ impl<T: NativeType> NotSimdPrimitive for T {}
 impl NotSimdPrimitive for u128 {}
 #[cfg(feature = "simd")]
 impl NotSimdPrimitive for i128 {}
+
+// Trait to allow blanket impl for all SIMD types when simd is enabled.
+#[cfg(feature = "simd")]
+mod _simd_primitive {
+    use std::simd::SimdElement;
+    pub trait SimdPrimitive: SimdElement {}
+    impl SimdPrimitive for u8 {}
+    impl SimdPrimitive for u16 {}
+    impl SimdPrimitive for u32 {}
+    impl SimdPrimitive for u64 {}
+    impl SimdPrimitive for usize {}
+    impl SimdPrimitive for i8 {}
+    impl SimdPrimitive for i16 {}
+    impl SimdPrimitive for i32 {}
+    impl SimdPrimitive for i64 {}
+    impl SimdPrimitive for isize {}
+    impl SimdPrimitive for f32 {}
+    impl SimdPrimitive for f64 {}
+}
+
+#[cfg(feature = "simd")]
+pub use _simd_primitive::SimdPrimitive;
diff --git a/crates/polars-compute/src/sum.rs b/crates/polars-compute/src/sum.rs
new file mode 100644
index 000000000000..1179fe0366bf
--- /dev/null
+++ b/crates/polars-compute/src/sum.rs
@@ -0,0 +1,160 @@
+use std::ops::Add;
+#[cfg(feature = "simd")]
+use std::simd::prelude::*;
+
+use arrow::array::{Array, PrimitiveArray};
+use arrow::bitmap::bitmask::BitMask;
+use arrow::types::NativeType;
+use num_traits::Zero;
+
+macro_rules! wrapping_impl {
+    ($trait_name:ident, $method:ident, $t:ty) => {
+        impl $trait_name for $t {
+            #[inline(always)]
+            fn wrapping_add(&self, v: &Self) -> Self {
+                <$t>::$method(*self, *v)
+            }
+        }
+    };
+}
+
+/// Performs addition that wraps around on overflow.
+///
+/// Differs from num::WrappingAdd in that this is also implemented for floats.
+pub trait WrappingAdd: Sized {
+    /// Wrapping (modular) addition. Computes `self + other`, wrapping around at
+    /// the boundary of the type.
+    fn wrapping_add(&self, v: &Self) -> Self;
+}
+
+wrapping_impl!(WrappingAdd, wrapping_add, u8);
+wrapping_impl!(WrappingAdd, wrapping_add, u16);
+wrapping_impl!(WrappingAdd, wrapping_add, u32);
+wrapping_impl!(WrappingAdd, wrapping_add, u64);
+wrapping_impl!(WrappingAdd, wrapping_add, usize);
+wrapping_impl!(WrappingAdd, wrapping_add, u128);
+
+wrapping_impl!(WrappingAdd, wrapping_add, i8);
+wrapping_impl!(WrappingAdd, wrapping_add, i16);
+wrapping_impl!(WrappingAdd, wrapping_add, i32);
+wrapping_impl!(WrappingAdd, wrapping_add, i64);
+wrapping_impl!(WrappingAdd, wrapping_add, isize);
+wrapping_impl!(WrappingAdd, wrapping_add, i128);
+
+wrapping_impl!(WrappingAdd, add, f32);
+wrapping_impl!(WrappingAdd, add, f64);
+
+#[cfg(feature = "simd")]
+const STRIPE: usize = 16;
+
+fn wrapping_sum_with_mask_scalar<T: Zero + WrappingAdd + Copy>(vals: &[T], mask: &BitMask) -> T {
+    assert!(vals.len() == mask.len());
+    vals.iter()
+        .enumerate()
+        .map(|(i, x)| {
+            // No filter but rather select of 0 for cmov opt.
+            if mask.get(i) {
+                *x
+            } else {
+                T::zero()
+            }
+        })
+        .fold(T::zero(), |a, b| a.wrapping_add(&b))
+}
+
+#[cfg(not(feature = "simd"))]
+impl<T> WrappingSum for T
+where
+    T: NativeType + WrappingAdd + Zero,
+{
+    fn wrapping_sum(vals: &[Self]) -> Self {
+        vals.iter()
+            .copied()
+            .fold(T::zero(), |a, b| a.wrapping_add(&b))
+    }
+
+    fn wrapping_sum_with_validity(vals: &[Self], mask: &BitMask) -> Self {
+        wrapping_sum_with_mask_scalar(vals, mask)
+    }
+}
+
+#[cfg(feature = "simd")]
+impl<T> WrappingSum for T
+where
+    T: NativeType + WrappingAdd + Zero + crate::SimdPrimitive,
+{
+    fn wrapping_sum(vals: &[Self]) -> Self {
+        vals.iter()
+            .copied()
+            .fold(T::zero(), |a, b| a.wrapping_add(&b))
+    }
+
+    fn wrapping_sum_with_validity(vals: &[Self], mask: &BitMask) -> Self {
+        assert!(vals.len() == mask.len());
+        let remainder = vals.len() % STRIPE;
+        let (rest, main) = vals.split_at(remainder);
+        let (rest_mask, main_mask) = mask.split_at(remainder);
+        let zero: Simd<T, STRIPE> = Simd::default();
+
+        let vsum = main
+            .chunks_exact(STRIPE)
+            .enumerate()
+            .map(|(i, a)| {
+                let m: Mask<_, STRIPE> = main_mask.get_simd(i * STRIPE);
+                m.select(Simd::from_slice(a), zero)
+            })
+            .fold(zero, |a, b| {
+                let a = a.to_array();
+                let b = b.to_array();
+                Simd::from_array(std::array::from_fn(|i| a[i].wrapping_add(&b[i])))
+            });
+
+        let mainsum = vsum
+            .to_array()
+            .into_iter()
+            .fold(T::zero(), |a, b| a.wrapping_add(&b));
+
+        // TODO: faster remainder.
+        let restsum = wrapping_sum_with_mask_scalar(rest, &rest_mask);
+        mainsum.wrapping_add(&restsum)
+    }
+}
+
+#[cfg(feature = "simd")]
+impl WrappingSum for u128 {
+    fn wrapping_sum(vals: &[Self]) -> Self {
+        vals.iter().copied().fold(0, |a, b| a.wrapping_add(b))
+    }
+
+    fn wrapping_sum_with_validity(vals: &[Self], mask: &BitMask) -> Self {
+        wrapping_sum_with_mask_scalar(vals, mask)
+    }
+}
+
+#[cfg(feature = "simd")]
+impl WrappingSum for i128 {
+    fn wrapping_sum(vals: &[Self]) -> Self {
+        vals.iter().copied().fold(0, |a, b| a.wrapping_add(b))
+    }
+
+    fn wrapping_sum_with_validity(vals: &[Self], mask: &BitMask) -> Self {
+        wrapping_sum_with_mask_scalar(vals, mask)
+    }
+}
+
+pub trait WrappingSum: Sized {
+    fn wrapping_sum(vals: &[Self]) -> Self;
+    fn wrapping_sum_with_validity(vals: &[Self], mask: &BitMask) -> Self;
+}
+
+pub fn wrapping_sum_arr<T>(arr: &PrimitiveArray<T>) -> T
+where
+    T: NativeType + WrappingSum,
+{
+    let validity = arr.validity().filter(|_| arr.null_count() > 0);
+    if let Some(mask) = validity {
+        WrappingSum::wrapping_sum_with_validity(arr.values(), &BitMask::from_bitmap(mask))
+    } else {
+        WrappingSum::wrapping_sum(arr.values())
+    }
+}
diff --git a/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs
index ef5fd105203d..9829d6aa0832 100644
--- a/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs
+++ b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs
@@ -2,14 +2,11 @@
 mod quantile;
 mod var;
 
-use std::ops::Add;
-
-use arrow::compute;
-use arrow::types::simd::Simd;
 use arrow::types::NativeType;
 use num_traits::{Float, One, ToPrimitive, Zero};
 use polars_compute::float_sum;
 use polars_compute::min_max::MinMaxKernel;
+use polars_compute::sum::{wrapping_sum_arr, WrappingSum};
 use polars_utils::min_max::MinMax;
 use polars_utils::sync::SyncPtr;
 pub use quantile::*;
@@ -45,8 +42,7 @@ pub trait ChunkAggSeries {
 
 fn sum<T>(array: &PrimitiveArray<T>) -> T
 where
-    T: NumericNative + NativeType,
-    <T as Simd>::Simd: Add<Output = <T as Simd>::Simd> + compute::aggregate::Sum<T>,
+    T: NumericNative + NativeType + WrappingSum,
 {
     if array.null_count() == array.len() {
         return T::default();
@@ -69,16 +65,15 @@ where
             }
         }
     } else {
-        compute::aggregate::sum_primitive(array).unwrap_or(T::zero())
+        wrapping_sum_arr(array)
     }
 }
 
 impl<T> ChunkAgg<T::Native> for ChunkedArray<T>
 where
     T: PolarsNumericType,
+    T::Native: WrappingSum,
     PrimitiveArray<T::Native>: for<'a> MinMaxKernel<Scalar<'a> = T::Native>,
-    <T::Native as Simd>::Simd:
-        Add<Output = <T::Native as Simd>::Simd> + compute::aggregate::Sum<T::Native>,
 {
     fn sum(&self) -> Option<T::Native> {
         Some(
@@ -270,9 +265,8 @@ impl BooleanChunked {
 impl<T> ChunkAggSeries for ChunkedArray<T>
 where
     T: PolarsNumericType,
+    T::Native: WrappingSum,
     PrimitiveArray<T::Native>: for<'a> MinMaxKernel<Scalar<'a> = T::Native>,
-    <T::Native as Simd>::Simd:
-        Add<Output = <T::Native as Simd>::Simd> + compute::aggregate::Sum<T::Native>,
     ChunkedArray<T>: IntoSeries,
 {
     fn sum_reduce(&self) -> Scalar {
@@ -345,9 +339,7 @@ impl VarAggSeries for Float64Chunked {
 impl<T> QuantileAggSeries for ChunkedArray<T>
 where
     T: PolarsIntegerType,
-    T::Native: Ord,
-    <T::Native as Simd>::Simd:
-        Add<Output = <T::Native as Simd>::Simd> + compute::aggregate::Sum<T::Native>,
+    T::Native: Ord + WrappingSum,
 {
     fn quantile_reduce(&self, quantile: f64, method: QuantileMethod) -> PolarsResult<Scalar> {
         let v = self.quantile(quantile, method)?;
diff --git a/crates/polars-core/src/datatypes/mod.rs b/crates/polars-core/src/datatypes/mod.rs
index 4c4c726037eb..1d6196e167d7 100644
--- a/crates/polars-core/src/datatypes/mod.rs
+++ b/crates/polars-core/src/datatypes/mod.rs
@@ -30,7 +30,6 @@ pub use arrow::datatypes::reshape::*;
 #[cfg(feature = "dtype-categorical")]
 use arrow::datatypes::IntegerType;
 pub use arrow::datatypes::{ArrowDataType, TimeUnit as ArrowTimeUnit};
-use arrow::types::simd::Simd;
 use arrow::types::NativeType;
 use bytemuck::Zeroable;
 pub use dtype::*;
@@ -331,7 +330,7 @@ pub trait NumericNative:
     + NumCast
     + Zero
     + One
-    + Simd
+    // + Simd
     // + Simd8
     + std::iter::Sum<Self>
     + Add<Output = Self>
diff --git a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/mean.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/mean.rs
index 8b1374c504d3..0be644ed28ff 100644
--- a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/mean.rs
+++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/mean.rs
@@ -2,12 +2,10 @@ use std::any::Any;
 use std::ops::Add;
 
 use arrow::array::{Array, PrimitiveArray};
-use arrow::compute::aggregate::Sum;
-use arrow::types::simd::Simd;
+use polars_compute::sum::{wrapping_sum_arr, WrappingSum};
 use polars_core::export::arrow::datatypes::PrimitiveType;
 use polars_core::export::num::NumCast;
 use polars_core::prelude::*;
-use polars_core::utils::arrow::compute::aggregate::sum_primitive;
 
 use super::*;
 
@@ -28,8 +26,7 @@ impl<K: NumericNative> MeanAgg<K> {
 impl<K> AggregateFn for MeanAgg<K>
 where
     K::PolarsType: PolarsNumericType,
-    K: NumericNative + Add<Output = K>,
-    <K as Simd>::Simd: Add<Output = <K as Simd>::Simd> + Sum<K>,
+    K: NumericNative + Add<Output = K> + WrappingSum,
 {
     fn has_physical_agg(&self) -> bool {
         true
@@ -83,16 +80,15 @@ where
                 .downcast_ref::<PrimitiveArray<K>>()
                 .unwrap_unchecked()
         };
-        match (sum_primitive(arr), self.sum) {
-            (Some(val), Some(sum)) => {
+        match (wrapping_sum_arr(arr), self.sum) {
+            (val, Some(sum)) => {
                 self.sum = Some(sum + val);
                 self.count += (arr.len() - arr.null_count()) as IdxSize;
             },
-            (Some(val), None) => {
+            (val, None) => {
                 self.sum = Some(val);
                 self.count += (arr.len() - arr.null_count()) as IdxSize;
             },
-            _ => {},
         }
     }
 
diff --git a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/sum.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/sum.rs
index 202ec3969dd5..a1c42029b332 100644
--- a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/sum.rs
+++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/sum.rs
@@ -2,11 +2,9 @@ use std::any::Any;
 use std::ops::Add;
 
 use arrow::array::PrimitiveArray;
-use arrow::compute::aggregate::Sum;
-use arrow::types::simd::Simd;
+use polars_compute::sum::{wrapping_sum_arr, WrappingSum};
 use polars_core::export::num::NumCast;
 use polars_core::prelude::*;
-use polars_core::utils::arrow::compute::aggregate::sum_primitive;
 
 use super::*;
 
@@ -23,8 +21,7 @@ impl<K: NumericNative> SumAgg<K> {
 impl<K> AggregateFn for SumAgg<K>
 where
     K::PolarsType: PolarsNumericType,
-    K: NumericNative + Add<Output = K>,
-    <K as Simd>::Simd: Add<Output = <K as Simd>::Simd> + Sum<K>,
+    K: NumericNative + Add<Output = K> + WrappingSum,
 {
     fn has_physical_agg(&self) -> bool {
         true
@@ -61,14 +58,13 @@ where
                 .downcast_ref::<PrimitiveArray<K>>()
                 .unwrap_unchecked()
         };
-        match (sum_primitive(arr), self.sum) {
-            (Some(val), Some(sum)) => {
+        match (wrapping_sum_arr(arr), self.sum) {
+            (val, Some(sum)) => {
                 self.sum = Some(sum + val);
             },
-            (Some(val), None) => {
+            (val, None) => {
                 self.sum = Some(val);
             },
-            _ => {},
         }
     }
 
diff --git a/crates/polars/tests/it/arrow/compute/aggregate/mod.rs b/crates/polars/tests/it/arrow/compute/aggregate/mod.rs
index d7de8a8c37c5..2363540356a2 100644
--- a/crates/polars/tests/it/arrow/compute/aggregate/mod.rs
+++ b/crates/polars/tests/it/arrow/compute/aggregate/mod.rs
@@ -1,2 +1 @@
 mod memory;
-mod sum;
diff --git a/crates/polars/tests/it/arrow/compute/aggregate/sum.rs b/crates/polars/tests/it/arrow/compute/aggregate/sum.rs
deleted file mode 100644
index 011f75aad356..000000000000
--- a/crates/polars/tests/it/arrow/compute/aggregate/sum.rs
+++ /dev/null
@@ -1,37 +0,0 @@
-use arrow::array::*;
-use arrow::compute::aggregate::{sum, sum_primitive};
-use arrow::datatypes::ArrowDataType;
-use arrow::scalar::{PrimitiveScalar, Scalar};
-
-#[test]
-fn test_primitive_array_sum() {
-    let a = Int32Array::from_slice([1, 2, 3, 4, 5]);
-    assert_eq!(
-        &PrimitiveScalar::<i32>::from(Some(15)) as &dyn Scalar,
-        sum(&a).unwrap().as_ref()
-    );
-
-    let a = a.to(ArrowDataType::Date32);
-    assert_eq!(
-        &PrimitiveScalar::<i32>::from(Some(15)).to(ArrowDataType::Date32) as &dyn Scalar,
-        sum(&a).unwrap().as_ref()
-    );
-}
-
-#[test]
-fn test_primitive_array_float_sum() {
-    let a = Float64Array::from_slice([1.1f64, 2.2, 3.3, 4.4, 5.5]);
-    assert!((16.5 - sum_primitive(&a).unwrap()).abs() < f64::EPSILON);
-}
-
-#[test]
-fn test_primitive_array_sum_with_nulls() {
-    let a = Int32Array::from(&[None, Some(2), Some(3), None, Some(5)]);
-    assert_eq!(10, sum_primitive(&a).unwrap());
-}
-
-#[test]
-fn test_primitive_array_sum_all_nulls() {
-    let a = Int32Array::from(&[None, None, None]);
-    assert_eq!(None, sum_primitive(&a));
-}