From df3a63906c44b23de7065d60c20bf99e2571ccc8 Mon Sep 17 00:00:00 2001 From: miguel raz Date: Fri, 4 Jun 2021 14:24:47 -0500 Subject: [PATCH 1/7] add dot_product example --- crates/core_simd/examples/dot_product.rs | 31 ++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 crates/core_simd/examples/dot_product.rs diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs new file mode 100644 index 0000000000000..812b0b23eebff --- /dev/null +++ b/crates/core_simd/examples/dot_product.rs @@ -0,0 +1,31 @@ +// Code taken from the `packed_simd` crate +// Run this code with `cargo test --example dot_product` +#![feature(array_chunks)] +use core_simd::*; + +pub fn dot_prod(a: &[f32], b: &[f32]) -> f32 { + assert_eq!(a.len(), b.len()); + + // TODO handle remainder when a.len() % 4 != 0 + a.array_chunks::<4>() + .map(|&a| f32x4::from_array(a)) + .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b))) + .map(|(a, b)| (a * b).horizontal_sum()) + .sum() +} + +fn main() { + // Empty main to make cargo happy +} + +#[cfg(test)] +mod tests { + #[test] + fn test() { + use super::*; + let a: Vec = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; + let b: Vec = vec![-8.0, -7.0, -6.0, -5.0, 4.0, 3.0, 2.0, 1.0]; + + assert_eq!(0.0, dot_prod(&a, &b)); + } +} From c08a4d1f10473bfbdddf3d2eefc40e1194a633a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= Date: Sat, 26 Mar 2022 14:04:37 -0600 Subject: [PATCH 2/7] add more basic dot products and comments, README --- crates/core_simd/examples/README.md | 19 ++++++++++++++++ crates/core_simd/examples/dot_product.rs | 29 +++++++++++++++++++++--- 2 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 crates/core_simd/examples/README.md diff --git a/crates/core_simd/examples/README.md b/crates/core_simd/examples/README.md new file mode 100644 index 0000000000000..b37dffa8eaab3 --- /dev/null +++ b/crates/core_simd/examples/README.md @@ -0,0 +1,19 @@ +### `stdsimd` examples + +This crate is a port of example uses of `stdsimd`, mostly taken from the `packed_simd` crate. + +The examples contain, as in the case of `dot_product.rs`, multiple ways of solving the problem, in order to show idiomatic uses of SIMD and iteration of performance designs. + +Run the tests with the command + +``` +cargo run --example dot_product +``` + +and the benchmarks via the command + +``` +cargo run --example --benchmark ??? +``` + +and measure the timings on your local system. diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs index 812b0b23eebff..3e415fc4471dc 100644 --- a/crates/core_simd/examples/dot_product.rs +++ b/crates/core_simd/examples/dot_product.rs @@ -3,7 +3,27 @@ #![feature(array_chunks)] use core_simd::*; -pub fn dot_prod(a: &[f32], b: &[f32]) -> f32 { +/// This is your barebones dot product implementation: +/// Take 2 vectors, multiply them element wise and *then* +/// add up the result. In the next example we will see if there +/// is any difference to adding as we go along multiplying. +pub fn dot_prod_0(a: &[f32], b: &[f32]) -> f32 { + assert_eq!(a.len(), b.len()); + + a.iter() + .zip(b.iter()) + .map(|a, b| a * b) + .sum() +} + +pub fn dot_prod_1(a: &[f32], b: &[f32]) -> f32 { + assert_eq!(a.len(), b.len()); + a.iter() + .zip(b.iter()) + .fold(0.0, |a, b| a * b) +} + +pub fn dot_prod_simd_0(a: &[f32], b: &[f32]) -> f32 { assert_eq!(a.len(), b.len()); // TODO handle remainder when a.len() % 4 != 0 @@ -21,11 +41,14 @@ fn main() { #[cfg(test)] mod tests { #[test] - fn test() { + fn smoke_test() { use super::*; let a: Vec = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; let b: Vec = vec![-8.0, -7.0, -6.0, -5.0, 4.0, 3.0, 2.0, 1.0]; - assert_eq!(0.0, dot_prod(&a, &b)); + assert_eq!(0.0, dot_prod_0(&a, &b)); + assert_eq!(0.0, dot_prod_1(&a, &b)); + assert_eq!(0.0, dot_prod_simd_0(&a, &b)); + assert_eq!(0.0, dot_prod_simd_1(&a, &b)); } } From 4615805ec2ce44c37792df3b5b179a795f57542b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= Date: Sat, 26 Mar 2022 16:10:25 -0600 Subject: [PATCH 3/7] add remainder dot_product and cleanup cleanup dot_product and README.md --- crates/core_simd/examples/README.md | 8 +- crates/core_simd/examples/dot_product.rs | 106 ++++++++++++++++++++--- 2 files changed, 95 insertions(+), 19 deletions(-) diff --git a/crates/core_simd/examples/README.md b/crates/core_simd/examples/README.md index b37dffa8eaab3..82747f1b5a6f9 100644 --- a/crates/core_simd/examples/README.md +++ b/crates/core_simd/examples/README.md @@ -10,10 +10,4 @@ Run the tests with the command cargo run --example dot_product ``` -and the benchmarks via the command - -``` -cargo run --example --benchmark ??? -``` - -and measure the timings on your local system. +and verify the code for `dot_product.rs` on your machine. diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs index 3e415fc4471dc..ed210192e2a4b 100644 --- a/crates/core_simd/examples/dot_product.rs +++ b/crates/core_simd/examples/dot_product.rs @@ -1,39 +1,113 @@ // Code taken from the `packed_simd` crate // Run this code with `cargo test --example dot_product` +//use std::iter::zip; + #![feature(array_chunks)] +#![feature(slice_as_chunks)] +// Add these imports to use the stdsimd library +#![feature(portable_simd)] use core_simd::*; -/// This is your barebones dot product implementation: -/// Take 2 vectors, multiply them element wise and *then* -/// add up the result. In the next example we will see if there -/// is any difference to adding as we go along multiplying. +// This is your barebones dot product implementation: +// Take 2 vectors, multiply them element wise and *then* +// go along the resulting array and add up the result. +// In the next example we will see if there +// is any difference to adding and multiplying in tandem. pub fn dot_prod_0(a: &[f32], b: &[f32]) -> f32 { assert_eq!(a.len(), b.len()); - a.iter() - .zip(b.iter()) - .map(|a, b| a * b) - .sum() + a.iter().zip(b.iter()).map(|(a, b)| a * b).sum() } +// When dealing with SIMD, it is very important to think about the amount +// of data movement and when it happens. We're going over simple computation examples here, and yet +// it is not trivial to understand what may or may not contribute to performance +// changes. Eventually, you will need tools to inspect the generated assembly and confirm your +// hypothesis and benchmarks - we will mention them later on. +// With the use of `fold`, we're doing a multiplication, +// and then adding it to the sum, one element from both vectors at a time. pub fn dot_prod_1(a: &[f32], b: &[f32]) -> f32 { assert_eq!(a.len(), b.len()); a.iter() - .zip(b.iter()) - .fold(0.0, |a, b| a * b) + .zip(b.iter()) + .fold(0.0, |a, zipped| a + zipped.0 * zipped.1) } +// We now move on to the SIMD implementations: notice the following constructs: +// `array_chunks::<4>`: mapping this over the vector will let use construct SIMD vectors +// `f32x4::from_array`: construct the SIMD vector from a slice +// `(a * b).reduce_sum()`: Multiply both f32x4 vectors together, and then reduce them. +// This approach essentially uses SIMD to produce a vector of length N/4 of all the products, +// and then add those with `sum()`. This is suboptimal. +// TODO: ASCII diagrams pub fn dot_prod_simd_0(a: &[f32], b: &[f32]) -> f32 { assert_eq!(a.len(), b.len()); - // TODO handle remainder when a.len() % 4 != 0 a.array_chunks::<4>() .map(|&a| f32x4::from_array(a)) .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b))) - .map(|(a, b)| (a * b).horizontal_sum()) + .map(|(a, b)| (a * b).reduce_sum()) .sum() } +// There's some simple ways to improve the previous code: +// 1. Make a `zero` `f32x4` SIMD vector that we will be accumulating into +// So that there is only one `sum()` reduction when the last `f32x4` has been processed +// 2. Exploit Fused Multiply Add so that the multiplication, addition and sinking into the reduciton +// happen in the same step. +// If the arrays are large, minimizing the data shuffling will lead to great perf. +// If the arrays are small, handling the remainder elements when the length isn't a multiple of 4 +// Can become a problem. +pub fn dot_prod_simd_1(a: &[f32], b: &[f32]) -> f32 { + assert_eq!(a.len(), b.len()); + // TODO handle remainder when a.len() % 4 != 0 + a.array_chunks::<4>() + .map(|&a| f32x4::from_array(a)) + .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b))) + .fold(f32x4::splat(0.0), |acc, zipped| acc + zipped.0 * zipped.1) + .reduce_sum() +} + +// A lot of knowledgeable use of SIMD comes from knowing specific instructions that are +// available - let's try to use the `mul_add` instruction, which is the fused-multiply-add we were looking for. +use std_float::StdFloat; +pub fn dot_prod_simd_2(a: &[f32], b: &[f32]) -> f32 { + assert_eq!(a.len(), b.len()); + // TODO handle remainder when a.len() % 4 != 0 + let mut res = f32x4::splat(0.0); + a.array_chunks::<4>() + .map(|&a| f32x4::from_array(a)) + .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b))) + .for_each(|(a, b)| { + res = a.mul_add(b, res); + }); + res.reduce_sum() +} + +// Finally, we will write the same operation but handling the loop remainder. +const LANES: usize = 4; +pub fn dot_prod_simd_3(a: &[f32], b: &[f32]) -> f32 { + assert_eq!(a.len(), b.len()); + + let (a_extra, a_chunks) = a.as_rchunks(); + let (b_extra, b_chunks) = b.as_rchunks(); + + // These are always true, but for emphasis: + assert_eq!(a_chunks.len(), b_chunks.len()); + assert_eq!(a_extra.len(), b_extra.len()); + + let mut sums = [0.0; LANES]; + for ((x, y), d) in std::iter::zip(a_extra, b_extra).zip(&mut sums) { + *d = x * y; + } + + let mut sums = f32x4::from_array(sums); + std::iter::zip(a_chunks, b_chunks).for_each(|(x, y)| { + sums += f32x4::from_array(*x) * f32x4::from_array(*y); + }); + + sums.reduce_sum() +} fn main() { // Empty main to make cargo happy } @@ -45,10 +119,18 @@ mod tests { use super::*; let a: Vec = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; let b: Vec = vec![-8.0, -7.0, -6.0, -5.0, 4.0, 3.0, 2.0, 1.0]; + let x: Vec = [0.5; 1003].to_vec(); + let y: Vec = [2.0; 1003].to_vec(); + // Basic check assert_eq!(0.0, dot_prod_0(&a, &b)); assert_eq!(0.0, dot_prod_1(&a, &b)); assert_eq!(0.0, dot_prod_simd_0(&a, &b)); assert_eq!(0.0, dot_prod_simd_1(&a, &b)); + assert_eq!(0.0, dot_prod_simd_2(&a, &b)); + assert_eq!(0.0, dot_prod_simd_3(&a, &b)); + + // We can handle vectors that are non-multiples of 4 + assert_eq!(1003.0, dot_prod_simd_3(&x, &y)); } } From 4ddfd2f3f8c547fa7c42a0f9a5979665262a30c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= Date: Tue, 29 Mar 2022 16:52:54 -0600 Subject: [PATCH 4/7] non allocating fold simd allocating fold with std::ops::Add::add --- crates/core_simd/examples/dot_product.rs | 31 ++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs index ed210192e2a4b..75d628ee39223 100644 --- a/crates/core_simd/examples/dot_product.rs +++ b/crates/core_simd/examples/dot_product.rs @@ -108,6 +108,37 @@ pub fn dot_prod_simd_3(a: &[f32], b: &[f32]) -> f32 { sums.reduce_sum() } + +// Finally, we present an iterator version for handling remainders in a scalar fashion at the end of the loop. +// Unfortunately, this is allocating 1 `XMM` register on the order of `~len(a)` - we'll see how we can get around it in the +// next example. +pub fn dot_prod_simd_4(a: &[f32], b: &[f32]) -> f32 { + let mut sum = a + .array_chunks::<4>() + .map(|&a| f32x4::from_array(a)) + .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b))) + .map(|(a, b)| a * b) + .fold(f32x4::splat(0.0), std::ops::Add::add) + .reduce_sum(); + let remain = a.len() - (a.len() % 4); + sum += a[remain..] + .iter() + .zip(&b[remain..]) + .map(|(a, b)| a * b) + .sum::(); + sum +} + +// This version allocates a single `XMM` register for accumulation, and the folds don't allocate on top of that. +// Notice the the use of `mul_add`, which can do a multiply and an add operation ber iteration. +pub fn dot_prod_simd_5(a: &[f32], b: &[f32]) -> f32 { + a.array_chunks::<4>() + .map(|&a| f32x4::from_array(a)) + .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b))) + .fold(f32x4::splat(0.), |acc, (a, b)| acc.mul_add(a, b)) + .reduce_sum() +} + fn main() { // Empty main to make cargo happy } From aeac9ed37339c463a6a155b12135b7f167611e26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= Date: Tue, 29 Mar 2022 17:36:47 -0600 Subject: [PATCH 5/7] proper mul_add arg order, added tests --- crates/core_simd/examples/dot_product.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs index 75d628ee39223..84824c2e5c4a9 100644 --- a/crates/core_simd/examples/dot_product.rs +++ b/crates/core_simd/examples/dot_product.rs @@ -135,7 +135,7 @@ pub fn dot_prod_simd_5(a: &[f32], b: &[f32]) -> f32 { a.array_chunks::<4>() .map(|&a| f32x4::from_array(a)) .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b))) - .fold(f32x4::splat(0.), |acc, (a, b)| acc.mul_add(a, b)) + .fold(f32x4::splat(0.), |acc, (a, b)| a.mul_add(b, acc)) .reduce_sum() } @@ -160,6 +160,8 @@ mod tests { assert_eq!(0.0, dot_prod_simd_1(&a, &b)); assert_eq!(0.0, dot_prod_simd_2(&a, &b)); assert_eq!(0.0, dot_prod_simd_3(&a, &b)); + assert_eq!(0.0, dot_prod_simd_4(&a, &b)); + assert_eq!(0.0, dot_prod_simd_5(&a, &b)); // We can handle vectors that are non-multiples of 4 assert_eq!(1003.0, dot_prod_simd_3(&x, &y)); From 64247a327d30a2d5fe7ad3d98f527bff1cc8fb85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= Date: Wed, 30 Mar 2022 17:45:59 -0600 Subject: [PATCH 6/7] add _scalar names for dot_product examples --- crates/core_simd/examples/dot_product.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs index 84824c2e5c4a9..936741a2ceb63 100644 --- a/crates/core_simd/examples/dot_product.rs +++ b/crates/core_simd/examples/dot_product.rs @@ -13,7 +13,7 @@ use core_simd::*; // go along the resulting array and add up the result. // In the next example we will see if there // is any difference to adding and multiplying in tandem. -pub fn dot_prod_0(a: &[f32], b: &[f32]) -> f32 { +pub fn dot_prod_scalar_0(a: &[f32], b: &[f32]) -> f32 { assert_eq!(a.len(), b.len()); a.iter().zip(b.iter()).map(|(a, b)| a * b).sum() @@ -26,7 +26,7 @@ pub fn dot_prod_0(a: &[f32], b: &[f32]) -> f32 { // hypothesis and benchmarks - we will mention them later on. // With the use of `fold`, we're doing a multiplication, // and then adding it to the sum, one element from both vectors at a time. -pub fn dot_prod_1(a: &[f32], b: &[f32]) -> f32 { +pub fn dot_prod_scalar_1(a: &[f32], b: &[f32]) -> f32 { assert_eq!(a.len(), b.len()); a.iter() .zip(b.iter()) @@ -154,8 +154,8 @@ mod tests { let y: Vec = [2.0; 1003].to_vec(); // Basic check - assert_eq!(0.0, dot_prod_0(&a, &b)); - assert_eq!(0.0, dot_prod_1(&a, &b)); + assert_eq!(0.0, dot_prod_scalar_0(&a, &b)); + assert_eq!(0.0, dot_prod_scalar_1(&a, &b)); assert_eq!(0.0, dot_prod_simd_0(&a, &b)); assert_eq!(0.0, dot_prod_simd_1(&a, &b)); assert_eq!(0.0, dot_prod_simd_2(&a, &b)); From da3bd6d3a04f84ebc7fc6314f2e1f8a74e379018 Mon Sep 17 00:00:00 2001 From: The Atelier Date: Sat, 3 Dec 2022 18:40:07 -0800 Subject: [PATCH 7/7] Update dot_product example import --- crates/core_simd/examples/dot_product.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs index 936741a2ceb63..391f08f55a07a 100644 --- a/crates/core_simd/examples/dot_product.rs +++ b/crates/core_simd/examples/dot_product.rs @@ -6,7 +6,7 @@ #![feature(slice_as_chunks)] // Add these imports to use the stdsimd library #![feature(portable_simd)] -use core_simd::*; +use core_simd::simd::*; // This is your barebones dot product implementation: // Take 2 vectors, multiply them element wise and *then*