From df3a63906c44b23de7065d60c20bf99e2571ccc8 Mon Sep 17 00:00:00 2001
From: miguel raz <miguelraz@gmail.com>
Date: Fri, 4 Jun 2021 14:24:47 -0500
Subject: [PATCH 1/7] add dot_product example

---
 crates/core_simd/examples/dot_product.rs | 31 ++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 crates/core_simd/examples/dot_product.rs
diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
new file mode 100644
index 0000000000000..812b0b23eebff
--- /dev/null
+++ b/crates/core_simd/examples/dot_product.rs
@@ -0,0 +1,31 @@
+// Code taken from the `packed_simd` crate
+// Run this code with `cargo test --example dot_product`
+#![feature(array_chunks)]
+use core_simd::*;
+
+pub fn dot_prod(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+
+    // TODO handle remainder when a.len() % 4 != 0
+    a.array_chunks::<4>()
+        .map(|&a| f32x4::from_array(a))
+        .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
+        .map(|(a, b)| (a * b).horizontal_sum())
+        .sum()
+}
+
+fn main() {
+    // Empty main to make cargo happy
+}
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn test() {
+        use super::*;
+        let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let b: Vec<f32> = vec![-8.0, -7.0, -6.0, -5.0, 4.0, 3.0, 2.0, 1.0];
+
+        assert_eq!(0.0, dot_prod(&a, &b));
+    }
+}

From c08a4d1f10473bfbdddf3d2eefc40e1194a633a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= <miguelraz@gmail.com>
Date: Sat, 26 Mar 2022 14:04:37 -0600
Subject: [PATCH 2/7] add more basic dot products and comments, README

---
 crates/core_simd/examples/README.md      | 19 ++++++++++++++++
 crates/core_simd/examples/dot_product.rs | 29 +++++++++++++++++++++---
 2 files changed, 45 insertions(+), 3 deletions(-)
 create mode 100644 crates/core_simd/examples/README.md

diff --git a/crates/core_simd/examples/README.md b/crates/core_simd/examples/README.md
new file mode 100644
index 0000000000000..b37dffa8eaab3
--- /dev/null
+++ b/crates/core_simd/examples/README.md
@@ -0,0 +1,19 @@
+### `stdsimd` examples
+
+This crate is a port of example uses of `stdsimd`, mostly taken from the `packed_simd` crate.
+
+The examples contain, as in the case of `dot_product.rs`, multiple ways of solving the problem, in order to show idiomatic uses of SIMD and iteration of performance designs.
+
+Run the tests with the command 
+
+```
+cargo run --example dot_product
+```
+
+and the benchmarks via the command
+
+```
+cargo run --example --benchmark ???
+```
+
+and measure the timings on your local system.
diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
index 812b0b23eebff..3e415fc4471dc 100644
--- a/crates/core_simd/examples/dot_product.rs
+++ b/crates/core_simd/examples/dot_product.rs
@@ -3,7 +3,27 @@
 #![feature(array_chunks)]
 use core_simd::*;
 
-pub fn dot_prod(a: &[f32], b: &[f32]) -> f32 {
+/// This is your barebones dot product implementation: 
+/// Take 2 vectors, multiply them element wise and *then*
+/// add up the result. In the next example we will see if there
+///  is any difference to adding as we go along multiplying.
+pub fn dot_prod_0(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+
+    a.iter()
+    .zip(b.iter())
+    .map(|a, b| a * b)
+    .sum()
+}
+
+pub fn dot_prod_1(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+    a.iter()
+    .zip(b.iter())
+    .fold(0.0, |a, b| a * b)
+}
+
+pub fn dot_prod_simd_0(a: &[f32], b: &[f32]) -> f32 {
     assert_eq!(a.len(), b.len());
 
     // TODO handle remainder when a.len() % 4 != 0
@@ -21,11 +41,14 @@ fn main() {
 #[cfg(test)]
 mod tests {
     #[test]
-    fn test() {
+    fn smoke_test() {
         use super::*;
         let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
         let b: Vec<f32> = vec![-8.0, -7.0, -6.0, -5.0, 4.0, 3.0, 2.0, 1.0];
 
-        assert_eq!(0.0, dot_prod(&a, &b));
+        assert_eq!(0.0, dot_prod_0(&a, &b));
+        assert_eq!(0.0, dot_prod_1(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_0(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_1(&a, &b));
     }
 }

From 4615805ec2ce44c37792df3b5b179a795f57542b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= <miguelraz@gmail.com>
Date: Sat, 26 Mar 2022 16:10:25 -0600
Subject: [PATCH 3/7] add remainder dot_product and cleanup

cleanup dot_product and README.md
---
 crates/core_simd/examples/README.md      |   8 +-
 crates/core_simd/examples/dot_product.rs | 106 ++++++++++++++++++++---
 2 files changed, 95 insertions(+), 19 deletions(-)

diff --git a/crates/core_simd/examples/README.md b/crates/core_simd/examples/README.md
index b37dffa8eaab3..82747f1b5a6f9 100644
--- a/crates/core_simd/examples/README.md
+++ b/crates/core_simd/examples/README.md
@@ -10,10 +10,4 @@ Run the tests with the command
 cargo run --example dot_product
 ```
 
-and the benchmarks via the command
-
-```
-cargo run --example --benchmark ???
-```
-
-and measure the timings on your local system.
+and verify the code for `dot_product.rs` on your machine.
diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
index 3e415fc4471dc..ed210192e2a4b 100644
--- a/crates/core_simd/examples/dot_product.rs
+++ b/crates/core_simd/examples/dot_product.rs
@@ -1,39 +1,113 @@
 // Code taken from the `packed_simd` crate
 // Run this code with `cargo test --example dot_product`
+//use std::iter::zip;
+
 #![feature(array_chunks)]
+#![feature(slice_as_chunks)]
+// Add these imports to use the stdsimd library
+#![feature(portable_simd)]
 use core_simd::*;
 
-/// This is your barebones dot product implementation: 
-/// Take 2 vectors, multiply them element wise and *then*
-/// add up the result. In the next example we will see if there
-///  is any difference to adding as we go along multiplying.
+// This is your barebones dot product implementation:
+// Take 2 vectors, multiply them element wise and *then*
+// go along the resulting array and add up the result.
+// In the next example we will see if there
+//  is any difference to adding and multiplying in tandem.
 pub fn dot_prod_0(a: &[f32], b: &[f32]) -> f32 {
     assert_eq!(a.len(), b.len());
 
-    a.iter()
-    .zip(b.iter())
-    .map(|a, b| a * b)
-    .sum()
+    a.iter().zip(b.iter()).map(|(a, b)| a * b).sum()
 }
 
+// When dealing with SIMD, it is very important to think about the amount
+// of data movement and when it happens. We're going over simple computation examples here, and yet
+// it is not trivial to understand what may or may not contribute to performance
+// changes. Eventually, you will need tools to inspect the generated assembly and confirm your
+// hypothesis and benchmarks - we will mention them later on.
+// With the use of `fold`, we're doing a multiplication,
+// and then adding it to the sum, one element from both vectors at a time.
 pub fn dot_prod_1(a: &[f32], b: &[f32]) -> f32 {
     assert_eq!(a.len(), b.len());
     a.iter()
-    .zip(b.iter())
-    .fold(0.0, |a, b| a * b)
+        .zip(b.iter())
+        .fold(0.0, |a, zipped| a + zipped.0 * zipped.1)
 }
 
+// We now move on to the SIMD implementations: notice the following constructs:
+// `array_chunks::<4>`: mapping this over the vector will let use construct SIMD vectors
+// `f32x4::from_array`: construct the SIMD vector from a slice
+// `(a * b).reduce_sum()`: Multiply both f32x4 vectors together, and then reduce them.
+// This approach essentially uses SIMD to produce a vector of length N/4 of all the products,
+// and then add those with `sum()`. This is suboptimal.
+// TODO: ASCII diagrams
 pub fn dot_prod_simd_0(a: &[f32], b: &[f32]) -> f32 {
     assert_eq!(a.len(), b.len());
-
     // TODO handle remainder when a.len() % 4 != 0
     a.array_chunks::<4>()
         .map(|&a| f32x4::from_array(a))
         .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
-        .map(|(a, b)| (a * b).horizontal_sum())
+        .map(|(a, b)| (a * b).reduce_sum())
         .sum()
 }
 
+// There's some simple ways to improve the previous code:
+// 1. Make a `zero` `f32x4` SIMD vector that we will be accumulating into
+// So that there is only one `sum()` reduction when the last `f32x4` has been processed
+// 2. Exploit Fused Multiply Add so that the multiplication, addition and sinking into the reduciton
+// happen in the same step.
+// If the arrays are large, minimizing the data shuffling will lead to great perf.
+// If the arrays are small, handling the remainder elements when the length isn't a multiple of 4
+// Can become a problem.
+pub fn dot_prod_simd_1(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+    // TODO handle remainder when a.len() % 4 != 0
+    a.array_chunks::<4>()
+        .map(|&a| f32x4::from_array(a))
+        .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
+        .fold(f32x4::splat(0.0), |acc, zipped| acc + zipped.0 * zipped.1)
+        .reduce_sum()
+}
+
+// A lot of knowledgeable use of SIMD comes from knowing specific instructions that are
+// available - let's try to use the `mul_add` instruction, which is the fused-multiply-add we were looking for.
+use std_float::StdFloat;
+pub fn dot_prod_simd_2(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+    // TODO handle remainder when a.len() % 4 != 0
+    let mut res = f32x4::splat(0.0);
+    a.array_chunks::<4>()
+        .map(|&a| f32x4::from_array(a))
+        .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
+        .for_each(|(a, b)| {
+            res = a.mul_add(b, res);
+        });
+    res.reduce_sum()
+}
+
+// Finally, we will write the same operation but handling the loop remainder.
+const LANES: usize = 4;
+pub fn dot_prod_simd_3(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+
+    let (a_extra, a_chunks) = a.as_rchunks();
+    let (b_extra, b_chunks) = b.as_rchunks();
+
+    // These are always true, but for emphasis:
+    assert_eq!(a_chunks.len(), b_chunks.len());
+    assert_eq!(a_extra.len(), b_extra.len());
+
+    let mut sums = [0.0; LANES];
+    for ((x, y), d) in std::iter::zip(a_extra, b_extra).zip(&mut sums) {
+        *d = x * y;
+    }
+
+    let mut sums = f32x4::from_array(sums);
+    std::iter::zip(a_chunks, b_chunks).for_each(|(x, y)| {
+        sums += f32x4::from_array(*x) * f32x4::from_array(*y);
+    });
+
+    sums.reduce_sum()
+}
 fn main() {
     // Empty main to make cargo happy
 }
@@ -45,10 +119,18 @@ mod tests {
         use super::*;
         let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
         let b: Vec<f32> = vec![-8.0, -7.0, -6.0, -5.0, 4.0, 3.0, 2.0, 1.0];
+        let x: Vec<f32> = [0.5; 1003].to_vec();
+        let y: Vec<f32> = [2.0; 1003].to_vec();
 
+        // Basic check
         assert_eq!(0.0, dot_prod_0(&a, &b));
         assert_eq!(0.0, dot_prod_1(&a, &b));
         assert_eq!(0.0, dot_prod_simd_0(&a, &b));
         assert_eq!(0.0, dot_prod_simd_1(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_2(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_3(&a, &b));
+
+        // We can handle vectors that are non-multiples of 4
+        assert_eq!(1003.0, dot_prod_simd_3(&x, &y));
     }
 }

From 4ddfd2f3f8c547fa7c42a0f9a5979665262a30c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= <miguelraz@gmail.com>
Date: Tue, 29 Mar 2022 16:52:54 -0600
Subject: [PATCH 4/7] non allocating fold simd

allocating fold with std::ops::Add::add
---
 crates/core_simd/examples/dot_product.rs | 31 ++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
index ed210192e2a4b..75d628ee39223 100644
--- a/crates/core_simd/examples/dot_product.rs
+++ b/crates/core_simd/examples/dot_product.rs
@@ -108,6 +108,37 @@ pub fn dot_prod_simd_3(a: &[f32], b: &[f32]) -> f32 {
 
     sums.reduce_sum()
 }
+
+// Finally, we present an iterator version for handling remainders in a scalar fashion at the end of the loop.
+// Unfortunately, this is allocating 1 `XMM` register on the order of `~len(a)` - we'll see how we can get around it in the
+// next example.
+pub fn dot_prod_simd_4(a: &[f32], b: &[f32]) -> f32 {
+    let mut sum = a
+        .array_chunks::<4>()
+        .map(|&a| f32x4::from_array(a))
+        .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
+        .map(|(a, b)| a * b)
+        .fold(f32x4::splat(0.0), std::ops::Add::add)
+        .reduce_sum();
+    let remain = a.len() - (a.len() % 4);
+    sum += a[remain..]
+        .iter()
+        .zip(&b[remain..])
+        .map(|(a, b)| a * b)
+        .sum::<f32>();
+    sum
+}
+
+// This version allocates a single `XMM` register for accumulation, and the folds don't allocate on top of that.
+// Notice the the use of `mul_add`, which can do a multiply and an add operation ber iteration.
+pub fn dot_prod_simd_5(a: &[f32], b: &[f32]) -> f32 {
+    a.array_chunks::<4>()
+        .map(|&a| f32x4::from_array(a))
+        .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
+        .fold(f32x4::splat(0.), |acc, (a, b)| acc.mul_add(a, b))
+        .reduce_sum()
+}
+
 fn main() {
     // Empty main to make cargo happy
 }

From aeac9ed37339c463a6a155b12135b7f167611e26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= <miguelraz@gmail.com>
Date: Tue, 29 Mar 2022 17:36:47 -0600
Subject: [PATCH 5/7] proper mul_add arg order, added tests

---
 crates/core_simd/examples/dot_product.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
index 75d628ee39223..84824c2e5c4a9 100644
--- a/crates/core_simd/examples/dot_product.rs
+++ b/crates/core_simd/examples/dot_product.rs
@@ -135,7 +135,7 @@ pub fn dot_prod_simd_5(a: &[f32], b: &[f32]) -> f32 {
     a.array_chunks::<4>()
         .map(|&a| f32x4::from_array(a))
         .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
-        .fold(f32x4::splat(0.), |acc, (a, b)| acc.mul_add(a, b))
+        .fold(f32x4::splat(0.), |acc, (a, b)| a.mul_add(b, acc))
         .reduce_sum()
 }
 
@@ -160,6 +160,8 @@ mod tests {
         assert_eq!(0.0, dot_prod_simd_1(&a, &b));
         assert_eq!(0.0, dot_prod_simd_2(&a, &b));
         assert_eq!(0.0, dot_prod_simd_3(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_4(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_5(&a, &b));
 
         // We can handle vectors that are non-multiples of 4
         assert_eq!(1003.0, dot_prod_simd_3(&x, &y));

From 64247a327d30a2d5fe7ad3d98f527bff1cc8fb85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= <miguelraz@gmail.com>
Date: Wed, 30 Mar 2022 17:45:59 -0600
Subject: [PATCH 6/7] add _scalar names for dot_product examples

---
 crates/core_simd/examples/dot_product.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
index 84824c2e5c4a9..936741a2ceb63 100644
--- a/crates/core_simd/examples/dot_product.rs
+++ b/crates/core_simd/examples/dot_product.rs
@@ -13,7 +13,7 @@ use core_simd::*;
 // go along the resulting array and add up the result.
 // In the next example we will see if there
 //  is any difference to adding and multiplying in tandem.
-pub fn dot_prod_0(a: &[f32], b: &[f32]) -> f32 {
+pub fn dot_prod_scalar_0(a: &[f32], b: &[f32]) -> f32 {
     assert_eq!(a.len(), b.len());
 
     a.iter().zip(b.iter()).map(|(a, b)| a * b).sum()
@@ -26,7 +26,7 @@ pub fn dot_prod_0(a: &[f32], b: &[f32]) -> f32 {
 // hypothesis and benchmarks - we will mention them later on.
 // With the use of `fold`, we're doing a multiplication,
 // and then adding it to the sum, one element from both vectors at a time.
-pub fn dot_prod_1(a: &[f32], b: &[f32]) -> f32 {
+pub fn dot_prod_scalar_1(a: &[f32], b: &[f32]) -> f32 {
     assert_eq!(a.len(), b.len());
     a.iter()
         .zip(b.iter())
@@ -154,8 +154,8 @@ mod tests {
         let y: Vec<f32> = [2.0; 1003].to_vec();
 
         // Basic check
-        assert_eq!(0.0, dot_prod_0(&a, &b));
-        assert_eq!(0.0, dot_prod_1(&a, &b));
+        assert_eq!(0.0, dot_prod_scalar_0(&a, &b));
+        assert_eq!(0.0, dot_prod_scalar_1(&a, &b));
         assert_eq!(0.0, dot_prod_simd_0(&a, &b));
         assert_eq!(0.0, dot_prod_simd_1(&a, &b));
         assert_eq!(0.0, dot_prod_simd_2(&a, &b));

From da3bd6d3a04f84ebc7fc6314f2e1f8a74e379018 Mon Sep 17 00:00:00 2001
From: The Atelier <workingjubilee@gmail.com>
Date: Sat, 3 Dec 2022 18:40:07 -0800
Subject: [PATCH 7/7] Update dot_product example import

---
 crates/core_simd/examples/dot_product.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
index 936741a2ceb63..391f08f55a07a 100644
--- a/crates/core_simd/examples/dot_product.rs
+++ b/crates/core_simd/examples/dot_product.rs
@@ -6,7 +6,7 @@
 #![feature(slice_as_chunks)]
 // Add these imports to use the stdsimd library
 #![feature(portable_simd)]
-use core_simd::*;
+use core_simd::simd::*;
 
 // This is your barebones dot product implementation:
 // Take 2 vectors, multiply them element wise and *then*