[faxpy] added unrolled kernel for axpy to improve performance

Performance achieved for kernels: 1) axpy_4096 : 41.9% 2) dotp_4096 : 48.3% 3) matmul_64x64x64 : 97.8%
pulp-platform · Jan 3, 2025 · 61f4164 · 61f4164
1 parent 2bc3350
commit 61f4164
Show file tree

Hide file tree

Showing 3 changed files with 51 additions and 1 deletion.
diff --git a/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.c b/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.c
@@ -45,6 +45,54 @@ void faxpy_v64b(const double a, const double *x, const double *y,
   } while (avl > 0);
 }
 
+// Unrolled 64-bit AXPY: y = a * x + y
+void faxpy_v64b_unrl(const double a, const double *x, const double *y,
+                unsigned int avl) {
+  unsigned int vl;
+  double *y2;
+
+  // Stripmine and accumulate a partial vector
+  do {
+    // Set the vl
+    asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl));
+
+    // Load vectors
+    asm volatile("vle64.v v0, (%0)" ::"r"(x));
+    asm volatile("vle64.v v8, (%0)" ::"r"(y));
+
+    // Multiply-accumulate
+    asm volatile("vfmacc.vf v8, %0, v0" ::"f"(a));
+    avl -= vl;
+    if (avl > 0) {
+      // Set the vl
+      asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl));
+
+      // Load vectors
+      x += vl;
+      asm volatile("vle64.v v16, (%0)" ::"r"(x));
+      y2 = y + vl;
+      asm volatile("vle64.v v24, (%0)" ::"r"(y2));
+
+      // Multiply-accumulate
+      asm volatile("vfmacc.vf v24, %0, v16" ::"f"(a));
+    }
+
+    // Store results
+    asm volatile("vse64.v v8, (%0)" ::"r"(y));
+    if (avl > 0) {
+      // Store results
+      y += vl;
+      asm volatile("vse64.v v24, (%0)" ::"r"(y));
+      avl -= vl;
+    }
+
+    // Bump pointers
+    x += vl;
+    y += vl;
+
+  } while (avl > 0);
+}
+
 // 32-bit AXPY: y = a * x + y
 void faxpy_v32b(const float a, const float *x, const float *y,
                 unsigned int avl) {

diff --git a/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.h b/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.h
@@ -21,6 +21,8 @@
 
 inline void faxpy_v64b(const double a, const double *x, const double *y,
                        unsigned int avl) __attribute__((always_inline));
+inline void faxpy_v64b_unrl(const double a, const double *x, const double *y,
+                        unsigned int avl) __attribute__((always_inline));
 inline void faxpy_v32b(const float a, const float *x, const float *y,
                        unsigned int avl) __attribute__((always_inline));
 inline void faxpy_v16b(const _Float16 a, const _Float16 *x, const _Float16 *y,

diff --git a/sw/spatzBenchmarks/dp-faxpy/main.c b/sw/spatzBenchmarks/dp-faxpy/main.c
@@ -83,7 +83,7 @@ int main() {
     timer = benchmark_get_cycle();
 
   // Call AXPY
-  faxpy_v64b(*a, x_int, y_int, dim_core);
+  faxpy_v64b_unrl(*a, x_int, y_int, dim_core);
 
   // Wait for all cores to finish
   snrt_cluster_hw_barrier();