From 61f4164ca326027fc49aa77a3bbea1604db187b6 Mon Sep 17 00:00:00 2001 From: Navaneeth-Kunhi Purayil Date: Fri, 3 Jan 2025 15:44:51 +0100 Subject: [PATCH] [faxpy] added unrolled kernel for axpy to improve performance Performance achieved for kernels: 1) axpy_4096 : 41.9% 2) dotp_4096 : 48.3% 3) matmul_64x64x64 : 97.8% --- sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.c | 48 ++++++++++++++++++++++ sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.h | 2 + sw/spatzBenchmarks/dp-faxpy/main.c | 2 +- 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.c b/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.c index a49ef62a..635e838f 100644 --- a/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.c +++ b/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.c @@ -45,6 +45,54 @@ void faxpy_v64b(const double a, const double *x, const double *y, } while (avl > 0); } +// Unrolled 64-bit AXPY: y = a * x + y +void faxpy_v64b_unrl(const double a, const double *x, const double *y, + unsigned int avl) { + unsigned int vl; + double *y2; + + // Stripmine and accumulate a partial vector + do { + // Set the vl + asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl)); + + // Load vectors + asm volatile("vle64.v v0, (%0)" ::"r"(x)); + asm volatile("vle64.v v8, (%0)" ::"r"(y)); + + // Multiply-accumulate + asm volatile("vfmacc.vf v8, %0, v0" ::"f"(a)); + avl -= vl; + if (avl > 0) { + // Set the vl + asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl)); + + // Load vectors + x += vl; + asm volatile("vle64.v v16, (%0)" ::"r"(x)); + y2 = y + vl; + asm volatile("vle64.v v24, (%0)" ::"r"(y2)); + + // Multiply-accumulate + asm volatile("vfmacc.vf v24, %0, v16" ::"f"(a)); + } + + // Store results + asm volatile("vse64.v v8, (%0)" ::"r"(y)); + if (avl > 0) { + // Store results + y += vl; + asm volatile("vse64.v v24, (%0)" ::"r"(y)); + avl -= vl; + } + + // Bump pointers + x += vl; + y += vl; + + } while (avl > 0); +} + // 32-bit AXPY: y = a * x + y void faxpy_v32b(const float a, const float *x, const float *y, unsigned int avl) { diff --git a/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.h b/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.h index e5625878..97e5a357 100644 --- a/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.h +++ b/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.h @@ -21,6 +21,8 @@ inline void faxpy_v64b(const double a, const double *x, const double *y, unsigned int avl) __attribute__((always_inline)); +inline void faxpy_v64b_unrl(const double a, const double *x, const double *y, + unsigned int avl) __attribute__((always_inline)); inline void faxpy_v32b(const float a, const float *x, const float *y, unsigned int avl) __attribute__((always_inline)); inline void faxpy_v16b(const _Float16 a, const _Float16 *x, const _Float16 *y, diff --git a/sw/spatzBenchmarks/dp-faxpy/main.c b/sw/spatzBenchmarks/dp-faxpy/main.c index 7e4539ea..9ec8bb3a 100644 --- a/sw/spatzBenchmarks/dp-faxpy/main.c +++ b/sw/spatzBenchmarks/dp-faxpy/main.c @@ -83,7 +83,7 @@ int main() { timer = benchmark_get_cycle(); // Call AXPY - faxpy_v64b(*a, x_int, y_int, dim_core); + faxpy_v64b_unrl(*a, x_int, y_int, dim_core); // Wait for all cores to finish snrt_cluster_hw_barrier();