Skip to content

Commit

Permalink
[faxpy] added unrolled kernel for axpy to improve performance
Browse files Browse the repository at this point in the history
Performance achieved for kernels:
1) axpy_4096 : 41.9%
2) dotp_4096 : 48.3%
3) matmul_64x64x64 : 97.8%
  • Loading branch information
Navaneeth-KunhiPurayil committed Jan 3, 2025
1 parent 2bc3350 commit 61f4164
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 1 deletion.
48 changes: 48 additions & 0 deletions sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,54 @@ void faxpy_v64b(const double a, const double *x, const double *y,
} while (avl > 0);
}

// Unrolled 64-bit AXPY: y = a * x + y
void faxpy_v64b_unrl(const double a, const double *x, const double *y,
unsigned int avl) {
unsigned int vl;
double *y2;

// Stripmine and accumulate a partial vector
do {
// Set the vl
asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl));

// Load vectors
asm volatile("vle64.v v0, (%0)" ::"r"(x));
asm volatile("vle64.v v8, (%0)" ::"r"(y));

// Multiply-accumulate
asm volatile("vfmacc.vf v8, %0, v0" ::"f"(a));
avl -= vl;
if (avl > 0) {
// Set the vl
asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl));

// Load vectors
x += vl;
asm volatile("vle64.v v16, (%0)" ::"r"(x));
y2 = y + vl;
asm volatile("vle64.v v24, (%0)" ::"r"(y2));

// Multiply-accumulate
asm volatile("vfmacc.vf v24, %0, v16" ::"f"(a));
}

// Store results
asm volatile("vse64.v v8, (%0)" ::"r"(y));
if (avl > 0) {
// Store results
y += vl;
asm volatile("vse64.v v24, (%0)" ::"r"(y));
avl -= vl;
}

// Bump pointers
x += vl;
y += vl;

} while (avl > 0);
}

// 32-bit AXPY: y = a * x + y
void faxpy_v32b(const float a, const float *x, const float *y,
unsigned int avl) {
Expand Down
2 changes: 2 additions & 0 deletions sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

inline void faxpy_v64b(const double a, const double *x, const double *y,
unsigned int avl) __attribute__((always_inline));
inline void faxpy_v64b_unrl(const double a, const double *x, const double *y,
unsigned int avl) __attribute__((always_inline));
inline void faxpy_v32b(const float a, const float *x, const float *y,
unsigned int avl) __attribute__((always_inline));
inline void faxpy_v16b(const _Float16 a, const _Float16 *x, const _Float16 *y,
Expand Down
2 changes: 1 addition & 1 deletion sw/spatzBenchmarks/dp-faxpy/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ int main() {
timer = benchmark_get_cycle();

// Call AXPY
faxpy_v64b(*a, x_int, y_int, dim_core);
faxpy_v64b_unrl(*a, x_int, y_int, dim_core);

// Wait for all cores to finish
snrt_cluster_hw_barrier();
Expand Down

0 comments on commit 61f4164

Please sign in to comment.