[neon] Modify neon sgemv_fp16

- Previously, sgemv_fp16 was dependent of two conditions: 1. should have 8-divisible column or row 2. fully work with fp16 digit (which might raise accuracy issue) - In this commit, we expect sgemv to work like: 1. support every column length (with adaptive-compute optimization) 2. use temporal fp32 array to secure cumulative digit error in large scale Tensor 3. accelerate fp32 to fp16 copy and vice versa with neon to enhance time performance - some trivial typo fix included **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 <[email protected]>
jijoongmoon · Sep 26, 2023 · 0fc814a · 0fc814a
1 parent 9149a55
commit 0fc814a
Show file tree

Hide file tree

Showing 6 changed files with 434 additions and 1,379 deletions.
diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp
@@ -103,28 +103,20 @@ static void sgemv_FP16(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
 
   if (TransA == CblasTrans) {
 #ifdef USE__FP16
-    if (incX == 1 && incY == 1 && (N % 16 == 0 || N % 8 == 0)) {
-
-      nntrainer::neon::sgemv_transpose_neon_fp16(A, X, Y, M, N, alpha, beta);
-    } else {
-      sgemv_loop_fp16(i, j, N, M);
-    }
+    nntrainer::neon::sgemv_transpose_neon_fp16(A, X, Y, M, N, alpha, beta);
 #else
     sgemv_loop_fp16(i, j, N, M);
 #endif
   } else {
 #ifdef USE__FP16
-    if (incX == 1 && incY == 1 && (N % 16 == 0 || N % 8 == 0)) {
-      nntrainer::neon::sgemv_neon_fp16(A, X, Y, M, N, alpha, beta);
-    } else {
-      sgemv_loop_fp16(j, i, M, N);
-    }
+    nntrainer::neon::sgemv_neon_fp16(A, X, Y, M, N, alpha, beta);
 #else
     sgemv_loop_fp16(j, i, M, N);
 #endif
   }
 }
 
+
 static _FP16 sdot_FP16(const unsigned int N, const _FP16 *X,
                        const unsigned int incX, const _FP16 *Y,
                        const unsigned int incY) {
@@ -175,7 +167,7 @@ static void scopy_INT4(const unsigned int N, const uint8_t *X, const int incX,
 
 #ifdef USE__FP16
   if (incX == 1 && incY == 1) {
-    nntrainer::neon::scopy_neon_int4(N, X, Y);
+    nntrainer::neon::scopy_neon_int4_to_fp16(N, X, Y);
   } else {
     throw std::invalid_argument(
       "Error: incX == 1 && incY == 1 is supported only");
@@ -188,6 +180,7 @@ static void scopy_INT4(const unsigned int N, const uint8_t *X, const int incX,
 #endif
 }
 
+
 static void ewvm_FP16(const unsigned int N, const _FP16 *X, const _FP16 *Y,
                       _FP16 *Z) {
 #ifdef USE__FP16
@@ -342,6 +335,7 @@ void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
   sgemv_FP16(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
 }
 
+
 unsigned int isamax(const unsigned int N, const _FP16 *X, const int incX) {
   /// @todo isamax_FP16 for BLAS_NUM_THREADS
   return isamax_FP16(N, X, incX);

diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h
@@ -55,14 +55,16 @@ void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
            const float alpha, const _FP16 *A, const unsigned int lda,
            const _FP16 *B, const unsigned int ldb, const float beta, _FP16 *C,
            const unsigned int ldc);
+
 void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
            const unsigned int N, const float alpha, const _FP16 *A,
            const unsigned int lda, const _FP16 *X, const int incX,
            const float beta, _FP16 *Y, const int incY);
+
 void ewvm(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z);
-  
+
 void ewva(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z);
-  
+
 unsigned int isamax(const unsigned int N, const _FP16 *X, const int incX);
 #endif