Skip to content

Commit

Permalink
[neon] Modify neon sgemv_fp16
Browse files Browse the repository at this point in the history
- Previously, sgemv_fp16 was dependent of two conditions:
	1. should have 8-divisible column or row
	2. fully work with fp16 digit (which might raise accuracy issue)
- In this commit, we expect sgemv to work like:
	1. support every column length (with adaptive-compute optimization)
	2. use temporal fp32 array to secure cumulative digit error in large scale Tensor
	3. accelerate fp32 to fp16 copy and vice versa with neon to enhance time performance
- some trivial typo fix included

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <[email protected]>
  • Loading branch information
skykongkong8 committed Sep 26, 2023
1 parent 9149a55 commit 0fc814a
Show file tree
Hide file tree
Showing 6 changed files with 434 additions and 1,379 deletions.
18 changes: 6 additions & 12 deletions nntrainer/tensor/blas_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,28 +103,20 @@ static void sgemv_FP16(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,

if (TransA == CblasTrans) {
#ifdef USE__FP16
if (incX == 1 && incY == 1 && (N % 16 == 0 || N % 8 == 0)) {

nntrainer::neon::sgemv_transpose_neon_fp16(A, X, Y, M, N, alpha, beta);
} else {
sgemv_loop_fp16(i, j, N, M);
}
nntrainer::neon::sgemv_transpose_neon_fp16(A, X, Y, M, N, alpha, beta);
#else
sgemv_loop_fp16(i, j, N, M);
#endif
} else {
#ifdef USE__FP16
if (incX == 1 && incY == 1 && (N % 16 == 0 || N % 8 == 0)) {
nntrainer::neon::sgemv_neon_fp16(A, X, Y, M, N, alpha, beta);
} else {
sgemv_loop_fp16(j, i, M, N);
}
nntrainer::neon::sgemv_neon_fp16(A, X, Y, M, N, alpha, beta);
#else
sgemv_loop_fp16(j, i, M, N);
#endif
}
}


static _FP16 sdot_FP16(const unsigned int N, const _FP16 *X,
const unsigned int incX, const _FP16 *Y,
const unsigned int incY) {
Expand Down Expand Up @@ -175,7 +167,7 @@ static void scopy_INT4(const unsigned int N, const uint8_t *X, const int incX,

#ifdef USE__FP16
if (incX == 1 && incY == 1) {
nntrainer::neon::scopy_neon_int4(N, X, Y);
nntrainer::neon::scopy_neon_int4_to_fp16(N, X, Y);
} else {
throw std::invalid_argument(
"Error: incX == 1 && incY == 1 is supported only");
Expand All @@ -188,6 +180,7 @@ static void scopy_INT4(const unsigned int N, const uint8_t *X, const int incX,
#endif
}


static void ewvm_FP16(const unsigned int N, const _FP16 *X, const _FP16 *Y,
_FP16 *Z) {
#ifdef USE__FP16
Expand Down Expand Up @@ -342,6 +335,7 @@ void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
sgemv_FP16(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
}


unsigned int isamax(const unsigned int N, const _FP16 *X, const int incX) {
/// @todo isamax_FP16 for BLAS_NUM_THREADS
return isamax_FP16(N, X, incX);
Expand Down
6 changes: 4 additions & 2 deletions nntrainer/tensor/blas_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,16 @@ void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
const float alpha, const _FP16 *A, const unsigned int lda,
const _FP16 *B, const unsigned int ldb, const float beta, _FP16 *C,
const unsigned int ldc);

void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
const unsigned int N, const float alpha, const _FP16 *A,
const unsigned int lda, const _FP16 *X, const int incX,
const float beta, _FP16 *Y, const int incY);

void ewvm(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z);

void ewva(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z);

unsigned int isamax(const unsigned int N, const _FP16 *X, const int incX);
#endif

Expand Down
Loading

0 comments on commit 0fc814a

Please sign in to comment.