Skip to content

Commit

Permalink
[ BLAS ] Implement explicit scopy function for AVX2
Browse files Browse the repository at this point in the history
- Occasional segmentation fault when scopy_cblas detected from mixed precision model test.
- Implement hand-written scopy function with avx2

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <[email protected]>
  • Loading branch information
skykongkong8 committed May 22, 2024
1 parent be6c1ee commit b6dce9d
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 4 deletions.
36 changes: 36 additions & 0 deletions nntrainer/tensor/blas_avx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,4 +274,40 @@ bool isValid(const size_t N, const float *input) {
return true;
}

void scopy_avx2(size_t N, const float *input, unsigned int incX, float *output,
unsigned int incY) {
assert(N != 0);
assert(input != NULL);
assert(output != NULL);

unsigned int idx = 0;
unsigned int N16 = (N >> 4) << 4;
unsigned int N8 = (N >> 3) << 3;
float *out_data = (float *)output;

for (; idx < N16; idx += 16) {
__m256 vec0 = _mm256_loadu_ps(input);
__m256 vec1 = _mm256_loadu_ps(input + 8);
input += 16;

_mm256_storeu_ps(out_data, (vec0));
_mm256_storeu_ps((out_data + 8), (vec1));
out_data += 16;
}
for (; idx < N8; idx += 8) {
__m256 vec = _mm256_loadu_ps(input);
input += 8;

_mm256_storeu_ps(out_data, (vec));
out_data += 8;
}

while (idx < N) {
*out_data = *input;
++out_data;
++input;
++idx;
}
}

} // namespace nntrainer::avx
12 changes: 12 additions & 0 deletions nntrainer/tensor/blas_avx.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,18 @@ bool isValid(const size_t N, const _Float16 *X);
*/
bool isValid(const size_t N, const float *X);

/**
* @brief Hand-written scopy using AVX2
* @note scopy_cblas occasionally fails. Thus implemented explicit function for
* that.
* @param[in] N length of the vector
* @param[in] input float * for Vector input
* @param[in] incX incremental index of input vector X
* @param[in] output float * for Vector output
* @param[in] incY incremental index of input vector Y
*/
void scopy_avx2(size_t N, const float *input, unsigned int incX, float *output,
unsigned int incY);
} // namespace nntrainer::avx

#endif /* __cplusplus */
Expand Down
10 changes: 6 additions & 4 deletions nntrainer/tensor/blas_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -842,10 +842,12 @@ void scopy(const unsigned int N, const float *X, const int incX, float *Y,
#ifdef BLAS_NUM_THREADS
openblas_set_num_threads(BLAS_NUM_THREADS);
#endif
// cblas_scopy(N, (float*)(X), incX, (float*)(Y), incY);
// replace cblas scopy with raw temporary.
for (unsigned int i = 0; i < N; ++i)
Y[i * incY] = X[i * incX];
#ifdef USE_AVX
nntrainer::avx::scopy_avx2(N, X, incX, Y, incY);
#else
// leave for NEON
scopy_cblas(N, X, incX, Y, incY);
#endif
#else
scopy_raw(N, X, incX, Y, incY);
#endif
Expand Down

0 comments on commit b6dce9d

Please sign in to comment.