From 3a6ff347f6ae0b9169d3088728e4144aed3185d0 Mon Sep 17 00:00:00 2001 From: "jijoong.moon" Date: Wed, 8 May 2024 21:43:36 +0900 Subject: [PATCH] asdf Describe a commit content (Until 80 colums per line) in detail ASAP. **Changes proposed in this PR:** - Added TOC generator for README.md Resolves: **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon --- nntrainer/tensor/blas_avx.cpp | 36 +++++++++++++++-------------- nntrainer/tensor/blas_avx.h | 10 ++++---- nntrainer/tensor/blas_interface.cpp | 12 ++++++++-- nntrainer/tensor/meson.build | 9 +++++--- 4 files changed, 41 insertions(+), 26 deletions(-) diff --git a/nntrainer/tensor/blas_avx.cpp b/nntrainer/tensor/blas_avx.cpp index fd8f3698a1..a51e8cc842 100644 --- a/nntrainer/tensor/blas_avx.cpp +++ b/nntrainer/tensor/blas_avx.cpp @@ -20,6 +20,7 @@ namespace nntrainer::avx { +#ifdef ENABLE_FP16 void vcvt_f16_f32(size_t N, const void *input, float *output) { assert(N != 0); assert(input != NULL); @@ -114,7 +115,7 @@ void vcvt_f32_f16(size_t N, const float *input, void *output) { } } -bool hasNaN(const size_t N, const float *input) { +bool hasNaN(const size_t N, const _Float16 *input) { assert(N != 0); assert(input != NULL); @@ -123,18 +124,29 @@ bool hasNaN(const size_t N, const float *input) { // 16 single-precision check : ( X != X ) for (; N - idx >= 16; idx += 16) { - const __m256 vec0 = _mm256_loadu_ps(input); + const __m256 vec0 = + _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input)); + const __m256 vec1 = + _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input + 8)); + input += 16; + __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ); temp = temp | _mm256_movemask_ps(res); + if (temp) + return true; + + __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res1); + if (temp) return true; } // 8 single-precision check : ( X != X ) for (; N - idx >= 8; idx += 8) { - const __m256 vec = _mm256_loadu_ps(input); + const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input)); input += 8; __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ); temp = temp | _mm256_movemask_ps(res); @@ -153,8 +165,9 @@ bool hasNaN(const size_t N, const float *input) { return false; } +#endif -bool hasNaN(const size_t N, const _Float16 *input) { +bool hasNaN(const size_t N, const float *input) { assert(N != 0); assert(input != NULL); @@ -163,29 +176,18 @@ bool hasNaN(const size_t N, const _Float16 *input) { // 16 single-precision check : ( X != X ) for (; N - idx >= 16; idx += 16) { - const __m256 vec0 = - _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input)); - const __m256 vec1 = - _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input + 8)); - + const __m256 vec0 = _mm256_loadu_ps(input); input += 16; - __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ); temp = temp | _mm256_movemask_ps(res); - if (temp) - return true; - - __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ); - temp = temp | _mm256_movemask_ps(res1); - if (temp) return true; } // 8 single-precision check : ( X != X ) for (; N - idx >= 8; idx += 8) { - const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input)); + const __m256 vec = _mm256_loadu_ps(input); input += 8; __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ); temp = temp | _mm256_movemask_ps(res); diff --git a/nntrainer/tensor/blas_avx.h b/nntrainer/tensor/blas_avx.h index 78ce6ef569..d25ded103f 100644 --- a/nntrainer/tensor/blas_avx.h +++ b/nntrainer/tensor/blas_avx.h @@ -20,6 +20,7 @@ namespace nntrainer::avx { +#ifdef ENABLE_FP16 /** * @brief Converts half-precision floating point values to single-precision * floating point values. @@ -44,19 +45,20 @@ void vcvt_f32_f16(size_t N, const float *input, void *output); * @brief check if the X has NaN value * @note it compare !(x==x) * @param[in] N length of the vector - * @param[in] X float * for Vector X + * @param[in] X half-precision * for Vector X * @param[out] true if it has NaN */ -bool hasNaN(const size_t N, const float *X); +bool hasNaN(const size_t N, const _Float16 *X); +#endif /** * @brief check if the X has NaN value * @note it compare !(x==x) * @param[in] N length of the vector - * @param[in] X half-precision * for Vector X + * @param[in] X float * for Vector X * @param[out] true if it has NaN */ -bool hasNaN(const size_t N, const _Float16 *X); +bool hasNaN(const size_t N, const float *X); } // namespace nntrainer::avx diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index c84ea27b31..657486bbb8 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -1094,8 +1094,12 @@ bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type, const void *X) { #ifdef USE_NEON if (d_type == ml::train::TensorDim::DataType::FP16) { - const __fp16 *vec = (const __fp16 *)X; +#ifdef ENABLE_FP16 + const _FP16 *vec = (const _FP16 *)X; return nntrainer::neon::hasNaN(N, vec); +#else + throw std::invalid_argument("FP16 type NEON type is not enabled"); +#endif } else if (d_type == ml::train::TensorDim::DataType::FP32) { const float *vec = (const float *)X; return nntrianer::neon::hasNaN(N, vec); @@ -1104,8 +1108,12 @@ bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type, } #else if (d_type == ml::train::TensorDim::DataType::FP16) { - const _Float16 *vec = (const _Float16 *)X; +#ifdef ENABLE_FP16 + const _FP16 *vec = (const _FP16 *)X; return nntrainer::avx::hasNaN(N, vec); +#else + throw std::invalid_argument("FP16 type AVX type is not enabled"); +#endif } else if (d_type == ml::train::TensorDim::DataType::FP32) { const float *vec = (const float *)X; return nntrainer::avx::hasNaN(N, vec); diff --git a/nntrainer/tensor/meson.build b/nntrainer/tensor/meson.build index 0884dbd3b4..f3678790fd 100644 --- a/nntrainer/tensor/meson.build +++ b/nntrainer/tensor/meson.build @@ -44,6 +44,12 @@ cl_headers = [ arch = host_machine.cpu_family() + +if arch == 'x86_64' + tensor_sources += 'blas_avx.cpp' + tensor_headers += 'blas_avx.h' +endif + if get_option('enable-fp16') if arch == 'arm' error ('FP16/ARM code (blas_neon.cpp) uses armv8.2 instructions. armv7 is not supported.') @@ -55,9 +61,6 @@ if get_option('enable-fp16') nntrainer_inc += include_directories('hgemm') nntrainer_inc_abs += meson.current_source_dir() / 'hgemm' endif - elif get_option('enable-avx') - tensor_sources += 'blas_avx.cpp' - tensor_headers += 'blas_avx.h' endif endif