asdf

Describe a commit content (Until 80 colums per line) in detail ASAP. **Changes proposed in this PR:** - Added TOC generator for README.md Resolves: **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon <[email protected]>
nnstreamer · May 8, 2024 · 3a6ff34 · 3a6ff34
1 parent 4ecc13b
commit 3a6ff34
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 26 deletions.
diff --git a/nntrainer/tensor/blas_avx.cpp b/nntrainer/tensor/blas_avx.cpp
@@ -20,6 +20,7 @@
 
 namespace nntrainer::avx {
 
+#ifdef ENABLE_FP16
 void vcvt_f16_f32(size_t N, const void *input, float *output) {
   assert(N != 0);
   assert(input != NULL);
@@ -114,7 +115,7 @@ void vcvt_f32_f16(size_t N, const float *input, void *output) {
   }
 }
 
-bool hasNaN(const size_t N, const float *input) {
+bool hasNaN(const size_t N, const _Float16 *input) {
   assert(N != 0);
   assert(input != NULL);
 
@@ -123,18 +124,29 @@ bool hasNaN(const size_t N, const float *input) {
 
   // 16 single-precision check : ( X != X )
   for (; N - idx >= 16; idx += 16) {
-    const __m256 vec0 = _mm256_loadu_ps(input);
+    const __m256 vec0 =
+      _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
+    const __m256 vec1 =
+      _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input + 8));
+
     input += 16;
+
     __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ);
     temp = temp | _mm256_movemask_ps(res);
 
+    if (temp)
+      return true;
+
+    __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res1);
+
     if (temp)
       return true;
   }
 
   // 8 single-precision check : ( X != X )
   for (; N - idx >= 8; idx += 8) {
-    const __m256 vec = _mm256_loadu_ps(input);
+    const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
     input += 8;
     __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ);
     temp = temp | _mm256_movemask_ps(res);
@@ -153,8 +165,9 @@ bool hasNaN(const size_t N, const float *input) {
 
   return false;
 }
+#endif
 
-bool hasNaN(const size_t N, const _Float16 *input) {
+bool hasNaN(const size_t N, const float *input) {
   assert(N != 0);
   assert(input != NULL);
 
@@ -163,29 +176,18 @@ bool hasNaN(const size_t N, const _Float16 *input) {
 
   // 16 single-precision check : ( X != X )
   for (; N - idx >= 16; idx += 16) {
-    const __m256 vec0 =
-      _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
-    const __m256 vec1 =
-      _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input + 8));
-
+    const __m256 vec0 = _mm256_loadu_ps(input);
     input += 16;
-
     __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ);
     temp = temp | _mm256_movemask_ps(res);
 
-    if (temp)
-      return true;
-
-    __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ);
-    temp = temp | _mm256_movemask_ps(res1);
-
     if (temp)
       return true;
   }
 
   // 8 single-precision check : ( X != X )
   for (; N - idx >= 8; idx += 8) {
-    const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
+    const __m256 vec = _mm256_loadu_ps(input);
     input += 8;
     __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ);
     temp = temp | _mm256_movemask_ps(res);

diff --git a/nntrainer/tensor/blas_avx.h b/nntrainer/tensor/blas_avx.h
@@ -20,6 +20,7 @@
 
 namespace nntrainer::avx {
 
+#ifdef ENABLE_FP16
 /**
  * @brief Converts half-precision floating point values to single-precision
  * floating point values.
@@ -44,19 +45,20 @@ void vcvt_f32_f16(size_t N, const float *input, void *output);
  * @brief     check if the X has NaN value
  * @note it compare !(x==x)
  * @param[in] N  length of the vector
- * @param[in] X float * for Vector X
+ * @param[in] X half-precision * for Vector X
  * @param[out] true if it has NaN
  */
-bool hasNaN(const size_t N, const float *X);
+bool hasNaN(const size_t N, const _Float16 *X);
+#endif
 
 /**
  * @brief     check if the X has NaN value
  * @note it compare !(x==x)
  * @param[in] N  length of the vector
- * @param[in] X half-precision * for Vector X
+ * @param[in] X float * for Vector X
  * @param[out] true if it has NaN
  */
-bool hasNaN(const size_t N, const _Float16 *X);
+bool hasNaN(const size_t N, const float *X);
 
 } // namespace nntrainer::avx
 

diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp
@@ -1094,8 +1094,12 @@ bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type,
              const void *X) {
 #ifdef USE_NEON
   if (d_type == ml::train::TensorDim::DataType::FP16) {
-    const __fp16 *vec = (const __fp16 *)X;
+#ifdef ENABLE_FP16
+    const _FP16 *vec = (const _FP16 *)X;
     return nntrainer::neon::hasNaN(N, vec);
+#else
+    throw std::invalid_argument("FP16 type NEON type is not enabled");
+#endif
   } else if (d_type == ml::train::TensorDim::DataType::FP32) {
     const float *vec = (const float *)X;
     return nntrianer::neon::hasNaN(N, vec);
@@ -1104,8 +1108,12 @@ bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type,
   }
 #else
   if (d_type == ml::train::TensorDim::DataType::FP16) {
-    const _Float16 *vec = (const _Float16 *)X;
+#ifdef ENABLE_FP16
+    const _FP16 *vec = (const _FP16 *)X;
     return nntrainer::avx::hasNaN(N, vec);
+#else
+    throw std::invalid_argument("FP16 type AVX type is not enabled");
+#endif
   } else if (d_type == ml::train::TensorDim::DataType::FP32) {
     const float *vec = (const float *)X;
     return nntrainer::avx::hasNaN(N, vec);

diff --git a/nntrainer/tensor/meson.build b/nntrainer/tensor/meson.build
@@ -44,6 +44,12 @@ cl_headers = [
 
 
 arch = host_machine.cpu_family()
+
+if arch == 'x86_64'
+    tensor_sources += 'blas_avx.cpp'
+    tensor_headers += 'blas_avx.h'
+endif
+
 if get_option('enable-fp16') 
   if arch == 'arm'
     error ('FP16/ARM code (blas_neon.cpp) uses armv8.2 instructions. armv7 is not supported.')
@@ -55,9 +61,6 @@ if get_option('enable-fp16')
       nntrainer_inc += include_directories('hgemm')
       nntrainer_inc_abs += meson.current_source_dir() / 'hgemm'
     endif
-  elif get_option('enable-avx')
-    tensor_sources += 'blas_avx.cpp'
-    tensor_headers += 'blas_avx.h'
   endif
 endif