Skip to content

Commit

Permalink
asdf
Browse files Browse the repository at this point in the history
Describe a commit content (Until 80 colums per line) in detail ASAP.

**Changes proposed in this PR:**
- Added TOC generator for README.md

Resolves:

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <[email protected]>
  • Loading branch information
jijoongmoon committed May 8, 2024
1 parent 4ecc13b commit 3a6ff34
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 26 deletions.
36 changes: 19 additions & 17 deletions nntrainer/tensor/blas_avx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

namespace nntrainer::avx {

#ifdef ENABLE_FP16
void vcvt_f16_f32(size_t N, const void *input, float *output) {
assert(N != 0);
assert(input != NULL);
Expand Down Expand Up @@ -114,7 +115,7 @@ void vcvt_f32_f16(size_t N, const float *input, void *output) {
}
}

bool hasNaN(const size_t N, const float *input) {
bool hasNaN(const size_t N, const _Float16 *input) {
assert(N != 0);
assert(input != NULL);

Expand All @@ -123,18 +124,29 @@ bool hasNaN(const size_t N, const float *input) {

// 16 single-precision check : ( X != X )
for (; N - idx >= 16; idx += 16) {
const __m256 vec0 = _mm256_loadu_ps(input);
const __m256 vec0 =
_mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
const __m256 vec1 =
_mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input + 8));

input += 16;

__m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ);
temp = temp | _mm256_movemask_ps(res);

if (temp)
return true;

__m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ);
temp = temp | _mm256_movemask_ps(res1);

if (temp)
return true;
}

// 8 single-precision check : ( X != X )
for (; N - idx >= 8; idx += 8) {
const __m256 vec = _mm256_loadu_ps(input);
const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
input += 8;
__m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ);
temp = temp | _mm256_movemask_ps(res);
Expand All @@ -153,8 +165,9 @@ bool hasNaN(const size_t N, const float *input) {

return false;
}
#endif

bool hasNaN(const size_t N, const _Float16 *input) {
bool hasNaN(const size_t N, const float *input) {
assert(N != 0);
assert(input != NULL);

Expand All @@ -163,29 +176,18 @@ bool hasNaN(const size_t N, const _Float16 *input) {

// 16 single-precision check : ( X != X )
for (; N - idx >= 16; idx += 16) {
const __m256 vec0 =
_mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
const __m256 vec1 =
_mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input + 8));

const __m256 vec0 = _mm256_loadu_ps(input);
input += 16;

__m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ);
temp = temp | _mm256_movemask_ps(res);

if (temp)
return true;

__m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ);
temp = temp | _mm256_movemask_ps(res1);

if (temp)
return true;
}

// 8 single-precision check : ( X != X )
for (; N - idx >= 8; idx += 8) {
const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
const __m256 vec = _mm256_loadu_ps(input);
input += 8;
__m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ);
temp = temp | _mm256_movemask_ps(res);
Expand Down
10 changes: 6 additions & 4 deletions nntrainer/tensor/blas_avx.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

namespace nntrainer::avx {

#ifdef ENABLE_FP16
/**
* @brief Converts half-precision floating point values to single-precision
* floating point values.
Expand All @@ -44,19 +45,20 @@ void vcvt_f32_f16(size_t N, const float *input, void *output);
* @brief check if the X has NaN value
* @note it compare !(x==x)
* @param[in] N length of the vector
* @param[in] X float * for Vector X
* @param[in] X half-precision * for Vector X
* @param[out] true if it has NaN
*/
bool hasNaN(const size_t N, const float *X);
bool hasNaN(const size_t N, const _Float16 *X);
#endif

/**
* @brief check if the X has NaN value
* @note it compare !(x==x)
* @param[in] N length of the vector
* @param[in] X half-precision * for Vector X
* @param[in] X float * for Vector X
* @param[out] true if it has NaN
*/
bool hasNaN(const size_t N, const _Float16 *X);
bool hasNaN(const size_t N, const float *X);

} // namespace nntrainer::avx

Expand Down
12 changes: 10 additions & 2 deletions nntrainer/tensor/blas_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1094,8 +1094,12 @@ bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type,
const void *X) {
#ifdef USE_NEON
if (d_type == ml::train::TensorDim::DataType::FP16) {
const __fp16 *vec = (const __fp16 *)X;
#ifdef ENABLE_FP16
const _FP16 *vec = (const _FP16 *)X;
return nntrainer::neon::hasNaN(N, vec);
#else
throw std::invalid_argument("FP16 type NEON type is not enabled");
#endif
} else if (d_type == ml::train::TensorDim::DataType::FP32) {
const float *vec = (const float *)X;
return nntrianer::neon::hasNaN(N, vec);
Expand All @@ -1104,8 +1108,12 @@ bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type,
}
#else
if (d_type == ml::train::TensorDim::DataType::FP16) {
const _Float16 *vec = (const _Float16 *)X;
#ifdef ENABLE_FP16
const _FP16 *vec = (const _FP16 *)X;
return nntrainer::avx::hasNaN(N, vec);
#else
throw std::invalid_argument("FP16 type AVX type is not enabled");
#endif
} else if (d_type == ml::train::TensorDim::DataType::FP32) {
const float *vec = (const float *)X;
return nntrainer::avx::hasNaN(N, vec);
Expand Down
9 changes: 6 additions & 3 deletions nntrainer/tensor/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@ cl_headers = [


arch = host_machine.cpu_family()

if arch == 'x86_64'
tensor_sources += 'blas_avx.cpp'
tensor_headers += 'blas_avx.h'
endif

if get_option('enable-fp16')
if arch == 'arm'
error ('FP16/ARM code (blas_neon.cpp) uses armv8.2 instructions. armv7 is not supported.')
Expand All @@ -55,9 +61,6 @@ if get_option('enable-fp16')
nntrainer_inc += include_directories('hgemm')
nntrainer_inc_abs += meson.current_source_dir() / 'hgemm'
endif
elif get_option('enable-avx')
tensor_sources += 'blas_avx.cpp'
tensor_headers += 'blas_avx.h'
endif
endif

Expand Down

0 comments on commit 3a6ff34

Please sign in to comment.