diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index 8882a8eb75..d82b14ff5d 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -108,10 +108,25 @@ static void sgemv_FP16(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, static _FP16 sdot_FP16(const unsigned int N, const _FP16 *X, const unsigned int incX, const _FP16 *Y, const unsigned int incY) { + + if (incX < 0 or incY < 0) + throw std::invalid_argument("Error: negative inc not supported"); + _FP16 ret = 0; + +#ifdef USE__FP16 + if (incX == 1 && incY == 1) { + ret = nntrainer::neon::sdot_neon_fp16(N, X, Y); + } else { + for (unsigned int i = 0; i < N; ++i) { + ret += X[i * incX] * Y[i * incY]; + } + } +#else for (unsigned int i = 0; i < N; ++i) { ret += X[i * incX] * Y[i * incY]; } +#endif return ret; } diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp index 57d331cdd6..0a9094eb67 100644 --- a/nntrainer/tensor/blas_neon.cpp +++ b/nntrainer/tensor/blas_neon.cpp @@ -519,7 +519,8 @@ void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y, } } -void saxpy_neon_fp16(const unsigned int N, const float alpha, const __fp16 *X, __fp16 *Y) { +void saxpy_neon_fp16(const unsigned int N, const float alpha, const __fp16 *X, + __fp16 *Y) { const float16x8_t v_alphaX8 = vmovq_n_f16(alpha); const float16x4_t v_alphaX4 = vmov_n_f16(alpha); @@ -527,7 +528,7 @@ void saxpy_neon_fp16(const unsigned int N, const float alpha, const __fp16 *X, _ unsigned int idx = 0; // processing batch of 8 - for(; (N - idx) >= 8 ; idx += 8){ + for (; (N - idx) >= 8; idx += 8) { float16x8_t x = vld1q_f16(&X[idx]); float16x8_t y = vld1q_f16(&Y[idx]); @@ -537,7 +538,7 @@ void saxpy_neon_fp16(const unsigned int N, const float alpha, const __fp16 *X, _ } // processing remaining batch of 4 - for(; (N - idx) >= 4 ; idx += 4){ + for (; (N - idx) >= 4; idx += 4) { float16x4_t x = vld1_f16(&X[idx]); float16x4_t y = vld1_f16(&Y[idx]); @@ -551,4 +552,52 @@ void saxpy_neon_fp16(const unsigned int N, const float alpha, const __fp16 *X, _ Y[idx] = Y[idx] + alpha * X[idx]; } +__fp16 sdot_neon_fp16(const unsigned int N, const __fp16 *X, const __fp16 *Y) { + + float16x8_t accX8 = vmovq_n_f16(0); + float16x4_t accX4 = vmov_n_f16(0); + + unsigned int idx = 0; + __fp16 ret = 0; + + // processing batch of 8 + for (; (N - idx) >= 8; idx += 8) { + float16x8_t x = vld1q_f16(&X[idx]); + float16x8_t y = vld1q_f16(&Y[idx]); + + // x*y + accX8 -> accX8 + accX8 = vfmaq_f16(accX8, x, y); + } + + // check at least one batch of 8 is processed + if (N - 8 >= 0) { + __fp16 result[8]; + vst1q_f16(result, accX8); + for (unsigned int i = 0; i < 8; i++) + ret += result[i]; + } + + // processing remaining batch of 4 + for (; (N - idx) >= 4; idx += 4) { + float16x4_t x = vld1_f16(&X[idx]); + float16x4_t y = vld1_f16(&Y[idx]); + + // x*y + accX4 -> accX4 + accX4 = vfma_f16(accX4, x, y); + } + + // check at least one batch of 4 is processed + if (N % 8 >= 4) { + __fp16 result[4]; + vst1_f16(result, accX4); + ret += result[0] + result[1] + result[2] + result[3]; + } + + // pocessing remaining values + for (; idx < N; idx++) + ret += X[idx] * Y[idx]; + + return ret; +} + } // namespace nntrainer::neon diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h index c57340973d..649ea89c2a 100644 --- a/nntrainer/tensor/blas_neon.h +++ b/nntrainer/tensor/blas_neon.h @@ -83,7 +83,16 @@ void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y, * @param[in] X __fp16 * for Vector X * @param[in] Y __fp16 * for Vector Y */ -void saxpy_neon_fp16(const unsigned int N, const float alpha, const __fp16 *X, __fp16 *Y); +void saxpy_neon_fp16(const unsigned int N, const float alpha, const __fp16 *X, + __fp16 *Y); + +/** + * @brief sdot computation with neon: sum of all X * Y + * @param[in] N number of elements in Y + * @param[in] X __fp16 * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +__fp16 sdot_neon_fp16(const unsigned int N, const __fp16 *X, const __fp16 *Y); } // namespace nntrainer::neon diff --git a/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp b/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp index 5d4e71aad7..aa9544759e 100644 --- a/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp +++ b/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp @@ -65,11 +65,65 @@ TEST(nntrainer_Tensor, add_i) { double cosSimNeon = cosine_similarity<__fp16>( input.getData<__fp16>(), input_fp32.getData(), input.size()); - + EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon); EXPECT_IN_RANGE(cosSimNeon, 0.99, 1); } +TEST(nntrainer_Tensor, dot) { + + nntrainer::TensorDim::TensorType t_type_nchw_fp16 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}; + + nntrainer::TensorDim::TensorType t_type_nchw_fp32 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32}; + + // conditions for fp16 sdot call: + // this->(batch * channel * height) = arg->(width) = 1; + + size_t width = 23; + + __fp16 a_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11, + 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + nntrainer::Tensor input( + nntrainer::TensorDim(1, 1, 1, width, t_type_nchw_fp16), a_data); + __fp16 b_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11, + 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + nntrainer::Tensor input_2( + nntrainer::TensorDim(1, 1, width, 1, t_type_nchw_fp16), b_data); + + float a_data_fp32[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11, + 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + nntrainer::Tensor input_fp32( + nntrainer::TensorDim(1, 1, 1, width, t_type_nchw_fp32), a_data_fp32); + float b_data_fp32[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11, + 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + nntrainer::Tensor input_fp32_2( + nntrainer::TensorDim(1, 1, width, 1, t_type_nchw_fp32), b_data_fp32); + + nntrainer::Tensor result_neon; + nntrainer::Tensor result_fp32; + + // NEON fp16 + result_neon = input.dot(input_2, false, false); + + // fp32 + result_fp32 = input_fp32.dot(input_fp32_2, false, false); + + float mseErrorNeon = + mse<__fp16>(result_neon.getData<__fp16>(), result_fp32.getData(), + result_neon.size()); + + double cosSimNeon = + cosine_similarity<__fp16>(result_neon.getData<__fp16>(), + result_fp32.getData(), result_neon.size()); + + const float epsilon = 1e-4; + + EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon); + EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); +} + GTEST_API_ int main(int argc, char **argv) { int result = -1;