Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ blas/neon ] Add NEON fp16 function for sdot #31

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions nntrainer/tensor/blas_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,25 @@ static void sgemv_FP16(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
static _FP16 sdot_FP16(const unsigned int N, const _FP16 *X,
const unsigned int incX, const _FP16 *Y,
const unsigned int incY) {

if (incX < 0 or incY < 0)
throw std::invalid_argument("Error: negative inc not supported");

_FP16 ret = 0;

#ifdef USE__FP16
if (incX == 1 && incY == 1) {
ret = nntrainer::neon::sdot_neon_fp16(N, X, Y);
} else {
for (unsigned int i = 0; i < N; ++i) {
ret += X[i * incX] * Y[i * incY];
}
}
#else
for (unsigned int i = 0; i < N; ++i) {
ret += X[i * incX] * Y[i * incY];
}
#endif
return ret;
}

Expand Down
55 changes: 52 additions & 3 deletions nntrainer/tensor/blas_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -519,15 +519,16 @@ void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y,
}
}

void saxpy_neon_fp16(const unsigned int N, const float alpha, const __fp16 *X, __fp16 *Y) {
void saxpy_neon_fp16(const unsigned int N, const float alpha, const __fp16 *X,
__fp16 *Y) {

const float16x8_t v_alphaX8 = vmovq_n_f16(alpha);
const float16x4_t v_alphaX4 = vmov_n_f16(alpha);

unsigned int idx = 0;

// processing batch of 8
for(; (N - idx) >= 8 ; idx += 8){
for (; (N - idx) >= 8; idx += 8) {
float16x8_t x = vld1q_f16(&X[idx]);
float16x8_t y = vld1q_f16(&Y[idx]);

Expand All @@ -537,7 +538,7 @@ void saxpy_neon_fp16(const unsigned int N, const float alpha, const __fp16 *X, _
}

// processing remaining batch of 4
for(; (N - idx) >= 4 ; idx += 4){
for (; (N - idx) >= 4; idx += 4) {
float16x4_t x = vld1_f16(&X[idx]);
float16x4_t y = vld1_f16(&Y[idx]);

Expand All @@ -551,4 +552,52 @@ void saxpy_neon_fp16(const unsigned int N, const float alpha, const __fp16 *X, _
Y[idx] = Y[idx] + alpha * X[idx];
}

__fp16 sdot_neon_fp16(const unsigned int N, const __fp16 *X, const __fp16 *Y) {

float16x8_t accX8 = vmovq_n_f16(0);
float16x4_t accX4 = vmov_n_f16(0);

unsigned int idx = 0;
__fp16 ret = 0;

// processing batch of 8
for (; (N - idx) >= 8; idx += 8) {
float16x8_t x = vld1q_f16(&X[idx]);
float16x8_t y = vld1q_f16(&Y[idx]);

// x*y + accX8 -> accX8
accX8 = vfmaq_f16(accX8, x, y);
}

// check at least one batch of 8 is processed
if (N - 8 >= 0) {
__fp16 result[8];
vst1q_f16(result, accX8);
for (unsigned int i = 0; i < 8; i++)
ret += result[i];
}

// processing remaining batch of 4
for (; (N - idx) >= 4; idx += 4) {
float16x4_t x = vld1_f16(&X[idx]);
float16x4_t y = vld1_f16(&Y[idx]);

// x*y + accX4 -> accX4
accX4 = vfma_f16(accX4, x, y);
}

// check at least one batch of 4 is processed
if (N % 8 >= 4) {
__fp16 result[4];
vst1_f16(result, accX4);
ret += result[0] + result[1] + result[2] + result[3];
}

// pocessing remaining values
for (; idx < N; idx++)
ret += X[idx] * Y[idx];

return ret;
}

} // namespace nntrainer::neon
11 changes: 10 additions & 1 deletion nntrainer/tensor/blas_neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,16 @@ void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y,
* @param[in] X __fp16 * for Vector X
* @param[in] Y __fp16 * for Vector Y
*/
void saxpy_neon_fp16(const unsigned int N, const float alpha, const __fp16 *X, __fp16 *Y);
void saxpy_neon_fp16(const unsigned int N, const float alpha, const __fp16 *X,
__fp16 *Y);

/**
* @brief sdot computation with neon: sum of all X * Y
* @param[in] N number of elements in Y
* @param[in] X __fp16 * for Vector X
* @param[in] Y __fp16 * for Vector Y
*/
__fp16 sdot_neon_fp16(const unsigned int N, const __fp16 *X, const __fp16 *Y);

} // namespace nntrainer::neon

Expand Down
56 changes: 55 additions & 1 deletion test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,65 @@ TEST(nntrainer_Tensor, add_i) {

double cosSimNeon = cosine_similarity<__fp16>(
input.getData<__fp16>(), input_fp32.getData<float>(), input.size());

EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon);
EXPECT_IN_RANGE(cosSimNeon, 0.99, 1);
}

TEST(nntrainer_Tensor, dot) {

nntrainer::TensorDim::TensorType t_type_nchw_fp16 = {
nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16};

nntrainer::TensorDim::TensorType t_type_nchw_fp32 = {
nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32};

// conditions for fp16 sdot call:
// this->(batch * channel * height) = arg->(width) = 1;

size_t width = 23;

__fp16 a_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11,
12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
nntrainer::Tensor input(
nntrainer::TensorDim(1, 1, 1, width, t_type_nchw_fp16), a_data);
__fp16 b_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11,
12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
nntrainer::Tensor input_2(
nntrainer::TensorDim(1, 1, width, 1, t_type_nchw_fp16), b_data);

float a_data_fp32[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11,
12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
nntrainer::Tensor input_fp32(
nntrainer::TensorDim(1, 1, 1, width, t_type_nchw_fp32), a_data_fp32);
float b_data_fp32[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11,
12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
nntrainer::Tensor input_fp32_2(
nntrainer::TensorDim(1, 1, width, 1, t_type_nchw_fp32), b_data_fp32);

nntrainer::Tensor result_neon;
nntrainer::Tensor result_fp32;

// NEON fp16
result_neon = input.dot(input_2, false, false);

// fp32
result_fp32 = input_fp32.dot(input_fp32_2, false, false);

float mseErrorNeon =
mse<__fp16>(result_neon.getData<__fp16>(), result_fp32.getData<float>(),
result_neon.size());

double cosSimNeon =
cosine_similarity<__fp16>(result_neon.getData<__fp16>(),
result_fp32.getData<float>(), result_neon.size());

const float epsilon = 1e-4;

EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon);
EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1);
}

GTEST_API_ int main(int argc, char **argv) {
int result = -1;

Expand Down