diff --git a/Applications/AlexNet/jni/Android.mk b/Applications/AlexNet/jni/Android.mk index d2cbf8dce2..b3a5c8672e 100644 --- a/Applications/AlexNet/jni/Android.mk +++ b/Applications/AlexNet/jni/Android.mk @@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \ $(NNTRAINER_ROOT)/nntrainer/compiler \ $(NNTRAINER_ROOT)/nntrainer/optimizers \ $(NNTRAINER_ROOT)/nntrainer/tensor \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \ $(NNTRAINER_ROOT)/nntrainer/utils \ $(NNTRAINER_ROOT)/api \ $(NNTRAINER_ROOT)/api/ccapi/include \ diff --git a/Applications/Android/NNDetector/app/src/main/jni/Android.mk b/Applications/Android/NNDetector/app/src/main/jni/Android.mk index 00fdb1ffc1..757a97ffbf 100644 --- a/Applications/Android/NNDetector/app/src/main/jni/Android.mk +++ b/Applications/Android/NNDetector/app/src/main/jni/Android.mk @@ -28,17 +28,15 @@ include $(CLEAR_VARS) NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/include/nntrainer SIMPLESHOT_DIR = . - LOCAL_ARM_NEON := true -LOCAL_CFLAGS += -std=c++17 -Ofast -mcpu=cortex-a53 -Ilz4-nougat/lib -LOCAL_LDFLAGS += -Llz4-nougat/lib/obj/local/$(TARGET_ARCH_ABI)/ -LOCAL_CXXFLAGS += -std=c++17 -frtti -fexceptions +LOCAL_CFLAGS += -std=c++17 -Ofast -mcpu=cortex-a53 -Ilz4-nougat/lib -DARM=1 +LOCAL_LDFLAGS += -Llz4-nougat/lib/obj/local/$(TARGET_ARCH_ABI)/ -DARM=1 +LOCAL_CXXFLAGS += -std=c++17 -frtti -fexceptions -fopenmp -static-openmp -DARM=1 LOCAL_CFLAGS += -pthread -fexceptions -fopenmp -static-openmp -LOCAL_LDFLAGS += -fexceptions -fopenmp -static-openmp LOCAL_MODULE_TAGS := optional LOCAL_ARM_MODE := arm LOCAL_MODULE := simpleshot_jni -LOCAL_LDLIBS := -llog -landroid -fopenmp -static-openmp -ljnigraphics +LOCAL_LDLIBS := -llog -landroid -fopenmp -static-openmp -ljnigraphics -DARM=1 LOCAL_SRC_FILES := simpleshot.cpp simpleshot_jni.cpp dataloader.cpp image.cpp LOCAL_SHARED_LIBRARIES := ccapi-nntrainer nntrainer diff --git a/Applications/Custom/LayerClient/jni/Android.mk b/Applications/Custom/LayerClient/jni/Android.mk index 56dd4286eb..af9021d7d6 100644 --- a/Applications/Custom/LayerClient/jni/Android.mk +++ b/Applications/Custom/LayerClient/jni/Android.mk @@ -22,6 +22,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \ $(NNTRAINER_ROOT)/nntrainer/utils \ $(NNTRAINER_ROOT)/nntrainer/optimizers \ $(NNTRAINER_ROOT)/nntrainer/tensor \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \ $(NNTRAINER_ROOT)/api \ $(NNTRAINER_ROOT)/api/ccapi/include \ ${ML_API_COMMON_INCLUDES} diff --git a/Applications/LLaMA/jni/Android.mk b/Applications/LLaMA/jni/Android.mk index f1a9c2f117..ddfab54ca4 100644 --- a/Applications/LLaMA/jni/Android.mk +++ b/Applications/LLaMA/jni/Android.mk @@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \ $(NNTRAINER_ROOT)/nntrainer/graph \ $(NNTRAINER_ROOT)/nntrainer/optimizers \ $(NNTRAINER_ROOT)/nntrainer/tensor \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \ $(NNTRAINER_ROOT)/nntrainer/utils \ $(NNTRAINER_ROOT)/api \ $(NNTRAINER_ROOT)/api/ccapi/include \ diff --git a/Applications/Layers/jni/Android.mk b/Applications/Layers/jni/Android.mk index bef36d8739..fea882c0ac 100644 --- a/Applications/Layers/jni/Android.mk +++ b/Applications/Layers/jni/Android.mk @@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \ $(NNTRAINER_ROOT)/nntrainer/compiler \ $(NNTRAINER_ROOT)/nntrainer/optimizers \ $(NNTRAINER_ROOT)/nntrainer/tensor \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \ $(NNTRAINER_ROOT)/nntrainer/utils \ $(NNTRAINER_ROOT)/api \ $(NNTRAINER_ROOT)/api/ccapi/include \ diff --git a/Applications/Multi_input/jni/Android.mk b/Applications/Multi_input/jni/Android.mk index 93445867b4..207eadd0bc 100644 --- a/Applications/Multi_input/jni/Android.mk +++ b/Applications/Multi_input/jni/Android.mk @@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \ $(NNTRAINER_ROOT)/nntrainer/graph \ $(NNTRAINER_ROOT)/nntrainer/optimizers \ $(NNTRAINER_ROOT)/nntrainer/tensor \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \ $(NNTRAINER_ROOT)/nntrainer/utils \ $(NNTRAINER_ROOT)/api \ $(NNTRAINER_ROOT)/api/ccapi/include \ diff --git a/Applications/PicoGPT/jni/Android.mk b/Applications/PicoGPT/jni/Android.mk index f7cc55d436..ce30e58868 100644 --- a/Applications/PicoGPT/jni/Android.mk +++ b/Applications/PicoGPT/jni/Android.mk @@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \ $(NNTRAINER_ROOT)/nntrainer/graph \ $(NNTRAINER_ROOT)/nntrainer/optimizers \ $(NNTRAINER_ROOT)/nntrainer/tensor \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \ $(NNTRAINER_ROOT)/nntrainer/utils \ $(NNTRAINER_ROOT)/api \ $(NNTRAINER_ROOT)/api/ccapi/include \ diff --git a/Applications/ProductRatings/jni/Android.mk b/Applications/ProductRatings/jni/Android.mk index 7a475d643c..96bc87d134 100644 --- a/Applications/ProductRatings/jni/Android.mk +++ b/Applications/ProductRatings/jni/Android.mk @@ -22,6 +22,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer/include \ $(NNTRAINER_ROOT)/nntrainer/layers \ $(NNTRAINER_ROOT)/nntrainer/compiler \ $(NNTRAINER_ROOT)/nntrainer/graph \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \ $(NNTRAINER_ROOT)/nntrainer/utils \ $(NNTRAINER_ROOT)/nntrainer/optimizers \ $(NNTRAINER_ROOT)/nntrainer/tensor diff --git a/Applications/ReinforcementLearning/DeepQ/jni/Android.mk b/Applications/ReinforcementLearning/DeepQ/jni/Android.mk index 67173ad256..9d3cf4ddb5 100644 --- a/Applications/ReinforcementLearning/DeepQ/jni/Android.mk +++ b/Applications/ReinforcementLearning/DeepQ/jni/Android.mk @@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \ $(NNTRAINER_ROOT)/nntrainer/graph \ $(NNTRAINER_ROOT)/nntrainer/optimizers \ $(NNTRAINER_ROOT)/nntrainer/tensor \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \ $(NNTRAINER_ROOT)/nntrainer/utils \ $(NNTRAINER_ROOT)/api \ $(NNTRAINER_ROOT)/api/ccapi/include \ diff --git a/Applications/Resnet/jni/Android.mk b/Applications/Resnet/jni/Android.mk index 1c807ec393..460fb8e5eb 100644 --- a/Applications/Resnet/jni/Android.mk +++ b/Applications/Resnet/jni/Android.mk @@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \ $(NNTRAINER_ROOT)/nntrainer/graph \ $(NNTRAINER_ROOT)/nntrainer/optimizers \ $(NNTRAINER_ROOT)/nntrainer/tensor \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \ $(NNTRAINER_ROOT)/nntrainer/utils \ $(NNTRAINER_ROOT)/api \ $(NNTRAINER_ROOT)/api/ccapi/include \ diff --git a/Applications/TransferLearning/CIFAR_Classification/jni/Android.mk b/Applications/TransferLearning/CIFAR_Classification/jni/Android.mk index 22be25c1dc..271f5997b6 100644 --- a/Applications/TransferLearning/CIFAR_Classification/jni/Android.mk +++ b/Applications/TransferLearning/CIFAR_Classification/jni/Android.mk @@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \ $(NNTRAINER_ROOT)/nntrainer/compiler \ $(NNTRAINER_ROOT)/nntrainer/optimizers \ $(NNTRAINER_ROOT)/nntrainer/tensor \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \ $(NNTRAINER_ROOT)/nntrainer/utils \ $(NNTRAINER_ROOT)/api \ $(NNTRAINER_ROOT)/api/ccapi/include \ diff --git a/Applications/TransferLearning/Draw_Classification/jni/Android.mk b/Applications/TransferLearning/Draw_Classification/jni/Android.mk index 9e933db23a..0f4ed02cf2 100644 --- a/Applications/TransferLearning/Draw_Classification/jni/Android.mk +++ b/Applications/TransferLearning/Draw_Classification/jni/Android.mk @@ -18,6 +18,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \ $(NNTRAINER_ROOT)/nntrainer/models \ $(NNTRAINER_ROOT)/nntrainer/graph \ $(NNTRAINER_ROOT)/nntrainer/tensor \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \ $(NNTRAINER_ROOT)/nntrainer/optimizers \ $(NNTRAINER_ROOT)/nntrainer/utils \ $(NNTRAINER_ROOT)/api \ diff --git a/Applications/VGG/jni/Android.mk b/Applications/VGG/jni/Android.mk index 76aa559a51..73de265251 100644 --- a/Applications/VGG/jni/Android.mk +++ b/Applications/VGG/jni/Android.mk @@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \ $(NNTRAINER_ROOT)/nntrainer/compiler \ $(NNTRAINER_ROOT)/nntrainer/optimizers \ $(NNTRAINER_ROOT)/nntrainer/tensor \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \ $(NNTRAINER_ROOT)/nntrainer/utils \ $(NNTRAINER_ROOT)/api \ $(NNTRAINER_ROOT)/api/ccapi/include \ diff --git a/Applications/YOLOv2/jni/Android.mk b/Applications/YOLOv2/jni/Android.mk index 9f0dfb7165..ad5d90e696 100644 --- a/Applications/YOLOv2/jni/Android.mk +++ b/Applications/YOLOv2/jni/Android.mk @@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \ $(NNTRAINER_ROOT)/nntrainer/graph \ $(NNTRAINER_ROOT)/nntrainer/optimizers \ $(NNTRAINER_ROOT)/nntrainer/tensor \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \ $(NNTRAINER_ROOT)/nntrainer/utils \ $(NNTRAINER_ROOT)/api \ $(NNTRAINER_ROOT)/api/ccapi/include \ diff --git a/Applications/YOLOv3/jni/Android.mk b/Applications/YOLOv3/jni/Android.mk index 115218f45c..0877c17cb3 100644 --- a/Applications/YOLOv3/jni/Android.mk +++ b/Applications/YOLOv3/jni/Android.mk @@ -20,6 +20,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \ $(NNTRAINER_ROOT)/nntrainer/graph \ $(NNTRAINER_ROOT)/nntrainer/optimizers \ $(NNTRAINER_ROOT)/nntrainer/tensor \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/arm \ $(NNTRAINER_ROOT)/nntrainer/utils \ $(NNTRAINER_ROOT)/api \ $(NNTRAINER_ROOT)/api/ccapi/include \ diff --git a/debian/nntrainer-dev.install b/debian/nntrainer-dev.install index fea2a1b5b5..934f151862 100644 --- a/debian/nntrainer-dev.install +++ b/debian/nntrainer-dev.install @@ -14,7 +14,10 @@ /usr/include/nntrainer/short_tensor.h /usr/include/nntrainer/float_tensor.h /usr/include/nntrainer/tensor_wrap_specs.h -/usr/include/nntrainer/blas_interface.h +usr/include/nntrainer/fallback_internal.h +usr/include/nntrainer/cblas_interface.h +usr/include/nntrainer/x86_compute_backend.h +/usr/include/nntrainer/cpu_backend.h /usr/include/nntrainer/var_grad.h /usr/include/nntrainer/weight.h # todo: update dataset headers diff --git a/jni/meson.build b/jni/meson.build index e552919beb..82dff1db8f 100644 --- a/jni/meson.build +++ b/jni/meson.build @@ -25,6 +25,12 @@ and_conf.set('VERSION_MAJOR', nntrainer_version_split[0]) and_conf.set('VERSION_MINOR', nntrainer_version_split[1]) and_conf.set('VERSION_MICRO', nntrainer_version_split[2]) +arch = host_machine.cpu_family() +and_conf.set('ARM', 1) +if arch == 'arm' + and_conf.set('ARMV7', 1) +endif + if get_option('enable-capi').enabled() and_conf.set('MESON_CAPI_NNTRAINER_SRCS', ' '.join(capi_src)) and_conf.set('MESON_CAPI_NNTRAINER_INCS', ' '.join(capi_inc_abs)) diff --git a/meson.build b/meson.build index 98e2cae9f6..3123357533 100644 --- a/meson.build +++ b/meson.build @@ -68,9 +68,23 @@ warning_c_flags = [ '-Wno-error=varargs' ] +arch = host_machine.cpu_family() +if arch == 'arm' or arch == 'aarch64' or get_option('platform') == 'android' + message('Build for ARM architecture') + extra_defines += '-DARM=1' + if arch == 'arm' + extra_defines += '-DARMV7=1' + endif +elif arch == 'x86' or arch == 'x86_64' + message('Build for X86 architecture') + if get_option('enable-fp16') + add_project_arguments(['-march=native'], language: ['c','cpp']) + message('-march=native added for AVX hardware acceleration.') + endif + extra_defines += '-DX86=1' +endif if get_option('enable-fp16') - arch = host_machine.cpu_family() if get_option('platform') == 'android' add_project_arguments('-mfp16-format=ieee', language: ['c', 'cpp']) extra_defines += '-DENABLE_FP16=1' @@ -110,11 +124,6 @@ if get_option('enable-fp16') if cc.version().version_compare('>=12.1.0') message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.') extra_defines += '-DENABLE_FP16=1' - if get_option('enable-avx') - extra_defines += '-DUSE_AVX=1' - add_project_arguments(['-march=native'], language: ['c','cpp']) - message('-march=native added for AVX hardware acceleration.') - endif else warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.') endif diff --git a/nntrainer/layers/acti_func.h b/nntrainer/layers/acti_func.h index c6c3576414..930f46a797 100644 --- a/nntrainer/layers/acti_func.h +++ b/nntrainer/layers/acti_func.h @@ -16,8 +16,8 @@ #define __ACTI_FUNC_H__ #ifdef __cplusplus -#include #include +#include namespace nntrainer { diff --git a/nntrainer/layers/activation_layer.cpp b/nntrainer/layers/activation_layer.cpp index 8fd59506cc..ec42bfbe57 100644 --- a/nntrainer/layers/activation_layer.cpp +++ b/nntrainer/layers/activation_layer.cpp @@ -20,8 +20,8 @@ #include #include -#include #include +#include #include #include #include @@ -32,8 +32,7 @@ namespace nntrainer { ActivationLayer::ActivationLayer() : - Layer(), - activation_props(new PropTypes(props::Activation())) { + Layer(), activation_props(new PropTypes(props::Activation())) { acti_func.setActiFunc(ActivationType::ACT_NONE); } diff --git a/nntrainer/layers/conv2d_layer.cpp b/nntrainer/layers/conv2d_layer.cpp index c059ae9caf..59a218e997 100644 --- a/nntrainer/layers/conv2d_layer.cpp +++ b/nntrainer/layers/conv2d_layer.cpp @@ -16,8 +16,8 @@ #include #include -#include #include +#include #include #include #include diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp deleted file mode 100644 index 91187c50cc..0000000000 --- a/nntrainer/tensor/blas_interface.cpp +++ /dev/null @@ -1,1130 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -/** - * Copyright (C) 2020 Jijoong Moon - * - * @file blas_interface.cpp - * @date 28 Aug 2020 - * @see https://github.com/nnstreamer/nntrainer - * @author Jijoong Moon - * @author Sungsik Kong - * @bug No known bugs except for NYI items - * @brief This is dummy header for blas support - * - */ - -#include -#include - -#if (defined USE__FP16 && defined USE_NEON) -#include -#include -#endif - -#if USE_AVX -#include -#endif - -#ifdef USE_BLAS -extern "C" { -#include -} -#endif - -#include - -#define sgemv_loop(ci, cj, cM, cN) \ - do { \ - float y0; \ - unsigned int i, j; \ - for (ci = 0; ci != cM; ci++) { \ - y0 = Y[ci * incy] * beta; \ - for (cj = 0; cj != cN; cj++) \ - y0 += A[i + j * lda] * X[cj * incx]; \ - Y[ci * incy] = y0; \ - } \ - } while (0); - -#define hgemv_loop(ci, cj, cM, cN) \ - do { \ - float y0; \ - unsigned int i, j; \ - for (ci = 0; ci != cM; ci++) { \ - y0 = static_cast(Y[ci * incy] * static_cast<_FP16>(beta)); \ - for (cj = 0; cj != cN; cj++) \ - y0 += static_cast(A[i + j * lda] * X[cj * incx]); \ - Y[ci * incy] = static_cast<_FP16>(y0); \ - } \ - } while (0); - -#define haxpy_loop() \ - do { \ - unsigned int i; \ - for (i = 0; i < N; ++i) \ - Y[i * incY] = Y[i * incY] + static_cast<_FP16>(alpha) * X[i * incX]; \ - } while (0); - -#define hgemm_loop() \ - do { \ - for (unsigned int m = 0; m < M; ++m) { \ - for (unsigned int n = 0; n < N; ++n) { \ - float c = 0; \ - _FP16 c_old = C[m * ldc + n]; \ - for (unsigned int k = 0; k < K; ++k) { \ - _FP16 a, b; \ - a = ((TransA) ? A[k * lda + m] : A[m * lda + k]); \ - b = ((TransB) ? B[n * ldb + k] : B[k * ldb + n]); \ - c += static_cast(a * b); \ - } \ - C[m * ldc + n] = static_cast<_FP16>(alpha * c); \ - if (beta != 0.0) \ - C[m * ldc + n] += static_cast<_FP16>(beta) * c_old; \ - } \ - } \ - } while (0); - -namespace nntrainer { - -template -static inline void transpose_fallback(unsigned int M, unsigned int N, - const T *src, unsigned int ld_src, T *dst, - unsigned int ld_dst) { - for (unsigned int i = 0; i < M; i++) { - for (unsigned int j = 0; j < N; j++) { - dst[i + j * ld_dst] = src[i * ld_src + j]; - } - } -} - -#ifdef ENABLE_FP16 -static void saxpy_FP16(const unsigned int N, const float alpha, const _FP16 *X, - const int incX, _FP16 *Y, const int incY) { - if (incX < 0 or incY < 0) - throw std::invalid_argument("Error: negative inc not supported"); - -#if (defined USE__FP16 && USE_NEON) - // USE__FP16 is defined when platform is android - if (incX == 1 && incY == 1) { - nntrainer::neon::haxpy(N, alpha, X, Y); - } else { - haxpy_loop(); - } -#else - haxpy_loop(); -#endif -} - -static void sgemv_FP16(const unsigned int TStorageOrder, bool TransA, - const unsigned int M, const unsigned int N, - const float alpha, const _FP16 *A, - const unsigned int lda, const _FP16 *X, const int incX, - const float beta, _FP16 *Y, const int incY) { -#if (defined USE__FP16 && USE_NEON) - if (TransA) { - nntrainer::neon::hgemv_transpose(A, X, Y, M, N, alpha, beta); - } else { - nntrainer::neon::hgemv(A, X, Y, M, N, alpha, beta); - } -#else - unsigned int lenX = - (TransA) ? 1 + (M - 1) * abs(incX) : 1 + (N - 1) * abs(incX); - unsigned int lenY = - (TransA) ? 1 + (N - 1) * abs(incY) : 1 + (M - 1) * abs(incY); - - float *A_ = new float[M * N]; - float *X_ = new float[lenX]; - float *Y_ = new float[lenY]; - - scopy(M * N, A, 1, A_, 1); - scopy(lenX, X, 1, X_, 1); - scopy(lenY, Y, 1, Y_, 1); - - sgemv(TStorageOrder, TransA, M, N, alpha, A_, lda, X_, incX, beta, Y_, incY); - - scopy(lenY, Y_, 1, Y, 1); - - delete[] A_; - delete[] X_; - delete[] Y_; -#endif -} - -static _FP16 sdot_FP16(const unsigned int N, const _FP16 *X, - const unsigned int incX, const _FP16 *Y, - const unsigned int incY) { - - if (incX < 0 or incY < 0) - throw std::invalid_argument("Error: negative inc not supported"); - - _FP16 ret = 0; - -#if (defined USE__FP16 && USE_NEON) - if (incX == 1 && incY == 1) { - ret = nntrainer::neon::hdot(N, X, Y); - } else { - for (unsigned int i = 0; i < N; ++i) { - ret += X[i * incX] * Y[i * incY]; - } - } -#else - for (unsigned int i = 0; i < N; ++i) { - ret += X[i * incX] * Y[i * incY]; - } -#endif - return ret; -} - -static void scopy_FP16(const unsigned int N, const _FP16 *X, const int incX, - _FP16 *Y, const int incY) { - unsigned int incy = abs(incY); - unsigned int incx = abs(incX); - -#if (defined USE__FP16 && USE_NEON) - if (incX == 1 && incY == 1) { - nntrainer::neon::hcopy(N, X, Y); - } else { - for (unsigned int i = 0; i < N; ++i) - Y[i * incy] = X[i * incx]; - } -#else - for (unsigned int i = 0; i < N; ++i) - Y[i * incy] = X[i * incx]; -#endif -} - -static void copy_float32_to_float16(const unsigned int N, const float *X, - const int incX, _FP16 *Y, const int incY) { - unsigned int incy = abs(incY); - unsigned int incx = abs(incX); - -#if (defined USE__FP16 && USE_NEON) - if (incX == 1 && incY == 1) { - nntrainer::neon::copy_fp32_to_fp16(N, X, Y); - } else { - for (unsigned int i = 0; i < N; ++i) - Y[i * incy] = X[i * incx]; - } -#elif USE_AVX - if (incX == 1 && incY == 1) { - nntrainer::avx::vcvt_f32_f16(N, X, Y); - } else { - for (unsigned int i = 0; i < N; ++i) - Y[i * incy] = static_cast<_FP16>(X[i * incx]); - } -#else - for (unsigned int i = 0; i < N; ++i) - Y[i * incy] = static_cast<_FP16>(X[i * incx]); -#endif -} - -static void copy_float16_to_float32(const unsigned int N, const _FP16 *X, - const int incX, float *Y, const int incY) { - unsigned int incy = abs(incY); - unsigned int incx = abs(incX); - -#if (defined USE__FP16 && USE_NEON) - if (incX == 1 && incY == 1) { - nntrainer::neon::copy_fp16_to_fp32(N, X, Y); - } else { - for (unsigned int i = 0; i < N; ++i) - Y[i * incy] = X[i * incx]; - } -#elif USE_AVX - if (incX == 1 && incY == 1) { - nntrainer::avx::vcvt_f16_f32(N, X, Y); - } else { - for (unsigned int i = 0; i < N; ++i) - Y[i * incy] = static_cast(X[i * incx]); - } -#else - for (unsigned int i = 0; i < N; ++i) - Y[i * incy] = static_cast(X[i * incx]); -#endif -} - -static void copy_int4_to_fp16(const unsigned int N, const uint8_t *X, - const int incX, _FP16 *Y, const int incY) { - unsigned int incy = abs(incY); - unsigned int incx = abs(incX); - -#if (defined USE__FP16 && USE_NEON) - if (incX == 1 && incY == 1) { - nntrainer::neon::copy_int4_to_fp16(N, X, Y); - } else { - throw std::invalid_argument( - "Error: incX == 1 && incY == 1 is supported only"); - } -#else - for (unsigned int idx = 0; idx < N; idx++) { - Y[2 * idx] = X[idx] >> 4; - Y[2 * idx + 1] = X[idx] & 0x0f; - } -#endif -} - -static void copy_int8_to_fp16(const unsigned int N, const uint8_t *X, - const int incX, _FP16 *Y, const int incY) { - unsigned int incy = abs(incY); - unsigned int incx = abs(incX); - -#if (defined USE__FP16 && USE_NEON) - if (incX == 1 && incY == 1) { - nntrainer::neon::copy_int8_to_fp16(N, X, Y); - } else { - throw std::invalid_argument( - "Error: incX == 1 && incY == 1 is supported only"); - } -#else - for (unsigned int idx = 0; idx < N; idx++) { - Y[idx] = X[idx]; - } -#endif -} - -void sscal(const unsigned int N, const float alpha, _FP16 *X, const int incX) { - unsigned int incx = abs(incX); - -#if (defined USE__FP16 && USE_NEON) - if (incX == 1) { - nntrainer::neon::hscal(N, X, alpha); - } else { - for (unsigned int i = 0; i < N; ++i) - X[i * incx] = static_cast<_FP16>(alpha) * X[i * incx]; - } -#else - for (unsigned int i = 0; i < N; ++i) - X[i * incx] = static_cast<_FP16>(alpha) * X[i * incx]; -#endif -} - -static _FP16 snrm2_FP16(const unsigned int N, const _FP16 *X, const int incX) { - unsigned int incx = abs(incX); - _FP16 sum; - _FP16 tmp; -#if (defined USE__FP16 && USE_NEON) - if (incX == 1) { - sum = nntrainer::neon::hnrm2(N, X); - } else { - float sum32 = 0; - for (unsigned int i = 0; i < N; i++) { - tmp = X[i * incx]; - sum32 += tmp * tmp; - } - sum = static_cast<_FP16>(sqrt(sum32)); - } -#else - float sum32 = 0; - for (unsigned int i = 0; i < N; i++) { - tmp = X[i * incx]; - sum32 += tmp * tmp; - } - sum = static_cast<_FP16>(sqrt(sum32)); -#endif - return sum; -} - -static void sgemm_FP16(const unsigned int TStorageOrder, bool TransA, - bool TransB, const unsigned int M, const unsigned int N, - const unsigned int K, const float alpha, const _FP16 *A, - const unsigned int lda, const _FP16 *B, - const unsigned int ldb, const float beta, _FP16 *C, - const unsigned int ldc) { - -#if (defined USE__FP16 && USE_NEON) - nntrainer::neon::custom_hgemm(A, B, C, M, N, K, alpha, beta, TransA, TransB); -#else - CBLAS_TRANSPOSE transA = TransA ? CblasTrans : CblasNoTrans; - CBLAS_TRANSPOSE transB = TransB ? CblasTrans : CblasNoTrans; - CBLAS_ORDER order = TStorageOrder ? CblasColMajor : CblasRowMajor; - - float *A_ = new float[M * K]; - float *B_ = new float[N * K]; - float *C_ = new float[M * N]; - - scopy(M * K, A, 1, A_, 1); - scopy(N * K, B, 1, B_, 1); - scopy(M * N, C, 1, C_, 1); - cblas_sgemm(order, transA, transB, M, N, K, alpha, A_, lda, B_, ldb, beta, C_, - ldc); - scopy(M * N, C_, 1, C, 1); - - delete[] A_; - delete[] B_; - delete[] C_; -#endif -} - -static unsigned int isamax_FP16(const unsigned int N, const _FP16 *X, - const int incX) { - unsigned int max_idx = 0; - -#if (defined USE__FP16 && USE_NEON) - if (incX == 1 && N >= 8) { - max_idx = nntrainer::neon::isamax(N, X); - } else { - _FP16 max_val = X[0]; - for (unsigned int n = 1; n < N; n += incX) { - _FP16 cur_val = (X[n] >= 0) ? X[n] : -1 * X[n]; - if (cur_val > max_val) { - max_val = cur_val; - max_idx = n; - } - } - } -#else - _FP16 max_val = X[0]; - for (unsigned int n = 1; n < N; n += incX) { - _FP16 cur_val = (X[n] >= 0) ? X[n] : -1 * X[n]; - if (cur_val > max_val) { - max_val = cur_val; - max_idx = n; - } - } -#endif - - return max_idx; -} - -void saxpy(const unsigned int N, const float alpha, const _FP16 *X, - const int incX, _FP16 *Y, const int incY) { - saxpy_FP16(N, alpha, X, incX, Y, incY); -} - -void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, - const unsigned int M, const unsigned int N, const unsigned int K, - const float alpha, const _FP16 *A, const unsigned int lda, - const _FP16 *B, const unsigned int ldb, const float beta, _FP16 *C, - const unsigned int ldc) { - sgemm_FP16(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, - beta, C, ldc); -} - -void scopy(const unsigned int N, const _FP16 *X, const int incX, _FP16 *Y, - const int incY) { - scopy_FP16(N, X, incX, Y, incY); -} - -void scopy(const unsigned int N, const float *X, const int incX, _FP16 *Y, - const int incY) { - copy_float32_to_float16(N, X, incX, Y, incY); -} - -void scopy(const unsigned int N, const _FP16 *X, const int incX, float *Y, - const int incY) { - copy_float16_to_float32(N, X, incX, Y, incY); -} - -void scopy_int4_to_float16(const unsigned int N, const uint8_t *X, - const int incX, _FP16 *Y, const int incY) { - copy_int4_to_fp16(N, X, incX, Y, incY); -} - -void scopy_int8_to_float16(const unsigned int N, const uint8_t *X, - const int incX, _FP16 *Y, const int incY) { - copy_int8_to_fp16(N, X, incX, Y, incY); -} - -static void ele_mul_fallback(const unsigned int N, const _FP16 *X, - const _FP16 *Y, _FP16 *Z, float alpha, float beta, - unsigned int i_stride, unsigned int o_stride) { - for (unsigned int i = 0; i < N; ++i) { - *Z = *X * static_cast<_FP16>(alpha) * *Y + static_cast<_FP16>(beta) * *Z; - X += o_stride; - Y += i_stride; - Z += o_stride; - } -} - -static void ele_add_fallback(const unsigned int N, const _FP16 *X, - const _FP16 *Y, _FP16 *Z, float alpha, float beta, - unsigned int i_stride, unsigned int o_stride) { - for (unsigned int i = 0; i < N; ++i) { - *Z = *X + static_cast<_FP16>(alpha) * *Y + static_cast<_FP16>(beta) * *Z; - X += o_stride; - Y += i_stride; - Z += o_stride; - } -} - -static void ele_sub_fallback(const unsigned int N, const _FP16 *X, - const _FP16 *Y, _FP16 *Z, float alpha, float beta, - unsigned int i_stride, unsigned int o_stride) { - for (unsigned int i = 0; i < N; ++i) { - *Z = *X - static_cast<_FP16>(alpha) * *Y + static_cast<_FP16>(beta) * *Z; - X += o_stride; - Y += i_stride; - Z += o_stride; - } -} - -static void ele_div_fallback(const unsigned int N, const _FP16 *X, - const _FP16 *Y, _FP16 *Z, float alpha, float beta, - unsigned int i_stride, unsigned int o_stride) { - for (unsigned int i = 0; i < N; ++i) { - *Z = *X / (static_cast<_FP16>(alpha) * *Y) + static_cast<_FP16>(beta) * *Z; - X += o_stride; - Y += i_stride; - Z += o_stride; - } -} - -void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, - float alpha, float beta, unsigned int i_stride, - unsigned int o_stride) { - if (i_stride == 1 && o_stride == 1) { -#if (defined USE__FP16 && USE_NEON) - nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta); -#else - ele_mul_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); -#endif - } else - ele_mul_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); -} - -void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, - float alpha, float beta, unsigned int i_stride, - unsigned int o_stride) { - if (i_stride == 1 && o_stride == 1) { -#if (defined USE__FP16 && USE_NEON) - nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta); -#else - ele_add_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); -#endif - } else - ele_add_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); -} - -void ele_sub(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, - float alpha, float beta, unsigned int i_stride, - unsigned int o_stride) { - if (i_stride == 1 && o_stride == 1) { -#if (defined USE__FP16 && USE_NEON) - nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta); -#else - ele_sub_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); -#endif - } else - ele_sub_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); -} - -void ele_div(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, - float alpha, float beta, unsigned int i_stride, - unsigned int o_stride) { - if (i_stride == 1 && o_stride == 1) { -#if (defined USE__FP16 && USE_NEON) - nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta); -#else - ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); -#endif - } else - ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); -} - -_FP16 snrm2(const int N, const _FP16 *X, const int incX) { - return snrm2_FP16(N, X, incX); -} - -_FP16 sdot(const unsigned int N, const _FP16 *X, const unsigned int incX, - const _FP16 *Y, const unsigned int incY) { - return sdot_FP16(N, X, incX, Y, incY); -} - -void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M, - const unsigned int N, const float alpha, const _FP16 *A, - const unsigned int lda, const _FP16 *X, const int incX, - const float beta, _FP16 *Y, const int incY) { - sgemv_FP16(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, Y, - incY); -} - -unsigned int isamax(const unsigned int N, const _FP16 *X, const int incX) { - /// @todo isamax_FP16 for BLAS_NUM_THREADS - return isamax_FP16(N, X, incX); -} - -void inv_sqrt_inplace(const unsigned int N, _FP16 *X) { -#ifdef USE_NEON - nntrainer::neon::inv_sqrt_inplace(N, X); -#else - for (unsigned int i = 0; i < N; ++i) { - X[i] = static_cast<_FP16>(1 / std::sqrt(static_cast(X[i]))); - } -#endif -} - -void transpose_matrix(const unsigned int M, const unsigned int N, - const _FP16 *src, unsigned int ld_src, _FP16 *dst, - unsigned int ld_dst) { -#ifdef USE_NEON - transpose_neon<_FP16>(M, N, src, ld_src, dst, ld_dst); -#else - transpose_fallback<_FP16>(M, N, src, ld_src, dst, ld_dst); -#endif -} -#endif - -#ifndef USE_BLAS -static void saxpy_raw(const unsigned int N, const float alpha, const float *X, - const int incX, float *Y, const int incY) { - if (incX < 0 or incY < 0) - throw std::invalid_argument("Error: negative inc not supported"); - for (unsigned int i = 0; i < N; ++i) - Y[i * incY] = Y[i * incY] + X[i * incX] * alpha; -} - -static void sgemv_raw(const unsigned int TStorageOrder, bool TransA, - const unsigned int M, const unsigned int N, - const float alpha, const float *A, const unsigned int lda, - const float *X, const int incX, const float beta, - float *Y, const int incY) { - - unsigned int incy = abs(incY); - unsigned int incx = abs(incX); - - if (TransA) { - sgemv_loop(i, j, N, M); - } else { - sgemv_loop(j, i, M, N); - } -} - -static float sdot_raw(const unsigned int N, const float *X, - const unsigned int incX, const float *Y, - const unsigned int incY) { - float ret = 0; - for (unsigned int i = 0; i < N; ++i) { - ret += X[i * incX] * Y[i * incY]; - } - return ret; -} - -static void scopy_raw(const unsigned int N, const float *X, const int incX, - float *Y, const int incY) { - unsigned int incy = abs(incY); - unsigned int incx = abs(incX); - - for (unsigned int i = 0; i < N; ++i) - Y[i * incy] = X[i * incx]; -} - -static void sscal_raw(const unsigned int N, const float alpha, float *X, - const int incX) { - unsigned int incx = abs(incX); - - for (unsigned int i = 0; i < N; ++i) - X[i * incx] = alpha * X[i * incx]; -} - -static float snrm2_raw(const unsigned int N, const float *X, const int incX) { - unsigned int incx = abs(incX); - float sum = 0.0f; - float tmp; - - for (unsigned int i = 0; i < N; i++) { - tmp = X[i * incx]; - sum += tmp * tmp; - } - return sqrt(sum); -} - -static void sgemm_raw(const unsigned int TStorageOrder, bool TransA, - bool TransB, const unsigned int M, const unsigned int N, - const unsigned int K, const float alpha, const float *A, - const unsigned int lda, const float *B, - const unsigned int ldb, const float beta, float *C, - const unsigned int ldc) { - - for (unsigned int m = 0; m < M; ++m) { - for (unsigned int n = 0; n < N; ++n) { - double c = 0.0; - float c_old = C[m * ldc + n]; - for (unsigned int k = 0; k < K; ++k) { - float a, b; - a = ((TransA) ? A[k * lda + m] : A[m * lda + k]); - b = ((TransB) ? B[n * ldb + k] : B[k * ldb + n]); - c += a * b; - } - C[m * ldc + n] = alpha * c; - if (beta != 0.0) - C[m * ldc + n] += beta * c_old; - } - } -} - -static unsigned int isamax_raw(const unsigned int N, const float *X, - const int incX) { - - unsigned int max_idx = 0; - float max_val = X[0]; - for (unsigned int n = 1; n < N; n += incX) { - float cur_val = abs(X[n]); - if (cur_val > max_val) { - max_val = cur_val; - max_idx = n; - } - } - - return max_idx; -} - -#endif - -void sscal(const unsigned int N, const float alpha, void *X, const int incX, - ml::train::TensorDim::DataType d_type) { - - if (d_type == ml::train::TensorDim::DataType::FP32) { - -#ifdef USE_BLAS -#ifdef BLAS_NUM_THREADS - openblas_set_num_threads(BLAS_NUM_THREADS); -#endif // BLAS_NUM_THREADS - cblas_sscal(N, alpha, (float *)X, incX); -#else // USE_BLAS else - sscal_raw(N, alpha, (float *)X, incX); -#endif // USE_BLAS - } else if (d_type == ml::train::TensorDim::DataType::FP16) { -#ifdef ENABLE_FP16 - sscal(N, alpha, (_FP16 *)X, incX); -#else - throw std::invalid_argument("Error: enable-fp16 is not enabled"); -#endif - } -} - -void sscal(const unsigned int N, const float alpha, float *X, const int incX) { -#ifdef USE_BLAS -#ifdef BLAS_NUM_THREADS - openblas_set_num_threads(BLAS_NUM_THREADS); -#endif - cblas_sscal(N, alpha, X, incX); -#else - sscal_raw(N, alpha, X, incX); -#endif -} - -void saxpy(const unsigned int N, const float alpha, const void *X, - const int incX, void *Y, const int incY, - ml::train::TensorDim::DataType d_type) { - if (d_type == ml::train::TensorDim::DataType::FP32) { -#ifdef USE_BLAS -#ifdef BLAS_NUM_THREADS - openblas_set_num_threads(BLAS_NUM_THREADS); -#endif - cblas_saxpy(N, alpha, static_cast(X), incX, - static_cast(Y), incY); -#else - saxpy_raw(N, alpha, static_cast(X), incX, - static_cast(Y), incY); -#endif - } else if (d_type == ml::train::TensorDim::DataType::FP16) { -#ifdef ENABLE_FP16 - saxpy_FP16(N, alpha, static_cast(X), incX, - static_cast<_FP16 *>(Y), incY); -#else - throw std::invalid_argument("Error: enable-fp16 is not enabled"); -#endif - } -} - -void saxpy(const unsigned int N, const float alpha, const float *X, - const int incX, float *Y, const int incY) { -#ifdef USE_BLAS -#ifdef BLAS_NUM_THREADS - openblas_set_num_threads(BLAS_NUM_THREADS); -#endif - cblas_saxpy(N, alpha, X, incX, Y, incY); -#else - saxpy_raw(N, alpha, X, incX, Y, incY); -#endif -} - -void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, - const unsigned int M, const unsigned int N, const unsigned int K, - const float alpha, const void *A, const unsigned int lda, - const void *B, const unsigned int ldb, const float beta, void *C, - const unsigned int ldc, ml::train::TensorDim::DataType d_type) { - if (d_type == ml::train::TensorDim::DataType::FP32) { -#ifdef USE_CUBLAS - int devID = 0; - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, devID); - float *d_A, *d_B, *d_C; - - unsigned int size_A = M * K * sizeof(float); - unsigned int size_B = K * N * sizeof(float); - unsigned int size_C = M * N * sizeof(float); - - cudaMalloc((void **)&d_A, size_A); - cudaMalloc((void **)&d_B, size_B); - cudaMemcpy(d_A, A, size_A, cudaMemcpyHostToDevice); - cudaMemcpy(d_B, B, size_B, cudaMemcpyHostToDevice); - cudaMalloc((void **)&d_C, size_C); - - cublasHandle_t handle; - cublasCreate(&handle); - - cublasOperation_t transA = (TransA) ? CUBLAS_OP_T : CUBLAS_OP_N; - cublasOperation_t transB = (TransB) ? CUBLAS_OP_T : CUBLAS_OP_N; - cublasSgemm(handle, transA, transB, N, M, K, &alpha, d_B, N, d_A, K, &beta, - d_C, N); - - cudaMemcpy(C, d_C, size_C, cudaMemcpyDeviceToHost); - cublasDestroy(handle); - -#elif defined USE_BLAS - -#ifdef BLAS_NUM_THREADS - openblas_set_num_threads(BLAS_NUM_THREADS); -#endif - CBLAS_TRANSPOSE transA = TransA ? CblasTrans : CblasNoTrans; - CBLAS_TRANSPOSE transB = TransB ? CblasTrans : CblasNoTrans; - CBLAS_ORDER order = TStorageOrder ? CblasColMajor : CblasRowMajor; - cblas_sgemm( - order, transA, transB, M, N, K, alpha, static_cast(A), lda, - static_cast(B), ldb, beta, static_cast(C), ldc); -#else - sgemm_raw(TStorageOrder, TransA, TransB, M, N, K, alpha, - static_cast(A), lda, static_cast(B), - ldb, beta, static_cast(C), ldc); -#endif - - } else if (d_type == ml::train::TensorDim::DataType::FP16) { -#ifdef ENABLE_FP16 - sgemm_FP16(TStorageOrder, TransA, TransB, M, N, K, alpha, - static_cast(A), lda, - static_cast(B), ldb, beta, - static_cast<_FP16 *>(C), ldc); -#else - throw std::invalid_argument("Error: enable-fp16 is not enabled"); -#endif - } -} // namespace nntrainer - -void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, - const unsigned int M, const unsigned int N, const unsigned int K, - const float alpha, const float *A, const unsigned int lda, - const float *B, const unsigned int ldb, const float beta, float *C, - const unsigned int ldc) { -#ifdef USE_CUBLAS - int devID = 0; - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, devID); - float *d_A, *d_B, *d_C; - - unsigned int size_A = M * K * sizeof(float); - unsigned int size_B = K * N * sizeof(float); - unsigned int size_C = M * N * sizeof(float); - - cudaMalloc((void **)&d_A, size_A); - cudaMalloc((void **)&d_B, size_B); - cudaMemcpy(d_A, A, size_A, cudaMemcpyHostToDevice); - cudaMemcpy(d_B, B, size_B, cudaMemcpyHostToDevice); - cudaMalloc((void **)&d_C, size_C); - - cublasHandle_t handle; - cublasCreate(&handle); - - cublasOperation_t transA = (TransA) ? CUBLAS_OP_T : CUBLAS_OP_N; - cublasOperation_t transB = (TransB) ? CUBLAS_OP_T : CUBLAS_OP_N; - cublasSgemm(handle, transA, transB, N, M, K, &alpha, d_B, N, d_A, K, &beta, - d_C, N); - - cudaMemcpy(C, d_C, size_C, cudaMemcpyDeviceToHost); - cublasDestroy(handle); -#elif defined USE_BLAS -#ifdef BLAS_NUM_THREADS - openblas_set_num_threads(BLAS_NUM_THREADS); -#endif - CBLAS_TRANSPOSE transA = TransA ? CblasTrans : CblasNoTrans; - CBLAS_TRANSPOSE transB = TransB ? CblasTrans : CblasNoTrans; - CBLAS_ORDER order = TStorageOrder ? CblasColMajor : CblasRowMajor; - cblas_sgemm(order, transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, - ldc); -#else - sgemm_raw(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, - C, ldc); -#endif -} - -void scopy(const unsigned int N, const void *X, const int incX, void *Y, - const int incY, ml::train::TensorDim::DataType d_type) { - - if (d_type == ml::train::TensorDim::DataType::FP32) { - -#ifdef USE_BLAS -#ifdef BLAS_NUM_THREADS - openblas_set_num_threads(BLAS_NUM_THREADS); -#endif - cblas_scopy(N, (float *)X, incX, (float *)Y, incY); -#else - scopy_raw(N, (float *)X, incX, (float *)Y, incY); -#endif - - } else if (d_type == ml::train::TensorDim::DataType::FP16) { -#ifdef ENABLE_FP16 - scopy_FP16(N, (_FP16 *)X, incX, (_FP16 *)Y, incY); -#else - throw std::invalid_argument("Error: enable-fp16 is not enabled"); -#endif - } -} - -void scopy(const unsigned int N, const float *X, const int incX, float *Y, - const int incY) { -#ifdef USE_BLAS -#ifdef BLAS_NUM_THREADS - openblas_set_num_threads(BLAS_NUM_THREADS); -#endif - cblas_scopy(N, X, incX, Y, incY); -#else - scopy_raw(N, X, incX, Y, incY); -#endif -} - -void scopy(const unsigned int N, const uint8_t *X, const int incX, uint8_t *Y, - const int intY) { -#ifdef USE_NEON - nntrainer::neon::copy_int8_or_int4(N, X, Y); -#else - for (unsigned int idx = 0; idx < N; idx++) { - Y[idx] = X[idx]; - } -#endif -} - -void scopy_int4_to_float32(const unsigned int N, const uint8_t *X, - const int incX, float *Y, const int incY) { -#ifdef USE_NEON - nntrainer::neon::copy_int4_to_fp32(N, X, Y); -#else - for (unsigned int idx = 0; idx < N; idx++) { - Y[2 * idx] = X[idx] >> 4; - Y[2 * idx + 1] = X[idx] & 0x0f; - } -#endif -} - -void scopy_int8_to_float32(const unsigned int N, const uint8_t *X, - const int incX, float *Y, const int incY) { -#ifdef USE_NEON - nntrainer::neon::copy_int8_to_fp32(N, X, Y); -#else - for (unsigned int idx = 0; idx < N; idx++) { - Y[idx] = X[idx]; - } -#endif -} - -float snrm2(const int N, const float *X, const int incX) { -#ifdef USE_BLAS -#ifdef BLAS_NUM_THREADS - openblas_set_num_threads(BLAS_NUM_THREADS); -#endif - return cblas_snrm2(N, X, incX); -#else - return snrm2_raw(N, X, incX); -#endif -} - -float sdot(const unsigned int N, const float *X, const unsigned int incX, - const float *Y, const unsigned int incY) { -#ifdef USE_BLAS -#ifdef BLAS_NUM_THREADS - openblas_set_num_threads(BLAS_NUM_THREADS); -#endif - return cblas_sdot(N, X, incX, Y, incY); -#else - return sdot_raw(N, X, incX, Y, incY); -#endif -} - -void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M, - const unsigned int N, const float alpha, const void *A, - const unsigned int lda, const void *X, const int incX, - const float beta, void *Y, const int incY, - ml::train::TensorDim::DataType d_type) { - - if (d_type == ml::train::TensorDim::DataType::FP32) { -#ifdef USE_BLAS -#ifdef BLAS_NUM_THREADS - openblas_set_num_threads(BLAS_NUM_THREADS); -#endif - CBLAS_TRANSPOSE transA = TransA ? CblasTrans : CblasNoTrans; - CBLAS_ORDER order = TStorageOrder ? CblasColMajor : CblasRowMajor; - return cblas_sgemv( - order, transA, M, N, alpha, static_cast(A), lda, - static_cast(X), incX, beta, static_cast(Y), incY); -#else - return sgemv_raw( - TStorageOrder, TransA, M, N, alpha, static_cast(A), lda, - static_cast(X), incX, beta, static_cast(Y), incY); -#endif - } else if (d_type == ml::train::TensorDim::DataType::FP16) { -#ifdef ENABLE_FP16 - return sgemv_FP16( - TStorageOrder, TransA, M, N, alpha, static_cast(A), lda, - static_cast(X), incX, beta, static_cast<_FP16 *>(Y), incY); -#else - throw std::invalid_argument("Error: enable-fp16 is not enabled"); -#endif - } -} - -void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M, - const unsigned int N, const float alpha, const float *A, - const unsigned int lda, const float *X, const int incX, - const float beta, float *Y, const int incY) { -#ifdef USE_BLAS -#ifdef BLAS_NUM_THREADS - openblas_set_num_threads(BLAS_NUM_THREADS); -#endif - CBLAS_TRANSPOSE transA = TransA ? CblasTrans : CblasNoTrans; - CBLAS_ORDER order = TStorageOrder ? CblasColMajor : CblasRowMajor; - return cblas_sgemv(order, transA, M, N, alpha, A, lda, X, incX, beta, Y, - incY); -#else - return sgemv_raw(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, Y, - incY); -#endif -} - -unsigned int isamax(const unsigned int N, const float *X, const int incX) { -#ifdef USE_BLAS -#ifdef BLAS_NUM_THREADS - openblas_set_num_threads(BLAS_NUM_THREADS); -#endif - return cblas_isamax(N, X, incX); -#else - return isamax_raw(N, X, incX); -#endif -} - -void sine(const unsigned int N, float *X, float *Y, float alpha) { -#ifdef USE_NEON - nntrainer::neon::sine(N, X, Y, alpha); -#else - unsigned int i = 0; - while (i < N) { - Y[i] = std::sin(alpha * X[i]); - ++i; - } -#endif -} - -void cosine(const unsigned int N, float *X, float *Y, float alpha) { -#ifdef USE_NEON - nntrainer::neon::cosine(N, X, Y, alpha); -#else - unsigned int i = 0; - while (i < N) { - Y[i] = std::cos(alpha * X[i]); - ++i; - } -#endif -} - -void inv_sqrt_inplace(const unsigned int N, float *X) { -#ifdef USE_NEON - nntrainer::neon::inv_sqrt_inplace(N, X); -#else - for (unsigned int i = 0; i < N; ++i) { - X[i] = 1 / std::sqrt(static_cast(X[i])); - } -#endif -} -static void ele_mul_fallback(const unsigned int N, const float *X, - const float *Y, float *Z, float alpha, float beta, - unsigned int i_stride, unsigned int o_stride) { - for (unsigned int i = 0; i < N; ++i) { - *Z = *X * alpha * *Y + beta * *Z; - X += o_stride; - Y += i_stride; - Z += o_stride; - } -} - -static void ele_add_fallback(const unsigned int N, const float *X, - const float *Y, float *Z, float alpha, float beta, - unsigned int i_stride, unsigned int o_stride) { - for (unsigned int i = 0; i < N; ++i) { - *Z = *X + alpha * *Y + beta * *Z; - X += o_stride; - Y += i_stride; - Z += o_stride; - } -} - -static void ele_sub_fallback(const unsigned int N, const float *X, - const float *Y, float *Z, float alpha, float beta, - unsigned int i_stride, unsigned int o_stride) { - for (unsigned int i = 0; i < N; ++i) { - *Z = *X - alpha * *Y + beta * *Z; - X += o_stride; - Y += i_stride; - Z += o_stride; - } -} - -static void ele_div_fallback(const unsigned int N, const float *X, - const float *Y, float *Z, float alpha, float beta, - unsigned int i_stride, unsigned int o_stride) { - for (unsigned int i = 0; i < N; ++i) { - *Z = *X / (alpha * *Y) + beta * *Z; - X += o_stride; - Y += i_stride; - Z += o_stride; - } -} - -void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, - float alpha, float beta, unsigned int i_stride, - unsigned int o_stride) { - if (i_stride == 1 && o_stride == 1) { -#ifdef USE_NEON - nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta); -#else - ele_mul_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); -#endif - } else - ele_mul_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); -} - -void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, - float alpha, float beta, unsigned int i_stride, - unsigned int o_stride) { - if (i_stride == 1 && o_stride == 1) { -#ifdef USE_NEON - nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta); -#else - ele_add_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); -#endif - } else - ele_add_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); -} - -void ele_sub(const unsigned int N, const float *X, const float *Y, float *Z, - float alpha, float beta, unsigned int i_stride, - unsigned int o_stride) { - if (i_stride == 1 && o_stride == 1) { -#ifdef USE_NEON - nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta); -#else - ele_sub_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); -#endif - } else - ele_sub_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); -} - -void ele_div(const unsigned int N, const float *X, const float *Y, float *Z, - float alpha, float beta, unsigned int i_stride, - unsigned int o_stride) { - if (i_stride == 1 && o_stride == 1) { -#ifdef USE_NEON - nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta); -#else - ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); -#endif - } else - ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); -} - -} // namespace nntrainer diff --git a/nntrainer/tensor/char_tensor.cpp b/nntrainer/tensor/char_tensor.cpp index ec70943b84..33e3b94acf 100644 --- a/nntrainer/tensor/char_tensor.cpp +++ b/nntrainer/tensor/char_tensor.cpp @@ -8,11 +8,10 @@ * @bug No known bugs except for NYI items */ +#include +#include #include #include - -#include -#include #include namespace nntrainer { diff --git a/nntrainer/tensor/cpu_backend/arm/arm_compute_backend.cpp b/nntrainer/tensor/cpu_backend/arm/arm_compute_backend.cpp new file mode 100644 index 0000000000..db82c6549d --- /dev/null +++ b/nntrainer/tensor/cpu_backend/arm/arm_compute_backend.cpp @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Sungsik Kong + * + * @file arm_compute_backend.cpp + * @date 23 April 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Sungsik Kong + * @bug No known bugs except for NYI items + * @brief Compute backend for arm + * + */ +#include +#include +#include +#include +#include +#include + +#define ROW_MAJOR 0 +#define COL_MAJOR 1 + +namespace nntrainer { + +void scopy(const unsigned int N, const uint8_t *X, const unsigned int incX, + uint8_t *Y, const unsigned int incY) { + if (incX == 1 && incY == 1) { + nntrainer::neon::copy_int8_or_int4(N, X, Y); + } else { + __fallback_scopy(N, X, incX, Y, incY); + } +} + +void scopy_int4_to_float32(const unsigned int N, const uint8_t *X, + const unsigned int incX, float *Y, + const unsigned int incY) { + if (incX == 1 && incY == 1) { + nntrainer::neon::copy_int4_to_fp32(N, X, Y); + } else { + __fallback_scopy_int4_to_float32(N, X, incX, Y, incY); + } +} + +void scopy_int8_to_float32(const unsigned int N, const uint8_t *X, + const unsigned int incX, float *Y, + const unsigned int incY) { + + if (incX == 1 && incY == 1) { + nntrainer::neon::copy_int8_to_fp32(N, X, Y); + } else { + __fallback_scopy_int8_to_float32(N, X, incX, Y, incY); + } +} + +void sine(const unsigned int N, float *X, float *Y, float alpha) { + nntrainer::neon::sine(N, X, Y, alpha); +} + +void cosine(const unsigned int N, float *X, float *Y, float alpha) { + nntrainer::neon::cosine(N, X, Y, alpha); +} + +void inv_sqrt_inplace(const unsigned int N, float *X) { + nntrainer::neon::inv_sqrt_inplace(N, X); +} + +void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + if (i_stride == 1 && o_stride == 1) { + nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta); + } else + __fallback_ele_mul(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + if (i_stride == 1 && o_stride == 1) { + nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta); + } else + __fallback_ele_add(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + if (i_stride == 1 && o_stride == 1) { + nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta); + } else + __fallback_ele_sub(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void ele_div(const unsigned N, const float *X, const float *Y, float *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + if (i_stride == 1 && o_stride == 1) { + nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta); + } else + __fallback_ele_div(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void saxpy(const unsigned int N, const float alpha, const float *X, + const unsigned int incX, float *Y, const unsigned int incY) { + __cblas_saxpy(N, alpha, X, incX, Y, incY); +} + +void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M, + const unsigned int N, const float alpha, const float *A, + const unsigned int lda, const float *X, const unsigned int incX, + const float beta, float *Y, const unsigned int incY) { + __cblas_sgemv(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, Y, + incY); +} + +float sdot(const unsigned int N, const float *X, const unsigned int incX, + const float *Y, const unsigned int incY) { + return __cblas_sdot(N, X, incX, Y, incY); +} + +void scopy(const unsigned int N, const float *X, const unsigned int incX, + float *Y, const unsigned int incY) { + __cblas_scopy(N, X, incX, Y, incY); +} + +void sscal(const unsigned int N, const float alpha, float *X, + const unsigned int incX) { + __cblas_sscal(N, alpha, X, incX); +} + +float snrm2(const unsigned int N, const float *X, const unsigned int incX) { + return __cblas_snrm2(N, X, incX); +} + +void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, + const unsigned int M, const unsigned int N, const unsigned int K, + const float alpha, const float *A, const unsigned int lda, + const float *B, const unsigned int ldb, const float beta, float *C, + const unsigned int ldc) { + __cblas_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, + beta, C, ldc); +} + +unsigned int isamax(const unsigned int N, const float *X, + const unsigned int incX) { + return __cblas_isamax(N, X, incX); +} + +} /* namespace nntrainer */ diff --git a/nntrainer/tensor/cpu_backend/arm/arm_compute_backend.h b/nntrainer/tensor/cpu_backend/arm/arm_compute_backend.h new file mode 100644 index 0000000000..a7617dc942 --- /dev/null +++ b/nntrainer/tensor/cpu_backend/arm/arm_compute_backend.h @@ -0,0 +1,430 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Sungsik Kong + * + * @file ARM_compute_backend.h + * @date 23 April 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Sungsik Kong + * @bug No known bugs except for NYI items + * @brief Compute backend for ARM + * + */ +#ifndef __ARM_COMPUTE_BACKEND_H__ +#define __ARM_COMPUTE_BACKEND_H__ +#ifdef __cplusplus + +#include +#include + +namespace nntrainer { + +#ifdef ENABLE_FP16 +/** + * @brief + * + * + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + * @param[in] alpha float number + */ +void sscal(const unsigned int N, const float alpha, _FP16 *X, + const unsigned int incX); + +/** + * @brief snrm2 computation : Euclidean norm + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + */ +_FP16 snrm2(const unsigned int N, const _FP16 *X, const unsigned int incX); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX, + _FP16 *Y, const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void scopy(const unsigned int N, const float *X, const unsigned int incX, + _FP16 *Y, const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + * @param[in] Y float * for Vector Y + */ +void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX, + float *Y, const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void scopy_int4_to_float16(const unsigned int N, const uint8_t *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void scopy_int8_to_float16(const unsigned int N, const uint8_t *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY); + +/** + * @brief sdot computation : sum of all X * Y + * @param[in] N number of elements in Y + * @param[in] X __fp16 * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +_FP16 sdot(const unsigned int N, const _FP16 *X, const unsigned int incX, + const _FP16 *Y, const unsigned int incY); + +/** + * @brief saxpy computation : Y = alpha*X + Y + * @param[in] N number of elements in Y + * @param[in] alpha float number + * @param[in] X __fp16 * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void saxpy(const unsigned int N, const float alpha, const _FP16 *X, + const unsigned int incX, _FP16 *Y, const unsigned int incY); + +/** + * @brief sgemm computation : Y = alpha*op(A)*op(B) + beta*C, + * where op(X) is one of X or X**T + * @param[in] A __fp16 * for Matrix A + * @param[in] B __fp16 * for Matrix B + * @param[in] C __fp16 * for Matrix C + * @param[in] M number of op(A)'s and C's row + * @param[in] N number of op(B)'s and C's columns + * @param[in] K number of op(A)'s and columns and op(B)'s rows + * @param[in] alpha float number + * @param[in] beta float number + */ +void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, + const unsigned int M, const unsigned int N, const unsigned int K, + const float alpha, const _FP16 *A, const unsigned int lda, + const _FP16 *B, const unsigned int ldb, const float beta, _FP16 *C, + const unsigned int ldc); +/** + * @brief sgemv computation : Y = alpha*A*X + beta*Y + * @param[in] A float * for Matrix A + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] rows number of A's row + * @param[in] cols number of A's columns + * @param[in] alpha float number + * @param[in] beta float number + */ +void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M, + const unsigned int N, const float alpha, const _FP16 *A, + const unsigned int lda, const _FP16 *X, const unsigned int incX, + const float beta, _FP16 *Y, const unsigned int incY); +/** + * @brief elementwise vector multiplication : Z = X ⊙ alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, + unsigned int o_stride = 1); + +/** + * @brief elementwise vector addition : Z = X + alpha * Y + beta * + * Z + * @param[in] N length of the vector + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, + unsigned int o_stride = 1); +/** + * @brief elementwise vector subtraction with neon : Z = X - alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void ele_sub(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, + unsigned int o_stride = 1); + +/** + * @brief elementwise vector division with neon : Z = X / (alpha * Y) + beta + * * Z + * @note ZeroDivisionError is not guaranteed in this function + * @param[in] N length of the vector + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, + unsigned int o_stride = 1); + +/** + * @brief isamax function : index of first maxima + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + */ +unsigned int isamax(const unsigned int N, const _FP16 *X, + const unsigned int incX); + +/** + * @brief squared root transformation inplace : X = sqrt(X) + * + * @param N size of X + * @param X __fp16 * for Vector X + */ +void inv_sqrt_inplace(const unsigned int N, _FP16 *X); + +/** + * @brief Matrix transpose / 2D Tensor transpose + * + * @param M row length of input matrix + * @param N col length of input matrix + * @param src src data of input matrix + * @param ld_src data offset of input matrix + * @param dst destination of output matrix + * @param ld_dst data offset of output matrix + */ +void transpose_matrix(const unsigned int M, const unsigned int N, + const _FP16 *src, unsigned int ld_src, _FP16 *dst, + unsigned int ld_dst); +#endif +/** + * @brief sscal computation : X = alpha * X + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] alpha float number + */ +void sscal(const unsigned int N, const float alpha, float *X, + const unsigned int incX); +/** + * @brief snrm2 computation : Euclidean norm + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + */ +float snrm2(const unsigned int N, const float *X, const unsigned int incX); +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + */ +void scopy(const unsigned int N, const float *X, const unsigned int incX, + float *Y, const unsigned int incY); +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y uint8_t * for Vector Y + */ +void scopy(const unsigned int N, const uint8_t *X, const unsigned int incX, + uint8_t *Y, const unsigned int incY); +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y float * for Vector Y + */ +void scopy_int4_to_float32(const unsigned int N, const uint8_t *X, + const unsigned int incX, float *Y, + const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y float * for Vector Y + */ +void scopy_int8_to_float32(const unsigned int N, const uint8_t *X, + const unsigned int incX, float *Y, + const unsigned int incY); + +/** + * @brief sdot computation : sum of all X * Y + * @param[in] N number of elements in Y + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + */ +float sdot(const unsigned int N, const float *X, const unsigned int incX, + const float *Y, const unsigned int incY); +/** + * @brief saxpy computation : Y = alpha*X + Y + * @param[in] N number of elements in Y + * @param[in] alpha float number + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + */ +void saxpy(const unsigned int N, const float alpha, const float *X, + const unsigned int incX, float *Y, const unsigned int incY); +/** + * @brief sgemm computation : Y = alpha*op(A)*op(B) + beta*C, + * where op(X) is one of X or X**T + * @param[in] A float * for Matrix A + * @param[in] B float * for Matrix B + * @param[in] C float * for Matrix C + * @param[in] M number of op(A)'s and C's row + * @param[in] N number of op(B)'s and C's columns + * @param[in] K number of op(A)'s and columns and op(B)'s rows + * @param[in] alpha float number + * @param[in] beta float number + */ +void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, + const unsigned int M, const unsigned int N, const unsigned int K, + const float alpha, const float *A, const unsigned int lda, + const float *B, const unsigned int ldb, const float beta, float *C, + const unsigned int ldc); +/** + * @brief sgemv computation : Y = alpha*A*X + beta*Y +arch-dep:nntrainer/tensor/cpu_backend/arm/arm_compute_backend.h + * @param[in] A float * for Matrix A + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] rows number of A's row + * @param[in] cols number of A's columns + * @param[in] alpha float number + * @param[in] beta float number + */ +void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M, + const unsigned int N, const float alpha, const float *A, + const unsigned int lda, const float *X, const unsigned int incX, + const float beta, float *Y, const unsigned int incY); +/** + * @brief isamax function : index of first maxima + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + */ +unsigned int isamax(const unsigned int N, const float *X, + const unsigned int incX); + +/** + * @brief sine with neon: Y = sin(alpha * X) + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] alpha float * for scaling angle (radian) + */ +void sine(const unsigned int N, float *X, float *Y, float alpha = 1.f); + +/** + * @brief cosine with neon: Y = cos(alpha * X) + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] alpha float * for scaling angle (radian) + */ +void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.f); + +/** + * @brief inversed squared root transformation inplace : X = 1 / sqrt(X) + * + * @param N size of X + * @param X float * for Vector X + */ +void inv_sqrt_inplace(const unsigned int N, float *X); +/** + * @brief elementwise vector multiplication : Z = X ⊙ alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, + unsigned int o_stride = 1); + +/** + * @brief elementwise vector addition : Z = X + alpha * Y + beta * + * Z + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, + unsigned int o_stride = 1); +/** + * @brief elementwise vector subtraction with neon : Z = X - alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, + unsigned int o_stride = 1); + +/** + * @brief elementwise vector division with neon : Z = X / (alpha * Y) + beta + * * Z + * @note ZeroDivisionError is not guaranteed in this function + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void ele_div(const unsigned N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, + unsigned int o_stride = 1); + +} /* namespace nntrainer */ +#endif /* __cplusplus */ +#endif /* __ARM_COMPUTE_BACKEND_H__ */ diff --git a/nntrainer/tensor/cpu_backend/arm/arm_compute_backend_fp16.cpp b/nntrainer/tensor/cpu_backend/arm/arm_compute_backend_fp16.cpp new file mode 100644 index 0000000000..133aeae47c --- /dev/null +++ b/nntrainer/tensor/cpu_backend/arm/arm_compute_backend_fp16.cpp @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Sungsik Kong + * + * @file arm_compute_backend_fp16.cpp + * @date 23 April 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Sungsik Kong + * @bug No known bugs except for NYI items + * @brief Compute backend for arm + * + */ +#include +#include +#include +#include +#include + +#define ROW_MAJOR 0 +#define COL_MAJOR 1 + +namespace nntrainer { + +void sscal(const unsigned int N, const float alpha, _FP16 *X, + const unsigned int incX) { + assert(incX > 0); + + if (incX == 1) { + nntrainer::neon::hscal(N, X, alpha); + } else { + __fallback_sscal(N, alpha, X, incX); + } +} + +_FP16 snrm2(const unsigned int N, const _FP16 *X, const unsigned int incX) { + assert(incX > 0); + _FP16 sum; + if (incX == 1) { + sum = nntrainer::neon::hnrm2(N, X); + } else { + sum = __fallback_snrm2(N, X, incX); + } + return sum; +} + +void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX, + _FP16 *Y, const unsigned int incY) { + if (incX == 1 && incY == 1) { + nntrainer::neon::hcopy(N, X, Y); + } else { + __fallback_scopy(N, X, incX, Y, incY); + } +} + +void scopy(const unsigned int N, const float *X, const unsigned int incX, + _FP16 *Y, const unsigned int incY) { + if (incX == 1 && incY == 1) { + nntrainer::neon::copy_fp32_to_fp16(N, X, Y); + } else { + __fallback_scopy(N, X, incX, Y, incY); + } +} + +void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX, + float *Y, const unsigned int incY) { + if (incX == 1 && incY == 1) { + nntrainer::neon::copy_fp16_to_fp32(N, X, Y); + } else { + __fallback_scopy(N, X, incX, Y, incY); + } +} + +void scopy_int4_to_float16(const unsigned int N, const uint8_t *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY) { + if (incX == 1 && incY == 1) { + nntrainer::neon::copy_int4_to_fp16(N, X, Y); + } else { + __fallback_scopy_int4_to_float16(N, X, incX, Y, incY); + } +} + +void scopy_int8_to_float16(const unsigned int N, const uint8_t *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY) { + if (incX == 1 && incY == 1) { + nntrainer::neon::copy_int8_to_fp16(N, X, Y); + } else { + __fallback_scopy_int8_to_float16(N, X, incX, Y, incY); + } +} + +_FP16 sdot(const unsigned int N, const _FP16 *X, const unsigned int incX, + const _FP16 *Y, const unsigned int incY) { + _FP16 ret = 0; + assert(incX > 0 && incY > 0); + if (incX == 1 && incY == 1) { + ret = nntrainer::neon::hdot(N, X, Y); + } else { + __fallback_sdot(N, X, incX, Y, incY); + } + return ret; +} + +void saxpy(const unsigned int N, const float alpha, const _FP16 *X, + const unsigned int incX, _FP16 *Y, const unsigned int incY) { + if (incX == 1 && incY == 1) { + nntrainer::neon::haxpy(N, alpha, X, Y); + } else { + __fallback_saxpy(N, alpha, X, incX, Y, incY); + } +} + +void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, + const unsigned int M, const unsigned int N, const unsigned int K, + const float alpha, const _FP16 *A, const unsigned int lda, + const _FP16 *B, const unsigned int ldb, const float beta, _FP16 *C, + const unsigned int ldc) { + if (TStorageOrder) { + __fallback_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B, + ldb, beta, C, ldc); + } else { + nntrainer::neon::custom_hgemm(A, B, C, M, N, K, alpha, beta, TransA, + TransB); + } +} + +void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M, + const unsigned int N, const float alpha, const _FP16 *A, + const unsigned int lda, const _FP16 *X, const unsigned int incX, + const float beta, _FP16 *Y, const unsigned int incY) { + if (TStorageOrder) { + __fallback_sgemv(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, + Y, incY); + } else { + if (TransA) { + nntrainer::neon::hgemv_transpose(A, X, Y, M, N, alpha, beta); + } else { + nntrainer::neon::hgemv(A, X, Y, M, N, alpha, beta); + } + } +} + +void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + if (i_stride == 1 && o_stride == 1) { + nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta); + } else + __fallback_ele_mul(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + if (i_stride == 1 && o_stride == 1) { + nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta); + } else + __fallback_ele_add(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void ele_sub(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + if (i_stride == 1 && o_stride == 1) { + nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta); + } else + __fallback_ele_sub(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + if (i_stride == 1 && o_stride == 1) { + nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta); + } else + __fallback_ele_div(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +unsigned int isamax(const unsigned int N, const _FP16 *X, + const unsigned int incX) { + unsigned int max_idx = 0; + if (incX == 1 && N >= 8) { + max_idx = nntrainer::neon::isamax(N, X); + } else { + max_idx = __fallback_isamax(N, X, incX); + } + return max_idx; +} + +void inv_sqrt_inplace(const unsigned int N, _FP16 *X) { + nntrainer::neon::inv_sqrt_inplace(N, X); +} + +void transpose_matrix(const unsigned int M, const unsigned int N, + const __fp16 *src, unsigned int ld_src, __fp16 *dst, + unsigned int ld_dst) { + nntrainer::neon::transpose_matrix(M, N, src, ld_src, dst, ld_dst); +} + +} /* namespace nntrainer */ diff --git a/nntrainer/tensor/cpu_backend/arm/armv7_neon.h b/nntrainer/tensor/cpu_backend/arm/armv7_neon.h new file mode 100644 index 0000000000..8b47e0525d --- /dev/null +++ b/nntrainer/tensor/cpu_backend/arm/armv7_neon.h @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Sungsik Kong + * + * @file cpu_backend.h + * @date 16 August 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Sungsik Kong + * @bug No known bugs except for NYI items + * @brief Conditional header file to support unsupported intrinsics on armv7l + * + */ + +#include +#include + +/** + * @brief macro for vfmaq_n_f32 + * + */ +#define vfmaq_n_f32(a, b, n) vaddq_f32(a, vmulq_f32(b, vmovq_n_f32(n))) + +/** + * @brief vdivq_f32 macro + * + * @param a a for a / b + * @param b b for a / b + * @return float32x4_t + */ +static inline float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) { + float32x4_t ret; + for (unsigned int i = 0; i < 4; ++i) { + ret[i] = a[i] / b[i]; + } + return ret; +} + +/** + * @brief vsqrtq_f32 macro + * + * @param a input vector + * @return float32x4_t + */ +static inline float32x4_t vsqrtq_f32(float32x4_t a) { + float32x4_t ret; + for (unsigned int i = 0; i < 4; ++i) { + ret[i] = std::sqrt(a[i]); + } + return ret; +} + +/** + * @brief vmaxvq_f32 macro + * + * @param a input vector + * @return float + */ +static inline float vmaxvq_f32(float32x4_t a) { + float ret = a[0]; + for (unsigned int i = 1; i < 4; ++i) { + if (ret > a[i]) + ret = a[i]; + } + return ret; +} + +/** + * @brief vaddvq_f32 + * + * @param a input vector + * @return float32_t + */ +static inline float32_t vaddvq_f32(float32x4_t a) { + float32_t ret = a[0]; + for (unsigned int i = 1; i < 4; ++i) { + ret += a[i]; + } + return ret; +} + +#ifdef ENABLE_FP16 +/** + * @brief macro for vfmaq_n_f16 + * + */ +#define vfmaq_n_f16(a, b, c, n) vaddq_f16(a, vmulq_f16(b, vmovq_n_f16(c[n]))) + +/** + * @brief vmaxvq_f16 macro + * + * @param a input vector + * @return float16_t + */ +static inline float16_t vmaxvq_f16(float16x8_t a) { + float16_t ret = a[0]; + for (unsigned int i = 1; i < 8; ++i) { + if (ret > a[i]) + ret = a[i]; + } + return ret; +} +#endif diff --git a/nntrainer/tensor/hgemm/hgemm.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm.cpp similarity index 100% rename from nntrainer/tensor/hgemm/hgemm.cpp rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm.cpp diff --git a/nntrainer/tensor/hgemm/hgemm.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm.h similarity index 100% rename from nntrainer/tensor/hgemm/hgemm.h rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm.h diff --git a/nntrainer/tensor/hgemm/hgemm_common.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_common.h similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_common.h rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_common.h diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel.h similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel.h rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel.h diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp similarity index 98% rename from nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp index 713117fbb0..2ca0925132 100644 --- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp +++ b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_1x4.cpp @@ -15,6 +15,9 @@ #include #include #include +#ifdef ARMV7 +#include +#endif template <> void hgemm_kernel_1x4(unsigned int M, unsigned int N, unsigned int K, diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp similarity index 99% rename from nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp index 3935187065..e1a724b41c 100644 --- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp +++ b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_1x8.cpp @@ -17,6 +17,9 @@ #include #include #include +#ifdef ARMV7 +#include +#endif // 1. Partial sum 64 digits : worst accuracy, best latency #define KERNEL_1x8_ACC8() \ diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp similarity index 99% rename from nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp index f446d32b5a..57e04675f8 100644 --- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp +++ b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_4x4.cpp @@ -16,6 +16,9 @@ #include #include #include +#ifdef ARMV7 +#include +#endif #define INIT_KERNEL_4x4() \ do { \ diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp similarity index 99% rename from nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp index 118e99d9db..1c1d26f270 100644 --- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp +++ b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_4x8.cpp @@ -16,6 +16,9 @@ #include #include #include +#ifdef ARMV7 +#include +#endif #define INIT_KERNEL_4X8() \ do { \ diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp similarity index 99% rename from nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp index 3ec2b0306d..cca0d4d3a9 100644 --- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp +++ b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_8x16.cpp @@ -16,6 +16,9 @@ #include #include #include +#ifdef ARMV7 +#include +#endif #define INIT_KERNEL_8X16() \ do { \ diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16_experimental.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_8x16_experimental.cpp similarity index 99% rename from nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16_experimental.cpp rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_8x16_experimental.cpp index ebd75fecb4..6b577469be 100644 --- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x16_experimental.cpp +++ b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_8x16_experimental.cpp @@ -17,6 +17,9 @@ #include #include #include +#ifdef ARMV7 +#include +#endif #define INIT_KERNEL_8X16() \ do { \ diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp similarity index 99% rename from nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp index b072c3255e..5fdf52e83a 100644 --- a/nntrainer/tensor/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp +++ b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/hgemm_kernel_8x8.cpp @@ -16,6 +16,9 @@ #include #include #include +#ifdef ARMV7 +#include +#endif #define INIT_KERNEL_8x8() \ do { \ diff --git a/nntrainer/tensor/hgemm/hgemm_kernel/meson.build b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/meson.build similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_kernel/meson.build rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_kernel/meson.build diff --git a/nntrainer/tensor/hgemm/hgemm_noTrans.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_noTrans.cpp similarity index 99% rename from nntrainer/tensor/hgemm/hgemm_noTrans.cpp rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_noTrans.cpp index b1497329dd..e6a562e045 100644 --- a/nntrainer/tensor/hgemm/hgemm_noTrans.cpp +++ b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_noTrans.cpp @@ -20,6 +20,9 @@ #include #include #include +#ifdef ARMV7 +#include +#endif void hgemm_noTrans(const __fp16 *A, const __fp16 *B, float *C32, unsigned int M, unsigned int N, unsigned int K, float alpha, float beta) { diff --git a/nntrainer/tensor/hgemm/hgemm_noTrans.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_noTrans.h similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_noTrans.h rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_noTrans.h diff --git a/nntrainer/tensor/hgemm/hgemm_pack.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_pack.cpp similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_pack.cpp rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_pack.cpp diff --git a/nntrainer/tensor/hgemm/hgemm_pack.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_pack.h similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_pack.h rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_pack.h diff --git a/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding.h similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding.h rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding.h diff --git a/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding_a.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding_a.cpp similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding_a.cpp rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding_a.cpp diff --git a/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding_a.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding_a.h similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding_a.h rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding_a.h diff --git a/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding_b.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding_b.cpp similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding_b.cpp rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding_b.cpp diff --git a/nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding_b.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding_b.h similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_padding/hgemm_padding_b.h rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/hgemm_padding_b.h diff --git a/nntrainer/tensor/hgemm/hgemm_padding/meson.build b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/meson.build similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_padding/meson.build rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_padding/meson.build diff --git a/nntrainer/tensor/hgemm/hgemm_transA.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transA.cpp similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_transA.cpp rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transA.cpp diff --git a/nntrainer/tensor/hgemm/hgemm_transA.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transA.h similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_transA.h rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transA.h diff --git a/nntrainer/tensor/hgemm/hgemm_transAB.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transAB.cpp similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_transAB.cpp rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transAB.cpp diff --git a/nntrainer/tensor/hgemm/hgemm_transAB.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transAB.h similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_transAB.h rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transAB.h diff --git a/nntrainer/tensor/hgemm/hgemm_transB.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transB.cpp similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_transB.cpp rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transB.cpp diff --git a/nntrainer/tensor/hgemm/hgemm_transB.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transB.h similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_transB.h rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_transB.h diff --git a/nntrainer/tensor/hgemm/hgemm_util.cpp b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_util.cpp similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_util.cpp rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_util.cpp diff --git a/nntrainer/tensor/hgemm/hgemm_util.h b/nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_util.h similarity index 100% rename from nntrainer/tensor/hgemm/hgemm_util.h rename to nntrainer/tensor/cpu_backend/arm/hgemm/hgemm_util.h diff --git a/nntrainer/tensor/hgemm/meson.build b/nntrainer/tensor/cpu_backend/arm/hgemm/meson.build similarity index 100% rename from nntrainer/tensor/hgemm/meson.build rename to nntrainer/tensor/cpu_backend/arm/hgemm/meson.build diff --git a/nntrainer/tensor/matrix_transpose_neon/mask_neon.h b/nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/mask_neon.h similarity index 100% rename from nntrainer/tensor/matrix_transpose_neon/mask_neon.h rename to nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/mask_neon.h diff --git a/nntrainer/tensor/matrix_transpose_neon/matrix_transpose_kernels_neon.h b/nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/matrix_transpose_kernels_neon.h similarity index 100% rename from nntrainer/tensor/matrix_transpose_neon/matrix_transpose_kernels_neon.h rename to nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/matrix_transpose_kernels_neon.h diff --git a/nntrainer/tensor/matrix_transpose_neon/matrix_transpose_neon.cpp b/nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/matrix_transpose_neon.cpp similarity index 100% rename from nntrainer/tensor/matrix_transpose_neon/matrix_transpose_neon.cpp rename to nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/matrix_transpose_neon.cpp diff --git a/nntrainer/tensor/matrix_transpose_neon/matrix_transpose_neon.h b/nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/matrix_transpose_neon.h similarity index 100% rename from nntrainer/tensor/matrix_transpose_neon/matrix_transpose_neon.h rename to nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/matrix_transpose_neon.h diff --git a/nntrainer/tensor/matrix_transpose_neon/meson.build b/nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/meson.build similarity index 100% rename from nntrainer/tensor/matrix_transpose_neon/meson.build rename to nntrainer/tensor/cpu_backend/arm/matrix_transpose_neon/meson.build diff --git a/nntrainer/tensor/cpu_backend/arm/meson.build b/nntrainer/tensor/cpu_backend/arm/meson.build new file mode 100644 index 0000000000..0be7034abe --- /dev/null +++ b/nntrainer/tensor/cpu_backend/arm/meson.build @@ -0,0 +1,37 @@ +arm_compute_backend_headers = [ + 'arm_compute_backend.h', + 'neon_impl.h', + 'neon_setting.h', + 'neon_mathfun.h', + 'neon_mathfun.hxx' +] +arm_compute_backend_sources = [ + 'arm_compute_backend.cpp', + 'neon_impl.cpp' +] + +if get_option('enable-fp16') + arm_compute_backend_sources += 'arm_compute_backend_fp16.cpp' + arm_compute_backend_sources += 'neon_impl_fp16.cpp' + + subdir('hgemm') + nntrainer_inc += include_directories('hgemm') + nntrainer_inc_abs += meson.current_source_dir() / 'hgemm' + + subdir('matrix_transpose_neon') + nntrainer_inc += include_directories('matrix_transpose_neon') + nntrainer_inc_abs += meson.current_source_dir() / 'matrix_transpose_neon' +endif + +arch = host_machine.cpu_family() +if arch == 'arm' + arm_compute_backend_headers += 'armv7_neon.h' +endif + +foreach s : arm_compute_backend_sources + nntrainer_sources += meson.current_source_dir() / s +endforeach + +foreach h : arm_compute_backend_headers + nntrainer_headers += meson.current_source_dir() / h +endforeach diff --git a/nntrainer/tensor/cpu_backend/arm/neon_impl.cpp b/nntrainer/tensor/cpu_backend/arm/neon_impl.cpp new file mode 100644 index 0000000000..86edaddde1 --- /dev/null +++ b/nntrainer/tensor/cpu_backend/arm/neon_impl.cpp @@ -0,0 +1,592 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Sungsik Kong + * + * @file neon_impl.cpp + * @date 23 April 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Sungsik Kong + * @bug No known bugs except for NYI items + * @brief Single-precision computation functions based on NEON + * + */ + +#include +#include +#include +#include +#ifdef ARMV7 +#include +#endif + +namespace nntrainer::neon { + +void sgemv(const float *A, const float *X, float *Y, uint32_t rows, + uint32_t cols, float alpha, float beta) { + const float *__restrict x; + + for (unsigned int i = 0; i < rows; ++i) { + Y[i] = Y[i] * beta; + } + + float32x4_t v_alpha = vmovq_n_f32(alpha); + + if (cols % 16 == 0) { + for (unsigned i = 0; i < cols; i += 16) { + float32x4_t x0_3 = vld1q_f32(&X[i]); + float32x4_t x4_7 = vld1q_f32(&X[i + 4]); + float32x4_t x8_11 = vld1q_f32(&X[i + 8]); + float32x4_t x12_15 = vld1q_f32(&X[i + 12]); + + if (alpha != 1.0) { + x0_3 = vmulq_f32(x0_3, v_alpha); + x4_7 = vmulq_f32(x4_7, v_alpha); + x8_11 = vmulq_f32(x8_11, v_alpha); + x12_15 = vmulq_f32(x12_15, v_alpha); + } + + float32x4_t wvec0_3, wvec4_7, wvec8_11, wvec12_15; + + const float *__restrict w; + + float32x4_t y0; + + for (unsigned int j = 0; j < rows; ++j) { + w = &A[j * cols + i]; + y0 = vmovq_n_f32(0); + + float r[4]; + wvec0_3 = vld1q_f32(&w[0]); + wvec4_7 = vld1q_f32(&w[4]); + wvec8_11 = vld1q_f32(&w[8]); + wvec12_15 = vld1q_f32(&w[12]); + + y0 = vmlaq_f32(y0, wvec0_3, x0_3); + y0 = vmlaq_f32(y0, wvec4_7, x4_7); + y0 = vmlaq_f32(y0, wvec8_11, x8_11); + y0 = vmlaq_f32(y0, wvec12_15, x12_15); + + vst1q_f32(r, y0); + for (unsigned int k = 0; k < 4; ++k) { + Y[j] = Y[j] + r[k]; + } + } + } + + } else if (cols % 8 == 0) { + for (unsigned i = 0; i < cols; i += 8) { + float32x4_t x0_3 = vld1q_f32(&X[i]); + float32x4_t x4_7 = vld1q_f32(&X[i + 4]); + + if (alpha != 1.0) { + x0_3 = vmulq_f32(x0_3, v_alpha); + x4_7 = vmulq_f32(x4_7, v_alpha); + } + + float32x4_t wvec0_3, wvec4_7; + + const float *__restrict w; + + float32x4_t y0; + + for (unsigned int j = 0; j < rows; ++j) { + w = &A[j * cols + i]; + y0 = vmovq_n_f32(0); + + float r[4]; + wvec0_3 = vld1q_f32(&w[0]); + wvec4_7 = vld1q_f32(&w[4]); + + y0 = vmlaq_f32(y0, wvec0_3, x0_3); + y0 = vmlaq_f32(y0, wvec4_7, x4_7); + + vst1q_f32(r, y0); + for (unsigned int k = 0; k < 4; ++k) { + Y[j] = Y[j] + r[k]; + } + } + } + } else if (cols % 4 == 0) { + for (unsigned i = 0; i < cols; i += 4) { + float32x4_t x0_3 = vld1q_f32(&X[i]); + + if (alpha != 1.0) { + x0_3 = vmulq_f32(x0_3, v_alpha); + } + + float32x4_t wvec0_3, wvec4_7; + + const float *__restrict w; + + float32x4_t y0; + + for (unsigned int j = 0; j < rows; ++j) { + w = &A[j * cols + i]; + y0 = vmovq_n_f32(0); + + float r[4]; + wvec0_3 = vld1q_f32(&w[0]); + + y0 = vmlaq_f32(y0, wvec0_3, x0_3); + + vst1q_f32(r, y0); + for (unsigned int k = 0; k < 4; ++k) { + Y[j] = Y[j] + r[k]; + } + } + } + } +} + +void sgemv_transpose(const float *A, const float *X, float *Y, uint32_t rows, + uint32_t cols, float alpha, float beta) { + const float *__restrict x; + + const float32x4_t v_beta = vdupq_n_f32(beta); + const float32x4_t v_alpha = vdupq_n_f32(alpha); + + if (cols % 16 == 0) { + unsigned int n = cols / 16; + bool *initialized = (bool *)malloc(sizeof(bool) * n); + unsigned int step; + for (unsigned int i = 0; i < cols / 16; ++i) { + initialized[i] = false; + } + + for (unsigned int i = 0; i < rows; ++i) { + float32x4_t x = vld1q_dup_f32(&X[i]); + x = vmulq_f32(x, v_alpha); + + for (unsigned int j = 0; j < cols; j += 16) { + float *__restrict y = &Y[j]; + + float32x4_t y0_3 = vld1q_f32(&y[0]); + float32x4_t y4_7 = vld1q_f32(&y[4]); + float32x4_t y8_11 = vld1q_f32(&y[8]); + float32x4_t y12_15 = vld1q_f32(&y[12]); + step = j / 16; + if (!initialized[step]) { + y0_3 = vmulq_f32(y0_3, v_beta); + y4_7 = vmulq_f32(y4_7, v_beta); + y8_11 = vmulq_f32(y8_11, v_beta); + y12_15 = vmulq_f32(y12_15, v_beta); + initialized[step] = true; + } + + float32x4_t wvec0_3, wvec4_7, wvec8_11, wvec12_15; + const float *__restrict w; + + w = &A[i * cols + j]; + + wvec0_3 = vld1q_f32(&w[0]); + wvec4_7 = vld1q_f32(&w[4]); + wvec8_11 = vld1q_f32(&w[8]); + wvec12_15 = vld1q_f32(&w[12]); + + y0_3 = vmlaq_f32(y0_3, wvec0_3, x); + y4_7 = vmlaq_f32(y4_7, wvec4_7, x); + y8_11 = vmlaq_f32(y8_11, wvec8_11, x); + y12_15 = vmlaq_f32(y12_15, wvec12_15, x); + + vst1q_f32(&y[0], y0_3); + vst1q_f32(&y[4], y4_7); + vst1q_f32(&y[8], y8_11); + vst1q_f32(&y[12], y12_15); + } + } + free(initialized); + return; + } else if (cols % 8 == 0) { + unsigned int n = cols / 8; + bool *initialized = (bool *)malloc(sizeof(bool) * n); + unsigned int step; + for (unsigned int i = 0; i < cols / 8; ++i) { + initialized[i] = false; + } + + for (unsigned int i = 0; i < rows; ++i) { + float32x4_t x = vld1q_dup_f32(&X[i]); + x = vmulq_f32(x, v_alpha); + + for (unsigned int j = 0; j < cols; j += 8) { + float *__restrict y = &Y[j]; + + float32x4_t y0_3 = vld1q_f32(&y[0]); + float32x4_t y4_7 = vld1q_f32(&y[4]); + + step = j / 8; + if (!initialized[step]) { + y0_3 = vmulq_f32(y0_3, v_beta); + y4_7 = vmulq_f32(y4_7, v_beta); + initialized[step] = true; + } + + float32x4_t wvec0_3, wvec4_7; + const float *__restrict w; + + w = &A[i * cols + j]; + + wvec0_3 = vld1q_f32(&w[0]); + wvec4_7 = vld1q_f32(&w[4]); + + y0_3 = vmlaq_f32(y0_3, wvec0_3, x); + y4_7 = vmlaq_f32(y4_7, wvec4_7, x); + vst1q_f32(&y[0], y0_3); + vst1q_f32(&y[4], y4_7); + } + } + free(initialized); + return; + } else if (cols % 4 == 0) { + unsigned int n = cols / 4; + bool *initialized = (bool *)malloc(sizeof(bool) * n); + + unsigned int step; + for (unsigned int i = 0; i < cols / 4; ++i) { + initialized[i] = false; + } + for (unsigned int i = 0; i < rows; ++i) { + float32x4_t x = vld1q_dup_f32(&X[i]); + x = vmulq_f32(x, v_alpha); + + for (unsigned int j = 0; j < cols; j += 4) { + float *__restrict y = &Y[j]; + + float32x4_t y0_3 = vld1q_f32(&y[0]); + step = j / 4; + if (!initialized[step]) { + y0_3 = vmulq_f32(y0_3, v_beta); + initialized[step] = true; + } + + float32x4_t wvec0_3; + const float *__restrict w; + + w = &A[i * cols + j]; + + wvec0_3 = vld1q_f32(&w[0]); + + y0_3 = vmlaq_f32(y0_3, wvec0_3, x); + vst1q_f32(&y[0], y0_3); + } + } + free(initialized); + } + + return; +} + +void copy_int4_to_fp32(const unsigned int N, const uint8_t *X, float *Y) { + + unsigned int idx = 0; + + // keep in mind that : len(X) = N, and len(Y) = 2*N + + // processing batch of 16 + float32x4_t y0, y1, y2, y3; + float32x4_t y4, y5, y6, y7; + + uint8_t low0, low1, high0, high1; + + for (; (N - idx) >= 16; idx += 16) { + uint8x16_t batch = vld1q_u8(&X[idx]); + + uint8x8_t low = vget_low_u8(batch); + uint8x8_t high = vget_high_u8(batch); + unsigned int i = 0; + for (; i < 8; ++i) { + low0 = low[i] >> 4; + low1 = low[i] & 0x0f; + + high0 = high[i] >> 4; + high1 = high[i] & 0x0f; + + // 0 ~ 8 + if (i < 2) { + y0[2 * i] = low0; + y0[2 * i + 1] = low1; + } else if (i < 4) { + y1[2 * (i - 2)] = low0; + y1[2 * (i - 2) + 1] = low1; + } else if (i < 6) { + y2[2 * (i - 4)] = low0; + y2[2 * (i - 4) + 1] = low1; + } else { + y3[2 * (i - 6)] = low0; + y3[2 * (i - 6) + 1] = low1; + } + + // 8 ~ 16 + if (i < 2) { + y4[2 * i] = high0; + y4[2 * i + 1] = high1; + } else if (i < 4) { + y5[2 * (i - 2)] = high0; + y5[2 * (i - 2) + 1] = high1; + } else if (i < 6) { + y6[2 * (i - 4)] = high0; + y6[2 * (i - 4) + 1] = high1; + } else { + y7[2 * (i - 6)] = high0; + y7[2 * (i - 6) + 1] = high1; + } + } + vst1q_f32(&Y[2 * idx], y0); + vst1q_f32(&Y[2 * idx + 4], y1); + vst1q_f32(&Y[2 * idx + 8], y2); + vst1q_f32(&Y[2 * idx + 12], y3); + vst1q_f32(&Y[2 * idx + 16], y4); + vst1q_f32(&Y[2 * idx + 20], y5); + vst1q_f32(&Y[2 * idx + 24], y6); + vst1q_f32(&Y[2 * idx + 28], y7); + } + + // processing remaining batch of 8 + for (; (N - idx) >= 8; idx += 8) { + uint8x8_t batch = vld1_u8(&X[idx]); + + unsigned int i = 0; + for (; i < 8; ++i) { + low0 = batch[i] >> 4; + low1 = batch[i] & 0x0f; + + if (i < 2) { + y0[2 * i] = low0; + y0[2 * i + 1] = low1; + } else if (i < 4) { + y1[2 * (i - 2)] = low0; + y1[2 * (i - 2) + 1] = low1; + } else if (i < 6) { + y2[2 * (i - 4)] = low0; + y2[2 * (i - 4) + 1] = low1; + } else { + y3[2 * (i - 6)] = low0; + y3[2 * (i - 6) + 1] = low1; + } + } + + vst1q_f32(&Y[2 * idx], y0); + vst1q_f32(&Y[2 * idx + 4], y1); + vst1q_f32(&Y[2 * idx + 8], y2); + vst1q_f32(&Y[2 * idx + 12], y3); + } + + // pocessing remaining values + for (; idx < N; idx++) { + Y[2 * idx] = X[idx] >> 4; + Y[2 * idx + 1] = X[idx] & 0x0f; + } +} + +void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y) { + ///@note int8 Tensor and int4 Tensor share the same memory offset + unsigned int idx = 0; + for (; N - idx >= 16; idx += 16) { + uint8x16_t batch = vld1q_u8(&X[idx]); + vst1q_u8(&Y[idx], batch); + } + for (; N - idx >= 8; idx += 8) { + uint8x8_t batch = vld1_u8(&X[idx]); + vst1_u8(&Y[idx], batch); + } + for (; N - idx >= 1; ++idx) { + Y[idx] = X[idx]; + } +} + +void sine(const unsigned int N, float *X, float *Y, float alpha) { + unsigned int i = 0; + for (; N - i >= 4; i += 4) { + float32x4_t x0_3 = vld1q_f32(&X[i]); + if (alpha != 1.0) + x0_3 = vmulq_n_f32(x0_3, alpha); + float32x4_t sinx0_3 = sin_ps(x0_3); + vst1q_f32(&Y[i], sinx0_3); + } + while (i < N) { + Y[i] = std::sin(alpha * X[i]); + ++i; + } +} + +void cosine(const unsigned int N, float *X, float *Y, float alpha) { + unsigned int i = 0; + for (; N - i >= 4; i += 4) { + float32x4_t x0_3 = vld1q_f32(&X[i]); + if (alpha != 1.0) + x0_3 = vmulq_n_f32(x0_3, alpha); + float32x4_t cosx0_3 = cos_ps(x0_3); + vst1q_f32(&Y[i], cosx0_3); + } + while (i < N) { + Y[i] = std::cos(alpha * X[i]); + ++i; + } +} + +void inv_sqrt_inplace(const unsigned int N, float *X) { + unsigned int i = 0; + for (; N - i >= 4; i += 4) { + float32x4_t x0_7 = vld1q_f32(&X[i]); + float32x4_t x0_7_sqrt = vsqrtq_f32(x0_7); + float32x4_t ones = vmovq_n_f32(1); + float32x4_t x0_7_sqrt_div = vdivq_f32(ones, x0_7_sqrt); + vst1q_f32(&X[i], x0_7_sqrt_div); + } + while (i < N) { + X[i] = (1 / std::sqrt(static_cast(X[i]))); + ++i; + } +} + +void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha, float beta) { + unsigned int i = 0; + float32x4_t alpha_vec = vdupq_n_f32(alpha); + float32x4_t beta_vec = vdupq_n_f32(beta); + for (; N - i >= 4; i += 4) { + float32x4_t x0_3 = vld1q_f32(&X[i]); + float32x4_t y0_3 = vld1q_f32(&Y[i]); + if (alpha != 1.f) { + y0_3 = vmulq_f32(y0_3, alpha_vec); + } + float32x4_t xy0_3 = vmulq_f32(x0_3, y0_3); + if (std::abs(beta) > __FLT_MIN__) { + float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); + vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); + } else + vst1q_f32(&Z[i], xy0_3); + } + while (i < N) { + if (std::abs(beta) > __FLT_MIN__) + Z[i] = alpha * X[i] * Y[i] + beta * Z[i]; + else + Z[i] = alpha * X[i] * Y[i]; + ++i; + } +} + +void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha, float beta) { + unsigned int i = 0; + float32x4_t alpha_vec = vdupq_n_f32(alpha); + float32x4_t beta_vec = vdupq_n_f32(beta); + for (; N - i >= 4; i += 4) { + float32x4_t x0_3 = vld1q_f32(&X[i]); + float32x4_t y0_3 = vld1q_f32(&Y[i]); + if (alpha != 1.f) { + y0_3 = vmulq_f32(y0_3, alpha_vec); + } + float32x4_t xy0_3 = vaddq_f32(x0_3, y0_3); + if (std::abs(beta) > __FLT_MIN__) { + float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); + vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); + } else + vst1q_f32(&Z[i], xy0_3); + } + while (i < N) { + if (std::abs(beta) > __FLT_MIN__) + Z[i] = X[i] + alpha * Y[i] + beta * Z[i]; + else + Z[i] = X[i] + alpha * Y[i]; + ++i; + } +} + +void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, + float alpha, float beta) { + unsigned int i = 0; + float32x4_t alpha_vec = vdupq_n_f32(alpha); + float32x4_t beta_vec = vdupq_n_f32(beta); + for (; N - i >= 4; i += 4) { + float32x4_t x0_3 = vld1q_f32(&X[i]); + float32x4_t y0_3 = vld1q_f32(&Y[i]); + if (alpha != 1.f) { + y0_3 = vmulq_f32(y0_3, alpha_vec); + } + float32x4_t xy0_3 = vsubq_f32(x0_3, y0_3); + if (std::abs(beta) > __FLT_MIN__) { + float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); + vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); + } else + vst1q_f32(&Z[i], xy0_3); + } + while (i < N) { + if (std::abs(beta) > __FLT_MIN__) + Z[i] = X[i] - alpha * Y[i] + beta * Z[i]; + else + Z[i] = X[i] - alpha * Y[i]; + ++i; + } +} + +void ele_div(const unsigned N, const float *X, const float *Y, float *Z, + float alpha, float beta) { + unsigned int i = 0; + float32x4_t alpha_vec = vdupq_n_f32(alpha); + float32x4_t beta_vec = vdupq_n_f32(beta); + for (; N - i >= 4; i += 4) { + float32x4_t x0_3 = vld1q_f32(&X[i]); + float32x4_t y0_3 = vld1q_f32(&Y[i]); + if (alpha != 1.f) { + y0_3 = vmulq_f32(y0_3, alpha_vec); + } + float32x4_t xy0_3 = vdivq_f32(x0_3, y0_3); + if (std::abs(beta) > __FLT_MIN__) { + float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); + vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); + } else + vst1q_f32(&Z[i], xy0_3); + } + while (i < N) { + if (std::abs(beta) > __FLT_MIN__) + Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i]; + else + Z[i] = X[i] / (alpha * Y[i]); + ++i; + } +} + +void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y) { + unsigned int idx = 0; + for (; (N - idx) >= 16; idx += 16) { + uint8x16_t batch = vld1q_u8(&X[idx]); + uint8x8_t low = vget_low_u8(batch); + uint8x8_t high = vget_high_u8(batch); + + // convert to u16 + uint16x8_t batch_low_u16 = vmovl_u8(low); + uint16x8_t batch_high_u16 = vmovl_u8(high); + + // convert to u32 + uint32x4_t batch_low_u32_low = vmovl_u16(vget_low_u16(batch_low_u16)); + uint32x4_t batch_low_u32_high = vmovl_u16(vget_high_u16(batch_low_u16)); + uint32x4_t batch_high_u32_low = vmovl_u16(vget_low_u16(batch_high_u16)); + uint32x4_t batch_high_u32_high = vmovl_u16(vget_high_u16(batch_high_u16)); + + // todo : experiment with vcvt_f32_u32_ bitwise operation w.r.t. + // time/accuracy + vst1q_f32(&Y[idx], vcvtq_f32_u32(batch_low_u32_low)); + vst1q_f32(&Y[idx + 4], vcvtq_f32_u32(batch_low_u32_high)); + vst1q_f32(&Y[idx + 8], vcvtq_f32_u32(batch_high_u32_low)); + vst1q_f32(&Y[idx + 12], vcvtq_f32_u32(batch_high_u32_high)); + } + for (; (N - idx) >= 8; idx += 8) { + uint8x8_t batch = vld1_u8(&X[idx]); + + // convert to u16 + uint16x8_t batch_u16 = vmovl_u8(batch); + + // convert to u32 + uint32x4_t batch_u32_low = vmovl_u16(vget_low_u16(batch_u16)); + uint32x4_t batch_u32_high = vmovl_u16(vget_high_u16(batch_u16)); + + vst1q_f32(&Y[idx], vcvtq_f32_u32(batch_u32_low)); + vst1q_f32(&Y[idx + 4], vcvtq_f32_u32(batch_u32_high)); + } + for (; (N - idx) >= 1; ++idx) { + Y[idx] = X[idx]; + } +} + +} // namespace nntrainer::neon diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/cpu_backend/arm/neon_impl.h similarity index 93% rename from nntrainer/tensor/blas_neon.h rename to nntrainer/tensor/cpu_backend/arm/neon_impl.h index 81f8c060ed..8261c3d2e1 100644 --- a/nntrainer/tensor/blas_neon.h +++ b/nntrainer/tensor/cpu_backend/arm/neon_impl.h @@ -1,154 +1,28 @@ +// SPDX-License-Identifier: Apache-2.0 /** - * Copyright (C) 2022 Jijoong Moon + * Copyright (C) 2024 Sungsik Kong * - * @file blas_neon.h - * @date 4 Aug 2022 + * @file neon_impl.h + * @date 23 April 2024 * @see https://github.com/nnstreamer/nntrainer - * @author Jijoong Moon * @author Sungsik Kong * @bug No known bugs except for NYI items - * @brief This is header for blas neon implementation + * @brief Single-precision computation functions based on NEON * */ -#ifndef __BLAS_NEON_H_ -#define __BLAS_NEON_H_ +#ifndef __NEON_IMPL_H_ +#define __NEON_IMPL_H_ #ifdef __cplusplus #include #include #include +#include namespace nntrainer::neon { - -/** - * @brief sgemv computation with neon : Y = alpha*A*X + beta*Y - * @param[in] A float * for Matrix A - * @param[in] X float * for Vector X - * @param[in] Y float * for Vector Y - * @param[in] M number of A's row - * @param[in] N number of A's columns - * @param[in] alpha float number - * @param[in] beta float number - */ -void sgemv(const float *A, const float *X, float *Y, uint32_t M, uint32_t N, - const float alpha, const float beta); - -/** - * @brief transposed sgemv computation with neon - * Y = alpha*transpose(A)*X - * + beta*Y - * @param[in] A float * for Matrix A - * @param[in] X float * for Vector X - * @param[in] Y float * for Vector Y - * @param[in] M number of A's row - * @param[in] N number of A's columns - * @param[in] alpha float number - * @param[in] beta float number - */ -void sgemv_transpose(const float *A, const float *X, float *Y, uint32_t M, - uint32_t N, float alpha, float beta); - -/** - * @brief copy function with neon: Y = X - * @param[in] N number of elements in X - * @param[in] X float * for Vector X - * @param[in] Y uint8_t * for Vector Y - */ -void copy_int4_to_fp32(const unsigned int N, const uint8_t *X, float *Y); - -/** - * @brief copy function with neon: Y = X - * @param[in] N number of elements in X - * @param[in] X float * for Vector X - * @param[in] Y uint8_t * for Vector Y - */ -void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y); - -/** - * @brief copy function with neon: Y = X - * @param[in] N number of elements in X - * @param[in] X uint8_t * for Vector X - * @param[in] Y uint8_t * for Vector Y - */ -void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y); -/** - * @brief sine with neon: Y = sin(alpha * X) - * @param[in] N number of elements in X - * @param[in] X float * for Vector X - * @param[in] Y float * for Vector Y - * @param[in] alpha float * for scaling angle (radian) - */ -void sine(const unsigned int N, float *X, float *Y, float alpha = 1.f); - -/** - * @brief cosine with neon: Y = cos(alpha * X) - * @param[in] N number of elements in X - * @param[in] X float * for Vector X - * @param[in] Y float * for Vector Y - * @param[in] alpha float * for scaling angle (radian) - */ -void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.f); - -/** - * @brief inversed squared root transformation with neon : X = 1 / sqrt(X) - * - * @param N number of elements in X - * @param X float * for Vector X - */ -void inv_sqrt_inplace(const unsigned int N, float *X); - -/** - * @brief elementwise vector multiplication : Z = X ⊙ alpha * Y + beta * Z - * @param[in] N length of the vector - * @param[in] X float * for Vector X - * @param[in] Y float * for Vector Y - * @param[in] Z float * for Vector Z - * @param[in] alpha scalar multiplier for input - * @param[in] beta scalar multiplier for output - */ -void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, - float alpha = 1.f, float beta = 0.f); - -/** - * @brief elementwise vector addition : Z = X + alpha * Y + beta * Z - * @param[in] N length of the vector - * @param[in] X float * for Vector X - * @param[in] Y float * for Vector Y - * @param[in] Z float * for Vector Z - * @param[in] alpha scalar multiplier for input - * @param[in] beta scalar multiplier for output - */ -void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, - float alpha = 1.f, float beta = 0.f); -/** - * @brief elementwise vector subtraction with neon : Z = X - alpha * Y + - * beta * Z - * @param[in] N length of the vector - * @param[in] X float * for Vector X - * @param[in] Y float * for Vector Y - * @param[in] Z float * for Vector Z - * @param[in] alpha scalar multiplier for input - * @param[in] beta scalar multiplier for output - */ -void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, - float alpha = 1.f, float beta = 0.f); - -/** - * @brief elementwise vector division with neon : Z = X / (alpha * Y) + beta - * * Z - * @note ZeroDivisionError is not guaranteed in this function - * @param[in] N length of the vector - * @param[in] X float * for Vector X - * @param[in] Y float * for Vector Y - * @param[in] Z float * for Vector Z - * @param[in] alpha scalar multiplier for input - * @param[in] beta scalar multiplier for output - */ -void ele_div(const unsigned N, const float *X, const float *Y, float *Z, - float alpha = 1.f, float beta = 0.f); - #ifdef ENABLE_FP16 + /** * @brief hgemv computation with neon : Y = alpha*A*X + beta*Y * @param[in] A __fp16 * for Matrix A @@ -330,7 +204,6 @@ unsigned int isamax(const unsigned int N, const __fp16 *X); void custom_hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, uint32_t M, uint32_t N, uint32_t K, float alpha, float beta, bool TransA, bool TransB); - /** * @brief squared root transformation with neon : X = sqrt(X) * @@ -338,9 +211,150 @@ void custom_hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, uint32_t M, * @param X __fp16 * for Vector X */ void inv_sqrt_inplace(const unsigned int N, __fp16 *X); + +/** + * @brief Matrix transpose / 2D Tensor transpose + * + * @param M row length of input matrix + * @param N col length of input matrix + * @param src src data of input matrix + * @param ld_src data offset of input matrix + * @param dst destination of output matrix + * @param ld_dst data offset of output matrix + */ +void transpose_matrix(const unsigned int M, const unsigned int N, + const __fp16 *src, unsigned int ld_src, __fp16 *dst, + unsigned int ld_dst); #endif +/** + * @brief sgemv computation with neon : Y = alpha*A*X + beta*Y + * @param[in] A float * for Matrix A + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] M number of A's row + * @param[in] N number of A's columns + * @param[in] alpha float number + * @param[in] beta float number + */ +void sgemv(const float *A, const float *X, float *Y, uint32_t M, uint32_t N, + const float alpha, const float beta); + +/** + * @brief transposed sgemv computation with neon + * Y = alpha*transpose(A)*X + * + beta*Y + * @param[in] A float * for Matrix A + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] M number of A's row + * @param[in] N number of A's columns + * @param[in] alpha float number + * @param[in] beta float number + */ +void sgemv_transpose(const float *A, const float *X, float *Y, uint32_t M, + uint32_t N, float alpha, float beta); + +/** + * @brief copy function with neon: Y = X + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y uint8_t * for Vector Y + */ +void copy_int4_to_fp32(const unsigned int N, const uint8_t *X, float *Y); + +/** + * @brief copy function with neon: Y = X + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y uint8_t * for Vector Y + */ +void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y); + +/** + * @brief copy function with neon: Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y uint8_t * for Vector Y + */ +void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y); +/** + * @brief sine with neon: Y = sin(alpha * X) + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] alpha float * for scaling angle (radian) + */ +void sine(const unsigned int N, float *X, float *Y, float alpha = 1.f); + +/** + * @brief cosine with neon: Y = cos(alpha * X) + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] alpha float * for scaling angle (radian) + */ +void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.f); + +/** + * @brief inversed squared root transformation with neon : X = 1 / sqrt(X) + * + * @param N number of elements in X + * @param X float * for Vector X + */ +void inv_sqrt_inplace(const unsigned int N, float *X); + +/** + * @brief elementwise vector multiplication : Z = X ⊙ alpha * Y + beta * Z + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + */ +void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f); + +/** + * @brief elementwise vector addition : Z = X + alpha * Y + beta * Z + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + */ +void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f); +/** + * @brief elementwise vector subtraction with neon : Z = X - alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + */ +void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f); + +/** + * @brief elementwise vector division with neon : Z = X / (alpha * Y) + beta + * * Z + * @note ZeroDivisionError is not guaranteed in this function + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + */ +void ele_div(const unsigned N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f); + } // namespace nntrainer::neon #endif /* __cplusplus */ -#endif /* __BLAS_NEON_H__ */ +#endif /* __NEON_SINGLE_H__ */ diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/cpu_backend/arm/neon_impl_fp16.cpp similarity index 69% rename from nntrainer/tensor/blas_neon.cpp rename to nntrainer/tensor/cpu_backend/arm/neon_impl_fp16.cpp index 4b6c05c72e..60792b3316 100644 --- a/nntrainer/tensor/blas_neon.cpp +++ b/nntrainer/tensor/cpu_backend/arm/neon_impl_fp16.cpp @@ -1,553 +1,27 @@ // SPDX-License-Identifier: Apache-2.0 /** - * Copyright (C) 2022 Jijoong Moon + * Copyright (C) 2024 Sungsik Kong * - * @file blas_neon.cpp - * @date 4 Aug 2022 + * @file neon_fp16.cpp + * @date 23 April 2024 * @see https://github.com/nnstreamer/nntrainer - * @author Jijoong Moon * @author Sungsik Kong * @bug No known bugs except for NYI items - * @brief This is Source for blas neon implementation + * @brief Half-precision computation functions based on NEON * */ -#include -#include #include +#include #include -#include +#include +#include +#ifdef ARMV7 +#include +#endif namespace nntrainer::neon { -void sgemv(const float *A, const float *X, float *Y, uint32_t rows, - uint32_t cols, float alpha, float beta) { - const float *__restrict x; - - for (unsigned int i = 0; i < rows; ++i) { - Y[i] = Y[i] * beta; - } - - float32x4_t v_alpha = vmovq_n_f32(alpha); - - if (cols % 16 == 0) { - for (unsigned i = 0; i < cols; i += 16) { - float32x4_t x0_3 = vld1q_f32(&X[i]); - float32x4_t x4_7 = vld1q_f32(&X[i + 4]); - float32x4_t x8_11 = vld1q_f32(&X[i + 8]); - float32x4_t x12_15 = vld1q_f32(&X[i + 12]); - - if (alpha != 1.0) { - x0_3 = vmulq_f32(x0_3, v_alpha); - x4_7 = vmulq_f32(x4_7, v_alpha); - x8_11 = vmulq_f32(x8_11, v_alpha); - x12_15 = vmulq_f32(x12_15, v_alpha); - } - - float32x4_t wvec0_3, wvec4_7, wvec8_11, wvec12_15; - - const float *__restrict w; - - float32x4_t y0; - - for (unsigned int j = 0; j < rows; ++j) { - w = &A[j * cols + i]; - y0 = vmovq_n_f32(0); - - float r[4]; - wvec0_3 = vld1q_f32(&w[0]); - wvec4_7 = vld1q_f32(&w[4]); - wvec8_11 = vld1q_f32(&w[8]); - wvec12_15 = vld1q_f32(&w[12]); - - y0 = vmlaq_f32(y0, wvec0_3, x0_3); - y0 = vmlaq_f32(y0, wvec4_7, x4_7); - y0 = vmlaq_f32(y0, wvec8_11, x8_11); - y0 = vmlaq_f32(y0, wvec12_15, x12_15); - - vst1q_f32(r, y0); - for (unsigned int k = 0; k < 4; ++k) { - Y[j] = Y[j] + r[k]; - } - } - } - - } else if (cols % 8 == 0) { - for (unsigned i = 0; i < cols; i += 8) { - float32x4_t x0_3 = vld1q_f32(&X[i]); - float32x4_t x4_7 = vld1q_f32(&X[i + 4]); - - if (alpha != 1.0) { - x0_3 = vmulq_f32(x0_3, v_alpha); - x4_7 = vmulq_f32(x4_7, v_alpha); - } - - float32x4_t wvec0_3, wvec4_7; - - const float *__restrict w; - - float32x4_t y0; - - for (unsigned int j = 0; j < rows; ++j) { - w = &A[j * cols + i]; - y0 = vmovq_n_f32(0); - - float r[4]; - wvec0_3 = vld1q_f32(&w[0]); - wvec4_7 = vld1q_f32(&w[4]); - - y0 = vmlaq_f32(y0, wvec0_3, x0_3); - y0 = vmlaq_f32(y0, wvec4_7, x4_7); - - vst1q_f32(r, y0); - for (unsigned int k = 0; k < 4; ++k) { - Y[j] = Y[j] + r[k]; - } - } - } - } else if (cols % 4 == 0) { - for (unsigned i = 0; i < cols; i += 4) { - float32x4_t x0_3 = vld1q_f32(&X[i]); - - if (alpha != 1.0) { - x0_3 = vmulq_f32(x0_3, v_alpha); - } - - float32x4_t wvec0_3, wvec4_7; - - const float *__restrict w; - - float32x4_t y0; - - for (unsigned int j = 0; j < rows; ++j) { - w = &A[j * cols + i]; - y0 = vmovq_n_f32(0); - - float r[4]; - wvec0_3 = vld1q_f32(&w[0]); - - y0 = vmlaq_f32(y0, wvec0_3, x0_3); - - vst1q_f32(r, y0); - for (unsigned int k = 0; k < 4; ++k) { - Y[j] = Y[j] + r[k]; - } - } - } - } -} - -void sgemv_transpose(const float *A, const float *X, float *Y, uint32_t rows, - uint32_t cols, float alpha, float beta) { - const float *__restrict x; - - const float32x4_t v_beta = vdupq_n_f32(beta); - const float32x4_t v_alpha = vdupq_n_f32(alpha); - - if (cols % 16 == 0) { - unsigned int n = cols / 16; - bool *initialized = (bool *)malloc(sizeof(bool) * n); - unsigned int step; - for (unsigned int i = 0; i < cols / 16; ++i) { - initialized[i] = false; - } - - for (unsigned int i = 0; i < rows; ++i) { - float32x4_t x = vld1q_dup_f32(&X[i]); - x = vmulq_f32(x, v_alpha); - - for (unsigned int j = 0; j < cols; j += 16) { - float *__restrict y = &Y[j]; - - float32x4_t y0_3 = vld1q_f32(&y[0]); - float32x4_t y4_7 = vld1q_f32(&y[4]); - float32x4_t y8_11 = vld1q_f32(&y[8]); - float32x4_t y12_15 = vld1q_f32(&y[12]); - step = j / 16; - if (!initialized[step]) { - y0_3 = vmulq_f32(y0_3, v_beta); - y4_7 = vmulq_f32(y4_7, v_beta); - y8_11 = vmulq_f32(y8_11, v_beta); - y12_15 = vmulq_f32(y12_15, v_beta); - initialized[step] = true; - } - - float32x4_t wvec0_3, wvec4_7, wvec8_11, wvec12_15; - const float *__restrict w; - - w = &A[i * cols + j]; - - wvec0_3 = vld1q_f32(&w[0]); - wvec4_7 = vld1q_f32(&w[4]); - wvec8_11 = vld1q_f32(&w[8]); - wvec12_15 = vld1q_f32(&w[12]); - - y0_3 = vmlaq_f32(y0_3, wvec0_3, x); - y4_7 = vmlaq_f32(y4_7, wvec4_7, x); - y8_11 = vmlaq_f32(y8_11, wvec8_11, x); - y12_15 = vmlaq_f32(y12_15, wvec12_15, x); - - vst1q_f32(&y[0], y0_3); - vst1q_f32(&y[4], y4_7); - vst1q_f32(&y[8], y8_11); - vst1q_f32(&y[12], y12_15); - } - } - free(initialized); - return; - } else if (cols % 8 == 0) { - unsigned int n = cols / 8; - bool *initialized = (bool *)malloc(sizeof(bool) * n); - unsigned int step; - for (unsigned int i = 0; i < cols / 8; ++i) { - initialized[i] = false; - } - - for (unsigned int i = 0; i < rows; ++i) { - float32x4_t x = vld1q_dup_f32(&X[i]); - x = vmulq_f32(x, v_alpha); - - for (unsigned int j = 0; j < cols; j += 8) { - float *__restrict y = &Y[j]; - - float32x4_t y0_3 = vld1q_f32(&y[0]); - float32x4_t y4_7 = vld1q_f32(&y[4]); - - step = j / 8; - if (!initialized[step]) { - y0_3 = vmulq_f32(y0_3, v_beta); - y4_7 = vmulq_f32(y4_7, v_beta); - initialized[step] = true; - } - - float32x4_t wvec0_3, wvec4_7; - const float *__restrict w; - - w = &A[i * cols + j]; - - wvec0_3 = vld1q_f32(&w[0]); - wvec4_7 = vld1q_f32(&w[4]); - - y0_3 = vmlaq_f32(y0_3, wvec0_3, x); - y4_7 = vmlaq_f32(y4_7, wvec4_7, x); - vst1q_f32(&y[0], y0_3); - vst1q_f32(&y[4], y4_7); - } - } - free(initialized); - return; - } else if (cols % 4 == 0) { - unsigned int n = cols / 4; - bool *initialized = (bool *)malloc(sizeof(bool) * n); - - unsigned int step; - for (unsigned int i = 0; i < cols / 4; ++i) { - initialized[i] = false; - } - for (unsigned int i = 0; i < rows; ++i) { - float32x4_t x = vld1q_dup_f32(&X[i]); - x = vmulq_f32(x, v_alpha); - - for (unsigned int j = 0; j < cols; j += 4) { - float *__restrict y = &Y[j]; - - float32x4_t y0_3 = vld1q_f32(&y[0]); - step = j / 4; - if (!initialized[step]) { - y0_3 = vmulq_f32(y0_3, v_beta); - initialized[step] = true; - } - - float32x4_t wvec0_3; - const float *__restrict w; - - w = &A[i * cols + j]; - - wvec0_3 = vld1q_f32(&w[0]); - - y0_3 = vmlaq_f32(y0_3, wvec0_3, x); - vst1q_f32(&y[0], y0_3); - } - } - free(initialized); - } - - return; -} - -void copy_int4_to_fp32(const unsigned int N, const uint8_t *X, float *Y) { - - unsigned int idx = 0; - - // keep in mind that : len(X) = N, and len(Y) = 2*N - - // processing batch of 16 - float32x4_t y0, y1, y2, y3; - float32x4_t y4, y5, y6, y7; - - uint8_t low0, low1, high0, high1; - - for (; (N - idx) >= 16; idx += 16) { - uint8x16_t batch = vld1q_u8(&X[idx]); - - uint8x8_t low = vget_low_u8(batch); - uint8x8_t high = vget_high_u8(batch); - unsigned int i = 0; - for (; i < 8; ++i) { - low0 = low[i] >> 4; - low1 = low[i] & 0x0f; - - high0 = high[i] >> 4; - high1 = high[i] & 0x0f; - - // 0 ~ 8 - if (i < 2) { - y0[2 * i] = low0; - y0[2 * i + 1] = low1; - } else if (i < 4) { - y1[2 * (i - 2)] = low0; - y1[2 * (i - 2) + 1] = low1; - } else if (i < 6) { - y2[2 * (i - 4)] = low0; - y2[2 * (i - 4) + 1] = low1; - } else { - y3[2 * (i - 6)] = low0; - y3[2 * (i - 6) + 1] = low1; - } - - // 8 ~ 16 - if (i < 2) { - y4[2 * i] = high0; - y4[2 * i + 1] = high1; - } else if (i < 4) { - y5[2 * (i - 2)] = high0; - y5[2 * (i - 2) + 1] = high1; - } else if (i < 6) { - y6[2 * (i - 4)] = high0; - y6[2 * (i - 4) + 1] = high1; - } else { - y7[2 * (i - 6)] = high0; - y7[2 * (i - 6) + 1] = high1; - } - } - vst1q_f32(&Y[2 * idx], y0); - vst1q_f32(&Y[2 * idx + 4], y1); - vst1q_f32(&Y[2 * idx + 8], y2); - vst1q_f32(&Y[2 * idx + 12], y3); - vst1q_f32(&Y[2 * idx + 16], y4); - vst1q_f32(&Y[2 * idx + 20], y5); - vst1q_f32(&Y[2 * idx + 24], y6); - vst1q_f32(&Y[2 * idx + 28], y7); - } - - // processing remaining batch of 8 - for (; (N - idx) >= 8; idx += 8) { - uint8x8_t batch = vld1_u8(&X[idx]); - - unsigned int i = 0; - for (; i < 8; ++i) { - low0 = batch[i] >> 4; - low1 = batch[i] & 0x0f; - - if (i < 2) { - y0[2 * i] = low0; - y0[2 * i + 1] = low1; - } else if (i < 4) { - y1[2 * (i - 2)] = low0; - y1[2 * (i - 2) + 1] = low1; - } else if (i < 6) { - y2[2 * (i - 4)] = low0; - y2[2 * (i - 4) + 1] = low1; - } else { - y3[2 * (i - 6)] = low0; - y3[2 * (i - 6) + 1] = low1; - } - } - - vst1q_f32(&Y[2 * idx], y0); - vst1q_f32(&Y[2 * idx + 4], y1); - vst1q_f32(&Y[2 * idx + 8], y2); - vst1q_f32(&Y[2 * idx + 12], y3); - } - - // pocessing remaining values - for (; idx < N; idx++) { - Y[2 * idx] = X[idx] >> 4; - Y[2 * idx + 1] = X[idx] & 0x0f; - } -} - -void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y) { - ///@note int8 Tensor and int4 Tensor share the same memory offset - unsigned int idx = 0; - for (; N - idx >= 16; idx += 16) { - uint8x16_t batch = vld1q_u8(&X[idx]); - vst1q_u8(&Y[idx], batch); - } - for (; N - idx >= 8; idx += 8) { - uint8x8_t batch = vld1_u8(&X[idx]); - vst1_u8(&Y[idx], batch); - } - for (; N - idx >= 1; ++idx) { - Y[idx] = X[idx]; - } -} - -void sine(const unsigned int N, float *X, float *Y, float alpha) { - unsigned int i = 0; - for (; N - i >= 4; i += 4) { - float32x4_t x0_3 = vld1q_f32(&X[i]); - if (alpha != 1.0) - x0_3 = vmulq_n_f32(x0_3, alpha); - float32x4_t sinx0_3 = sin_ps(x0_3); - vst1q_f32(&Y[i], sinx0_3); - } - while (i < N) { - Y[i] = std::sin(alpha * X[i]); - ++i; - } -} - -void cosine(const unsigned int N, float *X, float *Y, float alpha) { - unsigned int i = 0; - for (; N - i >= 4; i += 4) { - float32x4_t x0_3 = vld1q_f32(&X[i]); - if (alpha != 1.0) - x0_3 = vmulq_n_f32(x0_3, alpha); - float32x4_t cosx0_3 = cos_ps(x0_3); - vst1q_f32(&Y[i], cosx0_3); - } - while (i < N) { - Y[i] = std::cos(alpha * X[i]); - ++i; - } -} - -void inv_sqrt_inplace(const unsigned int N, float *X) { - unsigned int i = 0; - for (; N - i >= 4; i += 4) { - float32x4_t x0_7 = vld1q_f32(&X[i]); - float32x4_t x0_7_sqrt = vsqrtq_f32(x0_7); - float32x4_t ones = vmovq_n_f32(1); - float32x4_t x0_7_sqrt_div = vdivq_f32(ones, x0_7_sqrt); - vst1q_f32(&X[i], x0_7_sqrt_div); - } - while (i < N) { - X[i] = (1 / std::sqrt(static_cast(X[i]))); - ++i; - } -} - -void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, - float alpha, float beta) { - unsigned int i = 0; - float32x4_t alpha_vec = vdupq_n_f32(alpha); - float32x4_t beta_vec = vdupq_n_f32(beta); - for (; N - i >= 4; i += 4) { - float32x4_t x0_3 = vld1q_f32(&X[i]); - float32x4_t y0_3 = vld1q_f32(&Y[i]); - if (alpha != 1.f) { - y0_3 = vmulq_f32(y0_3, alpha_vec); - } - float32x4_t xy0_3 = vmulq_f32(x0_3, y0_3); - if (std::abs(beta) > __FLT_MIN__) { - float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); - vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); - } else - vst1q_f32(&Z[i], xy0_3); - } - while (i < N) { - if (std::abs(beta) > __FLT_MIN__) - Z[i] = alpha * X[i] * Y[i] + beta * Z[i]; - else - Z[i] = alpha * X[i] * Y[i]; - ++i; - } -} - -void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, - float alpha, float beta) { - unsigned int i = 0; - float32x4_t alpha_vec = vdupq_n_f32(alpha); - float32x4_t beta_vec = vdupq_n_f32(beta); - for (; N - i >= 4; i += 4) { - float32x4_t x0_3 = vld1q_f32(&X[i]); - float32x4_t y0_3 = vld1q_f32(&Y[i]); - if (alpha != 1.f) { - y0_3 = vmulq_f32(y0_3, alpha_vec); - } - float32x4_t xy0_3 = vaddq_f32(x0_3, y0_3); - if (std::abs(beta) > __FLT_MIN__) { - float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); - vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); - } else - vst1q_f32(&Z[i], xy0_3); - } - while (i < N) { - if (std::abs(beta) > __FLT_MIN__) - Z[i] = X[i] + alpha * Y[i] + beta * Z[i]; - else - Z[i] = X[i] + alpha * Y[i]; - ++i; - } -} - -void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, - float alpha, float beta) { - unsigned int i = 0; - float32x4_t alpha_vec = vdupq_n_f32(alpha); - float32x4_t beta_vec = vdupq_n_f32(beta); - for (; N - i >= 4; i += 4) { - float32x4_t x0_3 = vld1q_f32(&X[i]); - float32x4_t y0_3 = vld1q_f32(&Y[i]); - if (alpha != 1.f) { - y0_3 = vmulq_f32(y0_3, alpha_vec); - } - float32x4_t xy0_3 = vsubq_f32(x0_3, y0_3); - if (std::abs(beta) > __FLT_MIN__) { - float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); - vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); - } else - vst1q_f32(&Z[i], xy0_3); - } - while (i < N) { - if (std::abs(beta) > __FLT_MIN__) - Z[i] = X[i] - alpha * Y[i] + beta * Z[i]; - else - Z[i] = X[i] - alpha * Y[i]; - ++i; - } -} - -void ele_div(const unsigned N, const float *X, const float *Y, float *Z, - float alpha, float beta) { - unsigned int i = 0; - float32x4_t alpha_vec = vdupq_n_f32(alpha); - float32x4_t beta_vec = vdupq_n_f32(beta); - for (; N - i >= 4; i += 4) { - float32x4_t x0_3 = vld1q_f32(&X[i]); - float32x4_t y0_3 = vld1q_f32(&Y[i]); - if (alpha != 1.f) { - y0_3 = vmulq_f32(y0_3, alpha_vec); - } - float32x4_t xy0_3 = vdivq_f32(x0_3, y0_3); - if (std::abs(beta) > __FLT_MIN__) { - float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); - vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); - } else - vst1q_f32(&Z[i], xy0_3); - } - while (i < N) { - if (std::abs(beta) > __FLT_MIN__) - Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i]; - else - Z[i] = X[i] / (alpha * Y[i]); - ++i; - } -} - -#ifdef ENABLE_FP16 - void hgemv(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t M, uint32_t N, float alpha, float beta) { const unsigned int batch = 0; @@ -1405,48 +879,6 @@ void copy_int8_to_fp16(const unsigned int N, const uint8_t *X, __fp16 *Y) { } } -void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y) { - unsigned int idx = 0; - for (; (N - idx) >= 16; idx += 16) { - uint8x16_t batch = vld1q_u8(&X[idx]); - uint8x8_t low = vget_low_u8(batch); - uint8x8_t high = vget_high_u8(batch); - - // convert to u16 - uint16x8_t batch_low_u16 = vmovl_u8(low); - uint16x8_t batch_high_u16 = vmovl_u8(high); - - // convert to u32 - uint32x4_t batch_low_u32_low = vmovl_u16(vget_low_u16(batch_low_u16)); - uint32x4_t batch_low_u32_high = vmovl_u16(vget_high_u16(batch_low_u16)); - uint32x4_t batch_high_u32_low = vmovl_u16(vget_low_u16(batch_high_u16)); - uint32x4_t batch_high_u32_high = vmovl_u16(vget_high_u16(batch_high_u16)); - - // todo : experiment with vcvt_f32_u32_ bitwise operation w.r.t. - // time/accuracy - vst1q_f32(&Y[idx], vcvtq_f32_u32(batch_low_u32_low)); - vst1q_f32(&Y[idx + 4], vcvtq_f32_u32(batch_low_u32_high)); - vst1q_f32(&Y[idx + 8], vcvtq_f32_u32(batch_high_u32_low)); - vst1q_f32(&Y[idx + 12], vcvtq_f32_u32(batch_high_u32_high)); - } - for (; (N - idx) >= 8; idx += 8) { - uint8x8_t batch = vld1_u8(&X[idx]); - - // convert to u16 - uint16x8_t batch_u16 = vmovl_u8(batch); - - // convert to u32 - uint32x4_t batch_u32_low = vmovl_u16(vget_low_u16(batch_u16)); - uint32x4_t batch_u32_high = vmovl_u16(vget_high_u16(batch_u16)); - - vst1q_f32(&Y[idx], vcvtq_f32_u32(batch_u32_low)); - vst1q_f32(&Y[idx + 4], vcvtq_f32_u32(batch_u32_high)); - } - for (; (N - idx) >= 1; ++idx) { - Y[idx] = X[idx]; - } -} - void copy_fp16_to_fp32(const unsigned int N, const __fp16 *X, float *Y) { unsigned int idx = 0; @@ -1719,5 +1151,9 @@ void inv_sqrt_inplace(const unsigned int N, __fp16 *X) { } } -#endif +void transpose_matrix(const unsigned int M, const unsigned int N, + const __fp16 *src, unsigned int ld_src, __fp16 *dst, + unsigned int ld_dst) { + transpose_neon<__fp16>(M, N, src, ld_src, dst, ld_dst); +} } // namespace nntrainer::neon diff --git a/nntrainer/tensor/cpu_backend/arm/neon_mathfun.h b/nntrainer/tensor/cpu_backend/arm/neon_mathfun.h new file mode 100644 index 0000000000..3023560b1e --- /dev/null +++ b/nntrainer/tensor/cpu_backend/arm/neon_mathfun.h @@ -0,0 +1,85 @@ +/** + * @file neon_mathfun.h + * @date 15 Jan 2024 + * @brief This is collection of sin, cos, exp, log function with NEON SIMD + * @see https://github.com/nnstreamer/nntrainer + * @author Julien Pommier + * @bug No known bugs except for NYI items + * + */ + +/** NEON implementation of sin, cos, exp and log + + Inspired by Intel Approximate Math library, and based on the + corresponding algorithms of the cephes math library +*/ + +/** gCopyright (C) 2011 Julien Pommier + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + (this is the zlib license) +*/ + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#ifndef NEON_MATHFUN_H_ +#define NEON_MATHFUN_H_ + +#include +/** + * @brief typedef for vector register. + * + */ +typedef float32x4_t v4sf; // vector of 4 float + +// prototypes +/** + * @brief log function with neon x = log(x) + * @param[in] x register variable (float32x4_t) + */ +inline v4sf log_ps(v4sf x); + +/** + * @brief exp function with neon x = exp(x) + * @param[in] x register variable (float32x4_t) + */ +inline v4sf exp_ps(v4sf x); + +/** + * @brief sin_ps function with neon x = sin(x) + * @param[in] x register variable (float32x4_t) + */ +inline v4sf sin_ps(v4sf x); + +/** + * @brief cos_ps function with neon x = cos(x) + * @param[in] x register variable (float32x4_t) + */ +inline v4sf cos_ps(v4sf x); + +/** + * @brief sincos_ps function with neon x = sin(x) or cos(x) + * @param[in] x register variable (float32x4_t) + * @param[in] s sin register variable (float32x4_t) + * @param[in] c cos register variable (float32x4_t) + */ +inline void sincos_ps(v4sf x, v4sf *s, v4sf *c); + +#include "neon_mathfun.hxx" + +#endif +#endif diff --git a/nntrainer/tensor/cpu_backend/arm/neon_mathfun.hxx b/nntrainer/tensor/cpu_backend/arm/neon_mathfun.hxx new file mode 100644 index 0000000000..93e4af8b66 --- /dev/null +++ b/nntrainer/tensor/cpu_backend/arm/neon_mathfun.hxx @@ -0,0 +1,341 @@ +/** + * @file neon_mathfun.hxx + * @date 15 Jan 2024 + * @brief This is collection of sin, cos, exp, log function with NEON SIMD + * @see https://github.com/nnstreamer/nntrainer + * @author Julien Pommier + * @bug No known bugs except for NYI items + * + */ + +/* NEON implementation of sin, cos, exp and log + + Inspired by Intel Approximate Math library, and based on the + corresponding algorithms of the cephes math library +*/ + +/* Copyright (C) 2011 Julien Pommier + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + (this is the zlib license) +*/ + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + +typedef uint32x4_t v4su; // vector of 4 uint32 +typedef int32x4_t v4si; // vector of 4 uint32 + +#define c_inv_mant_mask ~0x7f800000u +#define c_cephes_SQRTHF 0.707106781186547524 +#define c_cephes_log_p0 7.0376836292E-2 +#define c_cephes_log_p1 -1.1514610310E-1 +#define c_cephes_log_p2 1.1676998740E-1 +#define c_cephes_log_p3 -1.2420140846E-1 +#define c_cephes_log_p4 +1.4249322787E-1 +#define c_cephes_log_p5 -1.6668057665E-1 +#define c_cephes_log_p6 +2.0000714765E-1 +#define c_cephes_log_p7 -2.4999993993E-1 +#define c_cephes_log_p8 +3.3333331174E-1 +#define c_cephes_log_q1 -2.12194440e-4 +#define c_cephes_log_q2 0.693359375 + +/* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 +*/ +/** + * @brief log function with simd + * + * @param x input register variable + * @return v4sf + */ +v4sf log_ps(v4sf x) { + v4sf one = vdupq_n_f32(1); + + x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */ + v4su invalid_mask = vcleq_f32(x, vdupq_n_f32(0)); + + v4si ux = vreinterpretq_s32_f32(x); + + v4si emm0 = vshrq_n_s32(ux, 23); + + /* keep only the fractional part */ + ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask)); + ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f))); + x = vreinterpretq_f32_s32(ux); + + emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f)); + v4sf e = vcvtq_f32_s32(emm0); + + e = vaddq_f32(e, one); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + v4su mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF)); + v4sf tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); + x = vsubq_f32(x, one); + e = vsubq_f32( + e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask))); + x = vaddq_f32(x, tmp); + + v4sf z = vmulq_f32(x, x); + + v4sf y = vdupq_n_f32(c_cephes_log_p0); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8)); + y = vmulq_f32(y, x); + + y = vmulq_f32(y, z); + + tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1)); + y = vaddq_f32(y, tmp); + + tmp = vmulq_f32(z, vdupq_n_f32(0.5f)); + y = vsubq_f32(y, tmp); + + tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2)); + x = vaddq_f32(x, y); + x = vaddq_f32(x, tmp); + x = vreinterpretq_f32_u32(vorrq_u32( + vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN + return x; +} + +#define c_exp_hi 88.3762626647949f +#define c_exp_lo -88.3762626647949f + +#define c_cephes_LOG2EF 1.44269504088896341 +#define c_cephes_exp_C1 0.693359375 +#define c_cephes_exp_C2 -2.12194440e-4 + +#define c_cephes_exp_p0 1.9875691500E-4 +#define c_cephes_exp_p1 1.3981999507E-3 +#define c_cephes_exp_p2 8.3334519073E-3 +#define c_cephes_exp_p3 4.1665795894E-2 +#define c_cephes_exp_p4 1.6666665459E-1 +#define c_cephes_exp_p5 5.0000001201E-1 + +/* exp() computed for 4 float at once */ +/** + * @brief exponential function with simd + * + * @param x input register variable + * @return v4sf + */ +v4sf exp_ps(v4sf x) { + v4sf tmp, fx; + + v4sf one = vdupq_n_f32(1); + x = vminq_f32(x, vdupq_n_f32(c_exp_hi)); + x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo)); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF)); + + /* perform a floorf */ + tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); + + /* if greater, substract 1 */ + v4su mask = vcgtq_f32(tmp, fx); + mask = vandq_u32(mask, vreinterpretq_u32_f32(one)); + + fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); + + tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1)); + v4sf z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2)); + x = vsubq_f32(x, tmp); + x = vsubq_f32(x, z); + + static const float cephes_exp_p[6] = {c_cephes_exp_p0, c_cephes_exp_p1, + c_cephes_exp_p2, c_cephes_exp_p3, + c_cephes_exp_p4, c_cephes_exp_p5}; + v4sf y = vld1q_dup_f32(cephes_exp_p + 0); + v4sf c1 = vld1q_dup_f32(cephes_exp_p + 1); + v4sf c2 = vld1q_dup_f32(cephes_exp_p + 2); + v4sf c3 = vld1q_dup_f32(cephes_exp_p + 3); + v4sf c4 = vld1q_dup_f32(cephes_exp_p + 4); + v4sf c5 = vld1q_dup_f32(cephes_exp_p + 5); + + y = vmulq_f32(y, x); + z = vmulq_f32(x, x); + y = vaddq_f32(y, c1); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c2); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c3); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c4); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c5); + + y = vmulq_f32(y, z); + y = vaddq_f32(y, x); + y = vaddq_f32(y, one); + + /* build 2^n */ + int32x4_t mm; + mm = vcvtq_s32_f32(fx); + mm = vaddq_s32(mm, vdupq_n_s32(0x7f)); + mm = vshlq_n_s32(mm, 23); + v4sf pow2n = vreinterpretq_f32_s32(mm); + + y = vmulq_f32(y, pow2n); + return y; +} + +#define c_minus_cephes_DP1 -0.78515625 +#define c_minus_cephes_DP2 -2.4187564849853515625e-4 +#define c_minus_cephes_DP3 -3.77489497744594108e-8 +#define c_sincof_p0 -1.9515295891E-4 +#define c_sincof_p1 8.3321608736E-3 +#define c_sincof_p2 -1.6666654611E-1 +#define c_coscof_p0 2.443315711809948E-005 +#define c_coscof_p1 -1.388731625493765E-003 +#define c_coscof_p2 4.166664568298827E-002 +#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI + +/* evaluation of 4 sines & cosines at once. + + The code is the exact rewriting of the cephes sinf function. + Precision is excellent as long as x < 8192 (I did not bother to + take into account the special handling they have for greater values + -- it does not return garbage for arguments over 8192, though, but + the extra precision is missing). + + Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + surprising but correct result. + + Note also that when you compute sin(x), cos(x) is available at + almost no extra price so both sin_ps and cos_ps make use of + sincos_ps.. + */ +/** + * @brief sincos function with simd + * + * @param x input register variable + * @return v4sf + */ +void sincos_ps(v4sf x, v4sf *ysin, v4sf *ycos) { // any x + v4sf xmm1, xmm2, xmm3, y; + + v4su emm2; + + v4su sign_mask_sin, sign_mask_cos; + sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0)); + x = vabsq_f32(x); + + /* scale by 4/Pi */ + y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI)); + + /* store the integer part of y in mm0 */ + emm2 = vcvtq_u32_f32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = vaddq_u32(emm2, vdupq_n_u32(1)); + emm2 = vandq_u32(emm2, vdupq_n_u32(~1)); + y = vcvtq_f32_u32(emm2); + + /* get the polynom selection mask + there is one polynom for 0 <= x <= Pi/4 + and another one for Pi/4 * - * @file blas_neon_setting.h + * @file neon_setting.h * @date 18 Jan 2024 * @see https://github.com/nnstreamer/nntrainer * https://arxiv.org/abs/1706.03762 diff --git a/nntrainer/tensor/cpu_backend/cblas_interface/cblas_interface.cpp b/nntrainer/tensor/cpu_backend/cblas_interface/cblas_interface.cpp new file mode 100644 index 0000000000..00c0462a50 --- /dev/null +++ b/nntrainer/tensor/cpu_backend/cblas_interface/cblas_interface.cpp @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Sungsik Kong + * + * @file cblas_interface.cpp + * @date 23 April 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Sungsik Kong + * @bug No known bugs except for NYI items + * @brief Single-precision computation functions based on NEON + * + */ + +#include +#include + +namespace nntrainer { +void __cblas_saxpy(const unsigned int N, const float alpha, const float *X, + const unsigned int incX, float *Y, const unsigned int incY) { +#ifdef BLAS_NUM_THREADS + openblas_set_num_threads(BLAS_NUM_THREADS); +#endif + cblas_saxpy(N, alpha, X, incX, Y, incY); +} + +void __cblas_sgemv(const unsigned int TStorageOrder, bool TransA, + const unsigned int M, const unsigned int N, + const float alpha, const float *A, const unsigned int lda, + const float *X, const unsigned int incX, const float beta, + float *Y, const unsigned int incY) { + CBLAS_TRANSPOSE transA = TransA ? CblasTrans : CblasNoTrans; + CBLAS_ORDER order = TStorageOrder ? CblasColMajor : CblasRowMajor; +#ifdef BLAS_NUM_THREADS + openblas_set_num_threads(BLAS_NUM_THREADS); +#endif + cblas_sgemv(order, transA, M, N, alpha, A, lda, X, incX, beta, Y, incY); +} + +float __cblas_sdot(const unsigned int N, const float *X, + const unsigned int incX, const float *Y, + const unsigned int incY) { +#ifdef BLAS_NUM_THREADS + openblas_set_num_threads(BLAS_NUM_THREADS); +#endif + return cblas_sdot(N, X, incX, Y, incY); +} + +void __cblas_scopy(const unsigned int N, const float *X, + const unsigned int incX, float *Y, const unsigned int incY) { +#ifdef BLAS_NUM_THREADS + openblas_set_num_threads(BLAS_NUM_THREADS); +#endif + cblas_scopy(N, X, incX, Y, incY); +} + +void __cblas_sscal(const unsigned int N, const float alpha, float *X, + const unsigned int incX) { +#ifdef BLAS_NUM_THREADS + openblas_set_num_threads(BLAS_NUM_THREADS); +#endif + cblas_sscal(N, alpha, X, incX); +} + +float __cblas_snrm2(const unsigned int N, const float *X, + const unsigned int incX) { +#ifdef BLAS_NUM_THREADS + openblas_set_num_threads(BLAS_NUM_THREADS); +#endif + return cblas_snrm2(N, X, incX); +} + +void __cblas_sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, + const unsigned int M, const unsigned int N, + const unsigned int K, const float alpha, const float *A, + const unsigned int lda, const float *B, + const unsigned int ldb, const float beta, float *C, + const unsigned int ldc) { + CBLAS_TRANSPOSE transA = TransA ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE transB = TransB ? CblasTrans : CblasNoTrans; + CBLAS_ORDER order = TStorageOrder ? CblasColMajor : CblasRowMajor; +#ifdef BLAS_NUM_THREADS + openblas_set_num_threads(BLAS_NUM_THREADS); +#endif + cblas_sgemm(order, transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, + ldc); +} + +unsigned int __cblas_isamax(const unsigned int N, const float *X, + const unsigned int incX) { +#ifdef BLAS_NUM_THREADS + openblas_set_num_threads(BLAS_NUM_THREADS); +#endif + return cblas_isamax(N, X, incX); +} +} // namespace nntrainer diff --git a/nntrainer/tensor/cpu_backend/cblas_interface/cblas_interface.h b/nntrainer/tensor/cpu_backend/cblas_interface/cblas_interface.h new file mode 100644 index 0000000000..fdae5b9215 --- /dev/null +++ b/nntrainer/tensor/cpu_backend/cblas_interface/cblas_interface.h @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Sungsik Kong + * + * @file cblas_interface.h + * @date 23 April 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Sungsik Kong + * @bug No known bugs except for NYI items + * @brief Single-precision computation functions based on NEON + * + */ + +#ifndef __CBLAS_INTERFACE_H__ +#define __CBLAS_INTERFACE_H__ +#ifdef __cplusplus + +namespace nntrainer { +/** + * @brief saxpy computation : Y = alpha*X + Y + * @param[in] N number of elements in Y + * @param[in] alpha float number + * @param[in] X __fp16 * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void __cblas_saxpy(const unsigned int N, const float alpha, const float *X, + const unsigned int incX, float *Y, const unsigned int incY); +/** + * @brief sgemv computation : Y = alpha*A*X + beta*Y + * @param[in] TStorageOrder Row major / Col major + * @param[in] A float * for Matrix A + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] rows number of A's row + * @param[in] cols number of A's columns + * @param[in] alpha float number + * @param[in] beta float number + */ +void __cblas_sgemv(const unsigned int TStorageOrder, bool TransA, + const unsigned int M, const unsigned int N, + const float alpha, const float *A, const unsigned int lda, + const float *X, const unsigned int incX, const float beta, + float *Y, const unsigned int incY); +/** + * @brief sdot computation : sum of all X * Y + * @param[in] N number of elements in Y + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + */ +float __cblas_sdot(const unsigned int N, const float *X, + const unsigned int incX, const float *Y, + const unsigned int incY); +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + */ +void __cblas_scopy(const unsigned int N, const float *X, + const unsigned int incX, float *Y, const unsigned int incY); +/** + * @brief sscal computation : X = alpha * X + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] alpha float number + */ +void __cblas_sscal(const unsigned int N, const float alpha, float *X, + const unsigned int incX); +/** + * @brief snrm2 computation : Euclidean norm + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + */ +float __cblas_snrm2(const unsigned int N, const float *X, + const unsigned int incX); +/** + * @brief sgemm computation : Y = alpha*op(A)*op(B) + beta*C, + * where op(X) is one of X or X**T + * @param[in] TStorageOrder Row major / Col major + * @param[in] A float * for Matrix A + * @param[in] B float * for Matrix B + * @param[in] C float * for Matrix C + * @param[in] M number of op(A)'s and C's row + * @param[in] N number of op(B)'s and C's columns + * @param[in] K number of op(A)'s and columns and op(B)'s rows + * @param[in] alpha float number + * @param[in] beta float number + */ +void __cblas_sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, + const unsigned int M, const unsigned int N, + const unsigned int K, const float alpha, const float *A, + const unsigned int lda, const float *B, + const unsigned int ldb, const float beta, float *C, + const unsigned int ldc); +/** + * @brief isamax function : index of first maxima + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + */ +unsigned int __cblas_isamax(const unsigned int N, const float *X, + const unsigned int incX); +} // namespace nntrainer + +#endif +#endif diff --git a/nntrainer/tensor/cpu_backend/cblas_interface/meson.build b/nntrainer/tensor/cpu_backend/cblas_interface/meson.build new file mode 100644 index 0000000000..104bfca3e2 --- /dev/null +++ b/nntrainer/tensor/cpu_backend/cblas_interface/meson.build @@ -0,0 +1,14 @@ +cblas_interface_headers = [ + 'cblas_interface.h', +] +cblas_interface_sources = [ + 'cblas_interface.cpp', +] + +foreach s : cblas_interface_sources + nntrainer_sources += meson.current_source_dir() / s +endforeach + +foreach h : cblas_interface_headers + nntrainer_headers += meson.current_source_dir() / h +endforeach diff --git a/nntrainer/tensor/cpu_backend/cpu_backend.h b/nntrainer/tensor/cpu_backend/cpu_backend.h new file mode 100644 index 0000000000..eb50f12335 --- /dev/null +++ b/nntrainer/tensor/cpu_backend/cpu_backend.h @@ -0,0 +1,441 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Sungsik Kong + * + * @file cpu_backend.h + * @date 23 April 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Sungsik Kong + * @bug No known bugs except for NYI items + * @brief Computational backend for CPU considering architecture dependency + * + */ + +#ifndef __CPU_BACKEND_H__ +#define __CPU_BACKEND_H__ +#ifdef __cplusplus + +#ifdef ARM +#include +#elif X86 +#include +#else +#include +#endif + +#include +#include + +#ifdef ENABLE_FP16 +/** + * @brief sscal computation : X = alpha * X + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + * @param[in] alpha float number + */ +extern void sscal(const unsigned int N, const float alpha, _FP16 *X, + const unsigned int incX); + +/** + * @brief snrm2 computation : Euclidean norm + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + */ +extern _FP16 snrm2(const unsigned int N, const _FP16 *X, + const unsigned int incX); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +extern void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX, + _FP16 *Y, const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +extern void scopy(const unsigned int N, const float *X, const unsigned int incX, + _FP16 *Y, const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + * @param[in] Y float * for Vector Y + */ +extern void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX, + float *Y, const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +extern void scopy_int4_to_float16(const unsigned int N, const uint8_t *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +extern void scopy_int8_to_float16(const unsigned int N, const uint8_t *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY); + +/** + * @brief sdot computation : sum of all X * Y + * @param[in] N number of elements in Y + * @param[in] X __fp16 * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +extern _FP16 sdot(const unsigned int N, const _FP16 *X, const unsigned int incX, + const _FP16 *Y, const unsigned int incY); + +/** + * @brief saxpy computation : Y = alpha*X + Y + * @param[in] N number of elements in Y + * @param[in] alpha float number + * @param[in] X __fp16 * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +extern void saxpy(const unsigned int N, const float alpha, const _FP16 *X, + const unsigned int incX, _FP16 *Y, const unsigned int incY); + +/** + * @brief sgemm computation : Y = alpha*op(A)*op(B) + beta*C, + * where op(X) is one of X or X**T + * @param[in] A __fp16 * for Matrix A + * @param[in] B __fp16 * for Matrix B + * @param[in] C __fp16 * for Matrix C + * @param[in] M number of op(A)'s and C's row + * @param[in] N number of op(B)'s and C's columns + * @param[in] K number of op(A)'s and columns and op(B)'s rows + * @param[in] alpha float number + * @param[in] beta float number + */ +extern void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, + const unsigned int M, const unsigned int N, + const unsigned int K, const float alpha, const _FP16 *A, + const unsigned int lda, const _FP16 *B, + const unsigned int ldb, const float beta, _FP16 *C, + const unsigned int ldc); +/** + * @brief sgemv computation : Y = alpha*A*X + beta*Y + * @param[in] A float * for Matrix A + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] rows number of A's row + * @param[in] cols number of A's columns + * @param[in] alpha float number + * @param[in] beta float number + */ +extern void sgemv(const unsigned int TStorageOrder, bool TransA, + const unsigned int M, const unsigned int N, const float alpha, + const _FP16 *A, const unsigned int lda, const _FP16 *X, + const unsigned int incX, const float beta, _FP16 *Y, + const unsigned int incY); +/** + * @brief elementwise vector multiplication : Z = X ⊙ alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +extern void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, + _FP16 *Z, float alpha = 1.f, float beta = 0.f, + unsigned int i_stride = 1, unsigned int o_stride = 1); + +/** + * @brief elementwise vector addition : Z = X + alpha * Y + beta * + * Z + * @param[in] N length of the vector + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +extern void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, + _FP16 *Z, float alpha = 1.f, float beta = 0.f, + unsigned int i_stride = 1, unsigned int o_stride = 1); +/** + * @brief elementwise vector subtraction with neon : Z = X - alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +extern void ele_sub(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha = 1.f, float beta = 0.f, + unsigned int i_stride = 1, unsigned int o_stride = 1); + +/** + * @brief elementwise vector division with neon : Z = X / (alpha * Y) + beta + * * Z + * @note ZeroDivisionError is not guaranteed in this function + * @param[in] N length of the vector + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +extern void ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha = 1.f, float beta = 0.f, + unsigned int i_stride = 1, unsigned int o_stride = 1); + +/** + * @brief isamax function : index of first maxima + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + */ +extern unsigned int isamax(const unsigned int N, const _FP16 *X, + const unsigned int incX); + +/** + * @brief squared root transformation inplace : X = sqrt(X) + * + * @param N size of X + * @param X __fp16 * for Vector X + */ +extern void inv_sqrt_inplace(const unsigned int N, _FP16 *X); + +/** + * @brief Matrix transpose / 2D Tensor transpose + * + * @param M row length of input matrix + * @param N col length of input matrix + * @param src src data of input matrix + * @param ld_src data offset of input matrix + * @param dst destination of output matrix + * @param ld_dst data offset of output matrix + */ +extern void transpose_matrix(const unsigned int M, const unsigned int N, + const _FP16 *src, unsigned int ld_src, _FP16 *dst, + unsigned int ld_dst); +#endif +/** + * @brief sscal computation : X = alpha * X + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] alpha float number + */ +extern void sscal(const unsigned int N, const float alpha, float *X, + const unsigned int incX); +/** + * @brief snrm2 computation : Euclidean norm + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + */ +extern float snrm2(const unsigned int N, const float *X, + const unsigned int incX); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + */ +extern void scopy(const unsigned int N, const float *X, const unsigned int incX, + float *Y, const unsigned int incY); +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y uint8_t * for Vector Y + */ +extern void scopy(const unsigned int N, const uint8_t *X, + const unsigned int incX, uint8_t *Y, const unsigned int incY); +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y float * for Vector Y + */ +extern void scopy_int4_to_float32(const unsigned int N, const uint8_t *X, + const unsigned int incX, float *Y, + const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y float * for Vector Y + */ +extern void scopy_int8_to_float32(const unsigned int N, const uint8_t *X, + const unsigned int incX, float *Y, + const unsigned int incY); + +/** + * @brief sdot computation : sum of all X * Y + * @param[in] N number of elements in Y + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + */ +extern float sdot(const unsigned int N, const float *X, const unsigned int incX, + const float *Y, const unsigned int incY); + +/** + * @brief saxpy computation : Y = alpha*X + Y + * @param[in] N number of elements in Y + * @param[in] alpha float number + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + */ +extern void saxpy(const unsigned int N, const float alpha, const float *X, + const unsigned int incX, float *Y, const unsigned int incY); +/** + * @brief sgemm computation : Y = alpha*op(A)*op(B) + beta*C, + * where op(X) is one of X or X**T + * @param[in] A float * for Matrix A + * @param[in] B float * for Matrix B + * @param[in] C float * for Matrix C + * @param[in] M number of op(A)'s and C's row + * @param[in] N number of op(B)'s and C's columns + * @param[in] K number of op(A)'s and columns and op(B)'s rows + * @param[in] alpha float number + * @param[in] beta float number + */ +extern void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, + const unsigned int M, const unsigned int N, + const unsigned int K, const float alpha, const float *A, + const unsigned int lda, const float *B, + const unsigned int ldb, const float beta, float *C, + const unsigned int ldc); +/** + * @brief sgemv computation : Y = alpha*A*X + beta*Y + * @param[in] A float * for Matrix A + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] rows number of A's row + * @param[in] cols number of A's columns + * @param[in] alpha float number + * @param[in] beta float number + */ +extern void sgemv(const unsigned int TStorageOrder, bool TransA, + const unsigned int M, const unsigned int N, const float alpha, + const float *A, const unsigned int lda, const float *X, + const unsigned int incX, const float beta, float *Y, + const unsigned int incY); +/** + * @brief isamax function : index of first maxima + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + */ +extern unsigned int isamax(const unsigned int N, const float *X, + const unsigned int incX); + +/** + * @brief sine with neon: Y = sin(alpha * X) + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] alpha float * for scaling angle (radian) + */ +extern void sine(const unsigned int N, float *X, float *Y, float alpha = 1.f); + +/** + * @brief cosine with neon: Y = cos(alpha * X) + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] alpha float * for scaling angle (radian) + */ +extern void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.f); + +/** + * @brief inversed squared root transformation inplace : X = 1 / sqrt(X) + * + * @param N size of X + * @param X float * for Vector X + */ +extern void inv_sqrt_inplace(const unsigned int N, float *X); +/** + * @brief elementwise vector multiplication : Z = X ⊙ alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +extern void ele_mul(const unsigned int N, const float *X, const float *Y, + float *Z, float alpha = 1.f, float beta = 0.f, + unsigned int i_stride = 1, unsigned int o_stride = 1); + +/** + * @brief elementwise vector addition : Z = X + alpha * Y + beta * + * Z + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +extern void ele_add(const unsigned int N, const float *X, const float *Y, + float *Z, float alpha = 1.f, float beta = 0.f, + unsigned int i_stride = 1, unsigned int o_stride = 1); +/** + * @brief elementwise vector subtraction with neon : Z = X - alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +extern void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f, + unsigned int i_stride = 1, unsigned int o_stride = 1); + +/** + * @brief elementwise vector division with neon : Z = X / (alpha * Y) + beta + * * Z + * @note ZeroDivisionError is not guaranteed in this function + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +extern void ele_div(const unsigned N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f, + unsigned int i_stride = 1, unsigned int o_stride = 1); + +#endif +#endif diff --git a/nntrainer/tensor/cpu_backend/fallback/fallback.cpp b/nntrainer/tensor/cpu_backend/fallback/fallback.cpp new file mode 100644 index 0000000000..1a06dbcd87 --- /dev/null +++ b/nntrainer/tensor/cpu_backend/fallback/fallback.cpp @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Sungsik Kong + * + * @file fallback.cpp + * @date 23 April 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Sungsik Kong + * @bug No known bugs except for NYI items + * @brief Fallback interface (Raw implementations) + * + */ + +#include +#include +#include + +namespace nntrainer { + +void scopy_int4_to_float32(const unsigned int N, const uint8_t *X, + const unsigned int incX, float *Y, + const unsigned int incY) { + __fallback_scopy_int4_to_float32(N, X, incX, Y, incY); +} + +void scopy_int8_to_float32(const unsigned int N, const uint8_t *X, + const unsigned int incX, float *Y, + const unsigned int incY) { + __fallback_scopy_int8_to_float32(N, X, incX, Y, incY); +} + +void sine(const unsigned int N, float *X, float *Y, float alpha) { + __fallback_sine(N, X, Y, alpha); +} + +void cosine(const unsigned int N, float *X, float *Y, float alpha) { + __fallback_cosine(N, X, Y, alpha); +} + +void inv_sqrt_inplace(const unsigned int N, float *X) { + __fallback_inv_sqrt_inplace(N, X); +} + +void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + __fallback_ele_mul(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + __fallback_ele_add(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + __fallback_ele_sub(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void ele_div(const unsigned N, const float *X, const float *Y, float *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + __fallback_ele_div(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void saxpy(const unsigned int N, const float alpha, const float *X, + const unsigned int incX, float *Y, const unsigned int incY) { + __fallback_saxpy(N, alpha, X, incX, Y, incY); +} + +void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M, + const unsigned int N, const float alpha, const float *A, + const unsigned int lda, const float *X, const unsigned int incX, + const float beta, float *Y, const unsigned int incY) { + __fallback_sgemv(TStorageOrder, TransA, M, N, alpha, A, lda, Y, incX, beta, Y, + incY); +} + +float sdot(const unsigned int N, const float *X, const unsigned int incX, + const float *Y, const unsigned int incY) { + return __fallback_sdot(N, X, incX, Y, incY); +} + +void scopy(const unsigned int N, const uint8_t *X, const unsigned int incX, + uint8_t *Y, const unsigned int incY) { + __fallback_scopy(N, X, incX, Y, incY); +} + +void scopy(const unsigned int N, const float *X, const unsigned int incX, + float *Y, const unsigned int incY) { + __fallback_scopy(N, X, incX, Y, incY); +} + +void sscal(const unsigned int N, const float alpha, float *X, + const unsigned int incX) { + __fallback_sscal(N, alpha, X, incX); +} + +float snrm2(const unsigned int N, const float *X, const unsigned int incX) { + return __fallback_snrm2(N, X, incX); +} + +void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, + const unsigned int M, const unsigned int N, const unsigned int K, + const float alpha, const float *A, const unsigned int lda, + const float *B, const unsigned int ldb, const float beta, float *C, + const unsigned int ldc) { + __fallback_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B, + ldb, beta, C, ldc); +} + +unsigned int isamax(const unsigned int N, const float *X, + const unsigned int incX) { + return __fallback_isamax(N, X, incX); +} + +} /* namespace nntrainer */ diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/cpu_backend/fallback/fallback.h similarity index 74% rename from nntrainer/tensor/blas_interface.h rename to nntrainer/tensor/cpu_backend/fallback/fallback.h index b57ea3e057..268cda78d2 100644 --- a/nntrainer/tensor/blas_interface.h +++ b/nntrainer/tensor/cpu_backend/fallback/fallback.h @@ -1,28 +1,23 @@ // SPDX-License-Identifier: Apache-2.0 /** - * Copyright (C) 2020 Jijoong Moon + * Copyright (C) 2024 Sungsik Kong * - * @file blas_interface.h - * @date 28 Aug 2020 + * @file fallback.h + * @date 23 April 2024 * @see https://github.com/nnstreamer/nntrainer - * @author Jijoong Moon * @author Sungsik Kong * @bug No known bugs except for NYI items - * @brief This is dummy header for blas support + * @brief Fallback interface (Raw implementations) * */ -#ifndef __BLAS_INTERFACE_H_ -#define __BLAS_INTERFACE_H_ +#ifndef __FALLBACK_H__ +#define __FALLBACK_H__ #ifdef __cplusplus -#ifdef USE_CUBLAS -#include -#include -#endif - #include #include + namespace nntrainer { #ifdef ENABLE_FP16 @@ -32,14 +27,15 @@ namespace nntrainer { * @param[in] X __fp16 * for Vector X * @param[in] alpha float number */ -void sscal(const unsigned int N, const float alpha, _FP16 *X, const int incX); +void sscal(const unsigned int N, const float alpha, _FP16 *X, + const unsigned int incX); /** * @brief snrm2 computation : Euclidean norm * @param[in] N number of elements in X * @param[in] X __fp16 * for Vector X */ -_FP16 snrm2(const int N, const _FP16 *X, const int incX); +_FP16 snrm2(const unsigned int N, const _FP16 *X, const unsigned int incX); /** * @brief copy function : Y = X @@ -47,8 +43,8 @@ _FP16 snrm2(const int N, const _FP16 *X, const int incX); * @param[in] X __fp16 * for Vector X * @param[in] Y __fp16 * for Vector Y */ -void scopy(const unsigned int N, const _FP16 *X, const int incX, _FP16 *Y, - const int incY); +void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX, + _FP16 *Y, const unsigned int incY); /** * @brief copy function : Y = X @@ -56,8 +52,8 @@ void scopy(const unsigned int N, const _FP16 *X, const int incX, _FP16 *Y, * @param[in] X float * for Vector X * @param[in] Y __fp16 * for Vector Y */ -void scopy(const unsigned int N, const float *X, const int incX, _FP16 *Y, - const int incY); +void scopy(const unsigned int N, const float *X, const unsigned int incX, + _FP16 *Y, const unsigned int incY); /** * @brief copy function : Y = X @@ -65,8 +61,8 @@ void scopy(const unsigned int N, const float *X, const int incX, _FP16 *Y, * @param[in] X __fp16 * for Vector X * @param[in] Y float * for Vector Y */ -void scopy(const unsigned int N, const _FP16 *X, const int incX, float *Y, - const int incY); +void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX, + float *Y, const unsigned int incY); /** * @brief copy function : Y = X @@ -75,7 +71,8 @@ void scopy(const unsigned int N, const _FP16 *X, const int incX, float *Y, * @param[in] Y __fp16 * for Vector Y */ void scopy_int4_to_float16(const unsigned int N, const uint8_t *X, - const int incX, _FP16 *Y, const int incY); + const unsigned int incX, _FP16 *Y, + const unsigned int incY); /** * @brief copy function : Y = X @@ -84,7 +81,8 @@ void scopy_int4_to_float16(const unsigned int N, const uint8_t *X, * @param[in] Y __fp16 * for Vector Y */ void scopy_int8_to_float16(const unsigned int N, const uint8_t *X, - const int incX, _FP16 *Y, const int incY); + const unsigned int incX, _FP16 *Y, + const unsigned int incY); /** * @brief sdot computation : sum of all X * Y @@ -103,7 +101,7 @@ _FP16 sdot(const unsigned int N, const _FP16 *X, const unsigned int incX, * @param[in] Y __fp16 * for Vector Y */ void saxpy(const unsigned int N, const float alpha, const _FP16 *X, - const int incX, _FP16 *Y, const int incY); + const unsigned int incX, _FP16 *Y, const unsigned int incY); /** * @brief sgemm computation : Y = alpha*op(A)*op(B) + beta*C, @@ -134,8 +132,8 @@ void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, */ void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M, const unsigned int N, const float alpha, const _FP16 *A, - const unsigned int lda, const _FP16 *X, const int incX, - const float beta, _FP16 *Y, const int incY); + const unsigned int lda, const _FP16 *X, const unsigned int incX, + const float beta, _FP16 *Y, const unsigned int incY); /** * @brief elementwise vector multiplication : Z = X ⊙ alpha * Y + * beta * Z @@ -205,7 +203,8 @@ void ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, * @param[in] N number of elements in X * @param[in] X __fp16 * for Vector X */ -unsigned int isamax(const unsigned int N, const _FP16 *X, const int incX); +unsigned int isamax(const unsigned int N, const _FP16 *X, + const unsigned int incX); /** * @brief squared root transformation inplace : X = sqrt(X) @@ -229,51 +228,36 @@ void transpose_matrix(const unsigned int M, const unsigned int N, const _FP16 *src, unsigned int ld_src, _FP16 *dst, unsigned int ld_dst); #endif -/** - * @brief sscal computation : X = alpha * X - * @param[in] N number of elements in X - * @param[in] X void * for Vector X - * @param[in] alpha float number - */ -void sscal(const unsigned int N, const float alpha, void *X, const int incX, - ml::train::TensorDim::DataType d_type); /** * @brief sscal computation : X = alpha * X * @param[in] N number of elements in X * @param[in] X float * for Vector X * @param[in] alpha float number */ -void sscal(const unsigned int N, const float alpha, float *X, const int incX); +void sscal(const unsigned int N, const float alpha, float *X, + const unsigned int incX); /** * @brief snrm2 computation : Euclidean norm * @param[in] N number of elements in X * @param[in] X float * for Vector X */ -float snrm2(const int N, const float *X, const int incX); -/** - * @brief copy function : Y = X - * @param[in] N number of elements in X - * @param[in] X void * for Vector X - * @param[in] Y void * for Vector Y - */ -void scopy(const unsigned int N, const void *X, const int incX, void *Y, - const int incY, ml::train::TensorDim::DataType d_type); +float snrm2(const unsigned int N, const float *X, const unsigned int incX); /** * @brief copy function : Y = X * @param[in] N number of elements in X * @param[in] X float * for Vector X * @param[in] Y float * for Vector Y */ -void scopy(const unsigned int N, const float *X, const int incX, float *Y, - const int intY); +void scopy(const unsigned int N, const float *X, const unsigned int incX, + float *Y, const unsigned int incY); /** * @brief copy function : Y = X * @param[in] N number of elements in X * @param[in] X uint8_t * for Vector X * @param[in] Y uint8_t * for Vector Y */ -void scopy(const unsigned int N, const uint8_t *X, const int incX, uint8_t *Y, - const int intY); +void scopy(const unsigned int N, const uint8_t *X, const unsigned int incX, + uint8_t *Y, const unsigned int incY); /** * @brief copy function : Y = X * @param[in] N number of elements in X @@ -281,7 +265,8 @@ void scopy(const unsigned int N, const uint8_t *X, const int incX, uint8_t *Y, * @param[in] Y float * for Vector Y */ void scopy_int4_to_float32(const unsigned int N, const uint8_t *X, - const int incX, float *Y, const int intY); + const unsigned int incX, float *Y, + const unsigned int incY); /** * @brief copy function : Y = X @@ -290,7 +275,8 @@ void scopy_int4_to_float32(const unsigned int N, const uint8_t *X, * @param[in] Y float * for Vector Y */ void scopy_int8_to_float32(const unsigned int N, const uint8_t *X, - const int incX, float *Y, const int intY); + const unsigned int incX, float *Y, + const unsigned int incY); /** * @brief sdot computation : sum of all X * Y @@ -300,16 +286,6 @@ void scopy_int8_to_float32(const unsigned int N, const uint8_t *X, */ float sdot(const unsigned int N, const float *X, const unsigned int incX, const float *Y, const unsigned int incY); -/** - * @brief saxpy computation : Y = alpha*X + Y - * @param[in] N number of elements in Y - * @param[in] alpha float number - * @param[in] X void * for Vector X - * @param[in] Y void * for Vector Y - */ -void saxpy(const unsigned int N, const float alpha, const void *X, - const int incX, void *Y, const int incY, - ml::train::TensorDim::DataType d_type); /** * @brief saxpy computation : Y = alpha*X + Y * @param[in] N number of elements in Y @@ -318,24 +294,7 @@ void saxpy(const unsigned int N, const float alpha, const void *X, * @param[in] Y float * for Vector Y */ void saxpy(const unsigned int N, const float alpha, const float *X, - const int incX, float *Y, const int incY); -/** - * @brief sgemm computation : Y = alpha*op(A)*op(B) + beta*C, - * where op(X) is one of X or X**T - * @param[in] A void * for Matrix A - * @param[in] B void * for Matrix B - * @param[in] C void * for Matrix C - * @param[in] M number of op(A)'s and C's row - * @param[in] N number of op(B)'s and C's columns - * @param[in] K number of op(A)'s and columns and op(B)'s rows - * @param[in] alpha float number - * @param[in] beta float number - */ -void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, - const unsigned int M, const unsigned int N, const unsigned int K, - const float alpha, const void *A, const unsigned int lda, - const void *B, const unsigned int ldb, const float beta, void *C, - const unsigned int ldc, ml::train::TensorDim::DataType d_type); + const unsigned int incX, float *Y, const unsigned int incY); /** * @brief sgemm computation : Y = alpha*op(A)*op(B) + beta*C, * where op(X) is one of X or X**T @@ -353,21 +312,6 @@ void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, const float alpha, const float *A, const unsigned int lda, const float *B, const unsigned int ldb, const float beta, float *C, const unsigned int ldc); -/** - * @brief sgemv computation : Y = alpha*A*X + beta*Y - * @param[in] A void * for Matrix A - * @param[in] X void * for Vector X - * @param[in] Y void * for Vector Y - * @param[in] rows number of A's row - * @param[in] cols number of A's columns - * @param[in] alpha float number - * @param[in] beta float number - */ -void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M, - const unsigned int N, const float alpha, const void *A, - const unsigned int lda, const void *X, const int incX, - const float beta, void *Y, const int incY, - ml::train::TensorDim::DataType d_type); /** * @brief sgemv computation : Y = alpha*A*X + beta*Y * @param[in] A float * for Matrix A @@ -380,14 +324,15 @@ void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M, */ void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M, const unsigned int N, const float alpha, const float *A, - const unsigned int lda, const float *X, const int incX, - const float beta, float *Y, const int incY); + const unsigned int lda, const float *X, const unsigned int incX, + const float beta, float *Y, const unsigned int incY); /** * @brief isamax function : index of first maxima * @param[in] N number of elements in X * @param[in] X float * for Vector X */ -unsigned int isamax(const unsigned int N, const float *X, const int incX); +unsigned int isamax(const unsigned int N, const float *X, + const unsigned int incX); /** * @brief sine with neon: Y = sin(alpha * X) @@ -479,4 +424,4 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z, unsigned int o_stride = 1); } /* namespace nntrainer */ #endif /* __cplusplus */ -#endif /* __BLAS_INTERFACE_H__ */ +#endif /* __FALLBACK_H__ */ diff --git a/nntrainer/tensor/cpu_backend/fallback/fallback_fp16.cpp b/nntrainer/tensor/cpu_backend/fallback/fallback_fp16.cpp new file mode 100644 index 0000000000..214eb2936d --- /dev/null +++ b/nntrainer/tensor/cpu_backend/fallback/fallback_fp16.cpp @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Sungsik Kong + * + * @file fallback.cpp + * @date 23 April 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Sungsik Kong + * @bug No known bugs except for NYI items + * @brief Fallback interface (Raw implementations) + * + */ + +#include +#include +#include + +namespace nntrainer { + +void sscal(const unsigned int N, const float alpha, _FP16 *X, + const unsigned int incX) { + __fallback_sscal(N, alpha, X, incX); +} + +_FP16 snrm2(const unsigned int N, const _FP16 *X, const unsigned int incX) { + assert(incX > 0); + _FP16 sum = 0; + _FP16 tmp; + sum = __fallback_snrm2(N, X, incX); + return sum; +} + +void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX, + _FP16 *Y, const unsigned int incY) { + __fallback_scopy(N, X, incX, Y, incY); +} + +void scopy(const unsigned int N, const float *X, const unsigned int incX, + _FP16 *Y, const unsigned int incY) { + __fallback_scopy(N, X, incX, Y, incY); +} + +void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX, + float *Y, const unsigned int incY) { + __fallback_scopy(N, X, incX, Y, incY); +} + +void scopy_int4_to_float16(const unsigned int N, const uint8_t *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY) { + __fallback_scopy_int4_to_float16(N, X, incX, Y, incY); +} + +void scopy_int8_to_float16(const unsigned int N, const uint8_t *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY) { + __fallback_scopy_int8_to_float16(N, X, incX, Y, incY); +} + +_FP16 sdot(const unsigned int N, const _FP16 *X, const unsigned int incX, + const _FP16 *Y, const unsigned int incY) { + assert(incX > 0 && incY > 0); + _FP16 ret = 0; + return __fallback_sdot(N, X, incX, Y, incY); +} + +void saxpy(const unsigned int N, const float alpha, const _FP16 *X, + const unsigned int incX, _FP16 *Y, const unsigned int incY) { + __fallback_saxpy(N, alpha, X, incX, Y, incY); +} + +void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, + const unsigned int M, const unsigned int N, const unsigned int K, + const float alpha, const _FP16 *A, const unsigned int lda, + const _FP16 *B, const unsigned int ldb, const float beta, _FP16 *C, + const unsigned int ldc) { + __fallback_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B, + ldb, beta, C, ldc); +} + +void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M, + const unsigned int N, const float alpha, const _FP16 *A, + const unsigned int lda, const _FP16 *X, const unsigned int incX, + const float beta, _FP16 *Y, const unsigned int incY) { + __fallback_sgemv(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, Y, + incY); +} + +void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + __fallback_ele_mul(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + __fallback_ele_add(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void ele_sub(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + __fallback_ele_sub(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + __fallback_ele_div(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +unsigned int isamax(const unsigned int N, const _FP16 *X, + const unsigned int incX) { + unsigned int max_idx = 0; + max_idx = __fallback_isamax(N, X, incX); + return max_idx; +} + +void inv_sqrt_inplace(const unsigned int N, _FP16 *X) { + __fallback_inv_sqrt_inplace(N, X); +} + +void transpose_matrix(const unsigned int M, const unsigned int N, + const _FP16 *src, unsigned int ld_src, _FP16 *dst, + unsigned int ld_dst) { + __fallback_transpose_matrix(M, N, src, ld_src, dst, ld_dst); +} + +} /* namespace nntrainer */ diff --git a/nntrainer/tensor/cpu_backend/fallback/fallback_internal.cpp b/nntrainer/tensor/cpu_backend/fallback/fallback_internal.cpp new file mode 100644 index 0000000000..2ea2b6c407 --- /dev/null +++ b/nntrainer/tensor/cpu_backend/fallback/fallback_internal.cpp @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Sungsik Kong + * + * @file fallback_internal.cpp + * @date 23 April 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Sungsik Kong + * @bug No known bugs except for NYI items + * @brief Single-precision computation functions based on NEON + * + */ + +#include +#include +#include +#include +#include + +#define sgemv_loop(ci, cj, cM, cN) \ + do { \ + float y0; \ + unsigned int i, j; \ + for (ci = 0; ci != cM; ci++) { \ + y0 = Y[ci * incY] * beta; \ + for (cj = 0; cj != cN; cj++) \ + y0 += A[i + j * lda] * X[cj * incX]; \ + Y[ci * incY] = y0; \ + } \ + } while (0); +namespace nntrainer { + +void __fallback_sscal(const unsigned int N, const float alpha, float *X, + const unsigned int incX) { + assert(incX > 0); + for (unsigned int i = 0; i < N; ++i) + X[i * incX] = alpha * X[i * incX]; +} + +float __fallback_snrm2(const unsigned int N, const float *X, + const unsigned int incX) { + assert(incX > 0); + float sum = 0.0f; + float tmp; + + for (unsigned int i = 0; i < N; i++) { + tmp = X[i * incX]; + sum += tmp * tmp; + } + return sqrt(sum); +} + +void __fallback_scopy(const unsigned int N, const float *X, + const unsigned int incX, float *Y, + const unsigned int incY) { + assert(incX > 0 && incY > 0); + for (unsigned int i = 0; i < N; ++i) + Y[i * incY] = X[i * incX]; +} + +void __fallback_scopy(const unsigned int N, const uint8_t *X, + const unsigned int incX, uint8_t *Y, + const unsigned int incY) { + for (unsigned int idx = 0; idx < N; idx++) { + Y[idx * incX] = X[idx * incY]; + } +} + +void __fallback_scopy_int4_to_float32(const unsigned int N, const uint8_t *X, + const unsigned int incX, float *Y, + const unsigned int incY) { + for (unsigned int idx = 0; idx < N; idx++) { + Y[2 * idx] = X[idx] >> 4; + Y[2 * idx + 1] = X[idx] & 0x0f; + } +} + +void __fallback_scopy_int8_to_float32(const unsigned int N, const uint8_t *X, + const unsigned int incX, float *Y, + const unsigned int incY) { + for (unsigned int idx = 0; idx < N; idx++) { + Y[idx * incX] = X[idx * incY]; + } +} + +float __fallback_sdot(const unsigned int N, const float *X, + const unsigned int incX, const float *Y, + const unsigned int incY) { + float ret = 0; + for (unsigned int i = 0; i < N; ++i) { + ret += X[i * incX] * Y[i * incY]; + } + return ret; +} + +void __fallback_saxpy(const unsigned int N, const float alpha, const float *X, + const unsigned int incX, float *Y, + const unsigned int incY) { + assert(incX > 0 && incY > 0); + for (unsigned int i = 0; i < N; ++i) + Y[i * incY] = Y[i * incY] + X[i * incX] * alpha; +} + +void __fallback_sgemm(const unsigned int TStorageOrder, bool TransA, + bool TransB, const unsigned int M, const unsigned int N, + const unsigned int K, const float alpha, const float *A, + const unsigned int lda, const float *B, + const unsigned int ldb, const float beta, float *C, + const unsigned int ldc) { + for (unsigned int m = 0; m < M; ++m) { + for (unsigned int n = 0; n < N; ++n) { + double c = 0.0; + float c_old = C[m * ldc + n]; + for (unsigned int k = 0; k < K; ++k) { + float a, b; + a = ((TransA == true) ? A[k * lda + m] : A[m * lda + k]); + b = ((TransB == true) ? B[n * ldb + k] : B[k * ldb + n]); + c += a * b; + } + C[m * ldc + n] = alpha * c; + if (beta != 0.0) + C[m * ldc + n] += beta * c_old; + } + } +} + +void __fallback_sgemv(const unsigned int TStorageOrder, bool TransA, + const unsigned int M, const unsigned int N, + const float alpha, const float *A, const unsigned int lda, + const float *X, const unsigned int incX, const float beta, + float *Y, const unsigned int incY) { + + if (TransA == true) { + sgemv_loop(i, j, N, M); + } else { + sgemv_loop(j, i, M, N); + } +} + +unsigned int __fallback_isamax(const unsigned int N, const float *X, + const unsigned int incX) { + unsigned int max_idx = 0; + float max_val = X[0]; + for (unsigned int n = 1; n < N; n += incX) { + float cur_val = abs(X[n]); + if (cur_val > max_val) { + max_val = cur_val; + max_idx = n; + } + } + + return max_idx; +} + +void __fallback_sine(const unsigned int N, float *X, float *Y, float alpha) { + unsigned int i = 0; + while (i < N) { + Y[i] = std::sin(alpha * X[i]); + ++i; + } +} + +void __fallback_cosine(const unsigned int N, float *X, float *Y, float alpha) { + unsigned int i = 0; + while (i < N) { + Y[i] = std::cos(alpha * X[i]); + ++i; + } +} + +void __fallback_inv_sqrt_inplace(const unsigned int N, float *X) { + for (unsigned int i = 0; i < N; ++i) { + X[i] = 1 / std::sqrt(static_cast(X[i])); + } +} + +void __fallback_ele_mul(const unsigned int N, const float *X, const float *Y, + float *Z, float alpha, float beta, + unsigned int i_stride, unsigned int o_stride) { + for (unsigned int i = 0; i < N; ++i) { + *Z = *X * alpha * *Y + beta * *Z; + X += o_stride; + Y += i_stride; + Z += o_stride; + } +} + +void __fallback_ele_add(const unsigned int N, const float *X, const float *Y, + float *Z, float alpha, float beta, + unsigned int i_stride, unsigned int o_stride) { + for (unsigned int i = 0; i < N; ++i) { + *Z = *X + alpha * *Y + beta * *Z; + X += o_stride; + Y += i_stride; + Z += o_stride; + } +} + +void __fallback_ele_sub(const unsigned N, const float *X, const float *Y, + float *Z, float alpha, float beta, + unsigned int i_stride, unsigned int o_stride) { + for (unsigned int i = 0; i < N; ++i) { + *Z = *X - alpha * *Y + beta * *Z; + X += o_stride; + Y += i_stride; + Z += o_stride; + } +} + +void __fallback_ele_div(const unsigned N, const float *X, const float *Y, + float *Z, float alpha, float beta, + unsigned int i_stride, unsigned int o_stride) { + for (unsigned int i = 0; i < N; ++i) { + *Z = *X / (alpha * *Y) + beta * *Z; + X += o_stride; + Y += i_stride; + Z += o_stride; + } +} +} // namespace nntrainer diff --git a/nntrainer/tensor/cpu_backend/fallback/fallback_internal.h b/nntrainer/tensor/cpu_backend/fallback/fallback_internal.h new file mode 100644 index 0000000000..22703bbd1b --- /dev/null +++ b/nntrainer/tensor/cpu_backend/fallback/fallback_internal.h @@ -0,0 +1,444 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Sungsik Kong + * + * @file fallback_internal.h + * @date 23 April 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Sungsik Kong + * @bug No known bugs except for NYI items + * @brief Single-precision computation functions based on NEON + * + */ + +#ifndef __FALLBACK_INTERNAL_H__ +#define __FALLBACK_INTERNAL_H__ +#ifdef __cplusplus + +#include +#include + +namespace nntrainer { + +#ifdef ENABLE_FP16 +/** + * @brief sscal computation : X = alpha * X + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + * @param[in] alpha float number + */ +void __fallback_sscal(const unsigned int N, const float alpha, _FP16 *X, + const unsigned int incX); + +/** + * @brief snrm2 computation : Euclidean norm + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + */ +_FP16 __fallback_snrm2(const unsigned int N, const _FP16 *X, + const unsigned int incX); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void __fallback_scopy(const unsigned int N, const _FP16 *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void __fallback_scopy(const unsigned int N, const float *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + * @param[in] Y float * for Vector Y + */ +void __fallback_scopy(const unsigned int N, const _FP16 *X, + const unsigned int incX, float *Y, + const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void __fallback_scopy_int4_to_float16(const unsigned int N, const uint8_t *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void __fallback_scopy_int8_to_float16(const unsigned int N, const uint8_t *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY); + +/** + * @brief sdot computation : sum of all X * Y + * @param[in] N number of elements in Y + * @param[in] X __fp16 * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +_FP16 __fallback_sdot(const unsigned int N, const _FP16 *X, + const unsigned int incX, const _FP16 *Y, + const unsigned int incY); + +/** + * @brief saxpy computation : Y = alpha*X + Y + * @param[in] N number of elements in Y + * @param[in] alpha float number + * @param[in] X __fp16 * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void __fallback_saxpy(const unsigned int N, const float alpha, const _FP16 *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY); + +/** + * @brief sgemm computation : Y = alpha*op(A)*op(B) + beta*C, + * where op(X) is one of X or X**T + * @param[in] A __fp16 * for Matrix A + * @param[in] B __fp16 * for Matrix B + * @param[in] C __fp16 * for Matrix C + * @param[in] M number of op(A)'s and C's row + * @param[in] N number of op(B)'s and C's columns + * @param[in] K number of op(A)'s and columns and op(B)'s rows + * @param[in] alpha float number + * @param[in] beta float number + */ +void __fallback_sgemm(const unsigned int TStorageOrder, bool TransA, + bool TransB, const unsigned int M, const unsigned int N, + const unsigned int K, const float alpha, const _FP16 *A, + const unsigned int lda, const _FP16 *B, + const unsigned int ldb, const float beta, _FP16 *C, + const unsigned int ldc); +/** + * @brief sgemv computation : Y = alpha*A*X + beta*Y + * @param[in] A float * for Matrix A + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] rows number of A's row + * @param[in] cols number of A's columns + * @param[in] alpha float number + * @param[in] beta float number + */ +void __fallback_sgemv(const unsigned int TStorageOrder, bool TransA, + const unsigned int M, const unsigned int N, + const float alpha, const _FP16 *A, const unsigned int lda, + const _FP16 *X, const unsigned int incX, const float beta, + _FP16 *Y, const unsigned int incY); +/** + * @brief elementwise vector multiplication : Z = X ⊙ alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void __fallback_ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, + _FP16 *Z, float alpha, float beta, + unsigned int i_stride, unsigned int o_stride); + +/** + * @brief elementwise vector addition : Z = X + alpha * Y + beta * + * Z + * @param[in] N length of the vector + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void __fallback_ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, + _FP16 *Z, float alpha, float beta, + unsigned int i_stride, unsigned int o_stride); +/** + * @brief elementwise vector subtraction with neon : Z = X - alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void __fallback_ele_sub(const unsigned N, const _FP16 *X, const _FP16 *Y, + _FP16 *Z, float alpha, float beta, + unsigned int i_stride, unsigned int o_stride); + +/** + * @brief elementwise vector division with neon : Z = X / (alpha * Y) + beta + * * Z + * @note ZeroDivisionError is not guaranteed in this function + * @param[in] N length of the vector + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void __fallback_ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y, + _FP16 *Z, float alpha, float beta, + unsigned int i_stride, unsigned int o_stride); + +/** + * @brief isamax function : index of first maxima + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + */ +unsigned int __fallback_isamax(const unsigned int N, const _FP16 *X, + const unsigned int incX); + +/** + * @brief squared root transformation inplace : X = sqrt(X) + * + * @param N size of X + * @param X __fp16 * for Vector X + */ +void __fallback_inv_sqrt_inplace(const unsigned int N, _FP16 *X); + +/** + * @brief Matrix transpose / 2D Tensor transpose + * + * @param M row length of input matrix + * @param N col length of input matrix + * @param src src data of input matrix + * @param ld_src data offset of input matrix + * @param dst destination of output matrix + * @param ld_dst data offset of output matrix + */ +void __fallback_transpose_matrix(const unsigned int M, const unsigned int N, + const _FP16 *src, unsigned int ld_src, + _FP16 *dst, unsigned int ld_dst); +#endif +/** + * @brief sscal computation : X = alpha * X + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] alpha float number + */ +void __fallback_sscal(const unsigned int N, const float alpha, float *X, + const unsigned int incX); +/** + * @brief snrm2 computation : Euclidean norm + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + */ +float __fallback_snrm2(const unsigned int N, const float *X, + const unsigned int incX); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + */ +void __fallback_scopy(const unsigned int N, const float *X, + const unsigned int incX, float *Y, + const unsigned int incY); +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y uint8_t * for Vector Y + */ +void __fallback_scopy(const unsigned int N, const uint8_t *X, + const unsigned int incX, uint8_t *Y, + const unsigned int incY); +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y float * for Vector Y + */ +void __fallback_scopy_int4_to_float32(const unsigned int N, const uint8_t *X, + const unsigned int incX, float *Y, + const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y float * for Vector Y + */ +void __fallback_scopy_int8_to_float32(const unsigned int N, const uint8_t *X, + const unsigned int incX, float *Y, + const unsigned int incY); + +/** + * @brief sdot computation : sum of all X * Y + * @param[in] N number of elements in Y + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + */ +float __fallback_sdot(const unsigned int N, const float *X, + const unsigned int incX, const float *Y, + const unsigned int incY); + +/** + * @brief saxpy computation : Y = alpha*X + Y + * @param[in] N number of elements in Y + * @param[in] alpha float number + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + */ +void __fallback_saxpy(const unsigned int N, const float alpha, const float *X, + const unsigned int incX, float *Y, + const unsigned int incY); +/** + * @brief sgemm computation : Y = alpha*op(A)*op(B) + beta*C, + * where op(X) is one of X or X**T + * @param[in] A float * for Matrix A + * @param[in] B float * for Matrix B + * @param[in] C float * for Matrix C + * @param[in] M number of op(A)'s and C's row + * @param[in] N number of op(B)'s and C's columns + * @param[in] K number of op(A)'s and columns and op(B)'s rows + * @param[in] alpha float number + * @param[in] beta float number + */ +void __fallback_sgemm(const unsigned int TStorageOrder, bool TransA, + bool TransB, const unsigned int M, const unsigned int N, + const unsigned int K, const float alpha, const float *A, + const unsigned int lda, const float *B, + const unsigned int ldb, const float beta, float *C, + const unsigned int ldc); +/** + * @brief sgemv computation : Y = alpha*A*X + beta*Y + * @param[in] A float * for Matrix A + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] rows number of A's row + * @param[in] cols number of A's columns + * @param[in] alpha float number + * @param[in] beta float number + */ +void __fallback_sgemv(const unsigned int TStorageOrder, bool TransA, + const unsigned int M, const unsigned int N, + const float alpha, const float *A, const unsigned int lda, + const float *X, const unsigned int incX, const float beta, + float *Y, const unsigned int incY); +/** + * @brief isamax function : index of first maxima + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + */ +unsigned int __fallback_isamax(const unsigned int N, const float *X, + const unsigned int incX); + +/** + * @brief sine with neon: Y = sin(alpha * X) + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] alpha float * for scaling angle (radian) + */ +void __fallback_sine(const unsigned int N, float *X, float *Y, float alpha); + +/** + * @brief cosine with neon: Y = cos(alpha * X) + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] alpha float * for scaling angle (radian) + */ +void __fallback_cosine(const unsigned int N, float *X, float *Y, float alpha); + +/** + * @brief inversed squared root transformation inplace : X / sqrt(X) + * + * @param N size of X + * @param X float * for Vector X + */ +void __fallback_inv_sqrt_inplace(const unsigned int N, float *X); +/** + * @brief elementwise vector multiplication : Z = X ⊙ alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void __fallback_ele_mul(const unsigned int N, const float *X, const float *Y, + float *Z, float alpha, float beta, + unsigned int i_stride, unsigned int o_stride); + +/** + * @brief elementwise vector addition : Z = X + alpha * Y + beta * + * Z + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void __fallback_ele_add(const unsigned int N, const float *X, const float *Y, + float *Z, float alpha, float beta, + unsigned int i_stride, unsigned int o_stride); +/** + * @brief elementwise vector subtraction with neon : Z = X - alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void __fallback_ele_sub(const unsigned N, const float *X, const float *Y, + float *Z, float alpha, float beta, + unsigned int i_stride, unsigned int o_stride); + +/** + * @brief elementwise vector division with neon : Z = X / (alpha * Y) + beta + * * Z + * @note ZeroDivisionError is not guaranteed in this function + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void __fallback_ele_div(const unsigned N, const float *X, const float *Y, + float *Z, float alpha, float beta, + unsigned int i_stride, unsigned int o_stride); +} // namespace nntrainer +#endif +#endif diff --git a/nntrainer/tensor/cpu_backend/fallback/fallback_internal_fp16.cpp b/nntrainer/tensor/cpu_backend/fallback/fallback_internal_fp16.cpp new file mode 100644 index 0000000000..ad794aa792 --- /dev/null +++ b/nntrainer/tensor/cpu_backend/fallback/fallback_internal_fp16.cpp @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Sungsik Kong + * + * @file fallback_internal.cpp + * @date 23 April 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Sungsik Kong + * @bug No known bugs except for NYI items + * @brief Single-precision computation functions based on NEON + * + */ + +#include +#include +#include +#include +#include + +#define hgemv_loop(ci, cj, cM, cN) \ + do { \ + float y0; \ + unsigned int i, j; \ + for (ci = 0; ci != cM; ci++) { \ + y0 = static_cast(Y[ci * incY] * static_cast<_FP16>(beta)); \ + for (cj = 0; cj != cN; cj++) \ + y0 += static_cast(A[i + j * lda] * X[cj * incX]); \ + Y[ci * incY] = static_cast<_FP16>(y0); \ + } \ + } while (0); + +#define hgemm_loop() \ + do { \ + for (unsigned int m = 0; m < M; ++m) { \ + for (unsigned int n = 0; n < N; ++n) { \ + float c = 0; \ + _FP16 c_old = C[m * ldc + n]; \ + for (unsigned int k = 0; k < K; ++k) { \ + _FP16 a, b; \ + a = ((TransA) ? A[k * lda + m] : A[m * lda + k]); \ + b = ((TransB) ? B[n * ldb + k] : B[k * ldb + n]); \ + c += static_cast(a * b); \ + } \ + C[m * ldc + n] = static_cast<_FP16>(alpha * c); \ + if (beta != 0.0) \ + C[m * ldc + n] += static_cast<_FP16>(beta) * c_old; \ + } \ + } \ + } while (0); + +#define haxpy_loop() \ + do { \ + unsigned int i; \ + for (i = 0; i < N; ++i) \ + Y[i * incY] = Y[i * incY] + static_cast<_FP16>(alpha) * X[i * incX]; \ + } while (0); + +namespace nntrainer { + +void __fallback_sscal(const unsigned int N, const float alpha, _FP16 *X, + const unsigned int incX) { + for (unsigned int i = 0; i < N; ++i) + X[i * incX] = static_cast<_FP16>(alpha) * X[i * incX]; +} + +_FP16 __fallback_snrm2(const unsigned int N, const _FP16 *X, + const unsigned int incX) { + float sum = 0; + float tmp; + for (unsigned int i = 0; i < N; i++) { + tmp = static_cast(X[i * incX]); + sum += tmp * tmp; + } + return static_cast<_FP16>(sqrt(sum)); +} + +void __fallback_scopy(const unsigned int N, const _FP16 *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY) { + for (unsigned int i = 0; i < N; ++i) + Y[i * incY] = X[i * incX]; +} + +void __fallback_scopy(const unsigned int N, const float *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY) { + for (unsigned int i = 0; i < N; ++i) + Y[i * incY] = static_cast<_FP16>(X[i * incX]); +} + +void __fallback_scopy(const unsigned int N, const _FP16 *X, + const unsigned int incX, float *Y, + const unsigned int incY) { + for (unsigned int i = 0; i < N; ++i) + Y[i * incY] = static_cast(X[i * incX]); +} + +void __fallback_scopy_int4_to_float16(const unsigned int N, const uint8_t *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY) { + for (unsigned int idx = 0; idx < N; idx++) { + Y[2 * idx] = X[idx] >> 4; + Y[2 * idx + 1] = X[idx] & 0x0f; + } +} + +void __fallback_scopy_int8_to_float16(const unsigned int N, const uint8_t *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY) { + for (unsigned int idx = 0; idx < N; idx++) { + Y[idx] = X[idx]; + } +} + +_FP16 __fallback_sdot(const unsigned int N, const _FP16 *X, + const unsigned int incX, const _FP16 *Y, + const unsigned int incY) { + assert(incX > 0 && incY > 0); + float ret = 0; + for (unsigned int i = 0; i < N; ++i) { + ret += static_cast(X[i * incX]) * static_cast(Y[i * incY]); + } + return static_cast<_FP16>(ret); +} + +void __fallback_saxpy(const unsigned int N, const float alpha, const _FP16 *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY) { + haxpy_loop(); +} + +void __fallback_sgemm(const unsigned int TStorageOrder, bool TransA, + bool TransB, const unsigned int M, const unsigned int N, + const unsigned int K, const float alpha, const _FP16 *A, + const unsigned int lda, const _FP16 *B, + const unsigned int ldb, const float beta, _FP16 *C, + const unsigned int ldc) { + hgemm_loop(); +} + +void __fallback_sgemv(const unsigned int TStorageOrder, bool TransA, + const unsigned int M, const unsigned int N, + const float alpha, const _FP16 *A, const unsigned int lda, + const _FP16 *X, const unsigned int incX, const float beta, + _FP16 *Y, const unsigned int incY) { + + if (TransA == true) { + hgemv_loop(i, j, N, M); + } else { + hgemv_loop(j, i, M, N); + } +} + +void __fallback_ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, + _FP16 *Z, float alpha, float beta, + unsigned int i_stride, unsigned int o_stride) { + for (unsigned int i = 0; i < N; ++i) { + *Z = *X * static_cast<_FP16>(alpha) * *Y + static_cast<_FP16>(beta) * *Z; + X += o_stride; + Y += i_stride; + Z += o_stride; + } +} + +void __fallback_ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, + _FP16 *Z, float alpha, float beta, + unsigned int i_stride, unsigned int o_stride) { + for (unsigned int i = 0; i < N; ++i) { + *Z = *X + static_cast<_FP16>(alpha) * *Y + static_cast<_FP16>(beta) * *Z; + X += o_stride; + Y += i_stride; + Z += o_stride; + } +} + +void __fallback_ele_sub(const unsigned N, const _FP16 *X, const _FP16 *Y, + _FP16 *Z, float alpha, float beta, + unsigned int i_stride, unsigned int o_stride) { + for (unsigned int i = 0; i < N; ++i) { + *Z = *X - static_cast<_FP16>(alpha) * *Y + static_cast<_FP16>(beta) * *Z; + X += o_stride; + Y += i_stride; + Z += o_stride; + } +} + +void __fallback_ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y, + _FP16 *Z, float alpha, float beta, + unsigned int i_stride, unsigned int o_stride) { + for (unsigned int i = 0; i < N; ++i) { + *Z = *X / (static_cast<_FP16>(alpha) * *Y) + static_cast<_FP16>(beta) * *Z; + X += o_stride; + Y += i_stride; + Z += o_stride; + } +} + +unsigned int __fallback_isamax(const unsigned int N, const _FP16 *X, + const unsigned int incX) { + unsigned int max_idx = 0; + _FP16 max_val = X[0]; + for (unsigned int n = 1; n < N; n += incX) { + _FP16 cur_val = (X[n] >= 0) ? X[n] : -1 * X[n]; + if (cur_val > max_val) { + max_val = cur_val; + max_idx = n; + } + } + return max_idx; +} + +void __fallback_inv_sqrt_inplace(const unsigned int N, _FP16 *X) { + for (unsigned int i = 0; i < N; ++i) { + X[i] = static_cast<_FP16>(1 / std::sqrt(static_cast(X[i]))); + } +} + +void __fallback_transpose_matrix(const unsigned int M, const unsigned int N, + const _FP16 *src, unsigned int ld_src, + _FP16 *dst, unsigned int ld_dst) { + for (unsigned int i = 0; i < M; i++) { + for (unsigned int j = 0; j < N; j++) { + dst[i + j * ld_dst] = src[i * ld_src + j]; + } + } +} + +} // namespace nntrainer diff --git a/nntrainer/tensor/cpu_backend/fallback/meson.build b/nntrainer/tensor/cpu_backend/fallback/meson.build new file mode 100644 index 0000000000..86d31d5ffe --- /dev/null +++ b/nntrainer/tensor/cpu_backend/fallback/meson.build @@ -0,0 +1,24 @@ +fallback_headers = ['fallback_internal.h'] +fallback_sources = ['fallback_internal.cpp'] + +arch = host_machine.cpu_family() + +if arch != 'arm' and arch != 'aarch64' and arch != 'x86_64' and arch != 'x86' + fallback_headers += 'fallback.h' + fallback_sources += 'fallback.cpp' + if get_option('enable-fp16') + fallback_sources += 'fallback_fp16.cpp' + endif +endif + +if get_option('enable-fp16') + fallback_sources += 'fallback_internal_fp16.cpp' +endif + +foreach s : fallback_sources + nntrainer_sources += meson.current_source_dir() / s +endforeach + +foreach h : fallback_headers + nntrainer_headers += meson.current_source_dir() / h +endforeach diff --git a/nntrainer/tensor/cpu_backend/meson.build b/nntrainer/tensor/cpu_backend/meson.build new file mode 100644 index 0000000000..e75f1f9ad6 --- /dev/null +++ b/nntrainer/tensor/cpu_backend/meson.build @@ -0,0 +1,28 @@ +cpu_backend_headers = [ + 'cpu_backend.h', +] + +subdir('fallback') +nntrainer_inc += include_directories('fallback') +nntrainer_inc_abs += meson.current_source_dir() / 'fallback' + +if get_option('enable-blas') + subdir('cblas_interface') + nntrainer_inc += include_directories('cblas_interface') + nntrainer_inc_abs += meson.current_source_dir() / 'cblas_interface' +endif + +arch = host_machine.cpu_family() +if arch == 'arm' or arch == 'aarch64' or get_option('platform') == 'android' + subdir('arm') + nntrainer_inc += include_directories('arm') + nntrainer_inc_abs += meson.current_source_dir() / 'arm' +elif arch == 'x86_64' or arch == 'x86' + subdir('x86') + nntrainer_inc += include_directories('x86') + nntrainer_inc_abs += meson.current_source_dir() / 'x86' +endif + +foreach h : cpu_backend_headers + nntrainer_headers += meson.current_source_dir() / h +endforeach diff --git a/nntrainer/tensor/blas_avx.cpp b/nntrainer/tensor/cpu_backend/x86/blas_avx.cpp similarity index 98% rename from nntrainer/tensor/blas_avx.cpp rename to nntrainer/tensor/cpu_backend/x86/blas_avx.cpp index ce59583d6f..c923cb340f 100644 --- a/nntrainer/tensor/blas_avx.cpp +++ b/nntrainer/tensor/cpu_backend/x86/blas_avx.cpp @@ -18,7 +18,7 @@ #include -namespace nntrainer::avx { +namespace nntrainer { void vcvt_f16_f32(size_t N, const void *input, float *output) { assert(N != 0); @@ -114,4 +114,4 @@ void vcvt_f32_f16(size_t N, const float *input, void *output) { } } -} // namespace nntrainer::avx +} // namespace nntrainer diff --git a/nntrainer/tensor/blas_avx.h b/nntrainer/tensor/cpu_backend/x86/blas_avx.h similarity index 95% rename from nntrainer/tensor/blas_avx.h rename to nntrainer/tensor/cpu_backend/x86/blas_avx.h index ab1270a208..c1a714ddca 100644 --- a/nntrainer/tensor/blas_avx.h +++ b/nntrainer/tensor/cpu_backend/x86/blas_avx.h @@ -18,7 +18,7 @@ #include #include -namespace nntrainer::avx { +namespace nntrainer { /** * @brief Converts half-precision floating point values to single-precision @@ -40,7 +40,7 @@ void vcvt_f16_f32(size_t N, const void *input, float *output); */ void vcvt_f32_f16(size_t N, const float *input, void *output); -} // namespace nntrainer::avx +} // namespace nntrainer #endif /* __cplusplus */ #endif /* __BLAS_AVX_H_ */ diff --git a/nntrainer/tensor/cpu_backend/x86/meson.build b/nntrainer/tensor/cpu_backend/x86/meson.build new file mode 100644 index 0000000000..301bb8d54c --- /dev/null +++ b/nntrainer/tensor/cpu_backend/x86/meson.build @@ -0,0 +1,20 @@ +simd_interface_x86_headers = [ + 'x86_compute_backend.h', +] +simd_interface_x86_sources = [ + 'x86_compute_backend.cpp', +] + +if get_option('enable-fp16') + simd_interface_x86_headers += 'blas_avx.h' + simd_interface_x86_sources += 'blas_avx.cpp' + simd_interface_x86_sources += 'x86_compute_backend_fp16.cpp' +endif + +foreach s : simd_interface_x86_sources + nntrainer_sources += meson.current_source_dir() / s +endforeach + +foreach h : simd_interface_x86_headers + nntrainer_headers += meson.current_source_dir() / h +endforeach diff --git a/nntrainer/tensor/cpu_backend/x86/x86_compute_backend.cpp b/nntrainer/tensor/cpu_backend/x86/x86_compute_backend.cpp new file mode 100644 index 0000000000..bf35984b32 --- /dev/null +++ b/nntrainer/tensor/cpu_backend/x86/x86_compute_backend.cpp @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Sungsik Kong + * + * @file x86_compute_backend.cpp + * @date 23 April 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Sungsik Kong + * @bug No known bugs except for NYI items + * @brief Compute backend for x86 + * + */ + +#include + +#include +#include +#include +#include + +#define ROW_MAJOR 0 +#define COL_MAJOR 1 + +namespace nntrainer { + +void scopy_int4_to_float32(const unsigned int N, const uint8_t *X, + const unsigned int incX, float *Y, + const unsigned int incY) { + __fallback_scopy_int4_to_float32(N, X, incX, Y, incY); +} + +void scopy_int8_to_float32(const unsigned int N, const uint8_t *X, + const unsigned int incX, float *Y, + const unsigned int incY) { + __fallback_scopy_int8_to_float32(N, X, incX, Y, incY); +} + +void sine(const unsigned int N, float *X, float *Y, float alpha) { + __fallback_sine(N, X, Y, alpha); +} + +void cosine(const unsigned int N, float *X, float *Y, float alpha) { + __fallback_cosine(N, X, Y, alpha); +} + +void inv_sqrt_inplace(const unsigned int N, float *X) { + __fallback_inv_sqrt_inplace(N, X); +} + +void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + __fallback_ele_mul(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + __fallback_ele_add(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + __fallback_ele_sub(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void ele_div(const unsigned N, const float *X, const float *Y, float *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + __fallback_ele_div(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void saxpy(const unsigned int N, const float alpha, const float *X, + const unsigned int incX, float *Y, const unsigned int incY) { + __cblas_saxpy(N, alpha, X, incX, Y, incY); +} + +void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M, + const unsigned int N, const float alpha, const float *A, + const unsigned int lda, const float *X, const unsigned int incX, + const float beta, float *Y, const unsigned int incY) { + __cblas_sgemv(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, Y, + incY); +} + +float sdot(const unsigned int N, const float *X, const unsigned int incX, + const float *Y, const unsigned int incY) { + return __cblas_sdot(N, X, incX, Y, incY); +} + +void scopy(const unsigned int N, const uint8_t *X, const unsigned int incX, + uint8_t *Y, const unsigned int incY) { + __fallback_scopy(N, X, incX, Y, incY); +} + +void scopy(const unsigned int N, const float *X, const unsigned int incX, + float *Y, const unsigned int incY) { + __cblas_scopy(N, X, incX, Y, incY); +} + +void sscal(const unsigned int N, const float alpha, float *X, + const unsigned int incX) { + __cblas_sscal(N, alpha, X, incX); +} + +float snrm2(const unsigned int N, const float *X, const unsigned int incX) { + return __cblas_snrm2(N, X, incX); +} + +void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, + const unsigned int M, const unsigned int N, const unsigned int K, + const float alpha, const float *A, const unsigned int lda, + const float *B, const unsigned int ldb, const float beta, float *C, + const unsigned int ldc) { + __cblas_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, + beta, C, ldc); +} + +unsigned int isamax(const unsigned int N, const float *X, + const unsigned int incX) { + return __cblas_isamax(N, X, incX); +} + +} /* namespace nntrainer */ diff --git a/nntrainer/tensor/cpu_backend/x86/x86_compute_backend.h b/nntrainer/tensor/cpu_backend/x86/x86_compute_backend.h new file mode 100644 index 0000000000..840abc94df --- /dev/null +++ b/nntrainer/tensor/cpu_backend/x86/x86_compute_backend.h @@ -0,0 +1,427 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Sungsik Kong + * + * @file x86_compute_backend.h + * @date 23 April 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Sungsik Kong + * @bug No known bugs except for NYI items + * @brief Compute backend for x86 + * + */ + +#ifndef __x86_COMPUTE_BACKEND_H__ +#define __x86_COMPUTE_BACKEND_H__ +#ifdef __cplusplus + +#include +#include + +namespace nntrainer { + +#ifdef ENABLE_FP16 +/** + * @brief sscal computation : X = alpha * X + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + * @param[in] alpha float number + */ +void sscal(const unsigned int N, const float alpha, _FP16 *X, + const unsigned int incX); + +/** + * @brief snrm2 computation : Euclidean norm + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + */ +_FP16 snrm2(const unsigned int N, const _FP16 *X, const unsigned int incX); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX, + _FP16 *Y, const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void scopy(const unsigned int N, const float *X, const unsigned int incX, + _FP16 *Y, const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + * @param[in] Y float * for Vector Y + */ +void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX, + float *Y, const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void scopy_int4_to_float16(const unsigned int N, const uint8_t *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void scopy_int8_to_float16(const unsigned int N, const uint8_t *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY); + +/** + * @brief sdot computation : sum of all X * Y + * @param[in] N number of elements in Y + * @param[in] X __fp16 * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +_FP16 sdot(const unsigned int N, const _FP16 *X, const unsigned int incX, + const _FP16 *Y, const unsigned int incY); + +/** + * @brief saxpy computation : Y = alpha*X + Y + * @param[in] N number of elements in Y + * @param[in] alpha float number + * @param[in] X __fp16 * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void saxpy(const unsigned int N, const float alpha, const _FP16 *X, + const unsigned int incX, _FP16 *Y, const unsigned int incY); + +/** + * @brief sgemm computation : Y = alpha*op(A)*op(B) + beta*C, + * where op(X) is one of X or X**T + * @param[in] A __fp16 * for Matrix A + * @param[in] B __fp16 * for Matrix B + * @param[in] C __fp16 * for Matrix C + * @param[in] M number of op(A)'s and C's row + * @param[in] N number of op(B)'s and C's columns + * @param[in] K number of op(A)'s and columns and op(B)'s rows + * @param[in] alpha float number + * @param[in] beta float number + */ +void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, + const unsigned int M, const unsigned int N, const unsigned int K, + const float alpha, const _FP16 *A, const unsigned int lda, + const _FP16 *B, const unsigned int ldb, const float beta, _FP16 *C, + const unsigned int ldc); +/** + * @brief sgemv computation : Y = alpha*A*X + beta*Y + * @param[in] A float * for Matrix A + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] rows number of A's row + * @param[in] cols number of A's columns + * @param[in] alpha float number + * @param[in] beta float number + */ +void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M, + const unsigned int N, const float alpha, const _FP16 *A, + const unsigned int lda, const _FP16 *X, const unsigned int incX, + const float beta, _FP16 *Y, const unsigned int incY); +/** + * @brief elementwise vector multiplication : Z = X ⊙ alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, + unsigned int o_stride = 1); + +/** + * @brief elementwise vector addition : Z = X + alpha * Y + beta * + * Z + * @param[in] N length of the vector + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, + unsigned int o_stride = 1); +/** + * @brief elementwise vector subtraction with neon : Z = X - alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void ele_sub(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, + unsigned int o_stride = 1); + +/** + * @brief elementwise vector division with neon : Z = X / (alpha * Y) + beta + * * Z + * @note ZeroDivisionError is not guaranteed in this function + * @param[in] N length of the vector + * @param[in] X _FP16 * for Vector X + * @param[in] Y _FP16 * for Vector Y + * @param[in] Z _FP16 * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, + unsigned int o_stride = 1); + +/** + * @brief isamax function : index of first maxima + * @param[in] N number of elements in X + * @param[in] X __fp16 * for Vector X + */ +unsigned int isamax(const unsigned int N, const _FP16 *X, + const unsigned int incX); + +/** + * @brief squared root transformation inplace : X = sqrt(X) + * + * @param N size of X + * @param X __fp16 * for Vector X + */ +void inv_sqrt_inplace(const unsigned int N, _FP16 *X); + +/** + * @brief Matrix transpose / 2D Tensor transpose + * + * @param M row length of input matrix + * @param N col length of input matrix + * @param src src data of input matrix + * @param ld_src data offset of input matrix + * @param dst destination of output matrix + * @param ld_dst data offset of output matrix + */ +void transpose_matrix(const unsigned int M, const unsigned int N, + const _FP16 *src, unsigned int ld_src, _FP16 *dst, + unsigned int ld_dst); +#endif +/** + * @brief sscal computation : X = alpha * X + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] alpha float number + */ +void sscal(const unsigned int N, const float alpha, float *X, + const unsigned int incX); +/** + * @brief snrm2 computation : Euclidean norm + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + */ +float snrm2(const unsigned int N, const float *X, const unsigned int incX); +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + */ +void scopy(const unsigned int N, const float *X, const unsigned int incX, + float *Y, const unsigned int incY); +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y uint8_t * for Vector Y + */ +void scopy(const unsigned int N, const uint8_t *X, const unsigned int incX, + uint8_t *Y, const unsigned int incY); +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y float * for Vector Y + */ +void scopy_int4_to_float32(const unsigned int N, const uint8_t *X, + const unsigned int incX, float *Y, + const unsigned int incY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y float * for Vector Y + */ +void scopy_int8_to_float32(const unsigned int N, const uint8_t *X, + const unsigned int incX, float *Y, + const unsigned int incY); + +/** + * @brief sdot computation : sum of all X * Y + * @param[in] N number of elements in Y + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + */ +float sdot(const unsigned int N, const float *X, const unsigned int incX, + const float *Y, const unsigned int incY); +/** + * @brief saxpy computation : Y = alpha*X + Y + * @param[in] N number of elements in Y + * @param[in] alpha float number + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + */ +void saxpy(const unsigned int N, const float alpha, const float *X, + const unsigned int incX, float *Y, const unsigned int incY); +/** + * @brief sgemm computation : Y = alpha*op(A)*op(B) + beta*C, + * where op(X) is one of X or X**T + * @param[in] A float * for Matrix A + * @param[in] B float * for Matrix B + * @param[in] C float * for Matrix C + * @param[in] M number of op(A)'s and C's row + * @param[in] N number of op(B)'s and C's columns + * @param[in] K number of op(A)'s and columns and op(B)'s rows + * @param[in] alpha float number + * @param[in] beta float number + */ +void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, + const unsigned int M, const unsigned int N, const unsigned int K, + const float alpha, const float *A, const unsigned int lda, + const float *B, const unsigned int ldb, const float beta, float *C, + const unsigned int ldc); +/** + * @brief sgemv computation : Y = alpha*A*X + beta*Y + * @param[in] A float * for Matrix A + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] rows number of A's row + * @param[in] cols number of A's columns + * @param[in] alpha float number + * @param[in] beta float number + */ +void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M, + const unsigned int N, const float alpha, const float *A, + const unsigned int lda, const float *X, const unsigned int incX, + const float beta, float *Y, const unsigned int incY); +/** + * @brief isamax function : index of first maxima + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + */ +unsigned int isamax(const unsigned int N, const float *X, + const unsigned int incX); + +/** + * @brief sine with neon: Y = sin(alpha * X) + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] alpha float * for scaling angle (radian) + */ +void sine(const unsigned int N, float *X, float *Y, float alpha = 1.f); + +/** + * @brief cosine with neon: Y = cos(alpha * X) + * @param[in] N number of elements in X + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] alpha float * for scaling angle (radian) + */ +void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.f); + +/** + * @brief inversed squared root transformation inplace : X = 1 / sqrt(X) + * + * @param N size of X + * @param X float * for Vector X + */ +void inv_sqrt_inplace(const unsigned int N, float *X); +/** + * @brief elementwise vector multiplication : Z = X ⊙ alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, + unsigned int o_stride = 1); + +/** + * @brief elementwise vector addition : Z = X + alpha * Y + beta * + * Z + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, + unsigned int o_stride = 1); +/** + * @brief elementwise vector subtraction with neon : Z = X - alpha * Y + + * beta * Z + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, + unsigned int o_stride = 1); + +/** + * @brief elementwise vector division with neon : Z = X / (alpha * Y) + beta + * * Z + * @note ZeroDivisionError is not guaranteed in this function + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[in] Y float * for Vector Y + * @param[in] Z float * for Vector Z + * @param[in] alpha scalar multiplier for input + * @param[in] beta scalar multiplier for output + * @param[in] i_stride input stride + * @param[in] o_stride output stride + */ +void ele_div(const unsigned N, const float *X, const float *Y, float *Z, + float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, + unsigned int o_stride = 1); +} /* namespace nntrainer */ +#endif /* __cplusplus */ +#endif /* __x86_COMPUTE_BACKEND_H__ */ diff --git a/nntrainer/tensor/cpu_backend/x86/x86_compute_backend_fp16.cpp b/nntrainer/tensor/cpu_backend/x86/x86_compute_backend_fp16.cpp new file mode 100644 index 0000000000..b0d3d97cbb --- /dev/null +++ b/nntrainer/tensor/cpu_backend/x86/x86_compute_backend_fp16.cpp @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Sungsik Kong + * + * @file x86_compute_backend_fp16.cpp + * @date 23 April 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Sungsik Kong + * @bug No known bugs except for NYI items + * @brief Compute backend for x86 + * + */ + +#include + +#include +#include +#include +#include + +#ifdef ENABLE_FP16 +#include +#endif + +#define ROW_MAJOR 0 +#define COL_MAJOR 1 + +namespace nntrainer { + +void sscal(const unsigned int N, const float alpha, _FP16 *X, + const unsigned int incX) { + __fallback_sscal(N, alpha, X, incX); +} + +_FP16 snrm2(const unsigned int N, const _FP16 *X, const unsigned int incX) { + assert(incX > 0); + _FP16 sum = __fallback_snrm2(N, X, incX); + return sum; +} + +void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX, + _FP16 *Y, const unsigned int incY) { + if (incX == 1 && incY == 1) { + __fallback_scopy(N, X, incX, Y, incY); + } +} + +void scopy(const unsigned int N, const float *X, const unsigned int incX, + _FP16 *Y, const unsigned int incY) { + if (incX == 1 && incY == 1) { + nntrainer::vcvt_f32_f16(N, X, Y); + + } else { + __fallback_scopy(N, X, incX, Y, incY); + } +} + +void scopy(const unsigned int N, const _FP16 *X, const unsigned int incX, + float *Y, const unsigned int incY) { + if (incX == 1 && incY == 1) { + nntrainer::vcvt_f16_f32(N, X, Y); + } else { + __fallback_scopy(N, X, incX, Y, incY); + } +} + +void scopy_int4_to_float16(const unsigned int N, const uint8_t *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY) { + if (incX == 1 && incY == 1) { + __fallback_scopy_int4_to_float16(N, X, incX, Y, incY); + } +} + +void scopy_int8_to_float16(const unsigned int N, const uint8_t *X, + const unsigned int incX, _FP16 *Y, + const unsigned int incY) { + __fallback_scopy_int8_to_float16(N, X, incX, Y, incY); +} + +_FP16 sdot(const unsigned int N, const _FP16 *X, const unsigned int incX, + const _FP16 *Y, const unsigned int incY) { + assert(incX > 0 && incY > 0); + _FP16 ret = 0; + return __fallback_sdot(N, X, incX, Y, incY); +} + +void saxpy(const unsigned int N, const float alpha, const _FP16 *X, + const unsigned int incX, _FP16 *Y, const unsigned int incY) { + __fallback_saxpy(N, alpha, X, incX, Y, incY); +} + +void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB, + const unsigned int M, const unsigned int N, const unsigned int K, + const float alpha, const _FP16 *A, const unsigned int lda, + const _FP16 *B, const unsigned int ldb, const float beta, _FP16 *C, + const unsigned int ldc) { + float *A_ = new float[M * K]; + float *B_ = new float[N * K]; + float *C_ = new float[M * N]; + + scopy(M * K, A, 1, A_, 1); + scopy(N * K, B, 1, B_, 1); + scopy(M * N, C, 1, C_, 1); + + __cblas_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A_, lda, B_, ldb, + beta, C_, ldc); + scopy(M * N, C_, 1, C, 1); + + delete[] A_; + delete[] B_; + delete[] C_; +} + +void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M, + const unsigned int N, const float alpha, const _FP16 *A, + const unsigned int lda, const _FP16 *X, const unsigned int incX, + const float beta, _FP16 *Y, const unsigned int incY) { + unsigned int lenX = (TransA) ? 1 + (M - 1) * (incX) : 1 + (N - 1) * (incX); + unsigned int lenY = (TransA) ? 1 + (N - 1) * (incY) : 1 + (M - 1) * (incY); + + float *A_ = new float[M * N]; + float *X_ = new float[lenX]; + float *Y_ = new float[lenY]; + + scopy(M * N, A, 1, A_, 1); + scopy(lenX, X, 1, X_, 1); + scopy(lenY, Y, 1, Y_, 1); + + __cblas_sgemv(TStorageOrder, TransA, M, N, alpha, A_, lda, X_, incX, beta, Y_, + incY); + + scopy(lenY, Y_, 1, Y, 1); + + delete[] A_; + delete[] X_; + delete[] Y_; +} + +void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + __fallback_ele_mul(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + __fallback_ele_add(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void ele_sub(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + __fallback_ele_sub(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +void ele_div(const unsigned N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, + float alpha, float beta, unsigned int i_stride, + unsigned int o_stride) { + __fallback_ele_div(N, X, Y, Z, alpha, beta, i_stride, o_stride); +} + +unsigned int isamax(const unsigned int N, const _FP16 *X, + const unsigned int incX) { + unsigned int max_idx = 0; + max_idx = __fallback_isamax(N, X, incX); + return max_idx; +} + +void inv_sqrt_inplace(const unsigned int N, _FP16 *X) { + __fallback_inv_sqrt_inplace(N, X); +} + +void transpose_matrix(const unsigned int M, const unsigned int N, + const _FP16 *src, unsigned int ld_src, _FP16 *dst, + unsigned int ld_dst) { + __fallback_transpose_matrix(M, N, src, ld_src, dst, ld_dst); +} + +} /* namespace nntrainer */ diff --git a/nntrainer/tensor/float_tensor.cpp b/nntrainer/tensor/float_tensor.cpp index 7ca18a7b40..31e7e6b202 100644 --- a/nntrainer/tensor/float_tensor.cpp +++ b/nntrainer/tensor/float_tensor.cpp @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include #include diff --git a/nntrainer/tensor/half_tensor.cpp b/nntrainer/tensor/half_tensor.cpp index 6753d51d34..7a36bcf44a 100644 --- a/nntrainer/tensor/half_tensor.cpp +++ b/nntrainer/tensor/half_tensor.cpp @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include #include diff --git a/nntrainer/tensor/meson.build b/nntrainer/tensor/meson.build index 19c32096b9..ca8e9a2171 100644 --- a/nntrainer/tensor/meson.build +++ b/nntrainer/tensor/meson.build @@ -1,5 +1,4 @@ tensor_sources = [ - 'blas_interface.cpp', 'cache_elem.cpp', 'cache_loader.cpp', 'cache_pool.cpp', @@ -33,30 +32,11 @@ tensor_headers = [ 'weight.h', 'var_grad.h', 'tensor_wrap_specs.h', - 'blas_interface.h' ] -arch = host_machine.cpu_family() -if get_option('enable-fp16') - if arch == 'arm' - error ('FP16/ARM code (blas_neon.cpp) uses armv8.2 instructions. armv7 is not supported.') - elif arch == 'aarch64' or get_option('platform') == 'android' - if get_option('enable-neon') - tensor_sources += 'blas_neon.cpp' - tensor_headers += 'blas_neon.h' - subdir('hgemm') - nntrainer_inc += include_directories('hgemm') - nntrainer_inc_abs += meson.current_source_dir() / 'hgemm' - - subdir('matrix_transpose_neon') - nntrainer_inc += include_directories('matrix_transpose_neon') - nntrainer_inc_abs += meson.current_source_dir() / 'matrix_transpose_neon' - endif - elif get_option('enable-avx') - tensor_sources += 'blas_avx.cpp' - tensor_headers += 'blas_avx.h' - endif -endif +subdir('cpu_backend') +nntrainer_inc += include_directories('cpu_backend') +nntrainer_inc_abs += meson.current_source_dir() / 'cpu_backend' if get_option('enable-fp16') tensor_headers += 'half_tensor.h' diff --git a/nntrainer/tensor/short_tensor.cpp b/nntrainer/tensor/short_tensor.cpp index 8705b10ea0..325ea139b0 100644 --- a/nntrainer/tensor/short_tensor.cpp +++ b/nntrainer/tensor/short_tensor.cpp @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h index 74e0a3437e..d61e6b84ff 100644 --- a/nntrainer/tensor/tensor.h +++ b/nntrainer/tensor/tensor.h @@ -23,8 +23,8 @@ #include -#include #include +#include #include #include #include diff --git a/nntrainer/utils/util_simd_neon.cpp b/nntrainer/utils/util_simd_neon.cpp index d823897047..6de2419555 100644 --- a/nntrainer/utils/util_simd_neon.cpp +++ b/nntrainer/utils/util_simd_neon.cpp @@ -8,8 +8,11 @@ * @bug No known bugs except for NYI items */ -#include +#include #include +#ifdef ARMV7 +#include +#endif namespace nntrainer::neon { diff --git a/packaging/nntrainer.spec b/packaging/nntrainer.spec index deaafebd1b..bc754b8da6 100644 --- a/packaging/nntrainer.spec +++ b/packaging/nntrainer.spec @@ -57,7 +57,7 @@ %define fp16_support -Denable-fp16=true %else %define fp16_support -Denable-fp16=false -%endif # enalbe_fp16 +%endif # enable_fp16 %ifarch aarch64 %define neon_support -Denable-neon=true @@ -540,7 +540,37 @@ cp -r result %{buildroot}%{_datadir}/nntrainer/unittest/ %{_includedir}/nntrainer/half_tensor.h %endif %{_includedir}/nntrainer/tensor_wrap_specs.h -%{_includedir}/nntrainer/blas_interface.h +%{_includedir}/nntrainer/cpu_backend.h +%{_includedir}/nntrainer/fallback_internal.h +%{_includedir}/nntrainer/cblas_interface.h +%ifarch %{ix86} x86_64 +%{_includedir}/nntrainer/x86_compute_backend.h +%if 0%{?enable_fp16} +%{_includedir}/nntrainer/blas_avx.h +%endif +%endif +%ifarch aarch64 +%{_includedir}/nntrainer/arm_compute_backend.h +%{_includedir}/nntrainer/neon_impl.h +%{_includedir}/nntrainer/neon_setting.h +%{_includedir}/nntrainer/neon_mathfun.h +%{_includedir}/nntrainer/neon_mathfun.hxx +%if 0%{?enable_fp16} +%{_includedir}/nntrainer/hgemm.h +%{_includedir}/nntrainer/matrix_transpose_neon.h +%endif +%endif +%ifarch %arm + %{_includedir}/nntrainer/arm_compute_backend.h + %{_includedir}/nntrainer/armv7_neon.h + %{_includedir}/nntrainer/neon_impl.h + %{_includedir}/nntrainer/neon_setting.h + %{_includedir}/nntrainer/neon_mathfun.h + %{_includedir}/nntrainer/neon_mathfun.hxx +%endif +%ifnarch %{ix86} %arm x86_64 aarch64 +%{_includedir}/nntrainer/fallback.h +%endif %{_includedir}/nntrainer/var_grad.h %{_includedir}/nntrainer/weight.h # @todo: update dataset headers diff --git a/test/jni/Android.mk b/test/jni/Android.mk index 153b4eb840..a249188160 100644 --- a/test/jni/Android.mk +++ b/test/jni/Android.mk @@ -22,6 +22,9 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \ $(NNTRAINER_ROOT)/nntrainer/opencl \ $(NNTRAINER_ROOT)/nntrainer/optimizers \ $(NNTRAINER_ROOT)/nntrainer/tensor \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/fallback \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend/aarch64 \ $(NNTRAINER_ROOT)/nntrainer/tensor/cl_operations \ $(NNTRAINER_ROOT)/nntrainer/utils \ $(NNTRAINER_ROOT)/api \ diff --git a/test/unittest/jni/Android.mk b/test/unittest/jni/Android.mk index 40fe50d28e..8a1d9af5bc 100644 --- a/test/unittest/jni/Android.mk +++ b/test/unittest/jni/Android.mk @@ -20,6 +20,7 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \ $(NNTRAINER_ROOT)/nntrainer/graph \ $(NNTRAINER_ROOT)/nntrainer/optimizers \ $(NNTRAINER_ROOT)/nntrainer/tensor \ + $(NNTRAINER_ROOT)/nntrainer/tensor/cpu_backend \ $(NNTRAINER_ROOT)/nntrainer/utils \ $(NNTRAINER_ROOT)/api \ $(NNTRAINER_ROOT)/api/ccapi/include \