Skip to content

Commit

Permalink
[ TEST COMMIT/Trivial ]
Browse files Browse the repository at this point in the history
This commit enables build and test for refactorized cpu_backend by freeing unncessary test dependencies from TensorV2.
To avoid conflict, Fixes like below should be applied for all the codes in TensorV2:
	- Remove blas_interface.h / blas_interface.cpp dependencies
	- Use dim.getStorageOrder() at sgemv / sgemm
	- (MOST IMPORTANT) Do NOT use CBLAS params in tensor-related files

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <[email protected]>
  • Loading branch information
skykongkong8 committed Apr 18, 2024
1 parent ca57110 commit c94059c
Show file tree
Hide file tree
Showing 9 changed files with 2,647 additions and 2,823 deletions.
2 changes: 1 addition & 1 deletion debian/nntrainer-dev.install
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
/usr/include/nntrainer/tensor_base.h
/usr/include/nntrainer/float_tensor.h
/usr/include/nntrainer/tensor_wrap_specs.h
/usr/include/nntrainer/blas_interface.h
# /usr/include/nntrainer/blas_interface.h
/usr/include/nntrainer/var_grad.h
/usr/include/nntrainer/weight.h
# todo: update dataset headers
Expand Down
179 changes: 3 additions & 176 deletions nntrainer/tensor/blas_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,7 @@

#include <blas_interface.h>
#include <nntrainer_error.h>

#if (defined USE_NEON)
#include <neon_simd.h>
#endif

#if USE_AVX
#include <avx_simd.h>
#endif
// #include <cpu_backend.h>

#include <cmath>

Expand Down Expand Up @@ -169,8 +162,6 @@ static void scopy_FP16(const unsigned int N, const _FP16 *X, const int incX,
Y[i * incy] = X[i * incx];
}
#else
for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = X[i * incx];
#endif
}

Expand All @@ -194,8 +185,6 @@ static void copy_float32_to_float16(const unsigned int N, const float *X,
Y[i * incy] = static_cast<_FP16>(X[i * incx]);
}
#else
for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = static_cast<_FP16>(X[i * incx]);
#endif
}

Expand All @@ -219,8 +208,6 @@ static void copy_float16_to_float32(const unsigned int N, const _FP16 *X,
Y[i * incy] = static_cast<float>(X[i * incx]);
}
#else
for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = static_cast<float>(X[i * incx]);
#endif
}

Expand Down Expand Up @@ -495,7 +482,7 @@ void ele_div(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
}

_FP16 snrm2(const int N, const _FP16 *X, const int incX) {
_FP16 snrm2(const unsigned int N, const _FP16 *X, const int incX) {
return snrm2_FP16(N, X, incX);
}

Expand Down Expand Up @@ -634,28 +621,6 @@ static unsigned int isamax_raw(const unsigned int N, const float *X,

#endif

void sscal(const unsigned int N, const float alpha, void *X, const int incX,
ml::train::TensorDim::DataType d_type) {

if (d_type == ml::train::TensorDim::DataType::FP32) {

#ifdef USE_BLAS
#ifdef BLAS_NUM_THREADS
openblas_set_num_threads(BLAS_NUM_THREADS);
#endif // BLAS_NUM_THREADS
cblas_sscal(N, alpha, (float *)X, incX);
#else // USE_BLAS else
sscal_raw(N, alpha, (float *)X, incX);
#endif // USE_BLAS
} else if (d_type == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
sscal(N, alpha, (_FP16 *)X, incX);
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
}
}

void sscal(const unsigned int N, const float alpha, float *X, const int incX) {
#ifdef USE_BLAS
#ifdef BLAS_NUM_THREADS
Expand All @@ -667,29 +632,6 @@ void sscal(const unsigned int N, const float alpha, float *X, const int incX) {
#endif
}

void saxpy(const unsigned int N, const float alpha, const void *X,
const int incX, void *Y, const int incY,
ml::train::TensorDim::DataType d_type) {
if (d_type == ml::train::TensorDim::DataType::FP32) {
#ifdef USE_BLAS
#ifdef BLAS_NUM_THREADS
openblas_set_num_threads(BLAS_NUM_THREADS);
#endif
cblas_saxpy(N, alpha, static_cast<const float *>(X), incX,
static_cast<float *>(Y), incY);
#else
saxpy_raw(N, alpha, static_cast<const float *>(X), incX,
static_cast<float *>(Y), incY);
#endif
} else if (d_type == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
saxpy_FP16(N, alpha, static_cast<const _FP16 *>(X), incX,
static_cast<_FP16 *>(Y), incY);
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
}
}

void saxpy(const unsigned int N, const float alpha, const float *X,
const int incX, float *Y, const int incY) {
Expand All @@ -703,68 +645,6 @@ void saxpy(const unsigned int N, const float alpha, const float *X,
#endif
}

void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
const unsigned int M, const unsigned int N, const unsigned int K,
const float alpha, const void *A, const unsigned int lda,
const void *B, const unsigned int ldb, const float beta, void *C,
const unsigned int ldc, ml::train::TensorDim::DataType d_type) {

if (d_type == ml::train::TensorDim::DataType::FP32) {
#ifdef USE_CUBLAS
int devID = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, devID);
float *d_A, *d_B, *d_C;

unsigned int size_A = M * K * sizeof(float);
unsigned int size_B = K * N * sizeof(float);
unsigned int size_C = M * N * sizeof(float);

cudaMalloc((void **)&d_A, size_A);
cudaMalloc((void **)&d_B, size_B);
cudaMemcpy(d_A, A, size_A, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, size_B, cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_C, size_C);

cublasHandle_t handle;
cublasCreate(&handle);

cublasOperation_t transA =
(TransA == CblasTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
cublasOperation_t transB =
(TransB == CblasTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
cublasSgemm(handle, transA, transB, N, M, K, &alpha, d_B, N, d_A, K, &beta,
d_C, N);

cudaMemcpy(C, d_C, size_C, cudaMemcpyDeviceToHost);
cublasDestroy(handle);

#elif defined USE_BLAS

#ifdef BLAS_NUM_THREADS
openblas_set_num_threads(BLAS_NUM_THREADS);
#endif

cblas_sgemm(
order, TransA, TransB, M, N, K, alpha, static_cast<const float *>(A), lda,
static_cast<const float *>(B), ldb, beta, static_cast<float *>(C), ldc);
#else
sgemm_raw(order, TransA, TransB, M, N, K, alpha,
static_cast<const float *>(A), lda, static_cast<const float *>(B),
ldb, beta, static_cast<float *>(C), ldc);
#endif

} else if (d_type == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
sgemm_FP16(
order, TransA, TransB, M, N, K, alpha, static_cast<const _FP16 *>(A), lda,
static_cast<const _FP16 *>(B), ldb, beta, static_cast<_FP16 *>(C), ldc);
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
}
} // namespace nntrainer

void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
const unsigned int M, const unsigned int N, const unsigned int K,
const float alpha, const float *A, const unsigned int lda,
Expand Down Expand Up @@ -809,29 +689,6 @@ void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
#endif
}

void scopy(const unsigned int N, const void *X, const int incX, void *Y,
const int incY, ml::train::TensorDim::DataType d_type) {

if (d_type == ml::train::TensorDim::DataType::FP32) {

#ifdef USE_BLAS
#ifdef BLAS_NUM_THREADS
openblas_set_num_threads(BLAS_NUM_THREADS);
#endif
cblas_scopy(N, (float *)X, incX, (float *)Y, incY);
#else
scopy_raw(N, (float *)X, incX, (float *)Y, incY);
#endif

} else if (d_type == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
scopy_FP16(N, (_FP16 *)X, incX, (_FP16 *)Y, incY);
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
}
}

void scopy(const unsigned int N, const float *X, const int incX, float *Y,
const int incY) {
#ifdef USE_BLAS
Expand Down Expand Up @@ -878,7 +735,7 @@ void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
#endif
}

float snrm2(const int N, const float *X, const int incX) {
float snrm2(const unsigned int N, const float *X, const int incX) {
#ifdef USE_BLAS
#ifdef BLAS_NUM_THREADS
openblas_set_num_threads(BLAS_NUM_THREADS);
Expand All @@ -901,36 +758,6 @@ float sdot(const unsigned int N, const float *X, const unsigned int incX,
#endif
}

void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
const unsigned int N, const float alpha, const void *A,
const unsigned int lda, const void *X, const int incX,
const float beta, void *Y, const int incY,
ml::train::TensorDim::DataType d_type) {
if (d_type == ml::train::TensorDim::DataType::FP32) {
#ifdef USE_BLAS
#ifdef BLAS_NUM_THREADS
openblas_set_num_threads(BLAS_NUM_THREADS);
#endif
return cblas_sgemv(
order, TransA, M, N, alpha, static_cast<const float *>(A), lda,
static_cast<const float *>(X), incX, beta, static_cast<float *>(Y), incY);
#else

return sgemv_raw(order, TransA, M, N, alpha, static_cast<const float *>(A),
lda, static_cast<const float *>(X), incX, beta,
static_cast<float *>(Y), incY);
#endif
} else if (d_type == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
return sgemv_FP16(order, TransA, M, N, alpha, static_cast<const _FP16 *>(A),
lda, static_cast<const _FP16 *>(X), incX, beta,
static_cast<_FP16 *>(Y), incY);
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
}
}

void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
const unsigned int N, const float alpha, const float *A,
const unsigned int lda, const float *X, const int incX,
Expand Down
Loading

0 comments on commit c94059c

Please sign in to comment.