Skip to content

Commit

Permalink
[gpu/enhance] Shared memory implementation for Blas kernels
Browse files Browse the repository at this point in the history
Implementation of shared memory using proper flags.
Added changes for Blas kernels.

Signed-off-by: Debadri Samaddar <[email protected]>
  • Loading branch information
s-debadri committed Nov 4, 2024
1 parent b1a3c75 commit fcc6b7a
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 151 deletions.
110 changes: 35 additions & 75 deletions nntrainer/tensor/cl_operations/blas_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,28 +31,13 @@ void sgemv_cl(const float *matAdata, const float *vecXdata, float *vecYdata,
size_t dim1_size = sizeof(float) * dim1;
size_t dim2_size = sizeof(float) * dim2;
opencl::Buffer inputA(cl_context_ref.context_inst_,
dim1 * dim2 * sizeof(float), true, nullptr);
dim1 * dim2 * sizeof(float), true, (void *)matAdata);

opencl::Buffer inputX(cl_context_ref.context_inst_, dim2_size, true,
nullptr);
(void *)vecXdata);

opencl::Buffer inOutY(cl_context_ref.context_inst_, dim1_size, true,
nullptr);

result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
if (!result) {
break;
}

result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
if (!result) {
break;
}

result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
opencl::Buffer inOutY(cl_context_ref.context_inst_, dim1_size, false,
vecYdata);

result = kernel_sgemv_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
Expand Down Expand Up @@ -88,7 +73,10 @@ void sgemv_cl(const float *matAdata, const float *vecXdata, float *vecYdata,
break;
}

result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
// to avoid cache inconsistency
vecYdata = (float *)(inOutY.MapBuffer(cl_context_ref.command_queue_inst_, 0,
dim1_size, true));
result = inOutY.UnMapBuffer(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
Expand All @@ -112,24 +100,14 @@ float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1) {
size_t dim1_size = sizeof(float) * dim1;

opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true,
nullptr);
(void *)vecAdata);

opencl::Buffer inputX(cl_context_ref.context_inst_, dim1_size, true,
nullptr);
(void *)vecXdata);

opencl::Buffer dotResult(cl_context_ref.context_inst_, sizeof(float), true,
opencl::Buffer dotResult(cl_context_ref.context_inst_, sizeof(float), false,
&cl_ret);

result = inputA.WriteData(cl_context_ref.command_queue_inst_, vecAdata);
if (!result) {
break;
}

result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
if (!result) {
break;
}

result = kernel_dot_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
break;
Expand Down Expand Up @@ -159,7 +137,11 @@ float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1) {
break;
}

result = dotResult.ReadData(cl_context_ref.command_queue_inst_, &cl_ret);
// to avoid cache inconsistency
float *tmp = (float *)(dotResult.MapBuffer(
cl_context_ref.command_queue_inst_, 0, sizeof(float), true));
cl_ret = *tmp;
result = dotResult.UnMapBuffer(cl_context_ref.command_queue_inst_, tmp);
if (!result) {
break;
}
Expand Down Expand Up @@ -205,28 +187,12 @@ void sgemm_cl(bool TransA, bool TransB, const float *A, const float *B,
size_t m_n_size = M * N * sizeof(float);

opencl::Buffer inputA(cl_context_ref.context_inst_, m_k_size, true,
nullptr);
(void *)A);

opencl::Buffer inputB(cl_context_ref.context_inst_, k_n_size, true,
nullptr);

opencl::Buffer inOutC(cl_context_ref.context_inst_, m_n_size, true,
nullptr);

result = inputA.WriteData(cl_context_ref.command_queue_inst_, A);
if (!result) {
break;
}
(void *)B);

result = inputB.WriteData(cl_context_ref.command_queue_inst_, B);
if (!result) {
break;
}

result = inOutC.WriteData(cl_context_ref.command_queue_inst_, C);
if (!result) {
break;
}
opencl::Buffer inOutC(cl_context_ref.context_inst_, m_n_size, false, C);

result = kernel_sgemm_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
Expand Down Expand Up @@ -272,7 +238,10 @@ void sgemm_cl(bool TransA, bool TransB, const float *A, const float *B,
break;
}

result = inOutC.ReadData(cl_context_ref.command_queue_inst_, C);
// to avoid cache inconsistency
C = (float *)(inOutC.MapBuffer(cl_context_ref.command_queue_inst_, 0,
m_n_size, true));
result = inOutC.UnMapBuffer(cl_context_ref.command_queue_inst_, C);
if (!result) {
break;
}
Expand All @@ -293,20 +262,10 @@ void addition_cl(const float *input, float *res, unsigned int size) {

size_t dim1_size = sizeof(float) * size;
opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true,
nullptr);

opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim1_size, true,
nullptr);
(void *)input);

result = inputA.WriteData(cl_context_ref.command_queue_inst_, input);
if (!result) {
break;
}

result = inOutRes.WriteData(cl_context_ref.command_queue_inst_, res);
if (!result) {
break;
}
opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim1_size, false,
res);

result =
kernel_addition_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
Expand All @@ -333,7 +292,10 @@ void addition_cl(const float *input, float *res, unsigned int size) {
break;
}

result = inOutRes.ReadData(cl_context_ref.command_queue_inst_, res);
// to avoid cache inconsistency
res = (float *)(inOutRes.MapBuffer(cl_context_ref.command_queue_inst_, 0,
dim1_size, true));
result = inOutRes.UnMapBuffer(cl_context_ref.command_queue_inst_, res);
if (!result) {
break;
}
Expand All @@ -354,12 +316,7 @@ void sscal_cl(float *X, const unsigned int N, const float alpha) {

size_t x_size = N * sizeof(float);

opencl::Buffer inputX(cl_context_ref.context_inst_, x_size, false, nullptr);

result = inputX.WriteData(cl_context_ref.command_queue_inst_, X);
if (!result) {
break;
}
opencl::Buffer inputX(cl_context_ref.context_inst_, x_size, false, X);

result = kernel_ptr->SetKernelArguments(0, &inputX, sizeof(cl_mem));
if (!result) {
Expand All @@ -380,7 +337,10 @@ void sscal_cl(float *X, const unsigned int N, const float alpha) {
break;
}

result = inputX.ReadData(cl_context_ref.command_queue_inst_, X);
// to avoid cache inconsistency
X = (float *)(inputX.MapBuffer(cl_context_ref.command_queue_inst_, 0,
x_size, 0));
result = inputX.UnMapBuffer(cl_context_ref.command_queue_inst_, X);
if (!result) {
break;
}
Expand Down
113 changes: 37 additions & 76 deletions nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,28 +31,14 @@ void sgemv_cl(const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata,
size_t dim1_size = sizeof(cl_half) * dim1;
size_t dim2_size = sizeof(cl_half) * dim2;
opencl::Buffer inputA(cl_context_ref.context_inst_,
dim1 * dim2 * sizeof(cl_half), true, nullptr);
dim1 * dim2 * sizeof(cl_half), true,
(void *)matAdata);

opencl::Buffer inputX(cl_context_ref.context_inst_, dim2_size, true,
nullptr);
(void *)vecXdata);

opencl::Buffer inOutY(cl_context_ref.context_inst_, dim1_size, true,
nullptr);

result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
if (!result) {
break;
}

result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
if (!result) {
break;
}

result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
opencl::Buffer inOutY(cl_context_ref.context_inst_, dim1_size, false,
vecYdata);

result =
kernel_sgemv_fp16_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
Expand Down Expand Up @@ -91,7 +77,10 @@ void sgemv_cl(const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata,
break;
}

result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
// to avoid cache inconsistency
vecYdata = (__fp16 *)(inOutY.MapBuffer(cl_context_ref.command_queue_inst_,
0, dim1_size, true));
result = inOutY.UnMapBuffer(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
Expand All @@ -117,23 +106,13 @@ __fp16 dot_cl(const __fp16 *vecAdata, const __fp16 *vecXdata,
size_t dim1_size = sizeof(cl_half) * dim1;

opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true,
nullptr);
(void *)vecAdata);

opencl::Buffer inputX(cl_context_ref.context_inst_, dim1_size, true,
nullptr);

opencl::Buffer dotResult(cl_context_ref.context_inst_, sizeof(__fp16), true,
&cl_ret);
(void *)vecXdata);

result = inputA.WriteData(cl_context_ref.command_queue_inst_, vecAdata);
if (!result) {
break;
}

result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
if (!result) {
break;
}
opencl::Buffer dotResult(cl_context_ref.context_inst_, sizeof(__fp16),
false, &cl_ret);

result =
kernel_dot_fp16_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
Expand Down Expand Up @@ -167,7 +146,11 @@ __fp16 dot_cl(const __fp16 *vecAdata, const __fp16 *vecXdata,
break;
}

result = dotResult.ReadData(cl_context_ref.command_queue_inst_, &cl_ret);
// to avoid cache inconsistency
__fp16 *tmp = (__fp16 *)(dotResult.MapBuffer(
cl_context_ref.command_queue_inst_, 0, sizeof(__fp16), true));
cl_ret = *tmp;
result = dotResult.UnMapBuffer(cl_context_ref.command_queue_inst_, tmp);
if (!result) {
break;
}
Expand Down Expand Up @@ -213,28 +196,12 @@ void sgemm_cl(bool TransA, bool TransB, const __fp16 *A, const __fp16 *B,
size_t m_n_size = M * N * sizeof(cl_half);

opencl::Buffer inputA(cl_context_ref.context_inst_, m_k_size, true,
nullptr);
(void *)A);

opencl::Buffer inputB(cl_context_ref.context_inst_, k_n_size, true,
nullptr);

opencl::Buffer inOutC(cl_context_ref.context_inst_, m_n_size, true,
nullptr);

result = inputA.WriteData(cl_context_ref.command_queue_inst_, A);
if (!result) {
break;
}
(void *)B);

result = inputB.WriteData(cl_context_ref.command_queue_inst_, B);
if (!result) {
break;
}

result = inOutC.WriteData(cl_context_ref.command_queue_inst_, C);
if (!result) {
break;
}
opencl::Buffer inOutC(cl_context_ref.context_inst_, m_n_size, false, C);

result =
kernel_sgemm_fp16_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
Expand Down Expand Up @@ -283,7 +250,10 @@ void sgemm_cl(bool TransA, bool TransB, const __fp16 *A, const __fp16 *B,
break;
}

result = inOutC.ReadData(cl_context_ref.command_queue_inst_, C);
// to avoid cache inconsistency
C = (__fp16 *)(inOutC.MapBuffer(cl_context_ref.command_queue_inst_, 0,
m_n_size, true));
result = inOutC.UnMapBuffer(cl_context_ref.command_queue_inst_, C);
if (!result) {
break;
}
Expand All @@ -305,20 +275,10 @@ void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size) {

size_t dim1_size = sizeof(cl_half) * size;
opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true,
nullptr);

opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim1_size, true,
nullptr);
(void *)input);

result = inputA.WriteData(cl_context_ref.command_queue_inst_, input);
if (!result) {
break;
}

result = inOutRes.WriteData(cl_context_ref.command_queue_inst_, res);
if (!result) {
break;
}
opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim1_size, false,
res);

result =
kernel_addition_fp16_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
Expand Down Expand Up @@ -346,7 +306,10 @@ void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size) {
break;
}

result = inOutRes.ReadData(cl_context_ref.command_queue_inst_, res);
// to avoid cache inconsistency
res = (__fp16 *)(inOutRes.MapBuffer(cl_context_ref.command_queue_inst_, 0,
dim1_size, true));
result = inOutRes.UnMapBuffer(cl_context_ref.command_queue_inst_, res);
if (!result) {
break;
}
Expand All @@ -367,12 +330,7 @@ void sscal_cl(__fp16 *X, const unsigned int N, const float alpha) {

size_t x_size = N * sizeof(cl_half);

opencl::Buffer inputX(cl_context_ref.context_inst_, x_size, false, nullptr);

result = inputX.WriteData(cl_context_ref.command_queue_inst_, X);
if (!result) {
break;
}
opencl::Buffer inputX(cl_context_ref.context_inst_, x_size, false, X);

result =
kernel_sscal_fp16_ptr->SetKernelArguments(0, &inputX, sizeof(cl_mem));
Expand All @@ -395,7 +353,10 @@ void sscal_cl(__fp16 *X, const unsigned int N, const float alpha) {
break;
}

result = inputX.ReadData(cl_context_ref.command_queue_inst_, X);
// to avoid cache inconsistency
X = (__fp16 *)(inputX.MapBuffer(cl_context_ref.command_queue_inst_, 0,
x_size, 0));
result = inputX.UnMapBuffer(cl_context_ref.command_queue_inst_, X);
if (!result) {
break;
}
Expand Down

0 comments on commit fcc6b7a

Please sign in to comment.