Skip to content

Commit

Permalink
Add atomic double test and fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
kchristin22 authored and vgvassilev committed Sep 24, 2024
1 parent 02f87f3 commit 6d6f853
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 36 deletions.
19 changes: 7 additions & 12 deletions lib/Differentiator/ReverseModeVisitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1493,14 +1493,10 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
m_Context.getTranslationUnitDecl());

FunctionDecl* atomicAddFunc = nullptr;
for (LookupResult::iterator it = lookupResult.begin();
it != lookupResult.end(); it++) {
NamedDecl* decl = *it;
for (auto decl : lookupResult) {
// FIXME: check for underlying types of the pointers
if (dyn_cast<FunctionDecl>(decl)->getReturnType() ==
result->getType()) {
printf("decl: %s, type: %s\n", decl->getNameAsString().c_str(),
result->getType().getAsString().c_str());
atomicAddFunc = dyn_cast<FunctionDecl>(decl);
break;
}
Expand Down Expand Up @@ -2312,21 +2308,20 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
DeclarationName atomicAddId = &m_Context.Idents.get("atomicAdd");
LookupResult lookupResult(m_Sema, atomicAddId, SourceLocation(),
Sema::LookupOrdinaryName);
m_Sema.LookupQualifiedName(lookupResult, m_Context.getTranslationUnitDecl());
m_Sema.LookupQualifiedName(lookupResult,
m_Context.getTranslationUnitDecl());

FunctionDecl* atomicAddFunc = nullptr;
for (LookupResult::iterator it = lookupResult.begin();
it != lookupResult.end(); it++) {
NamedDecl* decl = *it;
for (auto decl : lookupResult) {
// FIXME: check for underlying types of the pointers
if (dyn_cast<FunctionDecl>(decl)->getReturnType() == derivedE->getType()) {
printf("decl: %s, type: %s\n", decl->getNameAsString().c_str(), derivedE->getType().getAsString().c_str());
if (dyn_cast<FunctionDecl>(decl)->getReturnType() ==
derivedE->getType()) {
atomicAddFunc = dyn_cast<FunctionDecl>(decl);
break;
}
}
assert(atomicAddFunc && "atomicAdd function not found");
llvm::SmallVector<Expr*, 2> atomicArgs = {diff_dx, dfdx()};
llvm::SmallVector<Expr*, 2> atomicArgs = {diff_dx, dfdx()};
Expr* atomicCall =
BuildCallExprToFunction(atomicAddFunc, atomicArgs);

Expand Down
2 changes: 1 addition & 1 deletion test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ if(CUDAToolkit_FOUND)
get_filename_component(CUDA_ROOT "${CUDAToolkit_BIN_DIR}" DIRECTORY ABSOLUTE)
get_filename_component(CUDA_LIBDIR "${CUDA_cudart_static_LIBRARY}" DIRECTORY)

set(LIBOMPTARGET_DEP_CUDA_ARCH "sm_50")
set(LIBOMPTARGET_DEP_CUDA_ARCH "sm_60")

if(TARGET nvptx-arch)
get_property(LIBOMPTARGET_NVPTX_ARCH TARGET nvptx-arch PROPERTY LOCATION)
Expand Down
154 changes: 131 additions & 23 deletions test/CUDA/GradientKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ __global__ void kernel(int *a) {
//CHECK-NEXT: int _r_d0 = *_d_a;
//CHECK-NEXT: *_d_a = 0;
//CHECK-NEXT: *_d_a += _r_d0 * *a;
//CHECK-NEXT: *_d_a += *a * _r_d0;
//CHECK-NEXT: atomicAdd(_d_a, *a * _r_d0);
//CHECK-NEXT: }
//CHECK-NEXT: }

Expand All @@ -46,7 +46,7 @@ __global__ void add_kernel(int *out, int *in) {
//CHECK-NEXT: {
//CHECK-NEXT: out[index0] = _t0;
//CHECK-NEXT: int _r_d0 = _d_out[index0];
//CHECK-NEXT: _d_in[index0] += _r_d0;
//CHECK-NEXT: atomicAdd(&_d_in[index0], _r_d0);
//CHECK-NEXT: }
//CHECK-NEXT: }

Expand All @@ -60,7 +60,7 @@ __global__ void add_kernel_2(int *out, int *in) {
//CHECK-NEXT: {
//CHECK-NEXT: out[threadIdx.x] = _t0;
//CHECK-NEXT: int _r_d0 = _d_out[threadIdx.x];
//CHECK-NEXT: _d_in[threadIdx.x] += _r_d0;
//CHECK-NEXT: atomicAdd(&_d_in[threadIdx.x], _r_d0);
//CHECK-NEXT: }
//CHECK-NEXT: }

Expand All @@ -79,23 +79,24 @@ __global__ void add_kernel_3(int *out, int *in) {
//CHECK-NEXT: {
//CHECK-NEXT: out[index0] = _t2;
//CHECK-NEXT: int _r_d0 = _d_out[index0];
//CHECK-NEXT: _d_in[index0] += _r_d0;
//CHECK-NEXT: atomicAdd(&_d_in[index0], _r_d0);
//CHECK-NEXT: }
//CHECK-NEXT:}

__global__ void add_kernel_4(int *out, int *in) {
__global__ void add_kernel_4(int *out, int *in, int N) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < 5) {
if (index < N) {
int sum = 0;
// Each thread sums elements in steps of warpSize
for (int i = index; i < 5; i += warpSize) {
for (int i = index; i < N; i += warpSize) {
sum += in[i];
}
out[index] = sum;
}
}

// CHECK: void add_kernel_4_grad(int *out, int *in, int *_d_out, int *_d_in) {
// CHECK: void add_kernel_4_grad_0_1(int *out, int *in, int N, int *_d_out, int *_d_in) {
//CHECK-NEXT: int _d_N = 0;
//CHECK-NEXT: bool _cond0;
//CHECK-NEXT: int _d_sum = 0;
//CHECK-NEXT: int sum = 0;
Expand All @@ -110,13 +111,13 @@ __global__ void add_kernel_4(int *out, int *in) {
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0;
//CHECK-NEXT: {
//CHECK-NEXT: _cond0 = index0 < 5;
//CHECK-NEXT: _cond0 = index0 < N;
//CHECK-NEXT: if (_cond0) {
//CHECK-NEXT: sum = 0;
//CHECK-NEXT: _t2 = 0UL;
//CHECK-NEXT: for (i = index0; ; clad::push(_t3, i) , (i += warpSize)) {
//CHECK-NEXT: {
//CHECK-NEXT: if (!(i < 5))
//CHECK-NEXT: if (!(i < N))
//CHECK-NEXT: break;
//CHECK-NEXT: }
//CHECK-NEXT: _t2++;
Expand Down Expand Up @@ -147,31 +148,32 @@ __global__ void add_kernel_4(int *out, int *in) {
//CHECK-NEXT: {
//CHECK-NEXT: sum = clad::pop(_t4);
//CHECK-NEXT: int _r_d1 = _d_sum;
//CHECK-NEXT: _d_in[i] += _r_d1;
//CHECK-NEXT: atomicAdd(&_d_in[i], _r_d1);
//CHECK-NEXT: }
//CHECK-NEXT: }
//CHECK-NEXT: _d_index += _d_i;
//CHECK-NEXT: }
//CHECK-NEXT: }
//CHECK-NEXT:}

__global__ void add_kernel_5(int *out, int *in) {
__global__ void add_kernel_5(int *out, int *in, int N) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < 5) {
if (index < N) {
int sum = 0;
// Calculate the total number of threads in the grid
int totalThreads = blockDim.x * gridDim.x;
// Each thread sums elements in steps of the total number of threads in the grid
for (int i = index; i < 5; i += totalThreads) {
for (int i = index; i < N; i += totalThreads) {
sum += in[i];
}
out[index] = sum;
}
}

// CHECK: void add_kernel_5_grad(int *out, int *in, int *_d_out, int *_d_in) {
// CHECK: void add_kernel_5_grad_0_1(int *out, int *in, int N, int *_d_out, int *_d_in) {
//CHECK-NEXT: int _d_N = 0;
//CHECK-NEXT: bool _cond0;
//CHECK-NEXT: int _d_sum = 0;
//CHECK-NEXT: int _d_sum = 0;
//CHECK-NEXT: int sum = 0;
//CHECK-NEXT: unsigned int _t2;
//CHECK-NEXT: unsigned int _t3;
Expand All @@ -188,7 +190,7 @@ __global__ void add_kernel_5(int *out, int *in) {
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0;
//CHECK-NEXT: {
//CHECK-NEXT: _cond0 = index0 < 5;
//CHECK-NEXT: _cond0 = index0 < N;
//CHECK-NEXT: if (_cond0) {
//CHECK-NEXT: sum = 0;
//CHECK-NEXT: _t3 = blockDim.x;
Expand All @@ -197,7 +199,7 @@ __global__ void add_kernel_5(int *out, int *in) {
//CHECK-NEXT: _t4 = 0UL;
//CHECK-NEXT: for (i = index0; ; clad::push(_t5, i) , (i += totalThreads)) {
//CHECK-NEXT: {
//CHECK-NEXT: if (!(i < 5))
//CHECK-NEXT: if (!(i < N))
//CHECK-NEXT: break;
//CHECK-NEXT: }
//CHECK-NEXT: _t4++;
Expand Down Expand Up @@ -229,20 +231,43 @@ __global__ void add_kernel_5(int *out, int *in) {
//CHECK-NEXT: {
//CHECK-NEXT: sum = clad::pop(_t6);
//CHECK-NEXT: int _r_d1 = _d_sum;
//CHECK-NEXT: _d_in[i] += _r_d1;
//CHECK-NEXT: atomicAdd(&_d_in[i], _r_d1);
//CHECK-NEXT: }
//CHECK-NEXT: }
//CHECK-NEXT: _d_index += _d_i;
//CHECK-NEXT: }
//CHECK-NEXT: }
//CHECK-NEXT:}

__global__ void add_kernel_6(int *a, int *b) {
__global__ void add_kernel_6(double *a, double *b) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
a[2 * index] = b[0];
a[2 * index + 1] = b[0];
}

// CHECK: void add_kernel_6_grad(double *a, double *b, double *_d_a, double *_d_b) {
//CHECK-NEXT: unsigned int _t1 = blockIdx.x;
//CHECK-NEXT: unsigned int _t0 = blockDim.x;
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0;
//CHECK-NEXT: double _t2 = a[2 * index0];
//CHECK-NEXT: a[2 * index0] = b[0];
//CHECK-NEXT: double _t3 = a[2 * index0 + 1];
//CHECK-NEXT: a[2 * index0 + 1] = b[0];
//CHECK-NEXT: {
//CHECK-NEXT: a[2 * index0 + 1] = _t3;
//CHECK-NEXT: double _r_d1 = _d_a[2 * index0 + 1];
//CHECK-NEXT: _d_a[2 * index0 + 1] = 0.;
//CHECK-NEXT: atomicAdd(&_d_b[0], _r_d1);
//CHECK-NEXT: }
//CHECK-NEXT: {
//CHECK-NEXT: a[2 * index0] = _t2;
//CHECK-NEXT: double _r_d0 = _d_a[2 * index0];
//CHECK-NEXT: _d_a[2 * index0] = 0.;
//CHECK-NEXT: atomicAdd(&_d_b[0], _r_d0);
//CHECK-NEXT: }
//CHECK-NEXT:}

#define TEST(F, grid, block, shared_mem, use_stream, x, dx, N) \
{ \
int *fives = (int*)malloc(N * sizeof(int)); \
Expand Down Expand Up @@ -312,6 +337,76 @@ __global__ void add_kernel_6(int *a, int *b) {
free(res); \
}

#define TEST_2_N(F, grid, block, shared_mem, use_stream, args, y, x, dy, dx, N) \
{ \
int *fives = (int*)malloc(N * sizeof(int)); \
for(int i = 0; i < N; i++) { \
fives[i] = 5; \
} \
int *zeros = (int*)malloc(N * sizeof(int)); \
for(int i = 0; i < N; i++) { \
zeros[i] = 0; \
} \
cudaMemcpy(x, fives, N * sizeof(int), cudaMemcpyHostToDevice); \
cudaMemcpy(y, zeros, N * sizeof(int), cudaMemcpyHostToDevice); \
cudaMemcpy(dy, fives, N * sizeof(int), cudaMemcpyHostToDevice); \
cudaMemcpy(dx, zeros, N * sizeof(int), cudaMemcpyHostToDevice); \
auto test = clad::gradient(F, args); \
if constexpr (use_stream) { \
cudaStream_t cudaStream; \
cudaStreamCreate(&cudaStream); \
test.execute_kernel(grid, block, shared_mem, cudaStream, y, x, N, dy, dx); \
} \
else { \
test.execute_kernel(grid, block, y, x, N, dy, dx); \
} \
cudaDeviceSynchronize(); \
int *res = (int*)malloc(N * sizeof(int)); \
cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \
for (int i = 0; i < (N - 1); i++) { \
printf("%d, ", res[i]); \
} \
printf("%d\n", res[N-1]); \
free(fives); \
free(zeros); \
free(res); \
}

#define TEST_2_D(F, grid, block, shared_mem, use_stream, args, y, x, dy, dx, N) \
{ \
double *fives = (double*)malloc(N * sizeof(double)); \
for(int i = 0; i < N; i++) { \
fives[i] = 5; \
} \
double *zeros = (double*)malloc(N * sizeof(double)); \
for(int i = 0; i < N; i++) { \
zeros[i] = 0; \
} \
cudaMemcpy(x, fives, N * sizeof(double), cudaMemcpyHostToDevice); \
cudaMemcpy(y, zeros, N * sizeof(double), cudaMemcpyHostToDevice); \
cudaMemcpy(dy, fives, N * sizeof(double), cudaMemcpyHostToDevice); \
cudaMemcpy(dx, zeros, N * sizeof(double), cudaMemcpyHostToDevice); \
auto test = clad::gradient(F, args); \
if constexpr (use_stream) { \
cudaStream_t cudaStream; \
cudaStreamCreate(&cudaStream); \
test.execute_kernel(grid, block, shared_mem, cudaStream, y, x, dy, dx); \
} \
else { \
test.execute_kernel(grid, block, y, x, dy, dx); \
} \
cudaDeviceSynchronize(); \
double *res = (double*)malloc(N * sizeof(double)); \
cudaMemcpy(res, dx, N * sizeof(double), cudaMemcpyDeviceToHost); \
for (int i = 0; i < (N - 1); i++) { \
printf("%0.2f, ", res[i]); \
} \
printf("%0.2f\n", res[N-1]); \
free(fives); \
free(zeros); \
free(res); \
}


int main(void) {
int *a, *d_a;
Expand Down Expand Up @@ -340,14 +435,27 @@ int main(void) {
TEST_2(add_kernel, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_2, dim3(1), dim3(5, 1, 1), 0, true, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_3, dim3(5, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_4, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_5, dim3(2, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_6, dim3(1), dim3(5, 1, 1), 0, false, "a, b", dummy_out, dummy_in, d_out, d_in, 10); // CHECK-EXEC: 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
TEST_2_N(add_kernel_4, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2_N(add_kernel_5, dim3(2, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5

cudaFree(dummy_in);
cudaFree(dummy_out);
cudaFree(d_out);
cudaFree(d_in);

double *dummy_in_double, *dummy_out_double, *d_out_double, *d_in_double;
cudaMalloc(&dummy_in_double, 10 * sizeof(double));
cudaMalloc(&dummy_out_double, 10 * sizeof(double));
cudaMalloc(&d_out_double, 10 * sizeof(double));
cudaMalloc(&d_in_double, 10 * sizeof(double));

TEST_2_D(add_kernel_6, dim3(1), dim3(5, 1, 1), 0, false, "a, b", dummy_out_double, dummy_in_double, d_out_double, d_in_double, 10); // CHECK-EXEC: 50.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00

cudaFree(dummy_in_double);
cudaFree(dummy_out_double);
cudaFree(d_out_double);
cudaFree(d_in_double);


return 0;
}

0 comments on commit 6d6f853

Please sign in to comment.