Skip to content

Commit

Permalink
Support LBM
Browse files Browse the repository at this point in the history
  • Loading branch information
kchristin22 committed Dec 15, 2024
1 parent 14f48f3 commit 92c6573
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 62 deletions.
59 changes: 10 additions & 49 deletions benchmark/LBM/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
# User Options
#===============================================================================

CUDA_PATH ?= /usr/local/cuda-11.3
CUDA_PATH ?= /usr/local/cuda-11.8

ENZYME_PATH ?= /home/wsmoses/Enzyme/enzyme/build13Fast/Enzyme/ClangEnzyme-13.so
CLANG_PATH ?= /home/wsmoses/llvm-project/buildfast/bin/clang++
CLAD_PATH ?= ../../build/./lib/clad.so
CLANG_PATH ?= /usr/lib/llvm-17/bin/clang++

OPTIMIZE ?= yes

Expand All @@ -27,7 +27,7 @@ SM_VERSION = 60
# Program name & source code list
#===============================================================================

program = rsbench
program = lbm

source = lbm.cu main.cc parboil_cuda.c args.c

Expand All @@ -38,17 +38,17 @@ obj = $(source:.cu=.o)
#===============================================================================

# Standard Flags
CFLAGS := -mllvm -max-heap-to-stack-size=1000000 -I $(CUDA_PATH)/include -I .
CFLAGS := -mllvm -max-heap-to-stack-size=1000000 -I .
# -mllvm -max-heap-to-stack-size=-1
# -Rpass=attributor -mllvm -debug -mllvm -debug-only=attributor


CC := $(CLANG_PATH)
CFLAGS += -ffast-math -fno-experimental-new-pass-manager --cuda-path=$(CUDA_PATH) -L$(CUDA_PATH)/lib64 --cuda-gpu-arch=sm_$(SM_VERSION) -std=c++11 -Xclang -load -Xclang $(ENZYME_PATH)

CFLAGS := -I $(CUDA_PATH)/include --cuda-path=$(CUDA_PATH) -L$(CUDA_PATH)/lib64 --cuda-gpu-arch=sm_$(SM_VERSION) \
-std=c++17 -Xclang -add-plugin -Xclang clad -Xclang -load -Xclang $(CLAD_PATH) -I../../include

# Linker Flags
LDFLAGS = "-lcudart_static" "-ldl" "-lrt" -lpthread -lm
LDFLAGS = "-lcudart_static" "-ldl" "-lrt" -lpthread -lm -lstdc++

# Debug Flags
ifeq ($(DEBUG),yes)
Expand Down Expand Up @@ -89,45 +89,6 @@ else
CFLAGS += -DALLOW_AD=1
endif


# Optimization Flags
ifeq ($(NEWCACHE),yes)
CFLAGS += -mllvm -enzyme-new-cache=1 -mllvm -enzyme-mincut-cache=1
ifeq ($(ABI),yes)
CFLAGS += -DSIZE=20
endif
else
CFLAGS += -mllvm -enzyme-new-cache=0 -mllvm -enzyme-mincut-cache=0
ifeq ($(ABI),yes)
CFLAGS += -DSIZE=80
endif
endif

ifeq ($(AA),yes)
CFLAGS += -mllvm -enzyme-aggressive-aa=1
else
CFLAGS += -mllvm -enzyme-aggressive-aa=0
endif


ifeq ($(PHISTRUCT),yes)
CFLAGS += -mllvm -enzyme-phi-restructure=1
else
CFLAGS += -mllvm -enzyme-phi-restructure=0
endif

ifeq ($(COALESE),yes)
CFLAGS += -mllvm -enzyme-coalese
endif

ifeq ($(CACHELICM),yes)
CFLAGS += -mllvm -enzyme-loop-invariant-cache=1
else
CFLAGS += -mllvm -enzyme-loop-invariant-cache=0
endif



#===============================================================================
# Targets to Build
#===============================================================================
Expand All @@ -143,7 +104,7 @@ $(program): $(obj) lbm_kernel.cu Makefile


clean:
rm -rf rsbench $(obj)
rm -rf lbm lbm.o

run:
./rsbench
./lbm
2 changes: 1 addition & 1 deletion benchmark/LBM/args.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

#include <parboil.h>
#include "parboil.h"
#include <errno.h>
#include <limits.h>
#include <stdlib.h>
Expand Down
6 changes: 5 additions & 1 deletion benchmark/LBM/lbm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
#define REAL_MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0))
#define TOTAL_MARGIN (2*PADDED_X*PADDED_Y*N_CELL_ENTRIES)

#include "clad/Differentiator/Differentiator.h"

/******************************************************************************/

__attribute__((noinline))
Expand Down Expand Up @@ -139,7 +141,9 @@ __host__ void CUDA_LBM_kernel_loop( int nTimeSteps, LBM_Grid srcGrid, LBM_Grid s

cudaMemcpy(&here[0], srcGrid + start, N * sizeof(float), cudaMemcpyDeviceToHost);
#endif
__enzyme_autodiff((void*)CUDA_LBM_kernel_loop_inner, nTimeSteps, srcGrid, srcGridb, dstGrid, dstGridb);
auto grad = clad::gradient(CUDA_LBM_kernel_loop_inner, "srcGrid, dstGrid");
grad.execute(nTimeSteps, srcGrid, dstGrid, srcGridb, dstGridb);
// __enzyme_autodiff((void*)CUDA_LBM_kernel_loop_inner, nTimeSteps, srcGrid, srcGridb, dstGrid, dstGridb);
#ifdef ALLOCATOR
delete A;
#endif
Expand Down
15 changes: 5 additions & 10 deletions benchmark/LBM/lbm_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
// includes, project
#include "main.h"
#include "lbm.h"
#include "clad/Differentiator/Differentiator.h"
#ifndef __MCUDA__
#include <cuda.h>
#else
Expand Down Expand Up @@ -200,13 +199,9 @@ __global__ void performStreamCollide_kernel_wrapper( float* srcGrid, float* dstG
performStreamCollide_kernel(srcGrid, dstGrid );
}

int main() {
auto grad = clad::gradient(performStreamCollide_kernel_wrapper);
}

#ifdef ALLOW_AD
struct Byte20 {
char x[SIZE];
char x[SIZE_X];
};

extern __device__ int enzyme_dup;
Expand All @@ -217,28 +212,28 @@ __device__ Byte20 __enzyme_augmentfwd(void*, int, size_t, int, float*, float*, i
__global__ void performStreamCollide_augmented( float* src, float* dsrc, float* dst, float* ddst, Byte20* tape)
{
size_t idx = threadIdx.x + SIZE_X * (blockIdx.x + SIZE_Y * blockIdx.y);
tape[idx] = __enzyme_augmentfwd((void*)performStreamCollide_kernel, enzyme_allocated, sizeof(Byte20), enzyme_dup, src, dsrc, enzyme_dup, dst, ddst);
// tape[idx] = __enzyme_augmentfwd((void*)performStreamCollide_kernel, enzyme_allocated, sizeof(Byte20), enzyme_dup, src, dsrc, enzyme_dup, dst, ddst);
}

__device__ void __enzyme_reverse(void*, int, size_t, int, float*, float*, int, float*, float*, Byte20);
__global__ void performStreamCollide_gradient( float* src, float* dsrc, float* dst, float* ddst, Byte20* tape)
{
size_t idx = threadIdx.x + SIZE_X * (blockIdx.x + SIZE_Y * blockIdx.y);
__enzyme_reverse((void*)performStreamCollide_kernel, enzyme_allocated, sizeof(Byte20), enzyme_dup, src, dsrc, enzyme_dup, dst, ddst, tape[idx]);
// __enzyme_reverse((void*)performStreamCollide_kernel, enzyme_allocated, sizeof(Byte20), enzyme_dup, src, dsrc, enzyme_dup, dst, ddst, tape[idx]);
}
#else
__device__ Byte20 __enzyme_augmentfwd(void*, int, float*, float*, int, float*, float*);
__global__ void performStreamCollide_augmented( float* src, float* dsrc, float* dst, float* ddst, Byte20* tape)
{
size_t idx = threadIdx.x + SIZE_X * (blockIdx.x + SIZE_Y * blockIdx.y);
tape[idx] = __enzyme_augmentfwd((void*)performStreamCollide_kernel, enzyme_dup, src, dsrc, enzyme_dup, dst, ddst);
// tape[idx] = __enzyme_augmentfwd((void*)performStreamCollide_kernel, enzyme_dup, src, dsrc, enzyme_dup, dst, ddst);
}

__device__ void __enzyme_reverse(void*, int, float*, float*, int, float*, float*, Byte20);
__global__ void performStreamCollide_gradient( float* src, float* dsrc, float* dst, float* ddst, Byte20* tape)
{
size_t idx = threadIdx.x + SIZE_X * (blockIdx.x + SIZE_Y * blockIdx.y);
__enzyme_reverse((void*)performStreamCollide_kernel, enzyme_dup, src, dsrc, enzyme_dup, dst, ddst, tape[idx]);
// __enzyme_reverse((void*)performStreamCollide_kernel, enzyme_dup, src, dsrc, enzyme_dup, dst, ddst, tape[idx]);
}
#endif
#endif
Expand Down
2 changes: 1 addition & 1 deletion benchmark/LBM/parboil_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* (c) 2007 The Board of Trustees of the University of Illinois.
*/

#include <parboil.h>
#include "parboil.h"
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
Expand Down

0 comments on commit 92c6573

Please sign in to comment.