Support LBM

kchristin22 · Dec 11, 2024 · 79b282d · 79b282d
1 parent 8fde8c1
commit 79b282d
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 62 deletions.
diff --git a/benchmark/LBM/Makefile b/benchmark/LBM/Makefile
@@ -2,10 +2,10 @@
 # User Options
 #===============================================================================
 
-CUDA_PATH ?= /usr/local/cuda-11.3
+CUDA_PATH ?= /usr/local/cuda-11.8
 
-ENZYME_PATH ?= /home/wsmoses/Enzyme/enzyme/build13Fast/Enzyme/ClangEnzyme-13.so
-CLANG_PATH ?= /home/wsmoses/llvm-project/buildfast/bin/clang++
+CLAD_PATH ?= ../../build/./lib/clad.so
+CLANG_PATH ?= /usr/lib/llvm-17/bin/clang++
 
 OPTIMIZE    ?= yes
 
@@ -27,7 +27,7 @@ SM_VERSION  = 60
 # Program name & source code list
 #===============================================================================
 
-program = rsbench
+program = lbm
 
 source = lbm.cu main.cc parboil_cuda.c args.c
 
@@ -38,17 +38,17 @@ obj = $(source:.cu=.o)
 #===============================================================================
 
 # Standard Flags
-CFLAGS := -mllvm -max-heap-to-stack-size=1000000 -I $(CUDA_PATH)/include -I .
+CFLAGS := -mllvm -max-heap-to-stack-size=1000000 -I .
 # -mllvm -max-heap-to-stack-size=-1
 # -Rpass=attributor -mllvm -debug -mllvm -debug-only=attributor
 
 
 CC := $(CLANG_PATH)
-CFLAGS += -ffast-math -fno-experimental-new-pass-manager --cuda-path=$(CUDA_PATH) -L$(CUDA_PATH)/lib64 --cuda-gpu-arch=sm_$(SM_VERSION)  -std=c++11 -Xclang -load -Xclang $(ENZYME_PATH)
-
+CFLAGS := -I $(CUDA_PATH)/include --cuda-path=$(CUDA_PATH) -L$(CUDA_PATH)/lib64 --cuda-gpu-arch=sm_$(SM_VERSION) \
+          -std=c++17 -Xclang -add-plugin -Xclang clad -Xclang -load -Xclang $(CLAD_PATH) -I../../include  
 
 # Linker Flags
-LDFLAGS = "-lcudart_static" "-ldl" "-lrt" -lpthread -lm
+LDFLAGS = "-lcudart_static" "-ldl" "-lrt" -lpthread -lm -lstdc++
 
 # Debug Flags
 ifeq ($(DEBUG),yes)
@@ -89,45 +89,6 @@ else
   CFLAGS += -DALLOW_AD=1 
 endif
 
-
-# Optimization Flags
-ifeq ($(NEWCACHE),yes)
-  CFLAGS += -mllvm -enzyme-new-cache=1 -mllvm -enzyme-mincut-cache=1
-  ifeq ($(ABI),yes)
-    CFLAGS += -DSIZE=20
-  endif
-else
-  CFLAGS += -mllvm -enzyme-new-cache=0 -mllvm -enzyme-mincut-cache=0
-  ifeq ($(ABI),yes)
-    CFLAGS += -DSIZE=80
-  endif
-endif
-
-ifeq ($(AA),yes)
-  CFLAGS += -mllvm -enzyme-aggressive-aa=1
-else
-  CFLAGS += -mllvm -enzyme-aggressive-aa=0
-endif
-
-
-ifeq ($(PHISTRUCT),yes)
-  CFLAGS += -mllvm -enzyme-phi-restructure=1
-else
-  CFLAGS += -mllvm -enzyme-phi-restructure=0
-endif
-
-ifeq ($(COALESE),yes)
-  CFLAGS += -mllvm -enzyme-coalese
-endif
-
-ifeq ($(CACHELICM),yes)
-  CFLAGS +=  -mllvm -enzyme-loop-invariant-cache=1
-else
-  CFLAGS +=  -mllvm -enzyme-loop-invariant-cache=0
-endif
-
-
-
 #===============================================================================
 # Targets to Build
 #===============================================================================
@@ -143,7 +104,7 @@ $(program): $(obj) lbm_kernel.cu Makefile
 
 
 clean:
-	rm -rf rsbench $(obj)
+	rm -rf lbm lbm.o
 
 run:
-	./rsbench
+	./lbm
diff --git a/benchmark/LBM/args.c b/benchmark/LBM/args.c
@@ -1,5 +1,5 @@
 
-#include <parboil.h>
+#include "parboil.h"
 #include <errno.h>
 #include <limits.h>
 #include <stdlib.h>

diff --git a/benchmark/LBM/lbm.cu b/benchmark/LBM/lbm.cu
@@ -34,6 +34,8 @@
 #define REAL_MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0))
 #define TOTAL_MARGIN (2*PADDED_X*PADDED_Y*N_CELL_ENTRIES)
 
+#include "clad/Differentiator/Differentiator.h"
+
 /******************************************************************************/
 
 __attribute__((noinline))
@@ -139,7 +141,9 @@ __host__ void CUDA_LBM_kernel_loop( int nTimeSteps, LBM_Grid srcGrid, LBM_Grid s
 
 	cudaMemcpy(&here[0], srcGrid + start, N * sizeof(float), cudaMemcpyDeviceToHost);
 #endif
-	__enzyme_autodiff((void*)CUDA_LBM_kernel_loop_inner, nTimeSteps, srcGrid, srcGridb, dstGrid, dstGridb);
+    auto grad = clad::gradient(CUDA_LBM_kernel_loop_inner, "srcGrid, dstGrid");
+    grad.execute(nTimeSteps, srcGrid, dstGrid, srcGridb, dstGridb);
+    // __enzyme_autodiff((void*)CUDA_LBM_kernel_loop_inner, nTimeSteps, srcGrid, srcGridb, dstGrid, dstGridb);
 #ifdef ALLOCATOR
 	delete A;
 #endif

diff --git a/benchmark/LBM/lbm_kernel.cu b/benchmark/LBM/lbm_kernel.cu
@@ -18,7 +18,6 @@
 // includes, project
 #include "main.h"
 #include "lbm.h"
-#include "clad/Differentiator/Differentiator.h"
 #ifndef __MCUDA__
 #include <cuda.h>
 #else
@@ -200,13 +199,9 @@ __global__ void performStreamCollide_kernel_wrapper( float* srcGrid, float* dstG
 	performStreamCollide_kernel(srcGrid, dstGrid );
 }
 
-int main() {
-	auto grad = clad::gradient(performStreamCollide_kernel_wrapper);
-}
-
 #ifdef ALLOW_AD
 struct Byte20 {
-	char x[SIZE];
+	char x[SIZE_X];
 };
 
 extern __device__ int enzyme_dup;
@@ -217,28 +212,28 @@ __device__ Byte20 __enzyme_augmentfwd(void*, int, size_t, int, float*, float*, i
 __global__ void performStreamCollide_augmented( float* src, float* dsrc, float* dst, float* ddst, Byte20* tape)
 {
 	size_t idx = threadIdx.x + SIZE_X * (blockIdx.x + SIZE_Y * blockIdx.y);
-	tape[idx] = __enzyme_augmentfwd((void*)performStreamCollide_kernel, enzyme_allocated, sizeof(Byte20), enzyme_dup, src, dsrc, enzyme_dup, dst, ddst);
+	// tape[idx] = __enzyme_augmentfwd((void*)performStreamCollide_kernel, enzyme_allocated, sizeof(Byte20), enzyme_dup, src, dsrc, enzyme_dup, dst, ddst);
 }
 
 __device__ void __enzyme_reverse(void*, int, size_t, int, float*, float*, int, float*, float*, Byte20);
 __global__ void performStreamCollide_gradient( float* src, float* dsrc, float* dst, float* ddst, Byte20* tape)
 {
 	size_t idx = threadIdx.x + SIZE_X * (blockIdx.x + SIZE_Y * blockIdx.y);
-	__enzyme_reverse((void*)performStreamCollide_kernel, enzyme_allocated, sizeof(Byte20), enzyme_dup, src, dsrc, enzyme_dup, dst, ddst, tape[idx]);
+	// __enzyme_reverse((void*)performStreamCollide_kernel, enzyme_allocated, sizeof(Byte20), enzyme_dup, src, dsrc, enzyme_dup, dst, ddst, tape[idx]);
 }
 #else
 __device__ Byte20 __enzyme_augmentfwd(void*, int, float*, float*, int, float*, float*);
 __global__ void performStreamCollide_augmented( float* src, float* dsrc, float* dst, float* ddst, Byte20* tape)
 {
 	size_t idx = threadIdx.x + SIZE_X * (blockIdx.x + SIZE_Y * blockIdx.y);
-	tape[idx] = __enzyme_augmentfwd((void*)performStreamCollide_kernel, enzyme_dup, src, dsrc, enzyme_dup, dst, ddst);
+	// tape[idx] = __enzyme_augmentfwd((void*)performStreamCollide_kernel, enzyme_dup, src, dsrc, enzyme_dup, dst, ddst);
 }
 
 __device__ void __enzyme_reverse(void*, int, float*, float*, int, float*, float*, Byte20);
 __global__ void performStreamCollide_gradient( float* src, float* dsrc, float* dst, float* ddst, Byte20* tape)
 {
 	size_t idx = threadIdx.x + SIZE_X * (blockIdx.x + SIZE_Y * blockIdx.y);
-	__enzyme_reverse((void*)performStreamCollide_kernel, enzyme_dup, src, dsrc, enzyme_dup, dst, ddst, tape[idx]);
+	// __enzyme_reverse((void*)performStreamCollide_kernel, enzyme_dup, src, dsrc, enzyme_dup, dst, ddst, tape[idx]);
 }
 #endif
 #endif

diff --git a/benchmark/LBM/parboil_cuda.c b/benchmark/LBM/parboil_cuda.c
@@ -2,7 +2,7 @@
  * (c) 2007 The Board of Trustees of the University of Illinois.
  */
 
-#include <parboil.h>
+#include "parboil.h"
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>