Enable warp size 64

anthonix · Jun 14, 2024 · 85b3f13 · 85b3f13
1 parent c80bc8a
commit 85b3f13
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 9 deletions.
diff --git a/llmc/amd_common.cuh b/llmc/amd_common.cuh
@@ -303,18 +303,20 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
 }
 #else
 static __device__ __forceinline__ float warp_reduce_sum(float x) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
-    }
+#ifdef WAVEFRONTSIZE64
+    for (int mask = 32; mask > 0; mask >>= 1) { x += __shfl_xor(x, mask, 64); }
+#else
+    for (int mask = 16; mask > 0; mask >>= 1) { x += __shfl_xor(x, mask, 32); }
+#endif
     return x;
 }
 
 static __device__ __forceinline__ float warp_reduce_max(float x) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
-    }
+#ifdef WAVEFRONTSIZE64
+    for (int mask = 32; mask > 0; mask >>= 1) { x = fmaxf(x, __shfl_xor(x, mask, 64)); }
+#else
+    for (int mask = 16; mask > 0; mask >>= 1) { x = fmaxf(x, __shfl_xor(x, mask, 32)); }
+#endif
     return x;
 }
 #endif

diff --git a/llmc/cuda_common.h b/llmc/cuda_common.h
@@ -29,7 +29,11 @@ extern cudaDeviceProp deviceProp;
 
 // WarpSize is not a compile time constant
 // Defining here like this possibly allows the compiler to optimize better
+#ifdef WAVEFRONTSIZE64
+#define WARP_SIZE 64U
+#else
 #define WARP_SIZE 32U
+#endif
 
 // try to make sure that 2 blocks fit on A100/H100 to maximise latency tolerance
 // this needs to be defines rather than queried to be used for __launch_bounds__

diff --git a/llmc/matmul.cuh b/llmc/matmul.cuh
@@ -190,7 +190,7 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias,
 
         const int block_size = deviceProp.maxThreadsPerMultiProcessor == 1536 ? 768 : 1024;
 
-        dim3 block_dim = {4, 8, (unsigned)block_size/WARP_SIZE};
+        dim3 block_dim = {4, WARP_SIZE/4, (unsigned)block_size/WARP_SIZE};
         const int OC_per_warp = block_dim.y * x128::size; // 64 at BF16
         const int grid_size_x = CEIL_DIV(OC, OC_per_warp); // e.g. 12 horizontal blocks for 768 OCs at BF16
         const int grid_size_y = max(1, deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / (block_size * grid_size_x)); // full GPU!