From aac1754e4a6a528932d580776be06a045ae2c6c2 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Wed, 28 Feb 2024 17:48:15 +0000 Subject: [PATCH 1/6] Layernorm optimizations: Bulk conversions (packed halfs into half2, using vectors of half2); block and warp reduce with AMD wavesize 64 (vs 32); using smaller block sizes for improved block occupancy on CUs Use larger block sizes for decode; optimize warp and block reduce fully Refactor vector to use half to maintain same alignment as c10::Half; move packed logic into member functions Add a few missing unroll directives Fix blockReduce stall caused by warp divergence on CUDA (vLLM uses universal masks) Refactor vector type to enable optimizations for bf16 Re-apply the blockReduceSum fix for warp divergence Hotfix: Disable BF16 opts due to ROCm 5.7 incompatibility Remove redundant inline specifiers; preparing for upstream --- csrc/layernorm_kernels.cu | 288 +++++++++++++++++++++++++++++++++++--- csrc/reduction_utils.cuh | 56 ++++---- 2 files changed, 296 insertions(+), 48 deletions(-) diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu index 6d34d014c858e..5ee8b2609027e 100644 --- a/csrc/layernorm_kernels.cu +++ b/csrc/layernorm_kernels.cu @@ -4,6 +4,16 @@ #include "dispatch_utils.h" #include "reduction_utils.cuh" +#ifndef USE_ROCM + #include + #include +#else + #include + #include + + using __nv_bfloat16 = __hip_bfloat16; + using __nv_bfloat162 = __hip_bfloat162; +#endif namespace vllm { @@ -35,9 +45,200 @@ __global__ void rms_norm_kernel( } } -// TODO: Further optimize this kernel. -template -__global__ void fused_add_rms_norm_kernel( + +/* Converter structs for the conversion from torch types to HIP/CUDA types, + and the associated type conversions within HIP/CUDA. These helpers need + to be implemented for now because the relevant type conversion + operators/constructors are not consistently implemented by HIP/CUDA, so + a generic conversion via type casts cannot be implemented. + + Each struct should have the member static constexpr bool `exists`: + If false, the optimized kernel is not used for the corresponding torch type. + If true, the struct should be fully defined as shown in the examples below. + */ +template +struct _typeConvert { static constexpr bool exists = false; }; + +template<> +struct _typeConvert { + static constexpr bool exists = true; + using hip_type = __half; + using packed_hip_type = __half2; + + __device__ static inline float convert(hip_type x) { return __half2float(x); } + __device__ static inline float2 convert(packed_hip_type x) { return __half22float2(x); } + __device__ static inline hip_type convert(float x) { return __float2half_rn(x); } + __device__ static inline packed_hip_type convert(float2 x) { return __float22half2_rn(x); } +}; + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +// CUDA_ARCH < 800 does not have BF16 support +template<> +struct _typeConvert { + static constexpr bool exists = true; + using hip_type = __nv_bfloat16; + using packed_hip_type = __nv_bfloat162; + + __device__ static inline float convert(hip_type x) { return __bfloat162float(x); } + __device__ static inline float2 convert(packed_hip_type x) { return __bfloat1622float2(x); } + __device__ static inline hip_type convert(float x) { return __float2bfloat16(x); } + __device__ static inline packed_hip_type convert(float2 x) { return __float22bfloat162_rn(x); } +}; +#endif + + +/* Converter POD struct to generate vectorized and packed FP16/BF16 ops + for appropriate specializations of fused_add_rms_norm_kernel. + Only special member functions and functions that are necessary + in that kernel are implemented. + */ +template +struct _f16Vec { + /* Not theoretically necessary that width is a power of 2 but should + almost always be the case for optimization purposes */ + static_assert(width > 0 && (width & (width - 1)) == 0, + "Width is not a positive power of 2!"); + using Converter = _typeConvert; + using T1 = typename Converter::hip_type; + using T2 = typename Converter::packed_hip_type; + T1 data[width]; + + __device__ _f16Vec& operator+=(const _f16Vec& other) { + if constexpr (width % 2 == 0) { + #pragma unroll + for (int i = 0; i < width; i += 2) { + T2 temp{data[i], data[i+1]}; + temp += T2{other.data[i], other.data[i+1]}; + data[i] = temp.x; + data[i+1] = temp.y; + } + } else { + #pragma unroll + for (int i = 0; i < width; ++i) + data[i] += other.data[i]; + } + return *this; + } + + __device__ _f16Vec& operator*=(const _f16Vec& other) { + if constexpr (width % 2 == 0) { + #pragma unroll + for (int i = 0; i < width; i += 2) { + T2 temp{data[i], data[i+1]}; + temp *= T2{other.data[i], other.data[i+1]}; + data[i] = temp.x; + data[i+1] = temp.y; + } + } else { + #pragma unroll + for (int i = 0; i < width; ++i) + data[i] *= other.data[i]; + } + return *this; + } + + __device__ _f16Vec& operator*=(const float scale) { + if constexpr (width % 2 == 0) { + #pragma unroll + for (int i = 0; i < width; i += 2) { + float2 temp_f = Converter::convert(T2{data[i], data[i+1]}); + temp_f.x *= scale; + temp_f.y *= scale; + T2 temp = Converter::convert(temp_f); + data[i] = temp.x; + data[i+1] = temp.y; + } + } else { + #pragma unroll + for (int i = 0; i < width; ++i) { + float temp = Converter::convert(data[i]) * scale; + data[i] = Converter::convert(temp); + } + } + return *this; + } + + __device__ float sum_squares() const { + float result = 0.0f; + if constexpr (width % 2 == 0) { + #pragma unroll + for (int i = 0; i < width; i += 2) { + float2 z = Converter::convert(T2{data[i], data[i+1]}); + result += z.x * z.x + z.y * z.y; + } + } else { + #pragma unroll + for (int i = 0; i < width; ++i) { + float x = Converter::convert(data[i]); + result += x * x; + } + } + return result; + } +}; + +/* Function specialization in the case of FP16/BF16 tensors. + Additional optimizations we can make in this case are + packed and vectorized operations, which help with the + memory latency bottleneck. */ +template +__global__ std::enable_if_t< + (width > 0) && _typeConvert::exists> fused_add_rms_norm_kernel( + scalar_t* __restrict__ input, // [..., hidden_size] + scalar_t* __restrict__ residual, // [..., hidden_size] + const scalar_t* __restrict__ weight, // [hidden_size] + const float epsilon, + const int num_tokens, + const int hidden_size) { + // Ensures reinterpret_cast does not mutate address for alignment reasons + static_assert(alignof(scalar_t) == alignof(_f16Vec)); + // Sanity checks on our vector struct and type-punned pointer arithmetic + static_assert(std::is_pod_v<_f16Vec>); + static_assert(sizeof(_f16Vec) == sizeof(scalar_t) * width); + + const int vec_hidden_size = hidden_size / width; + __shared__ float s_variance; + float variance = 0.0f; + /* These and the argument pointers are all declared `restrict` as they are + not aliased in practice. Argument pointers should not be dereferenced + in this kernel as that would be undefined behavior */ + auto* __restrict__ input_v = reinterpret_cast<_f16Vec*>(input); + auto* __restrict__ residual_v = reinterpret_cast<_f16Vec*>(residual); + auto* __restrict__ weight_v = reinterpret_cast*>(weight); + + for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) { + int id = blockIdx.x * vec_hidden_size + idx; + _f16Vec temp = input_v[id]; + temp += residual_v[id]; + variance += temp.sum_squares(); + residual_v[id] = temp; + } + /* Keep the following if-else block in sync with the + calculation of max_block_size in fused_add_rms_norm */ + if (num_tokens < 256) { + variance = blockReduceSum(variance); + } else variance = blockReduceSum(variance); + if (threadIdx.x == 0) { + s_variance = rsqrtf(variance / hidden_size + epsilon); + } + __syncthreads(); + + for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) { + int id = blockIdx.x * vec_hidden_size + idx; + _f16Vec temp = residual_v[id]; + temp *= s_variance; + temp *= weight_v[idx]; + input_v[id] = temp; + } +} + + +/* Generic fused_add_rms_norm_kernel + The width field is not used here but necessary for other specializations. + */ +template +__global__ std::enable_if_t< + (width == 0) || !_typeConvert::exists> fused_add_rms_norm_kernel( scalar_t* __restrict__ input, // [..., hidden_size] scalar_t* __restrict__ residual, // [..., hidden_size] const scalar_t* __restrict__ weight, // [hidden_size] @@ -48,12 +249,17 @@ __global__ void fused_add_rms_norm_kernel( float variance = 0.0f; for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - float x = (float) input[blockIdx.x * hidden_size + idx]; - x += (float) residual[blockIdx.x * hidden_size + idx]; + scalar_t z = input[blockIdx.x * hidden_size + idx]; + z += residual[blockIdx.x * hidden_size + idx]; + float x = (float) z; variance += x * x; - residual[blockIdx.x * hidden_size + idx] = (scalar_t) x; + residual[blockIdx.x * hidden_size + idx] = z; } - variance = blockReduceSum(variance); + /* Keep the following if-else block in sync with the + calculation of max_block_size in fused_add_rms_norm */ + if (num_tokens < 256) { + variance = blockReduceSum(variance); + } else variance = blockReduceSum(variance); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); } @@ -93,6 +299,21 @@ void rms_norm( }); } +#define LAUNCH_FUSED_ADD_RMS_NORM(width) \ + VLLM_DISPATCH_FLOATING_TYPES( \ + input.scalar_type(), \ + "fused_add_rms_norm_kernel", \ + [&] { \ + vllm::fused_add_rms_norm_kernel \ + <<>>( \ + input.data_ptr(), \ + residual.data_ptr(), \ + weight.data_ptr(), \ + epsilon, \ + num_tokens, \ + hidden_size); \ + }); + void fused_add_rms_norm( torch::Tensor& input, // [..., hidden_size] torch::Tensor& residual, // [..., hidden_size] @@ -102,19 +323,44 @@ void fused_add_rms_norm( int num_tokens = input.numel() / hidden_size; dim3 grid(num_tokens); - dim3 block(std::min(hidden_size, 1024)); + /* This kernel is memory-latency bound in many scenarios. + When num_tokens is large, a smaller block size allows + for increased block occupancy on CUs and better latency + hiding on global mem ops. */ + const int max_block_size = (num_tokens < 256) ? 1024 : 256; + dim3 block(std::min(hidden_size, max_block_size)); const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), - "fused_add_rms_norm_kernel", - [&] { - vllm::fused_add_rms_norm_kernel<<>>( - input.data_ptr(), - residual.data_ptr(), - weight.data_ptr(), - epsilon, - num_tokens, - hidden_size); - }); -} + /*If the tensor types are FP16/BF16, try to use the optimized kernel + with packed + vectorized ops. + Max optimization is achieved with a width-8 vector of FP16/BF16s + since we can load at most 128 bits at once in a global memory op. + However, we have to narrow the vectors if the hidden_size does + not divide 8. + + Specifically, assuming hidden-size does not divide 8: + If the hidden_size divides 4, we can use a width-4 vector. + If the hidden_size divides 2 or 6, we can use a width-2 + vector. + If the hidden_size is odd, we can only use a width-1 vector + which provides no benefit over the base implementation + => we do not use the optimized kernel, which is signified + by setting width = 0. + */ + switch (hidden_size % 8) { + case 0: + LAUNCH_FUSED_ADD_RMS_NORM(8); + break; + case 2: + [[fallthrough]]; + case 6: + LAUNCH_FUSED_ADD_RMS_NORM(2); + break; + case 4: + LAUNCH_FUSED_ADD_RMS_NORM(4); + break; + default: + LAUNCH_FUSED_ADD_RMS_NORM(0); + break; + } +} \ No newline at end of file diff --git a/csrc/reduction_utils.cuh b/csrc/reduction_utils.cuh index c25464e866e55..0b3edb6453d98 100644 --- a/csrc/reduction_utils.cuh +++ b/csrc/reduction_utils.cuh @@ -20,44 +20,46 @@ #include "cuda_compat.h" namespace vllm { - -template +template __inline__ __device__ T warpReduceSum(T val) { -#pragma unroll - for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) + static_assert(numLanes > 0 && (numLanes & (numLanes - 1)) == 0, + "numLanes is not a positive power of 2!"); + static_assert(numLanes <= WARP_SIZE); + #pragma unroll + for (int mask = numLanes >> 1; mask > 0; mask >>= 1) val += VLLM_SHFL_XOR_SYNC(val, mask); return val; } -__inline__ __device__ constexpr int _calculateLaneMask(int warp_size) { - return warp_size - 1; -} - -__inline__ __device__ constexpr int _calculateWidShift(int warp_size) { - return 5 + (warp_size >> 6); +// Helper function to return the next largest power of 2 +static constexpr int _nextPow2(unsigned int num) { + if (num <= 1) return num; + return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); } /* Calculate the sum of all elements in a block */ -template +template __inline__ __device__ T blockReduceSum(T val) { - static __shared__ T shared[WARP_SIZE]; - constexpr auto LANE_MASK = _calculateLaneMask(WARP_SIZE); - constexpr auto WID_SHIFT = _calculateWidShift(WARP_SIZE); - int lane = threadIdx.x & LANE_MASK; - int wid = threadIdx.x >> WID_SHIFT; - - val = warpReduceSum(val); - - if (lane == 0) - shared[wid] = val; + static_assert(maxBlockSize <= 1024); + if constexpr (maxBlockSize > WARP_SIZE) { + val = warpReduceSum(val); + // Calculates max number of lanes that need to participate in the last warpReduce + constexpr int maxActiveLanes = (maxBlockSize + WARP_SIZE - 1) / WARP_SIZE; + static __shared__ T shared[maxActiveLanes]; + int lane = threadIdx.x % WARP_SIZE; + int wid = threadIdx.x / WARP_SIZE; + if (lane == 0) + shared[wid] = val; - __syncthreads(); + __syncthreads(); - // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent - // blockDim.x is not divided by 32 - val = (threadIdx.x < (blockDim.x / (WARP_SIZE * 1.0f))) ? shared[lane] : (T)(0.0f); - val = warpReduceSum(val); + val = (threadIdx.x < blockDim.x / float(WARP_SIZE)) ? shared[lane] : 0.0f; + val = warpReduceSum(val); + } else { + // A single warpReduce is equal to blockReduce + val = warpReduceSum(val); + } return val; } -} // namespace vllm +} // namespace vllm \ No newline at end of file From d2f681ad526c1156969e40928170e6bbb07b6847 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Wed, 27 Mar 2024 18:28:44 +0000 Subject: [PATCH 2/6] Disable no half conv flags for CUDA --- cmake/utils.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 6bf5d5130290b..c7d3d85389838 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -100,6 +100,11 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG) if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8) list(APPEND GPU_FLAGS "-DENABLE_FP8_E5M2") + list(REMOVE_ITEM GPU_FLAGS + "-D__CUDA_NO_HALF_OPERATORS__" + "-D__CUDA_NO_HALF_CONVERSIONS__" + "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" + "-D__CUDA_NO_HALF2_OPERATORS__") endif() elseif(${GPU_LANG} STREQUAL "HIP") From 51288368179a7a33c9efacc9398a1576ba327b6b Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Wed, 27 Mar 2024 21:01:55 +0000 Subject: [PATCH 3/6] Add more hidden sizes (including non-multiples of 8) to test --- tests/kernels/test_layernorm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index b1e3c1a7f07f5..210d59e4f32fa 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -5,7 +5,8 @@ DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing -HIDDEN_SIZES = [768, 5120, 8192] # Arbitrary values for testing +HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, + 8199] # Arbitrary values for testing ADD_RESIDUAL = [False, True] SEEDS = [0] CUDA_DEVICES = [ From c0e37f6582918f04059e0100deb71cd0433a3d93 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Thu, 28 Mar 2024 13:56:00 +0000 Subject: [PATCH 4/6] Enforce 16 byte alignment for CUDA vectorized mem ops --- csrc/layernorm_kernels.cu | 49 +++++++++++++-------------------------- csrc/reduction_utils.cuh | 2 +- 2 files changed, 17 insertions(+), 34 deletions(-) diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu index 5ee8b2609027e..2d5c81a0c304e 100644 --- a/csrc/layernorm_kernels.cu +++ b/csrc/layernorm_kernels.cu @@ -87,13 +87,14 @@ struct _typeConvert { #endif -/* Converter POD struct to generate vectorized and packed FP16/BF16 ops +/* Vector POD struct to generate vectorized and packed FP16/BF16 ops for appropriate specializations of fused_add_rms_norm_kernel. Only special member functions and functions that are necessary - in that kernel are implemented. + in that kernel are implemented. Alignment to 16 bytes is required + to use 128-bit global memory ops. */ template -struct _f16Vec { +struct alignas(16) _f16Vec { /* Not theoretically necessary that width is a power of 2 but should almost always be the case for optimization purposes */ static_assert(width > 0 && (width & (width - 1)) == 0, @@ -190,8 +191,6 @@ __global__ std::enable_if_t< const float epsilon, const int num_tokens, const int hidden_size) { - // Ensures reinterpret_cast does not mutate address for alignment reasons - static_assert(alignof(scalar_t) == alignof(_f16Vec)); // Sanity checks on our vector struct and type-punned pointer arithmetic static_assert(std::is_pod_v<_f16Vec>); static_assert(sizeof(_f16Vec) == sizeof(scalar_t) * width); @@ -232,7 +231,6 @@ __global__ std::enable_if_t< } } - /* Generic fused_add_rms_norm_kernel The width field is not used here but necessary for other specializations. */ @@ -335,32 +333,17 @@ void fused_add_rms_norm( with packed + vectorized ops. Max optimization is achieved with a width-8 vector of FP16/BF16s since we can load at most 128 bits at once in a global memory op. - However, we have to narrow the vectors if the hidden_size does - not divide 8. - - Specifically, assuming hidden-size does not divide 8: - If the hidden_size divides 4, we can use a width-4 vector. - If the hidden_size divides 2 or 6, we can use a width-2 - vector. - If the hidden_size is odd, we can only use a width-1 vector - which provides no benefit over the base implementation - => we do not use the optimized kernel, which is signified - by setting width = 0. + However, this requires each tensor's data to be aligned to 16 + bytes. */ - switch (hidden_size % 8) { - case 0: - LAUNCH_FUSED_ADD_RMS_NORM(8); - break; - case 2: - [[fallthrough]]; - case 6: - LAUNCH_FUSED_ADD_RMS_NORM(2); - break; - case 4: - LAUNCH_FUSED_ADD_RMS_NORM(4); - break; - default: - LAUNCH_FUSED_ADD_RMS_NORM(0); - break; + auto inp_ptr = reinterpret_cast(input.data_ptr()); + auto res_ptr = reinterpret_cast(residual.data_ptr()); + auto wt_ptr = reinterpret_cast(weight.data_ptr()); + bool ptrs_are_aligned = inp_ptr % 16 == 0 && res_ptr % 16 == 0 \ + && wt_ptr % 16 == 0; + if (ptrs_are_aligned && hidden_size % 8 == 0) { + LAUNCH_FUSED_ADD_RMS_NORM(8); + } else { + LAUNCH_FUSED_ADD_RMS_NORM(0); } -} \ No newline at end of file +} diff --git a/csrc/reduction_utils.cuh b/csrc/reduction_utils.cuh index 0b3edb6453d98..80c1b95d00671 100644 --- a/csrc/reduction_utils.cuh +++ b/csrc/reduction_utils.cuh @@ -62,4 +62,4 @@ __inline__ __device__ T blockReduceSum(T val) { return val; } -} // namespace vllm \ No newline at end of file +} // namespace vllm From 677e045d7204be4860629f988621613c3cb45144 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 29 Mar 2024 03:39:12 +0000 Subject: [PATCH 5/6] Add back explicit cast to T in reduction_utils --- csrc/layernorm_kernels.cu | 6 +++--- csrc/reduction_utils.cuh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu index 2d5c81a0c304e..f1223e780917d 100644 --- a/csrc/layernorm_kernels.cu +++ b/csrc/layernorm_kernels.cu @@ -73,6 +73,7 @@ struct _typeConvert { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 // CUDA_ARCH < 800 does not have BF16 support +// TODO: Add in ROCm support once public headers handle bf16 maturely template<> struct _typeConvert { static constexpr bool exists = true; @@ -89,9 +90,8 @@ struct _typeConvert { /* Vector POD struct to generate vectorized and packed FP16/BF16 ops for appropriate specializations of fused_add_rms_norm_kernel. - Only special member functions and functions that are necessary - in that kernel are implemented. Alignment to 16 bytes is required - to use 128-bit global memory ops. + Only functions that are necessary in that kernel are implemented. + Alignment to 16 bytes is required to use 128-bit global memory ops. */ template struct alignas(16) _f16Vec { diff --git a/csrc/reduction_utils.cuh b/csrc/reduction_utils.cuh index 80c1b95d00671..bb5171f854d55 100644 --- a/csrc/reduction_utils.cuh +++ b/csrc/reduction_utils.cuh @@ -53,7 +53,7 @@ __inline__ __device__ T blockReduceSum(T val) { __syncthreads(); - val = (threadIdx.x < blockDim.x / float(WARP_SIZE)) ? shared[lane] : 0.0f; + val = (threadIdx.x < blockDim.x / float(WARP_SIZE)) ? shared[lane] : (T)(0.0f); val = warpReduceSum(val); } else { // A single warpReduce is equal to blockReduce From a1bbdc48fbd96e90e205ffbb61ce11055626ca84 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Sat, 30 Mar 2024 03:30:20 +0000 Subject: [PATCH 6/6] Style tweak --- csrc/layernorm_kernels.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu index f1223e780917d..ea30fa2747838 100644 --- a/csrc/layernorm_kernels.cu +++ b/csrc/layernorm_kernels.cu @@ -231,6 +231,7 @@ __global__ std::enable_if_t< } } + /* Generic fused_add_rms_norm_kernel The width field is not used here but necessary for other specializations. */