From d6142efafb0df1cce39635f25fec37b6a3bb4ec8 Mon Sep 17 00:00:00 2001 From: Andres Suarez Date: Sun, 14 Apr 2024 11:28:32 -0700 Subject: [PATCH] Apply clang-format 18 Summary: Previously this code conformed from clang-format 12. Reviewed By: igorsugak Differential Revision: D56065247 fbshipit-source-id: f5a985dd8f8b84f2f9e1818b3719b43c5a1b05b3 --- .../backend/cuda/groupnorm/layer_norm.cuh | 113 +++++++++--------- .../cuda/layernorm_sigmoid_mul/layer_norm.cuh | 29 ++--- static/csrc/standalone.cpp | 6 +- static/include/cuda_device_functions.h | 19 ++- static/include/kernels/kat_printf.h | 9 +- static/include/rocm_device_functions.h | 11 +- 6 files changed, 90 insertions(+), 97 deletions(-) diff --git a/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh b/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh index 387be808a..24aa2e865 100644 --- a/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh +++ b/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh @@ -593,7 +593,7 @@ DispatchLayerNormWarpImplCols( return cudaErrorInvalidValue; } #define DEFINE_ONE_ELIF(thread_group_width) \ - else if (cols <= (thread_group_width)*pack_size) { \ + else if (cols <= (thread_group_width) * pack_size) { \ if (rows % 2 == 0) { \ return DispatchLayerNormWarpImplPadding< \ LOAD, \ @@ -622,7 +622,7 @@ DispatchLayerNormWarpImplCols( DEFINE_ONE_ELIF(32) #undef DEFINE_ONE_ELIF #define DEFINE_ONE_ELIF(max_col, min_col) \ - else if (cols <= (max_col)*kWarpSize) { \ + else if (cols <= (max_col) * kWarpSize) { \ return DispatchLayerNormWarpImplPadding< \ LOAD, \ STORE, \ @@ -663,7 +663,7 @@ DispatchLayerNormWarpImplCols( return cudaErrorInvalidValue; } #define DEFINE_ONE_ELIF(thread_group_width) \ - else if (cols <= (thread_group_width)*pack_size) { \ + else if (cols <= (thread_group_width) * pack_size) { \ if (rows % 2 == 0) { \ return DispatchLayerNormWarpImplPadding< \ LOAD, \ @@ -691,17 +691,18 @@ DispatchLayerNormWarpImplCols( DEFINE_ONE_ELIF(16) DEFINE_ONE_ELIF(32) #undef DEFINE_ONE_ELIF -#define DEFINE_ONE_ELIF(max_col, min_col) \ - else if ((cols <= (max_col)*kWarpSize) && (cols > (min_col)*kWarpSize)) { \ - return DispatchLayerNormWarpImplPadding< \ - LOAD, \ - STORE, \ - ComputeType, \ - pack_size, \ - max_col, \ - min_col, \ - kWarpSize, \ - 1>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \ +#define DEFINE_ONE_ELIF(max_col, min_col) \ + else if ( \ + (cols <= (max_col) * kWarpSize) && (cols > (min_col) * kWarpSize)) { \ + return DispatchLayerNormWarpImplPadding< \ + LOAD, \ + STORE, \ + ComputeType, \ + pack_size, \ + max_col, \ + min_col, \ + kWarpSize, \ + 1>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \ } DEFINE_ONE_ELIF(4, 2) DEFINE_ONE_ELIF(8, 4) @@ -1518,47 +1519,47 @@ DispatchLayerNormGradWarpImplCols( if (cols <= 0) { return cudaErrorInvalidValue; } -#define DEFINE_ONE_ELIF(thread_group_width) \ - else if (cols <= (thread_group_width)*pack_size) { \ - if (rows % 2 == 0) { \ - return DispatchLayerNormGradWarpImplPadding< \ - LOAD_X, \ - LOAD_SCALED_DY, \ - STORE, \ - ComputeType, \ - pack_size, \ - pack_size, \ - 0, \ - thread_group_width, \ - 2>( \ - stream, \ - load_x, \ - load_scaled_dy, \ - store, \ - mean, \ - inv_variance, \ - rows, \ - cols); \ - } else { \ - return DispatchLayerNormGradWarpImplPadding< \ - LOAD_X, \ - LOAD_SCALED_DY, \ - STORE, \ - ComputeType, \ - pack_size, \ - pack_size, \ - 0, \ - thread_group_width, \ - 1>( \ - stream, \ - load_x, \ - load_scaled_dy, \ - store, \ - mean, \ - inv_variance, \ - rows, \ - cols); \ - } \ +#define DEFINE_ONE_ELIF(thread_group_width) \ + else if (cols <= (thread_group_width) * pack_size) { \ + if (rows % 2 == 0) { \ + return DispatchLayerNormGradWarpImplPadding< \ + LOAD_X, \ + LOAD_SCALED_DY, \ + STORE, \ + ComputeType, \ + pack_size, \ + pack_size, \ + 0, \ + thread_group_width, \ + 2>( \ + stream, \ + load_x, \ + load_scaled_dy, \ + store, \ + mean, \ + inv_variance, \ + rows, \ + cols); \ + } else { \ + return DispatchLayerNormGradWarpImplPadding< \ + LOAD_X, \ + LOAD_SCALED_DY, \ + STORE, \ + ComputeType, \ + pack_size, \ + pack_size, \ + 0, \ + thread_group_width, \ + 1>( \ + stream, \ + load_x, \ + load_scaled_dy, \ + store, \ + mean, \ + inv_variance, \ + rows, \ + cols); \ + } \ } DEFINE_ONE_ELIF(4) DEFINE_ONE_ELIF(8) @@ -1566,7 +1567,7 @@ DispatchLayerNormGradWarpImplCols( DEFINE_ONE_ELIF(32) #undef DEFINE_ONE_ELIF #define DEFINE_ONE_ELIF(max_col, min_col) \ - else if (cols <= (max_col)*kWarpSize) { \ + else if (cols <= (max_col) * kWarpSize) { \ return DispatchLayerNormGradWarpImplPadding< \ LOAD_X, \ LOAD_SCALED_DY, \ diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layer_norm.cuh b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layer_norm.cuh index e28fdd831..3ff5f9609 100644 --- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layer_norm.cuh +++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layer_norm.cuh @@ -593,7 +593,7 @@ DispatchLayerNormWarpImplCols( return cudaErrorInvalidValue; } #define DEFINE_ONE_ELIF(thread_group_width) \ - else if (cols <= (thread_group_width)*pack_size) { \ + else if (cols <= (thread_group_width) * pack_size) { \ if (rows % 2 == 0) { \ return DispatchLayerNormWarpImplPadding< \ LOAD, \ @@ -622,7 +622,7 @@ DispatchLayerNormWarpImplCols( DEFINE_ONE_ELIF(32) #undef DEFINE_ONE_ELIF #define DEFINE_ONE_ELIF(max_col, min_col) \ - else if (cols <= (max_col)*kWarpSize) { \ + else if (cols <= (max_col) * kWarpSize) { \ return DispatchLayerNormWarpImplPadding< \ LOAD, \ STORE, \ @@ -663,7 +663,7 @@ DispatchLayerNormWarpImplCols( return cudaErrorInvalidValue; } #define DEFINE_ONE_ELIF(thread_group_width) \ - else if (cols <= (thread_group_width)*pack_size) { \ + else if (cols <= (thread_group_width) * pack_size) { \ if (rows % 2 == 0) { \ return DispatchLayerNormWarpImplPadding< \ LOAD, \ @@ -691,17 +691,18 @@ DispatchLayerNormWarpImplCols( DEFINE_ONE_ELIF(16) DEFINE_ONE_ELIF(32) #undef DEFINE_ONE_ELIF -#define DEFINE_ONE_ELIF(max_col, min_col) \ - else if ((cols <= (max_col)*kWarpSize) && (cols > (min_col)*kWarpSize)) { \ - return DispatchLayerNormWarpImplPadding< \ - LOAD, \ - STORE, \ - ComputeType, \ - pack_size, \ - max_col, \ - min_col, \ - kWarpSize, \ - 1>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \ +#define DEFINE_ONE_ELIF(max_col, min_col) \ + else if ( \ + (cols <= (max_col) * kWarpSize) && (cols > (min_col) * kWarpSize)) { \ + return DispatchLayerNormWarpImplPadding< \ + LOAD, \ + STORE, \ + ComputeType, \ + pack_size, \ + max_col, \ + min_col, \ + kWarpSize, \ + 1>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \ } DEFINE_ONE_ELIF(4, 2) DEFINE_ONE_ELIF(8, 4) diff --git a/static/csrc/standalone.cpp b/static/csrc/standalone.cpp index cb486304d..0ce97677b 100644 --- a/static/csrc/standalone.cpp +++ b/static/csrc/standalone.cpp @@ -433,11 +433,9 @@ struct AITStandaloneTestcase { inputs.push_back(AITData(gpu_data_owner.back().get(), shape, dtype)); } - std::cout << "Finished loading testcase inputs." - << "\n"; + std::cout << "Finished loading testcase inputs." << "\n"; if (fh.peek() == std::ifstream::traits_type::eof()) { - std::cout << "No expected outputs in testcase." - << "\n"; + std::cout << "No expected outputs in testcase." << "\n"; return; } if (inputs.size() != num_inputs) { diff --git a/static/include/cuda_device_functions.h b/static/include/cuda_device_functions.h index 71bf5a29c..18acfc218 100644 --- a/static/include/cuda_device_functions.h +++ b/static/include/cuda_device_functions.h @@ -85,8 +85,7 @@ inline std::string GetUUIDToString(const char bytes[16]) { inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) { std::ostringstream oss; - oss << "Hardware accelerator device properties: " - << "\n Device: " + oss << "Hardware accelerator device properties: " << "\n Device: " << "\n ASCII string identifying device: " << prop.name << "\n Major compute capability: " << prop.major << "\n Minor compute capability: " << prop.minor @@ -121,9 +120,8 @@ inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) { << "\n Peak global memory bandwidth (GByte/s): " << (prop.memoryClockRate / 1e6) * (prop.memoryBusWidth / 8) * 2 - << "\n Thread limits: " - << "\n Warp size in threads: " << prop.warpSize - << "\n Maximum size of each dimension of a grid: " + << "\n Thread limits: " << "\n Warp size in threads: " + << prop.warpSize << "\n Maximum size of each dimension of a grid: " << prop.maxGridSize[0] << " " << prop.maxGridSize[1] << " " << prop.maxGridSize[2] << "\n Maximum size of each dimension of a block: " @@ -145,8 +143,7 @@ inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) { << "\n Max clock frequency of the multiProcessors in khz: " << prop.clockRate - << "\n Device features: " - << "\n Device has ECC support enabled: " + << "\n Device features: " << "\n Device has ECC support enabled: " << (prop.ECCEnabled ? "yes" : "no") << "\n Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer: " << (prop.canMapHostMemory ? "yes" : "no") @@ -185,9 +182,8 @@ inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) { << "\n Device shares a unified address space with the host: " << (prop.unifiedAddressing ? "yes" : "no") - << "\n Texture limits: " - << "\n Maximum 1D surface size: " << prop.maxSurface1D - << "\n Maximum 1D layered surface dimensions: " + << "\n Texture limits: " << "\n Maximum 1D surface size: " + << prop.maxSurface1D << "\n Maximum 1D layered surface dimensions: " << prop.maxSurface1DLayered[0] << " " << prop.maxSurface1DLayered[1] << "\n Maximum 2D surface dimensions: " << prop.maxSurface2D[0] << " " << prop.maxSurface2D[1] @@ -234,8 +230,7 @@ inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) { inline std::string PrintInfoDeviceProperties(const DevicePropertyType& prop) { std::ostringstream oss; - oss << "Hardware accelerator device properties: " - << "\n Device: " + oss << "Hardware accelerator device properties: " << "\n Device: " << "\n ASCII string identifying device: " << prop.name << "\n Major compute capability: " << prop.major << "\n Minor compute capability: " << prop.minor diff --git a/static/include/kernels/kat_printf.h b/static/include/kernels/kat_printf.h index f4f122f6e..6fe1326b3 100644 --- a/static/include/kernels/kat_printf.h +++ b/static/include/kernels/kat_printf.h @@ -280,8 +280,8 @@ struct components { }; } // namespace double_ __attribute__((device)) static inline constexpr int get_sign_bit(double x) { - return ( - int)(double_::with_bit_access::wrap(x).U >> (double_::size_in_bits - 1)); + return (int)(double_::with_bit_access::wrap(x).U >> + (double_::size_in_bits - 1)); } __attribute__((device)) static inline int get_exp2(double x) { return double_::with_bit_access::wrap(x).exp2(); @@ -472,8 +472,9 @@ __attribute__((device)) static void print_integer( } else { do { const char digit = (char)(value % base); - buf[len++] = - (char)(digit < 10 ? '0' + digit : (flags & flags::uppercase ? 'A' : 'a') + digit - 10); + buf[len++] = (char)(digit < 10 ? '0' + digit + : (flags & flags::uppercase ? 'A' : 'a') + + digit - 10); value /= base; } while (value && (len < detail_::printf::integer_buffer_size)); } diff --git a/static/include/rocm_device_functions.h b/static/include/rocm_device_functions.h index 8fc7adf3c..92ecab14a 100644 --- a/static/include/rocm_device_functions.h +++ b/static/include/rocm_device_functions.h @@ -89,8 +89,7 @@ inline std::string PrintArchFeatureFlags(const hipDeviceArch_t& arch) { inline std::string PrintInfoDeviceProperties(const DevicePropertyType& prop) { std::ostringstream oss; - oss << "Hardware accelerator device properties: " - << "\n Device: " + oss << "Hardware accelerator device properties: " << "\n Device: " << "\n ASCII string identifying device: " << prop.name << "\n Major compute capability: " << prop.major << "\n Minor compute capability: " << prop.minor @@ -113,8 +112,7 @@ inline std::string PrintInfoDeviceProperties(const DevicePropertyType& prop) { inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) { std::ostringstream oss; - oss << "Hardware accelerator device properties: " - << "\n Device: " + oss << "Hardware accelerator device properties: " << "\n Device: " << "\n ASCII string identifying device: " << prop.name << "\n Major compute capability: " << prop.major << "\n Minor compute capability: " << prop.minor @@ -138,9 +136,8 @@ inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) { << "\n Peak global memory bandwidth (GByte/s): " << (prop.memoryClockRate / 1e6) * (prop.memoryBusWidth / 8) * 2 - << "\n Thread limits: " - << "\n Warp size in threads: " << prop.warpSize - << "\n Maximum size of each dimension of a grid: " + << "\n Thread limits: " << "\n Warp size in threads: " + << prop.warpSize << "\n Maximum size of each dimension of a grid: " << prop.maxGridSize[0] << " " << prop.maxGridSize[1] << " " << prop.maxGridSize[2] << "\n Maximum size of each dimension of a block: "