From d6142efafb0df1cce39635f25fec37b6a3bb4ec8 Mon Sep 17 00:00:00 2001
From: Andres Suarez <asuarez@meta.com>
Date: Sun, 14 Apr 2024 11:28:32 -0700
Subject: [PATCH] Apply clang-format 18

Summary: Previously this code conformed from clang-format 12.

Reviewed By: igorsugak

Differential Revision: D56065247

fbshipit-source-id: f5a985dd8f8b84f2f9e1818b3719b43c5a1b05b3
---
 .../backend/cuda/groupnorm/layer_norm.cuh     | 113 +++++++++---------
 .../cuda/layernorm_sigmoid_mul/layer_norm.cuh |  29 ++---
 static/csrc/standalone.cpp                    |   6 +-
 static/include/cuda_device_functions.h        |  19 ++-
 static/include/kernels/kat_printf.h           |   9 +-
 static/include/rocm_device_functions.h        |  11 +-
 6 files changed, 90 insertions(+), 97 deletions(-)

diff --git a/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh b/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
index 387be808a..24aa2e865 100644
--- a/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
+++ b/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
@@ -593,7 +593,7 @@ DispatchLayerNormWarpImplCols(
     return cudaErrorInvalidValue;
   }
 #define DEFINE_ONE_ELIF(thread_group_width)                                 \
-  else if (cols <= (thread_group_width)*pack_size) {                        \
+  else if (cols <= (thread_group_width) * pack_size) {                      \
     if (rows % 2 == 0) {                                                    \
       return DispatchLayerNormWarpImplPadding<                              \
           LOAD,                                                             \
@@ -622,7 +622,7 @@ DispatchLayerNormWarpImplCols(
   DEFINE_ONE_ELIF(32)
 #undef DEFINE_ONE_ELIF
 #define DEFINE_ONE_ELIF(max_col, min_col)                                 \
-  else if (cols <= (max_col)*kWarpSize) {                                 \
+  else if (cols <= (max_col) * kWarpSize) {                               \
     return DispatchLayerNormWarpImplPadding<                              \
         LOAD,                                                             \
         STORE,                                                            \
@@ -663,7 +663,7 @@ DispatchLayerNormWarpImplCols(
     return cudaErrorInvalidValue;
   }
 #define DEFINE_ONE_ELIF(thread_group_width)                                 \
-  else if (cols <= (thread_group_width)*pack_size) {                        \
+  else if (cols <= (thread_group_width) * pack_size) {                      \
     if (rows % 2 == 0) {                                                    \
       return DispatchLayerNormWarpImplPadding<                              \
           LOAD,                                                             \
@@ -691,17 +691,18 @@ DispatchLayerNormWarpImplCols(
   DEFINE_ONE_ELIF(16)
   DEFINE_ONE_ELIF(32)
 #undef DEFINE_ONE_ELIF
-#define DEFINE_ONE_ELIF(max_col, min_col)                                   \
-  else if ((cols <= (max_col)*kWarpSize) && (cols > (min_col)*kWarpSize)) { \
-    return DispatchLayerNormWarpImplPadding<                                \
-        LOAD,                                                               \
-        STORE,                                                              \
-        ComputeType,                                                        \
-        pack_size,                                                          \
-        max_col,                                                            \
-        min_col,                                                            \
-        kWarpSize,                                                          \
-        1>(stream, load, store, rows, cols, epsilon, mean, inv_variance);   \
+#define DEFINE_ONE_ELIF(max_col, min_col)                                  \
+  else if (                                                                \
+      (cols <= (max_col) * kWarpSize) && (cols > (min_col) * kWarpSize)) { \
+    return DispatchLayerNormWarpImplPadding<                               \
+        LOAD,                                                              \
+        STORE,                                                             \
+        ComputeType,                                                       \
+        pack_size,                                                         \
+        max_col,                                                           \
+        min_col,                                                           \
+        kWarpSize,                                                         \
+        1>(stream, load, store, rows, cols, epsilon, mean, inv_variance);  \
   }
   DEFINE_ONE_ELIF(4, 2)
   DEFINE_ONE_ELIF(8, 4)
@@ -1518,47 +1519,47 @@ DispatchLayerNormGradWarpImplCols(
   if (cols <= 0) {
     return cudaErrorInvalidValue;
   }
-#define DEFINE_ONE_ELIF(thread_group_width)          \
-  else if (cols <= (thread_group_width)*pack_size) { \
-    if (rows % 2 == 0) {                             \
-      return DispatchLayerNormGradWarpImplPadding<   \
-          LOAD_X,                                    \
-          LOAD_SCALED_DY,                            \
-          STORE,                                     \
-          ComputeType,                               \
-          pack_size,                                 \
-          pack_size,                                 \
-          0,                                         \
-          thread_group_width,                        \
-          2>(                                        \
-          stream,                                    \
-          load_x,                                    \
-          load_scaled_dy,                            \
-          store,                                     \
-          mean,                                      \
-          inv_variance,                              \
-          rows,                                      \
-          cols);                                     \
-    } else {                                         \
-      return DispatchLayerNormGradWarpImplPadding<   \
-          LOAD_X,                                    \
-          LOAD_SCALED_DY,                            \
-          STORE,                                     \
-          ComputeType,                               \
-          pack_size,                                 \
-          pack_size,                                 \
-          0,                                         \
-          thread_group_width,                        \
-          1>(                                        \
-          stream,                                    \
-          load_x,                                    \
-          load_scaled_dy,                            \
-          store,                                     \
-          mean,                                      \
-          inv_variance,                              \
-          rows,                                      \
-          cols);                                     \
-    }                                                \
+#define DEFINE_ONE_ELIF(thread_group_width)            \
+  else if (cols <= (thread_group_width) * pack_size) { \
+    if (rows % 2 == 0) {                               \
+      return DispatchLayerNormGradWarpImplPadding<     \
+          LOAD_X,                                      \
+          LOAD_SCALED_DY,                              \
+          STORE,                                       \
+          ComputeType,                                 \
+          pack_size,                                   \
+          pack_size,                                   \
+          0,                                           \
+          thread_group_width,                          \
+          2>(                                          \
+          stream,                                      \
+          load_x,                                      \
+          load_scaled_dy,                              \
+          store,                                       \
+          mean,                                        \
+          inv_variance,                                \
+          rows,                                        \
+          cols);                                       \
+    } else {                                           \
+      return DispatchLayerNormGradWarpImplPadding<     \
+          LOAD_X,                                      \
+          LOAD_SCALED_DY,                              \
+          STORE,                                       \
+          ComputeType,                                 \
+          pack_size,                                   \
+          pack_size,                                   \
+          0,                                           \
+          thread_group_width,                          \
+          1>(                                          \
+          stream,                                      \
+          load_x,                                      \
+          load_scaled_dy,                              \
+          store,                                       \
+          mean,                                        \
+          inv_variance,                                \
+          rows,                                        \
+          cols);                                       \
+    }                                                  \
   }
   DEFINE_ONE_ELIF(4)
   DEFINE_ONE_ELIF(8)
@@ -1566,7 +1567,7 @@ DispatchLayerNormGradWarpImplCols(
   DEFINE_ONE_ELIF(32)
 #undef DEFINE_ONE_ELIF
 #define DEFINE_ONE_ELIF(max_col, min_col)        \
-  else if (cols <= (max_col)*kWarpSize) {        \
+  else if (cols <= (max_col) * kWarpSize) {      \
     return DispatchLayerNormGradWarpImplPadding< \
         LOAD_X,                                  \
         LOAD_SCALED_DY,                          \
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layer_norm.cuh b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layer_norm.cuh
index e28fdd831..3ff5f9609 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layer_norm.cuh
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layer_norm.cuh
@@ -593,7 +593,7 @@ DispatchLayerNormWarpImplCols(
     return cudaErrorInvalidValue;
   }
 #define DEFINE_ONE_ELIF(thread_group_width)                                 \
-  else if (cols <= (thread_group_width)*pack_size) {                        \
+  else if (cols <= (thread_group_width) * pack_size) {                      \
     if (rows % 2 == 0) {                                                    \
       return DispatchLayerNormWarpImplPadding<                              \
           LOAD,                                                             \
@@ -622,7 +622,7 @@ DispatchLayerNormWarpImplCols(
   DEFINE_ONE_ELIF(32)
 #undef DEFINE_ONE_ELIF
 #define DEFINE_ONE_ELIF(max_col, min_col)                                 \
-  else if (cols <= (max_col)*kWarpSize) {                                 \
+  else if (cols <= (max_col) * kWarpSize) {                               \
     return DispatchLayerNormWarpImplPadding<                              \
         LOAD,                                                             \
         STORE,                                                            \
@@ -663,7 +663,7 @@ DispatchLayerNormWarpImplCols(
     return cudaErrorInvalidValue;
   }
 #define DEFINE_ONE_ELIF(thread_group_width)                                 \
-  else if (cols <= (thread_group_width)*pack_size) {                        \
+  else if (cols <= (thread_group_width) * pack_size) {                      \
     if (rows % 2 == 0) {                                                    \
       return DispatchLayerNormWarpImplPadding<                              \
           LOAD,                                                             \
@@ -691,17 +691,18 @@ DispatchLayerNormWarpImplCols(
   DEFINE_ONE_ELIF(16)
   DEFINE_ONE_ELIF(32)
 #undef DEFINE_ONE_ELIF
-#define DEFINE_ONE_ELIF(max_col, min_col)                                   \
-  else if ((cols <= (max_col)*kWarpSize) && (cols > (min_col)*kWarpSize)) { \
-    return DispatchLayerNormWarpImplPadding<                                \
-        LOAD,                                                               \
-        STORE,                                                              \
-        ComputeType,                                                        \
-        pack_size,                                                          \
-        max_col,                                                            \
-        min_col,                                                            \
-        kWarpSize,                                                          \
-        1>(stream, load, store, rows, cols, epsilon, mean, inv_variance);   \
+#define DEFINE_ONE_ELIF(max_col, min_col)                                  \
+  else if (                                                                \
+      (cols <= (max_col) * kWarpSize) && (cols > (min_col) * kWarpSize)) { \
+    return DispatchLayerNormWarpImplPadding<                               \
+        LOAD,                                                              \
+        STORE,                                                             \
+        ComputeType,                                                       \
+        pack_size,                                                         \
+        max_col,                                                           \
+        min_col,                                                           \
+        kWarpSize,                                                         \
+        1>(stream, load, store, rows, cols, epsilon, mean, inv_variance);  \
   }
   DEFINE_ONE_ELIF(4, 2)
   DEFINE_ONE_ELIF(8, 4)
diff --git a/static/csrc/standalone.cpp b/static/csrc/standalone.cpp
index cb486304d..0ce97677b 100644
--- a/static/csrc/standalone.cpp
+++ b/static/csrc/standalone.cpp
@@ -433,11 +433,9 @@ struct AITStandaloneTestcase {
 
       inputs.push_back(AITData(gpu_data_owner.back().get(), shape, dtype));
     }
-    std::cout << "Finished loading testcase inputs."
-              << "\n";
+    std::cout << "Finished loading testcase inputs." << "\n";
     if (fh.peek() == std::ifstream::traits_type::eof()) {
-      std::cout << "No expected outputs in testcase."
-                << "\n";
+      std::cout << "No expected outputs in testcase." << "\n";
       return;
     }
     if (inputs.size() != num_inputs) {
diff --git a/static/include/cuda_device_functions.h b/static/include/cuda_device_functions.h
index 71bf5a29c..18acfc218 100644
--- a/static/include/cuda_device_functions.h
+++ b/static/include/cuda_device_functions.h
@@ -85,8 +85,7 @@ inline std::string GetUUIDToString(const char bytes[16]) {
 
 inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) {
   std::ostringstream oss;
-  oss << "Hardware accelerator device properties: "
-      << "\n  Device: "
+  oss << "Hardware accelerator device properties: " << "\n  Device: "
       << "\n     ASCII string identifying device: " << prop.name
       << "\n     Major compute capability: " << prop.major
       << "\n     Minor compute capability: " << prop.minor
@@ -121,9 +120,8 @@ inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) {
       << "\n     Peak global memory bandwidth (GByte/s): "
       << (prop.memoryClockRate / 1e6) * (prop.memoryBusWidth / 8) * 2
 
-      << "\n  Thread limits: "
-      << "\n     Warp size in threads: " << prop.warpSize
-      << "\n     Maximum size of each dimension of a grid: "
+      << "\n  Thread limits: " << "\n     Warp size in threads: "
+      << prop.warpSize << "\n     Maximum size of each dimension of a grid: "
       << prop.maxGridSize[0] << " " << prop.maxGridSize[1] << " "
       << prop.maxGridSize[2]
       << "\n     Maximum size of each dimension of a block: "
@@ -145,8 +143,7 @@ inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) {
       << "\n     Max clock frequency of the multiProcessors in khz: "
       << prop.clockRate
 
-      << "\n  Device features: "
-      << "\n     Device has ECC support enabled: "
+      << "\n  Device features: " << "\n     Device has ECC support enabled: "
       << (prop.ECCEnabled ? "yes" : "no")
       << "\n     Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer: "
       << (prop.canMapHostMemory ? "yes" : "no")
@@ -185,9 +182,8 @@ inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) {
       << "\n     Device shares a unified address space with the host: "
       << (prop.unifiedAddressing ? "yes" : "no")
 
-      << "\n  Texture limits: "
-      << "\n     Maximum 1D surface size: " << prop.maxSurface1D
-      << "\n     Maximum 1D layered surface dimensions: "
+      << "\n  Texture limits: " << "\n     Maximum 1D surface size: "
+      << prop.maxSurface1D << "\n     Maximum 1D layered surface dimensions: "
       << prop.maxSurface1DLayered[0] << " " << prop.maxSurface1DLayered[1]
       << "\n     Maximum 2D surface dimensions: " << prop.maxSurface2D[0] << " "
       << prop.maxSurface2D[1]
@@ -234,8 +230,7 @@ inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) {
 
 inline std::string PrintInfoDeviceProperties(const DevicePropertyType& prop) {
   std::ostringstream oss;
-  oss << "Hardware accelerator device properties: "
-      << "\n  Device: "
+  oss << "Hardware accelerator device properties: " << "\n  Device: "
       << "\n     ASCII string identifying device: " << prop.name
       << "\n     Major compute capability: " << prop.major
       << "\n     Minor compute capability: " << prop.minor
diff --git a/static/include/kernels/kat_printf.h b/static/include/kernels/kat_printf.h
index f4f122f6e..6fe1326b3 100644
--- a/static/include/kernels/kat_printf.h
+++ b/static/include/kernels/kat_printf.h
@@ -280,8 +280,8 @@ struct components {
 };
 } // namespace double_
 __attribute__((device)) static inline constexpr int get_sign_bit(double x) {
-  return (
-      int)(double_::with_bit_access::wrap(x).U >> (double_::size_in_bits - 1));
+  return (int)(double_::with_bit_access::wrap(x).U >>
+               (double_::size_in_bits - 1));
 }
 __attribute__((device)) static inline int get_exp2(double x) {
   return double_::with_bit_access::wrap(x).exp2();
@@ -472,8 +472,9 @@ __attribute__((device)) static void print_integer(
   } else {
     do {
       const char digit = (char)(value % base);
-      buf[len++] =
-          (char)(digit < 10 ? '0' + digit : (flags & flags::uppercase ? 'A' : 'a') + digit - 10);
+      buf[len++] = (char)(digit < 10 ? '0' + digit
+                                     : (flags & flags::uppercase ? 'A' : 'a') +
+                                  digit - 10);
       value /= base;
     } while (value && (len < detail_::printf::integer_buffer_size));
   }
diff --git a/static/include/rocm_device_functions.h b/static/include/rocm_device_functions.h
index 8fc7adf3c..92ecab14a 100644
--- a/static/include/rocm_device_functions.h
+++ b/static/include/rocm_device_functions.h
@@ -89,8 +89,7 @@ inline std::string PrintArchFeatureFlags(const hipDeviceArch_t& arch) {
 
 inline std::string PrintInfoDeviceProperties(const DevicePropertyType& prop) {
   std::ostringstream oss;
-  oss << "Hardware accelerator device properties: "
-      << "\n  Device: "
+  oss << "Hardware accelerator device properties: " << "\n  Device: "
       << "\n     ASCII string identifying device: " << prop.name
       << "\n     Major compute capability: " << prop.major
       << "\n     Minor compute capability: " << prop.minor
@@ -113,8 +112,7 @@ inline std::string PrintInfoDeviceProperties(const DevicePropertyType& prop) {
 
 inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) {
   std::ostringstream oss;
-  oss << "Hardware accelerator device properties: "
-      << "\n  Device: "
+  oss << "Hardware accelerator device properties: " << "\n  Device: "
       << "\n     ASCII string identifying device: " << prop.name
       << "\n     Major compute capability: " << prop.major
       << "\n     Minor compute capability: " << prop.minor
@@ -138,9 +136,8 @@ inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) {
       << "\n     Peak global memory bandwidth (GByte/s): "
       << (prop.memoryClockRate / 1e6) * (prop.memoryBusWidth / 8) * 2
 
-      << "\n  Thread limits: "
-      << "\n     Warp size in threads: " << prop.warpSize
-      << "\n     Maximum size of each dimension of a grid: "
+      << "\n  Thread limits: " << "\n     Warp size in threads: "
+      << prop.warpSize << "\n     Maximum size of each dimension of a grid: "
       << prop.maxGridSize[0] << " " << prop.maxGridSize[1] << " "
       << prop.maxGridSize[2]
       << "\n     Maximum size of each dimension of a block: "