From c04a3f8a3aeb29ea9e8997b9ccadadbd9da346fd Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Mon, 13 Dec 2021 06:51:00 +0900
Subject: [PATCH 01/11] Support half precision sigmoid activation

---
 include/cutlass/epilogue/thread/activation.h | 8 ++++++++
 1 file changed, 8 insertions(+)
diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h
index 6f53375c2a..adc4abef88 100644
--- a/include/cutlass/epilogue/thread/activation.h
+++ b/include/cutlass/epilogue/thread/activation.h
@@ -108,6 +108,14 @@ struct Sigmoid<float> {
   }
 };
 
+template <>
+struct Sigmoid<half_t> {
+  CUTLASS_DEVICE
+  half_t operator()(half_t const &scalar) const {
+    return half_t(1) / (half_t(1) + half_t(::hexp(-scalar.to_half())));
+  }
+};
+
 template <typename T, int N>
 struct Sigmoid<Array<T, N> > {
   CUTLASS_HOST_DEVICE

From 1c9bead13adc3ab2df6a926734c0ebc7c8a08a24 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Mon, 13 Dec 2021 15:14:04 +0900
Subject: [PATCH 02/11] introduce a vectorized variant using fast_tanh

---
 include/cutlass/epilogue/thread/activation.h | 37 +++++++++++++++-----
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h
index adc4abef88..f56e7fc4be 100644
--- a/include/cutlass/epilogue/thread/activation.h
+++ b/include/cutlass/epilogue/thread/activation.h
@@ -108,14 +108,6 @@ struct Sigmoid<float> {
   }
 };
 
-template <>
-struct Sigmoid<half_t> {
-  CUTLASS_DEVICE
-  half_t operator()(half_t const &scalar) const {
-    return half_t(1) / (half_t(1) + half_t(::hexp(-scalar.to_half())));
-  }
-};
-
 template <typename T, int N>
 struct Sigmoid<Array<T, N> > {
   CUTLASS_HOST_DEVICE
@@ -132,6 +124,35 @@ struct Sigmoid<Array<T, N> > {
   }
 };
 
+template <>
+struct Sigmoid<half_t> {
+  CUTLASS_HOST_DEVICE
+  half_t operator()(half_t const& scalar) const {
+    half_t exp_res;
+    #if defined(__CUDA_ARCH__)
+    exp_res = half_t(::hexp(-scalar.to_half()));
+    #else
+    exp_res = half_t(std::exp(float(-scalar)));
+    #endif
+    return half_t(1) / (half_t(1) + exp_res);
+  }
+};
+
+#if defined(CUTLASS_USE_FAST_MATH)
+template <int N>
+struct Sigmoid<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const& z) const {
+    using T = half_t;
+    multiplies<Array<half_t, N>> mul;
+    plus<Array<half_t, N>> add;
+    fast_tanh_op<Array<half_t, N>> tanh;
+    return mul(add(tanh(mul(z, cutlass::constants::half<T>())), cutlass::constants::one<T>()),
+               cutlass::constants::half<T>());
+  }
+};
+#endif
+
 //
 // GELU function definitions implemented as described by
 //   Hendrycks, D., and Gimpel, K. in

From 30e58d940f93ceb67cb26bee9aa8ecc662908f7e Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Mon, 13 Dec 2021 20:51:10 +0900
Subject: [PATCH 03/11] refactored sigmoid using the new interface

---
 .../thread/linear_combination_generic.h       | 207 ++++++++++++++++++
 .../thread/linear_combination_sigmoid.h       | 156 +------------
 2 files changed, 214 insertions(+), 149 deletions(-)
 create mode 100644 include/cutlass/epilogue/thread/linear_combination_generic.h

diff --git a/include/cutlass/epilogue/thread/linear_combination_generic.h b/include/cutlass/epilogue/thread/linear_combination_generic.h
new file mode 100644
index 0000000000..75c211e760
--- /dev/null
+++ b/include/cutlass/epilogue/thread/linear_combination_generic.h
@@ -0,0 +1,207 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  template<typename T> class ActivationFunctor,
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationGeneric {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha(ElementCompute(1)),
+      beta(ElementCompute(0)),
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGeneric(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentOutput const &source) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+    ActivationFunctor<ComputeFragment> activation;
+
+    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+
+    intermediate = activation(intermediate);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_accumulator;
+    ActivationFunctor<ComputeFragment> activation;
+
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+
+    intermediate = activation(intermediate);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
index 4716effaa6..0a8ae76e49 100644
--- a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
+++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
@@ -29,12 +29,8 @@
 #pragma once
 
 #include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-
 #include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -57,150 +53,12 @@ template <
   typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
   FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
 >
-class LinearCombinationSigmoid {
-public:
-
-  using ElementOutput = ElementOutput_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-
-  static int const kCount = Count;
-
-  using FragmentOutput = Array<ElementOutput, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using ComputeFragment = Array<ElementCompute, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta;                   ///< scales source tensor
-    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(): 
-      alpha(ElementCompute(1)), 
-      beta(ElementCompute(0)), 
-      alpha_ptr(nullptr), 
-      beta_ptr(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha,
-      ElementCompute beta
-    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
-
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LinearCombinationSigmoid(Params const &params) {
-
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    return beta_ != ElementCompute(0);
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      beta_ = ElementCompute(1);
-    }
-  }
-  
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator, 
-    FragmentOutput const &source) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    ComputeFragment converted_source = source_converter(source);
-    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-
-    ComputeFragment intermediate;
-
-    multiplies<ComputeFragment> mul_add_source;
-    multiply_add<ComputeFragment> mul_add_accumulator;
-    Sigmoid<ComputeFragment> sigmoid;
-
-    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
-    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-
-    intermediate = sigmoid(intermediate);
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return destination_converter(intermediate);
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-
-    ComputeFragment intermediate;
-
-    multiplies<ComputeFragment> mul_add_accumulator;
-    Sigmoid<ComputeFragment> sigmoid;
-
-    intermediate = mul_add_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
-
-    intermediate = sigmoid(intermediate);
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return destination_converter(intermediate);
-  }
-};
-
+using LinearCombinationSigmoid = LinearCombinationGeneric<Sigmoid,
+							  ElementOutput_,
+							  Count,
+							  ElementAccumulator_,
+							  ElementCompute_,
+							  FloatRoundStyle::round_to_nearest>;
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace thread

From 6528a011c2d88e33b1b8976d884055a4156782d8 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Mon, 13 Dec 2021 21:03:43 +0900
Subject: [PATCH 04/11] refactored gelu

---
 .../epilogue/thread/linear_combination_gelu.h | 154 +-----------------
 .../thread/linear_combination_generic.h       |   4 +-
 .../thread/linear_combination_sigmoid.h       |   8 +-
 3 files changed, 8 insertions(+), 158 deletions(-)

diff --git a/include/cutlass/epilogue/thread/linear_combination_gelu.h b/include/cutlass/epilogue/thread/linear_combination_gelu.h
index 9eec618179..b3370974be 100644
--- a/include/cutlass/epilogue/thread/linear_combination_gelu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_gelu.h
@@ -29,12 +29,8 @@
 #pragma once
 
 #include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-
 #include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -57,153 +53,9 @@ template <
   typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
   FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
 >
-class LinearCombinationGELU {
-public:
-
-  using ElementOutput = ElementOutput_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-
-  static bool const kIsHeavy = true;
-
-  static int const kCount = Count;
-
-  using FragmentOutput = Array<ElementOutput, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using ComputeFragment = Array<ElementCompute, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta;                   ///< scales source tensor
-    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(): 
-      alpha(ElementCompute(1)), 
-      beta(ElementCompute(0)), 
-      alpha_ptr(nullptr), 
-      beta_ptr(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha,
-      ElementCompute beta
-    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
-
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LinearCombinationGELU(Params const &params) {
-
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    return beta_ != ElementCompute(0);
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      beta_ = ElementCompute(1);
-    }
-
-    CUTLASS_UNUSED(k_partition_count);
-  }
-  
-  /// Computes: D = gelu( alpha * accumulator + beta * source )
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator, 
-    FragmentOutput const &source) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    ComputeFragment converted_source = source_converter(source);
-    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-
-    ComputeFragment intermediate;
-
-    multiplies<ComputeFragment> mul_add_source;
-    multiply_add<ComputeFragment> mul_add_accumulator;
-    GELU<ComputeFragment> gelu;
-
-    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
-    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-
-    intermediate = gelu(intermediate);
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return destination_converter(intermediate);
-  }
-
-  /// Computes: D = gelu( alpha * accumulator )
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-
-    ComputeFragment intermediate;
-
-    multiplies<ComputeFragment> mul_add_accumulator;
-    GELU<ComputeFragment> gelu;
-
-    intermediate = mul_add_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
-
-    intermediate = gelu(intermediate);
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+using LinearCombinationGELU = LinearCombinationGeneric<GELU, ElementOutput_, Count, ElementAccumulator_,
+                                                       ElementCompute_, FloatRoundStyle::round_to_nearest, true>;
 
-    return destination_converter(intermediate);
-  }
-};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/cutlass/epilogue/thread/linear_combination_generic.h b/include/cutlass/epilogue/thread/linear_combination_generic.h
index 75c211e760..7ba87f3081 100644
--- a/include/cutlass/epilogue/thread/linear_combination_generic.h
+++ b/include/cutlass/epilogue/thread/linear_combination_generic.h
@@ -54,7 +54,8 @@ template <
                                                        ///< but we use 64 or 32 sometimes when there are not enough data to store
   typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
   typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  bool IsHeavy = false
 >
 class LinearCombinationGeneric {
 public:
@@ -63,6 +64,7 @@ class LinearCombinationGeneric {
   using ElementAccumulator = ElementAccumulator_;
   using ElementCompute = ElementCompute_;
 
+  static bool const kIsHeavy = IsHeavy;
   static int const kCount = Count;
 
   using FragmentOutput = Array<ElementOutput, kCount>;
diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
index 0a8ae76e49..2afedb1c63 100644
--- a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
+++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
@@ -53,12 +53,8 @@ template <
   typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
   FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
 >
-using LinearCombinationSigmoid = LinearCombinationGeneric<Sigmoid,
-							  ElementOutput_,
-							  Count,
-							  ElementAccumulator_,
-							  ElementCompute_,
-							  FloatRoundStyle::round_to_nearest>;
+using LinearCombinationSigmoid = LinearCombinationGeneric<Sigmoid, ElementOutput_, Count, ElementAccumulator_,
+                                                          ElementCompute_, FloatRoundStyle::round_to_nearest>;
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace thread

From ff6eafe85d0ecd5bcbcb2186e8410bf9be559549 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Mon, 13 Dec 2021 21:36:50 +0900
Subject: [PATCH 05/11] add silu activation

---
 include/cutlass/epilogue/thread/activation.h  | 19 ++++++
 .../epilogue/thread/linear_combination_silu.h | 62 +++++++++++++++++++
 2 files changed, 81 insertions(+)
 create mode 100644 include/cutlass/epilogue/thread/linear_combination_silu.h

diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h
index 6f53375c2a..b109cd0fe6 100644
--- a/include/cutlass/epilogue/thread/activation.h
+++ b/include/cutlass/epilogue/thread/activation.h
@@ -124,6 +124,25 @@ struct Sigmoid<Array<T, N> > {
   }
 };
 
+// SiLu (swish) operator
+template <typename T>
+struct SiLu {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &scalar) const {
+    return scalar * Sigmoid<T>(scalar);
+  }
+};
+
+template <typename T, int N>
+struct SiLu<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &rhs) const {
+    Sigmoid<Array<T, N>> sigmoid_op;
+    multiplies<Array<T, N>>     mul;
+    return mul(rhs, sigmoid_op(rhs));
+  }
+};
+
 //
 // GELU function definitions implemented as described by
 //   Hendrycks, D., and Gimpel, K. in
diff --git a/include/cutlass/epilogue/thread/linear_combination_silu.h b/include/cutlass/epilogue/thread/linear_combination_silu.h
new file mode 100644
index 0000000000..2573db9bc1
--- /dev/null
+++ b/include/cutlass/epilogue/thread/linear_combination_silu.h
@@ -0,0 +1,62 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationSilu = LinearCombinationGeneric<SiLu, ElementOutput_, Count, ElementAccumulator_,
+                                                       ElementCompute_, FloatRoundStyle::round_to_nearest>;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass

From 2f27d8ce6eb94a30628628968a75683bf175b882 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Tue, 14 Dec 2021 10:06:24 +0900
Subject: [PATCH 06/11] add hardswish

---
 include/cutlass/epilogue/thread/activation.h  | 26 ++++++++
 .../thread/linear_combination_hardswish.h     | 62 +++++++++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 include/cutlass/epilogue/thread/linear_combination_hardswish.h

diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h
index 4357f3dd8d..b986a8b342 100644
--- a/include/cutlass/epilogue/thread/activation.h
+++ b/include/cutlass/epilogue/thread/activation.h
@@ -172,6 +172,32 @@ struct SiLu<Array<T, N>> {
   }
 };
 
+template <typename T>
+struct HardSwish {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &x) const {
+    minimum<T> mn;
+    maximum<T> mx;
+    T relu6 = mn(mx(x + T(3), T(0)), T(6));
+    return x * (relu6 / T(6));
+  }
+};
+
+template <typename T, int N>
+struct HardSwish<Array<T, N> > {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &rhs) const {
+    Array<T, N> y;
+    HardSwish<T> hardswish_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(rhs.size()); ++i) {
+      y[i] = hardswish_op(rhs[i]);
+    }
+
+    return y;
+  }
+};
 
 //
 // GELU function definitions implemented as described by
diff --git a/include/cutlass/epilogue/thread/linear_combination_hardswish.h b/include/cutlass/epilogue/thread/linear_combination_hardswish.h
new file mode 100644
index 0000000000..4239ee3a05
--- /dev/null
+++ b/include/cutlass/epilogue/thread/linear_combination_hardswish.h
@@ -0,0 +1,62 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationHardSwish = LinearCombinationGeneric<HardSwish, ElementOutput_, Count, ElementAccumulator_,
+                                                            ElementCompute_, FloatRoundStyle::round_to_nearest>;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass

From d7b5297f1c87edbbf41ae60346f6d432c940a4b2 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Tue, 14 Dec 2021 14:28:27 +0900
Subject: [PATCH 07/11] remove sigmoid for now

---
 include/cutlass/epilogue/thread/activation.h | 29 --------------------
 1 file changed, 29 deletions(-)

diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h
index b986a8b342..c6b2be2bed 100644
--- a/include/cutlass/epilogue/thread/activation.h
+++ b/include/cutlass/epilogue/thread/activation.h
@@ -124,35 +124,6 @@ struct Sigmoid<Array<T, N> > {
   }
 };
 
-template <>
-struct Sigmoid<half_t> {
-  CUTLASS_HOST_DEVICE
-  half_t operator()(half_t const& scalar) const {
-    half_t exp_res;
-    #if defined(__CUDA_ARCH__)
-    exp_res = half_t(::hexp(-scalar.to_half()));
-    #else
-    exp_res = half_t(std::exp(float(-scalar)));
-    #endif
-    return half_t(1) / (half_t(1) + exp_res);
-  }
-};
-
-#if defined(CUTLASS_USE_FAST_MATH)
-template <int N>
-struct Sigmoid<Array<half_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const& z) const {
-    using T = half_t;
-    multiplies<Array<half_t, N>> mul;
-    plus<Array<half_t, N>> add;
-    fast_tanh_op<Array<half_t, N>> tanh;
-    return mul(add(tanh(mul(z, cutlass::constants::half<T>())), cutlass::constants::one<T>()),
-               cutlass::constants::half<T>());
-  }
-};
-#endif
-
 // SiLu (swish) operator
 template <typename T>
 struct SiLu {

From 8ac1036c94c0251aae4ac9f1a9323903cf527ae4 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Wed, 15 Dec 2021 03:44:23 +0900
Subject: [PATCH 08/11] add description to silu and hardswish, and other doc
 update

---
 include/cutlass/epilogue/thread/activation.h          | 11 ++++++++++-
 .../cutlass/epilogue/thread/linear_combination_gelu.h |  4 ++--
 .../epilogue/thread/linear_combination_generic.h      |  4 ++--
 .../epilogue/thread/linear_combination_hardswish.h    |  6 +++---
 .../epilogue/thread/linear_combination_sigmoid.h      |  6 +++---
 .../cutlass/epilogue/thread/linear_combination_silu.h |  6 +++---
 6 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h
index c6b2be2bed..6603e765c7 100644
--- a/include/cutlass/epilogue/thread/activation.h
+++ b/include/cutlass/epilogue/thread/activation.h
@@ -124,7 +124,11 @@ struct Sigmoid<Array<T, N> > {
   }
 };
 
-// SiLu (swish) operator
+// SiLu (swish) operator introduced by Elfwing et al. in the following paper
+// "Sigmoid-Weighted Linear Units for Neural Network Function Approximation in Reinforcement Learning" (2017)
+// https://arxiv.org/pdf/1702.03118.pdf
+// It is used in EfficientNet and YOLOv5, for example.
+// Reference: https://pytorch.org/docs/stable/generated/torch.nn.SiLU.html
 template <typename T>
 struct SiLu {
   CUTLASS_HOST_DEVICE
@@ -143,6 +147,11 @@ struct SiLu<Array<T, N>> {
   }
 };
 
+// Hardswish operator introduced by Howard et al. in the following paper
+// "Searching for MobileNetV3" (2019)
+// https://arxiv.org/pdf/1905.02244.pdf
+// It is used in models based on MobilenetNetV3.
+// Reference: https://pytorch.org/docs/stable/generated/torch.nn.Hardswish.html
 template <typename T>
 struct HardSwish {
   CUTLASS_HOST_DEVICE
diff --git a/include/cutlass/epilogue/thread/linear_combination_gelu.h b/include/cutlass/epilogue/thread/linear_combination_gelu.h
index b3370974be..9a5fd6a633 100644
--- a/include/cutlass/epilogue/thread/linear_combination_gelu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_gelu.h
@@ -40,9 +40,9 @@ namespace thread {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Applies a linear combination operator to an array of elements.
+/// Applies a linear combination operator followed by the GELU activation to an array of elements.
 ///
-/// D = alpha * accumulator + beta * source + uniform
+/// D = gelu(alpha * accumulator + beta * source + uniform)
 ///
 template <
   typename ElementOutput_,                             ///< Data type used to load and store tensors
diff --git a/include/cutlass/epilogue/thread/linear_combination_generic.h b/include/cutlass/epilogue/thread/linear_combination_generic.h
index 7ba87f3081..17f961e83b 100644
--- a/include/cutlass/epilogue/thread/linear_combination_generic.h
+++ b/include/cutlass/epilogue/thread/linear_combination_generic.h
@@ -42,9 +42,9 @@ namespace thread {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Applies a linear combination operator to an array of elements.
+/// Applies a linear combination operator followed by an activation function to an array of elements.
 ///
-/// D = alpha * accumulator + beta * source + uniform
+/// D = activation(alpha * accumulator + beta * source + uniform)
 ///
 template <
   template<typename T> class ActivationFunctor,
diff --git a/include/cutlass/epilogue/thread/linear_combination_hardswish.h b/include/cutlass/epilogue/thread/linear_combination_hardswish.h
index 4239ee3a05..a5dcabbff9 100644
--- a/include/cutlass/epilogue/thread/linear_combination_hardswish.h
+++ b/include/cutlass/epilogue/thread/linear_combination_hardswish.h
@@ -23,7 +23,7 @@
  *
  **************************************************************************************************/
 /*! \file
-  \brief Functor performing linear combination operations used by epilogues.
+  \brief Functor performing linear combination with HardSwish operations used by epilogues.
 */
 
 #pragma once
@@ -40,9 +40,9 @@ namespace thread {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Applies a linear combination operator to an array of elements.
+/// Applies a linear combination operator followed by the HardSwish activation to an array of elements.
 ///
-/// D = alpha * accumulator + beta * source + uniform
+/// D = hardswish(alpha * accumulator + beta * source + uniform)
 ///
 template <
   typename ElementOutput_,                             ///< Data type used to load and store tensors
diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
index 2afedb1c63..622a961ca5 100644
--- a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
+++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
@@ -23,7 +23,7 @@
  *
  **************************************************************************************************/
 /*! \file
-  \brief Functor performing linear combination operations used by epilogues.
+  \brief Functor performing linear combination with Sigmoid operations used by epilogues.
 */
 
 #pragma once
@@ -40,9 +40,9 @@ namespace thread {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Applies a linear combination operator to an array of elements.
+/// Applies a linear combination operator followed by the Sigmoid activation, to an array of elements.
 ///
-/// D = alpha * accumulator + beta * source + uniform
+/// D = sigmoid(alpha * accumulator + beta * source + uniform)
 ///
 template <
   typename ElementOutput_,                             ///< Data type used to load and store tensors
diff --git a/include/cutlass/epilogue/thread/linear_combination_silu.h b/include/cutlass/epilogue/thread/linear_combination_silu.h
index 2573db9bc1..a933af1228 100644
--- a/include/cutlass/epilogue/thread/linear_combination_silu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_silu.h
@@ -23,7 +23,7 @@
  *
  **************************************************************************************************/
 /*! \file
-  \brief Functor performing linear combination operations used by epilogues.
+  \brief Functor performing linear combination with SiLU operations used by epilogues.
 */
 
 #pragma once
@@ -40,9 +40,9 @@ namespace thread {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Applies a linear combination operator to an array of elements.
+/// Applies a linear combination operator folllowed by the SiLU activation to an array of elements.
 ///
-/// D = alpha * accumulator + beta * source + uniform
+/// D = silu(alpha * accumulator + beta * source + uniform)
 ///
 template <
   typename ElementOutput_,                             ///< Data type used to load and store tensors

From 80e8fe70095041c5bbf880935d5b220368eb5441 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Sat, 18 Dec 2021 10:36:14 +0900
Subject: [PATCH 09/11] Do not ignore Round

---
 include/cutlass/epilogue/thread/linear_combination_gelu.h      | 2 +-
 include/cutlass/epilogue/thread/linear_combination_hardswish.h | 2 +-
 include/cutlass/epilogue/thread/linear_combination_sigmoid.h   | 2 +-
 include/cutlass/epilogue/thread/linear_combination_silu.h      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/cutlass/epilogue/thread/linear_combination_gelu.h b/include/cutlass/epilogue/thread/linear_combination_gelu.h
index 9a5fd6a633..2bf05b7b20 100644
--- a/include/cutlass/epilogue/thread/linear_combination_gelu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_gelu.h
@@ -54,7 +54,7 @@ template <
   FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
 >
 using LinearCombinationGELU = LinearCombinationGeneric<GELU, ElementOutput_, Count, ElementAccumulator_,
-                                                       ElementCompute_, FloatRoundStyle::round_to_nearest, true>;
+                                                       ElementCompute_, Round, true>;
 
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/thread/linear_combination_hardswish.h b/include/cutlass/epilogue/thread/linear_combination_hardswish.h
index a5dcabbff9..e6c37d506a 100644
--- a/include/cutlass/epilogue/thread/linear_combination_hardswish.h
+++ b/include/cutlass/epilogue/thread/linear_combination_hardswish.h
@@ -54,7 +54,7 @@ template <
   FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
 >
 using LinearCombinationHardSwish = LinearCombinationGeneric<HardSwish, ElementOutput_, Count, ElementAccumulator_,
-                                                            ElementCompute_, FloatRoundStyle::round_to_nearest>;
+                                                            ElementCompute_, Round>;
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace thread
diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
index 622a961ca5..ea77a9537d 100644
--- a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
+++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
@@ -54,7 +54,7 @@ template <
   FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
 >
 using LinearCombinationSigmoid = LinearCombinationGeneric<Sigmoid, ElementOutput_, Count, ElementAccumulator_,
-                                                          ElementCompute_, FloatRoundStyle::round_to_nearest>;
+                                                          ElementCompute_, Round>;
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace thread
diff --git a/include/cutlass/epilogue/thread/linear_combination_silu.h b/include/cutlass/epilogue/thread/linear_combination_silu.h
index a933af1228..92ad6a6e34 100644
--- a/include/cutlass/epilogue/thread/linear_combination_silu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_silu.h
@@ -54,7 +54,7 @@ template <
   FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
 >
 using LinearCombinationSilu = LinearCombinationGeneric<SiLu, ElementOutput_, Count, ElementAccumulator_,
-                                                       ElementCompute_, FloatRoundStyle::round_to_nearest>;
+                                                       ElementCompute_, Round>;
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace thread

From 7b9dde37af05b00e956d881a4c69afe751cd0219 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Sat, 18 Dec 2021 11:48:06 +0900
Subject: [PATCH 10/11] use constant N

---
 include/cutlass/epilogue/thread/activation.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h
index 6603e765c7..7bc5042cc5 100644
--- a/include/cutlass/epilogue/thread/activation.h
+++ b/include/cutlass/epilogue/thread/activation.h
@@ -171,7 +171,7 @@ struct HardSwish<Array<T, N> > {
     HardSwish<T> hardswish_op;
 
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < int(rhs.size()); ++i) {
+    for (int i = 0; i < N; ++i) {
       y[i] = hardswish_op(rhs[i]);
     }
 

From 0e0fe491d4758e104dab6056f784ce18f430e5cc Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Sat, 18 Dec 2021 14:40:28 +0900
Subject: [PATCH 11/11] Set isHeavy = true in sigmoid and silu epilogue

---
 include/cutlass/epilogue/thread/linear_combination_sigmoid.h | 2 +-
 include/cutlass/epilogue/thread/linear_combination_silu.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
index ea77a9537d..e5ef55c80d 100644
--- a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
+++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
@@ -54,7 +54,7 @@ template <
   FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
 >
 using LinearCombinationSigmoid = LinearCombinationGeneric<Sigmoid, ElementOutput_, Count, ElementAccumulator_,
-                                                          ElementCompute_, Round>;
+                                                          ElementCompute_, Round, true>;
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace thread
diff --git a/include/cutlass/epilogue/thread/linear_combination_silu.h b/include/cutlass/epilogue/thread/linear_combination_silu.h
index 92ad6a6e34..e9a3e2c935 100644
--- a/include/cutlass/epilogue/thread/linear_combination_silu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_silu.h
@@ -54,7 +54,7 @@ template <
   FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
 >
 using LinearCombinationSilu = LinearCombinationGeneric<SiLu, ElementOutput_, Count, ElementAccumulator_,
-                                                       ElementCompute_, Round>;
+                                                       ElementCompute_, Round, true>;
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace thread