From c04a3f8a3aeb29ea9e8997b9ccadadbd9da346fd Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 13 Dec 2021 06:51:00 +0900 Subject: [PATCH 01/11] Support half precision sigmoid activation --- include/cutlass/epilogue/thread/activation.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h index 6f53375c2a..adc4abef88 100644 --- a/include/cutlass/epilogue/thread/activation.h +++ b/include/cutlass/epilogue/thread/activation.h @@ -108,6 +108,14 @@ struct Sigmoid { } }; +template <> +struct Sigmoid { + CUTLASS_DEVICE + half_t operator()(half_t const &scalar) const { + return half_t(1) / (half_t(1) + half_t(::hexp(-scalar.to_half()))); + } +}; + template struct Sigmoid > { CUTLASS_HOST_DEVICE From 1c9bead13adc3ab2df6a926734c0ebc7c8a08a24 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 13 Dec 2021 15:14:04 +0900 Subject: [PATCH 02/11] introduce a vectorized variant using fast_tanh --- include/cutlass/epilogue/thread/activation.h | 37 +++++++++++++++----- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h index adc4abef88..f56e7fc4be 100644 --- a/include/cutlass/epilogue/thread/activation.h +++ b/include/cutlass/epilogue/thread/activation.h @@ -108,14 +108,6 @@ struct Sigmoid { } }; -template <> -struct Sigmoid { - CUTLASS_DEVICE - half_t operator()(half_t const &scalar) const { - return half_t(1) / (half_t(1) + half_t(::hexp(-scalar.to_half()))); - } -}; - template struct Sigmoid > { CUTLASS_HOST_DEVICE @@ -132,6 +124,35 @@ struct Sigmoid > { } }; +template <> +struct Sigmoid { + CUTLASS_HOST_DEVICE + half_t operator()(half_t const& scalar) const { + half_t exp_res; + #if defined(__CUDA_ARCH__) + exp_res = half_t(::hexp(-scalar.to_half())); + #else + exp_res = half_t(std::exp(float(-scalar))); + #endif + return half_t(1) / (half_t(1) + exp_res); + } +}; + +#if defined(CUTLASS_USE_FAST_MATH) +template +struct Sigmoid> { + CUTLASS_HOST_DEVICE + Array operator()(Array const& z) const { + using T = half_t; + multiplies> mul; + plus> add; + fast_tanh_op> tanh; + return mul(add(tanh(mul(z, cutlass::constants::half())), cutlass::constants::one()), + cutlass::constants::half()); + } +}; +#endif + // // GELU function definitions implemented as described by // Hendrycks, D., and Gimpel, K. in From 30e58d940f93ceb67cb26bee9aa8ecc662908f7e Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 13 Dec 2021 20:51:10 +0900 Subject: [PATCH 03/11] refactored sigmoid using the new interface --- .../thread/linear_combination_generic.h | 207 ++++++++++++++++++ .../thread/linear_combination_sigmoid.h | 156 +------------ 2 files changed, 214 insertions(+), 149 deletions(-) create mode 100644 include/cutlass/epilogue/thread/linear_combination_generic.h diff --git a/include/cutlass/epilogue/thread/linear_combination_generic.h b/include/cutlass/epilogue/thread/linear_combination_generic.h new file mode 100644 index 0000000000..75c211e760 --- /dev/null +++ b/include/cutlass/epilogue/thread/linear_combination_generic.h @@ -0,0 +1,207 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Functor performing linear combination operations used by epilogues. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" +#include "cutlass/array.h" +#include "cutlass/functional.h" +#include "cutlass/numeric_conversion.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace thread { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Applies a linear combination operator to an array of elements. +/// +/// D = alpha * accumulator + beta * source + uniform +/// +template < + template class ActivationFunctor, + typename ElementOutput_, ///< Data type used to load and store tensors + int Count, ///< Number of elements computed per operation + ///< Usually it is 128/sizeof_bits, + ///< but we use 64 or 32 sometimes when there are not enough data to store + typename ElementAccumulator_ = ElementOutput_, ///< Accumulator data type + typename ElementCompute_ = ElementOutput_, ///< Data type used to compute linear combination + FloatRoundStyle Round = FloatRoundStyle::round_to_nearest +> +class LinearCombinationGeneric { +public: + + using ElementOutput = ElementOutput_; + using ElementAccumulator = ElementAccumulator_; + using ElementCompute = ElementCompute_; + + static int const kCount = Count; + + using FragmentOutput = Array; + using FragmentAccumulator = Array; + using ComputeFragment = Array; + + static FloatRoundStyle const kRound = Round; + + /// Host-constructable parameters structure + struct Params { + + ElementCompute alpha; ///< scales accumulators + ElementCompute beta; ///< scales source tensor + ElementCompute const *alpha_ptr; ///< pointer to accumulator scalar - if not null, loads it from memory + ElementCompute const *beta_ptr; ///< pointer to source scalar - if not null, loads it from memory + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Params(): + alpha(ElementCompute(1)), + beta(ElementCompute(0)), + alpha_ptr(nullptr), + beta_ptr(nullptr) { } + + CUTLASS_HOST_DEVICE + Params( + ElementCompute alpha, + ElementCompute beta + ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) { + + } + + CUTLASS_HOST_DEVICE + Params( + ElementCompute const *alpha_ptr, + ElementCompute const *beta_ptr + ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { + + } + }; + +private: + + // + // Data members + // + + ElementCompute alpha_; + ElementCompute beta_; + +public: + + /// Constructs the function object, possibly loading from pointers in host memory + CUTLASS_HOST_DEVICE + LinearCombinationGeneric(Params const ¶ms) { + + alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha); + beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta); + } + + /// Returns true if source is needed + CUTLASS_HOST_DEVICE + bool is_source_needed() const { + return beta_ != ElementCompute(0); + } + + /// Functionally required for serial reduction in the epilogue + CUTLASS_HOST_DEVICE + void set_k_partition(int k_partition, int k_partition_count) { + if (k_partition) { + beta_ = ElementCompute(1); + } + } + + /// Computes linear scaling: D = alpha * accumulator + beta * source + CUTLASS_HOST_DEVICE + FragmentOutput operator()( + FragmentAccumulator const &accumulator, + FragmentOutput const &source) const { + + // Convert source to interal compute numeric type + NumericArrayConverter source_converter; + NumericArrayConverter accumulator_converter; + + ComputeFragment converted_source = source_converter(source); + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + + ComputeFragment intermediate; + + multiplies mul_add_source; + multiply_add mul_add_accumulator; + ActivationFunctor activation; + + intermediate = mul_add_source(beta_, converted_source); // X = beta * C + uniform + intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate); // D = alpha * Accum + X + + intermediate = activation(intermediate); + + // Convert to destination numeric type + NumericArrayConverter destination_converter; + + return destination_converter(intermediate); + } + + /// Computes linear scaling: D = alpha * accumulator + CUTLASS_HOST_DEVICE + FragmentOutput operator()( + FragmentAccumulator const &accumulator) const { + + // Convert source to interal compute numeric type + NumericArrayConverter accumulator_converter; + + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + + ComputeFragment intermediate; + + multiplies mul_add_accumulator; + ActivationFunctor activation; + + intermediate = mul_add_accumulator(alpha_, converted_accumulator); // D = alpha * Accum + + intermediate = activation(intermediate); + + // Convert to destination numeric type + NumericArrayConverter destination_converter; + + return destination_converter(intermediate); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace thread +} // namespace epilogue +} // namespace cutlass diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h index 4716effaa6..0a8ae76e49 100644 --- a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h +++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h @@ -29,12 +29,8 @@ #pragma once #include "cutlass/cutlass.h" -#include "cutlass/numeric_types.h" -#include "cutlass/array.h" -#include "cutlass/functional.h" -#include "cutlass/numeric_conversion.h" - #include "cutlass/epilogue/thread/activation.h" +#include "cutlass/epilogue/thread/linear_combination_generic.h" ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -57,150 +53,12 @@ template < typename ElementCompute_ = ElementOutput_, ///< Data type used to compute linear combination FloatRoundStyle Round = FloatRoundStyle::round_to_nearest > -class LinearCombinationSigmoid { -public: - - using ElementOutput = ElementOutput_; - using ElementAccumulator = ElementAccumulator_; - using ElementCompute = ElementCompute_; - - static int const kCount = Count; - - using FragmentOutput = Array; - using FragmentAccumulator = Array; - using ComputeFragment = Array; - - static FloatRoundStyle const kRound = Round; - - /// Host-constructable parameters structure - struct Params { - - ElementCompute alpha; ///< scales accumulators - ElementCompute beta; ///< scales source tensor - ElementCompute const *alpha_ptr; ///< pointer to accumulator scalar - if not null, loads it from memory - ElementCompute const *beta_ptr; ///< pointer to source scalar - if not null, loads it from memory - - // - // Methods - // - - CUTLASS_HOST_DEVICE - Params(): - alpha(ElementCompute(1)), - beta(ElementCompute(0)), - alpha_ptr(nullptr), - beta_ptr(nullptr) { } - - CUTLASS_HOST_DEVICE - Params( - ElementCompute alpha, - ElementCompute beta - ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) { - - } - - CUTLASS_HOST_DEVICE - Params( - ElementCompute const *alpha_ptr, - ElementCompute const *beta_ptr - ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { - - } - }; - -private: - - // - // Data members - // - - ElementCompute alpha_; - ElementCompute beta_; - -public: - - /// Constructs the function object, possibly loading from pointers in host memory - CUTLASS_HOST_DEVICE - LinearCombinationSigmoid(Params const ¶ms) { - - alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha); - beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta); - } - - /// Returns true if source is needed - CUTLASS_HOST_DEVICE - bool is_source_needed() const { - return beta_ != ElementCompute(0); - } - - /// Functionally required for serial reduction in the epilogue - CUTLASS_HOST_DEVICE - void set_k_partition(int k_partition, int k_partition_count) { - if (k_partition) { - beta_ = ElementCompute(1); - } - } - - /// Computes linear scaling: D = alpha * accumulator + beta * source - CUTLASS_HOST_DEVICE - FragmentOutput operator()( - FragmentAccumulator const &accumulator, - FragmentOutput const &source) const { - - // Convert source to interal compute numeric type - NumericArrayConverter source_converter; - NumericArrayConverter accumulator_converter; - - ComputeFragment converted_source = source_converter(source); - ComputeFragment converted_accumulator = accumulator_converter(accumulator); - - // Perform binary operations - - ComputeFragment intermediate; - - multiplies mul_add_source; - multiply_add mul_add_accumulator; - Sigmoid sigmoid; - - intermediate = mul_add_source(beta_, converted_source); // X = beta * C + uniform - intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate); // D = alpha * Accum + X - - intermediate = sigmoid(intermediate); - - // Convert to destination numeric type - NumericArrayConverter destination_converter; - - return destination_converter(intermediate); - } - - /// Computes linear scaling: D = alpha * accumulator - CUTLASS_HOST_DEVICE - FragmentOutput operator()( - FragmentAccumulator const &accumulator) const { - - // Convert source to interal compute numeric type - NumericArrayConverter accumulator_converter; - - ComputeFragment converted_accumulator = accumulator_converter(accumulator); - - // Perform binary operations - - ComputeFragment intermediate; - - multiplies mul_add_accumulator; - Sigmoid sigmoid; - - intermediate = mul_add_accumulator(alpha_, converted_accumulator); // D = alpha * Accum - - intermediate = sigmoid(intermediate); - - // Convert to destination numeric type - NumericArrayConverter destination_converter; - - return destination_converter(intermediate); - } -}; - +using LinearCombinationSigmoid = LinearCombinationGeneric; ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace thread From 6528a011c2d88e33b1b8976d884055a4156782d8 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 13 Dec 2021 21:03:43 +0900 Subject: [PATCH 04/11] refactored gelu --- .../epilogue/thread/linear_combination_gelu.h | 154 +----------------- .../thread/linear_combination_generic.h | 4 +- .../thread/linear_combination_sigmoid.h | 8 +- 3 files changed, 8 insertions(+), 158 deletions(-) diff --git a/include/cutlass/epilogue/thread/linear_combination_gelu.h b/include/cutlass/epilogue/thread/linear_combination_gelu.h index 9eec618179..b3370974be 100644 --- a/include/cutlass/epilogue/thread/linear_combination_gelu.h +++ b/include/cutlass/epilogue/thread/linear_combination_gelu.h @@ -29,12 +29,8 @@ #pragma once #include "cutlass/cutlass.h" -#include "cutlass/numeric_types.h" -#include "cutlass/array.h" -#include "cutlass/functional.h" -#include "cutlass/numeric_conversion.h" - #include "cutlass/epilogue/thread/activation.h" +#include "cutlass/epilogue/thread/linear_combination_generic.h" ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -57,153 +53,9 @@ template < typename ElementCompute_ = ElementOutput_, ///< Data type used to compute linear combination FloatRoundStyle Round = FloatRoundStyle::round_to_nearest > -class LinearCombinationGELU { -public: - - using ElementOutput = ElementOutput_; - using ElementAccumulator = ElementAccumulator_; - using ElementCompute = ElementCompute_; - - static bool const kIsHeavy = true; - - static int const kCount = Count; - - using FragmentOutput = Array; - using FragmentAccumulator = Array; - using ComputeFragment = Array; - - static FloatRoundStyle const kRound = Round; - - /// Host-constructable parameters structure - struct Params { - - ElementCompute alpha; ///< scales accumulators - ElementCompute beta; ///< scales source tensor - ElementCompute const *alpha_ptr; ///< pointer to accumulator scalar - if not null, loads it from memory - ElementCompute const *beta_ptr; ///< pointer to source scalar - if not null, loads it from memory - - // - // Methods - // - - CUTLASS_HOST_DEVICE - Params(): - alpha(ElementCompute(1)), - beta(ElementCompute(0)), - alpha_ptr(nullptr), - beta_ptr(nullptr) { } - - CUTLASS_HOST_DEVICE - Params( - ElementCompute alpha, - ElementCompute beta - ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) { - - } - - CUTLASS_HOST_DEVICE - Params( - ElementCompute const *alpha_ptr, - ElementCompute const *beta_ptr - ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { - - } - }; - -private: - - // - // Data members - // - - ElementCompute alpha_; - ElementCompute beta_; - -public: - - /// Constructs the function object, possibly loading from pointers in host memory - CUTLASS_HOST_DEVICE - LinearCombinationGELU(Params const ¶ms) { - - alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha); - beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta); - } - - /// Returns true if source is needed - CUTLASS_HOST_DEVICE - bool is_source_needed() const { - return beta_ != ElementCompute(0); - } - - /// Functionally required for serial reduction in the epilogue - CUTLASS_HOST_DEVICE - void set_k_partition(int k_partition, int k_partition_count) { - if (k_partition) { - beta_ = ElementCompute(1); - } - - CUTLASS_UNUSED(k_partition_count); - } - - /// Computes: D = gelu( alpha * accumulator + beta * source ) - CUTLASS_HOST_DEVICE - FragmentOutput operator()( - FragmentAccumulator const &accumulator, - FragmentOutput const &source) const { - - // Convert source to interal compute numeric type - NumericArrayConverter source_converter; - NumericArrayConverter accumulator_converter; - - ComputeFragment converted_source = source_converter(source); - ComputeFragment converted_accumulator = accumulator_converter(accumulator); - - // Perform binary operations - - ComputeFragment intermediate; - - multiplies mul_add_source; - multiply_add mul_add_accumulator; - GELU gelu; - - intermediate = mul_add_source(beta_, converted_source); // X = beta * C + uniform - intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate); // D = alpha * Accum + X - - intermediate = gelu(intermediate); - - // Convert to destination numeric type - NumericArrayConverter destination_converter; - - return destination_converter(intermediate); - } - - /// Computes: D = gelu( alpha * accumulator ) - CUTLASS_HOST_DEVICE - FragmentOutput operator()( - FragmentAccumulator const &accumulator) const { - - // Convert source to interal compute numeric type - NumericArrayConverter accumulator_converter; - - ComputeFragment converted_accumulator = accumulator_converter(accumulator); - - // Perform binary operations - - ComputeFragment intermediate; - - multiplies mul_add_accumulator; - GELU gelu; - - intermediate = mul_add_accumulator(alpha_, converted_accumulator); // D = alpha * Accum - - intermediate = gelu(intermediate); - - // Convert to destination numeric type - NumericArrayConverter destination_converter; +using LinearCombinationGELU = LinearCombinationGeneric; - return destination_converter(intermediate); - } -}; ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/thread/linear_combination_generic.h b/include/cutlass/epilogue/thread/linear_combination_generic.h index 75c211e760..7ba87f3081 100644 --- a/include/cutlass/epilogue/thread/linear_combination_generic.h +++ b/include/cutlass/epilogue/thread/linear_combination_generic.h @@ -54,7 +54,8 @@ template < ///< but we use 64 or 32 sometimes when there are not enough data to store typename ElementAccumulator_ = ElementOutput_, ///< Accumulator data type typename ElementCompute_ = ElementOutput_, ///< Data type used to compute linear combination - FloatRoundStyle Round = FloatRoundStyle::round_to_nearest + FloatRoundStyle Round = FloatRoundStyle::round_to_nearest, + bool IsHeavy = false > class LinearCombinationGeneric { public: @@ -63,6 +64,7 @@ class LinearCombinationGeneric { using ElementAccumulator = ElementAccumulator_; using ElementCompute = ElementCompute_; + static bool const kIsHeavy = IsHeavy; static int const kCount = Count; using FragmentOutput = Array; diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h index 0a8ae76e49..2afedb1c63 100644 --- a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h +++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h @@ -53,12 +53,8 @@ template < typename ElementCompute_ = ElementOutput_, ///< Data type used to compute linear combination FloatRoundStyle Round = FloatRoundStyle::round_to_nearest > -using LinearCombinationSigmoid = LinearCombinationGeneric; +using LinearCombinationSigmoid = LinearCombinationGeneric; ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace thread From ff6eafe85d0ecd5bcbcb2186e8410bf9be559549 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 13 Dec 2021 21:36:50 +0900 Subject: [PATCH 05/11] add silu activation --- include/cutlass/epilogue/thread/activation.h | 19 ++++++ .../epilogue/thread/linear_combination_silu.h | 62 +++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 include/cutlass/epilogue/thread/linear_combination_silu.h diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h index 6f53375c2a..b109cd0fe6 100644 --- a/include/cutlass/epilogue/thread/activation.h +++ b/include/cutlass/epilogue/thread/activation.h @@ -124,6 +124,25 @@ struct Sigmoid > { } }; +// SiLu (swish) operator +template +struct SiLu { + CUTLASS_HOST_DEVICE + T operator()(T const &scalar) const { + return scalar * Sigmoid(scalar); + } +}; + +template +struct SiLu> { + CUTLASS_HOST_DEVICE + Array operator()(Array const &rhs) const { + Sigmoid> sigmoid_op; + multiplies> mul; + return mul(rhs, sigmoid_op(rhs)); + } +}; + // // GELU function definitions implemented as described by // Hendrycks, D., and Gimpel, K. in diff --git a/include/cutlass/epilogue/thread/linear_combination_silu.h b/include/cutlass/epilogue/thread/linear_combination_silu.h new file mode 100644 index 0000000000..2573db9bc1 --- /dev/null +++ b/include/cutlass/epilogue/thread/linear_combination_silu.h @@ -0,0 +1,62 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Functor performing linear combination operations used by epilogues. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/epilogue/thread/activation.h" +#include "cutlass/epilogue/thread/linear_combination_generic.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace thread { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Applies a linear combination operator to an array of elements. +/// +/// D = alpha * accumulator + beta * source + uniform +/// +template < + typename ElementOutput_, ///< Data type used to load and store tensors + int Count, ///< Number of elements computed per operation + ///< Usually it is 128/sizeof_bits, + ///< but we use 64 or 32 sometimes when there are not enough data to store + typename ElementAccumulator_ = ElementOutput_, ///< Accumulator data type + typename ElementCompute_ = ElementOutput_, ///< Data type used to compute linear combination + FloatRoundStyle Round = FloatRoundStyle::round_to_nearest +> +using LinearCombinationSilu = LinearCombinationGeneric; +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace thread +} // namespace epilogue +} // namespace cutlass From 2f27d8ce6eb94a30628628968a75683bf175b882 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Tue, 14 Dec 2021 10:06:24 +0900 Subject: [PATCH 06/11] add hardswish --- include/cutlass/epilogue/thread/activation.h | 26 ++++++++ .../thread/linear_combination_hardswish.h | 62 +++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 include/cutlass/epilogue/thread/linear_combination_hardswish.h diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h index 4357f3dd8d..b986a8b342 100644 --- a/include/cutlass/epilogue/thread/activation.h +++ b/include/cutlass/epilogue/thread/activation.h @@ -172,6 +172,32 @@ struct SiLu> { } }; +template +struct HardSwish { + CUTLASS_HOST_DEVICE + T operator()(T const &x) const { + minimum mn; + maximum mx; + T relu6 = mn(mx(x + T(3), T(0)), T(6)); + return x * (relu6 / T(6)); + } +}; + +template +struct HardSwish > { + CUTLASS_HOST_DEVICE + Array operator()(Array const &rhs) const { + Array y; + HardSwish hardswish_op; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < int(rhs.size()); ++i) { + y[i] = hardswish_op(rhs[i]); + } + + return y; + } +}; // // GELU function definitions implemented as described by diff --git a/include/cutlass/epilogue/thread/linear_combination_hardswish.h b/include/cutlass/epilogue/thread/linear_combination_hardswish.h new file mode 100644 index 0000000000..4239ee3a05 --- /dev/null +++ b/include/cutlass/epilogue/thread/linear_combination_hardswish.h @@ -0,0 +1,62 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Functor performing linear combination operations used by epilogues. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/epilogue/thread/activation.h" +#include "cutlass/epilogue/thread/linear_combination_generic.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace thread { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Applies a linear combination operator to an array of elements. +/// +/// D = alpha * accumulator + beta * source + uniform +/// +template < + typename ElementOutput_, ///< Data type used to load and store tensors + int Count, ///< Number of elements computed per operation + ///< Usually it is 128/sizeof_bits, + ///< but we use 64 or 32 sometimes when there are not enough data to store + typename ElementAccumulator_ = ElementOutput_, ///< Accumulator data type + typename ElementCompute_ = ElementOutput_, ///< Data type used to compute linear combination + FloatRoundStyle Round = FloatRoundStyle::round_to_nearest +> +using LinearCombinationHardSwish = LinearCombinationGeneric; +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace thread +} // namespace epilogue +} // namespace cutlass From d7b5297f1c87edbbf41ae60346f6d432c940a4b2 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Tue, 14 Dec 2021 14:28:27 +0900 Subject: [PATCH 07/11] remove sigmoid for now --- include/cutlass/epilogue/thread/activation.h | 29 -------------------- 1 file changed, 29 deletions(-) diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h index b986a8b342..c6b2be2bed 100644 --- a/include/cutlass/epilogue/thread/activation.h +++ b/include/cutlass/epilogue/thread/activation.h @@ -124,35 +124,6 @@ struct Sigmoid > { } }; -template <> -struct Sigmoid { - CUTLASS_HOST_DEVICE - half_t operator()(half_t const& scalar) const { - half_t exp_res; - #if defined(__CUDA_ARCH__) - exp_res = half_t(::hexp(-scalar.to_half())); - #else - exp_res = half_t(std::exp(float(-scalar))); - #endif - return half_t(1) / (half_t(1) + exp_res); - } -}; - -#if defined(CUTLASS_USE_FAST_MATH) -template -struct Sigmoid> { - CUTLASS_HOST_DEVICE - Array operator()(Array const& z) const { - using T = half_t; - multiplies> mul; - plus> add; - fast_tanh_op> tanh; - return mul(add(tanh(mul(z, cutlass::constants::half())), cutlass::constants::one()), - cutlass::constants::half()); - } -}; -#endif - // SiLu (swish) operator template struct SiLu { From 8ac1036c94c0251aae4ac9f1a9323903cf527ae4 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Wed, 15 Dec 2021 03:44:23 +0900 Subject: [PATCH 08/11] add description to silu and hardswish, and other doc update --- include/cutlass/epilogue/thread/activation.h | 11 ++++++++++- .../cutlass/epilogue/thread/linear_combination_gelu.h | 4 ++-- .../epilogue/thread/linear_combination_generic.h | 4 ++-- .../epilogue/thread/linear_combination_hardswish.h | 6 +++--- .../epilogue/thread/linear_combination_sigmoid.h | 6 +++--- .../cutlass/epilogue/thread/linear_combination_silu.h | 6 +++--- 6 files changed, 23 insertions(+), 14 deletions(-) diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h index c6b2be2bed..6603e765c7 100644 --- a/include/cutlass/epilogue/thread/activation.h +++ b/include/cutlass/epilogue/thread/activation.h @@ -124,7 +124,11 @@ struct Sigmoid > { } }; -// SiLu (swish) operator +// SiLu (swish) operator introduced by Elfwing et al. in the following paper +// "Sigmoid-Weighted Linear Units for Neural Network Function Approximation in Reinforcement Learning" (2017) +// https://arxiv.org/pdf/1702.03118.pdf +// It is used in EfficientNet and YOLOv5, for example. +// Reference: https://pytorch.org/docs/stable/generated/torch.nn.SiLU.html template struct SiLu { CUTLASS_HOST_DEVICE @@ -143,6 +147,11 @@ struct SiLu> { } }; +// Hardswish operator introduced by Howard et al. in the following paper +// "Searching for MobileNetV3" (2019) +// https://arxiv.org/pdf/1905.02244.pdf +// It is used in models based on MobilenetNetV3. +// Reference: https://pytorch.org/docs/stable/generated/torch.nn.Hardswish.html template struct HardSwish { CUTLASS_HOST_DEVICE diff --git a/include/cutlass/epilogue/thread/linear_combination_gelu.h b/include/cutlass/epilogue/thread/linear_combination_gelu.h index b3370974be..9a5fd6a633 100644 --- a/include/cutlass/epilogue/thread/linear_combination_gelu.h +++ b/include/cutlass/epilogue/thread/linear_combination_gelu.h @@ -40,9 +40,9 @@ namespace thread { ///////////////////////////////////////////////////////////////////////////////////////////////// -/// Applies a linear combination operator to an array of elements. +/// Applies a linear combination operator followed by the GELU activation to an array of elements. /// -/// D = alpha * accumulator + beta * source + uniform +/// D = gelu(alpha * accumulator + beta * source + uniform) /// template < typename ElementOutput_, ///< Data type used to load and store tensors diff --git a/include/cutlass/epilogue/thread/linear_combination_generic.h b/include/cutlass/epilogue/thread/linear_combination_generic.h index 7ba87f3081..17f961e83b 100644 --- a/include/cutlass/epilogue/thread/linear_combination_generic.h +++ b/include/cutlass/epilogue/thread/linear_combination_generic.h @@ -42,9 +42,9 @@ namespace thread { ///////////////////////////////////////////////////////////////////////////////////////////////// -/// Applies a linear combination operator to an array of elements. +/// Applies a linear combination operator followed by an activation function to an array of elements. /// -/// D = alpha * accumulator + beta * source + uniform +/// D = activation(alpha * accumulator + beta * source + uniform) /// template < template class ActivationFunctor, diff --git a/include/cutlass/epilogue/thread/linear_combination_hardswish.h b/include/cutlass/epilogue/thread/linear_combination_hardswish.h index 4239ee3a05..a5dcabbff9 100644 --- a/include/cutlass/epilogue/thread/linear_combination_hardswish.h +++ b/include/cutlass/epilogue/thread/linear_combination_hardswish.h @@ -23,7 +23,7 @@ * **************************************************************************************************/ /*! \file - \brief Functor performing linear combination operations used by epilogues. + \brief Functor performing linear combination with HardSwish operations used by epilogues. */ #pragma once @@ -40,9 +40,9 @@ namespace thread { ///////////////////////////////////////////////////////////////////////////////////////////////// -/// Applies a linear combination operator to an array of elements. +/// Applies a linear combination operator followed by the HardSwish activation to an array of elements. /// -/// D = alpha * accumulator + beta * source + uniform +/// D = hardswish(alpha * accumulator + beta * source + uniform) /// template < typename ElementOutput_, ///< Data type used to load and store tensors diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h index 2afedb1c63..622a961ca5 100644 --- a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h +++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h @@ -23,7 +23,7 @@ * **************************************************************************************************/ /*! \file - \brief Functor performing linear combination operations used by epilogues. + \brief Functor performing linear combination with Sigmoid operations used by epilogues. */ #pragma once @@ -40,9 +40,9 @@ namespace thread { ///////////////////////////////////////////////////////////////////////////////////////////////// -/// Applies a linear combination operator to an array of elements. +/// Applies a linear combination operator followed by the Sigmoid activation, to an array of elements. /// -/// D = alpha * accumulator + beta * source + uniform +/// D = sigmoid(alpha * accumulator + beta * source + uniform) /// template < typename ElementOutput_, ///< Data type used to load and store tensors diff --git a/include/cutlass/epilogue/thread/linear_combination_silu.h b/include/cutlass/epilogue/thread/linear_combination_silu.h index 2573db9bc1..a933af1228 100644 --- a/include/cutlass/epilogue/thread/linear_combination_silu.h +++ b/include/cutlass/epilogue/thread/linear_combination_silu.h @@ -23,7 +23,7 @@ * **************************************************************************************************/ /*! \file - \brief Functor performing linear combination operations used by epilogues. + \brief Functor performing linear combination with SiLU operations used by epilogues. */ #pragma once @@ -40,9 +40,9 @@ namespace thread { ///////////////////////////////////////////////////////////////////////////////////////////////// -/// Applies a linear combination operator to an array of elements. +/// Applies a linear combination operator folllowed by the SiLU activation to an array of elements. /// -/// D = alpha * accumulator + beta * source + uniform +/// D = silu(alpha * accumulator + beta * source + uniform) /// template < typename ElementOutput_, ///< Data type used to load and store tensors From 80e8fe70095041c5bbf880935d5b220368eb5441 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Sat, 18 Dec 2021 10:36:14 +0900 Subject: [PATCH 09/11] Do not ignore Round --- include/cutlass/epilogue/thread/linear_combination_gelu.h | 2 +- include/cutlass/epilogue/thread/linear_combination_hardswish.h | 2 +- include/cutlass/epilogue/thread/linear_combination_sigmoid.h | 2 +- include/cutlass/epilogue/thread/linear_combination_silu.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/cutlass/epilogue/thread/linear_combination_gelu.h b/include/cutlass/epilogue/thread/linear_combination_gelu.h index 9a5fd6a633..2bf05b7b20 100644 --- a/include/cutlass/epilogue/thread/linear_combination_gelu.h +++ b/include/cutlass/epilogue/thread/linear_combination_gelu.h @@ -54,7 +54,7 @@ template < FloatRoundStyle Round = FloatRoundStyle::round_to_nearest > using LinearCombinationGELU = LinearCombinationGeneric; + ElementCompute_, Round, true>; ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/thread/linear_combination_hardswish.h b/include/cutlass/epilogue/thread/linear_combination_hardswish.h index a5dcabbff9..e6c37d506a 100644 --- a/include/cutlass/epilogue/thread/linear_combination_hardswish.h +++ b/include/cutlass/epilogue/thread/linear_combination_hardswish.h @@ -54,7 +54,7 @@ template < FloatRoundStyle Round = FloatRoundStyle::round_to_nearest > using LinearCombinationHardSwish = LinearCombinationGeneric; + ElementCompute_, Round>; ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace thread diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h index 622a961ca5..ea77a9537d 100644 --- a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h +++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h @@ -54,7 +54,7 @@ template < FloatRoundStyle Round = FloatRoundStyle::round_to_nearest > using LinearCombinationSigmoid = LinearCombinationGeneric; + ElementCompute_, Round>; ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace thread diff --git a/include/cutlass/epilogue/thread/linear_combination_silu.h b/include/cutlass/epilogue/thread/linear_combination_silu.h index a933af1228..92ad6a6e34 100644 --- a/include/cutlass/epilogue/thread/linear_combination_silu.h +++ b/include/cutlass/epilogue/thread/linear_combination_silu.h @@ -54,7 +54,7 @@ template < FloatRoundStyle Round = FloatRoundStyle::round_to_nearest > using LinearCombinationSilu = LinearCombinationGeneric; + ElementCompute_, Round>; ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace thread From 7b9dde37af05b00e956d881a4c69afe751cd0219 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Sat, 18 Dec 2021 11:48:06 +0900 Subject: [PATCH 10/11] use constant N --- include/cutlass/epilogue/thread/activation.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h index 6603e765c7..7bc5042cc5 100644 --- a/include/cutlass/epilogue/thread/activation.h +++ b/include/cutlass/epilogue/thread/activation.h @@ -171,7 +171,7 @@ struct HardSwish > { HardSwish hardswish_op; CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < int(rhs.size()); ++i) { + for (int i = 0; i < N; ++i) { y[i] = hardswish_op(rhs[i]); } From 0e0fe491d4758e104dab6056f784ce18f430e5cc Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Sat, 18 Dec 2021 14:40:28 +0900 Subject: [PATCH 11/11] Set isHeavy = true in sigmoid and silu epilogue --- include/cutlass/epilogue/thread/linear_combination_sigmoid.h | 2 +- include/cutlass/epilogue/thread/linear_combination_silu.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h index ea77a9537d..e5ef55c80d 100644 --- a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h +++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h @@ -54,7 +54,7 @@ template < FloatRoundStyle Round = FloatRoundStyle::round_to_nearest > using LinearCombinationSigmoid = LinearCombinationGeneric; + ElementCompute_, Round, true>; ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace thread diff --git a/include/cutlass/epilogue/thread/linear_combination_silu.h b/include/cutlass/epilogue/thread/linear_combination_silu.h index 92ad6a6e34..e9a3e2c935 100644 --- a/include/cutlass/epilogue/thread/linear_combination_silu.h +++ b/include/cutlass/epilogue/thread/linear_combination_silu.h @@ -54,7 +54,7 @@ template < FloatRoundStyle Round = FloatRoundStyle::round_to_nearest > using LinearCombinationSilu = LinearCombinationGeneric; + ElementCompute_, Round, true>; ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace thread