Merge pull request #1752 from su2code/vectorize_when_possible

Always use vectorization when the numerical scheme supports it
su2code · Sep 26, 2022 · bc6ef2a · bc6ef2a
2 parents d32ccec + c9af050
commit bc6ef2a
Show file tree

Hide file tree

Showing 58 changed files with 358 additions and 297 deletions.
diff --git a/Common/doc/docmain.hpp b/Common/doc/docmain.hpp
@@ -28,7 +28,7 @@
 /*! 
  * \mainpage SU2 version 7.4.0 "Blackbird"
  * SU2 suite is an open-source collection of C++ based software tools
- * to perform PDE analysis and PDE constrained optimization problems.  The toolset is designed with 
+ * to perform PDE analysis and PDE constrained optimization.  The toolset is designed with 
  * computational fluid dynamics and aerodynamic shape optimization in mind, but is extensible to
  * include other families of governing equations such as potential flow, electrodynamics, chemically reacting
  * flows, and many others.  SU2 is released under an
@@ -38,54 +38,53 @@
  */
 
 /*!
- * \defgroup Config Descriptions of Configuration Options.
+ * \defgroup Config Description of the Configuration Options
  * \brief Group of variables that can be set using the configuration file.
  */
 
 /*!
- * \defgroup ConvDiscr Discretization of the convective terms.
+ * \defgroup ConvDiscr Discretization of the convective terms
  * \brief Group of classes which define the numerical methods for 
  *		  discretizing the convective terms of a Partial Differential Equation. 
  *		  There are methods for solving the direct, adjoint and linearized 
  *		  systems of equations.
  */
 
 /*!
- * \defgroup ViscDiscr Discretization of the viscous terms.
+ * \defgroup ViscDiscr Discretization of the viscous terms
  * \brief Group of classes which define the numerical methods for 
  *		  discretizing the viscous terms of a Partial Differential Equation. 
  *		  There are methods for solving the direct, adjoint and linearized 
  *		  systems of equations.
  */
 
 /*!
- * \defgroup SourceDiscr Discretization of the source terms.
+ * \defgroup SourceDiscr Discretization of the source terms
  * \brief Group of classes which define the numerical methods for 
  *		  discretizing the source terms of a Partial Differential Equation. 
  *		  There are methods for solving the direct, adjoint and linearized 
  *		  systems of equations.
  */
 
 /*!
- * \defgroup Potential_Flow_Equation Solving the potential flow equation.
- * \brief Group of classes which define the system of Potential flow equation in 
- *		  three formulations: direct, adjoint, and linearized.
- */
-
-/*!
- * \defgroup Euler_Equations Solving the Euler's equations.
+ * \defgroup Euler_Equations Solving the Euler equations
  * \brief Group of classes which define the system of Euler equations in 
  *		  three formulations: direct, adjoint, and linearized.
  */
 
 /*!
- * \defgroup Navier_Stokes_Equations Solving the Navier-Stokes' equations.
+ * \defgroup Navier_Stokes_Equations Solving the Navier-Stokes equations
  * \brief Group of classes which define the system of Navier-Stokes equations in 
  *		  three formulations: direct, adjoint, and linearized.
  */
 
 /*!
- * \defgroup Turbulence_Model Solving the turbulence models.
+ * \defgroup Turbulence_Model Solving the turbulence model equations
  * \brief Group of classes which define the turbulence model in 
  *		  three formulations: direct, adjoint, and linearized.
  */
+
+/*!
+ * \defgroup Elasticity_Equations Solving the elasticity equations
+ * \brief Group of classes to solve solid deformation problems.
+ */
diff --git a/Common/include/basic_types/datatype_structure.hpp b/Common/include/basic_types/datatype_structure.hpp
@@ -114,6 +114,21 @@ namespace SU2_TYPE {
   FORCEINLINE void SetDerivative(su2double &, const passivedouble &) {}
 #endif
 
+  /*!
+   * \brief Get the passive value of any variable. For most types return directly,
+   * specialize for su2double to call GetValue.
+   * \note This is a struct instead of a function because the return type of the
+   * su2double specialization changes.
+   */
+  template <class T>
+  struct Passive {
+    FORCEINLINE static T Value(const T& val) {return val;}
+  };
+  template <>
+  struct Passive<su2double> {
+    FORCEINLINE static passivedouble Value(const su2double& val) {return GetValue(val);}
+  };
+
   /*!
    * \brief Casts the primitive value to int (uses GetValue, already implemented for each type).
    * \param[in] data - The non-primitive datatype.

diff --git a/Common/include/linear_algebra/vector_expressions.hpp b/Common/include/linear_algebra/vector_expressions.hpp
@@ -33,6 +33,7 @@
 #include <cassert>
 #include <cstdlib>
 #include <cmath>
+#include <cstdint>
 
 namespace VecExpr {
 
@@ -157,21 +158,28 @@ FORCEINLINE auto FUN(decay_t<S> u, const CVecExpr<V,S>& v)                    \
   RETURNS( EXPR<Bcast<S>,V,S>(Bcast<S>(u), v.derived())                       \
 )                                                                             \
 
-/*--- std::max/min have issues (maybe because they return by reference).
- * For AD codi::max/min need to be used to avoid issues in debug builds. ---*/
-
-#if defined(CODI_REVERSE_TYPE) || defined(CODI_FORWARD_TYPE)
-#define max_impl math::max
-#define min_impl math::min
-#else
-#define max_impl(a,b) a<b? Scalar(b) : Scalar(a)
-#define min_impl(a,b) b<a? Scalar(b) : Scalar(a)
-#endif
-MAKE_BINARY_FUN(max, max_, max_impl)
-MAKE_BINARY_FUN(min, min_, min_impl)
+/*--- std::max/min have issues (because they return by reference).
+ * fmin and fmax return by value and thus are fine, but they would force
+ * conversions to double, to avoid that we provide integer overloads.
+ * We use int32/64 instead of int/long to avoid issues with Windows,
+ * where long is 32 bits (instead of 64 bits). ---*/
+
+#define MAKE_FMINMAX_OVERLOADS(TYPE)                          \
+FORCEINLINE TYPE fmax(TYPE a, TYPE b) { return a<b? b : a; }  \
+FORCEINLINE TYPE fmin(TYPE a, TYPE b) { return a<b? a : b; }
+MAKE_FMINMAX_OVERLOADS(int32_t)
+MAKE_FMINMAX_OVERLOADS(int64_t)
+MAKE_FMINMAX_OVERLOADS(uint32_t)
+MAKE_FMINMAX_OVERLOADS(uint64_t)
+/*--- Make the float and double versions of fmin/max available in this
+ * namespace to avoid ambiguous overloads. ---*/
+using std::fmax;
+using std::fmin;
+#undef MAKE_FMINMAX_OVERLOADS
+
+MAKE_BINARY_FUN(fmax, max_, fmax)
+MAKE_BINARY_FUN(fmin, min_, fmin)
 MAKE_BINARY_FUN(pow, pow_, math::pow)
-#undef max_impl
-#undef min_impl
 
 /*--- sts::plus and co. were tried, the code was horrendous (due to the forced
  * conversion between different types) and creating functions for these ops
@@ -190,20 +198,25 @@ MAKE_BINARY_FUN(operator/, div_, div_impl)
 #undef mul_impl
 #undef div_impl
 
-/*--- Relational operators need to be cast to the scalar type to allow vectorization. ---*/
-
-#define le_impl(a,b) Scalar(a<=b)
-#define ge_impl(a,b) Scalar(a>=b)
-#define eq_impl(a,b) Scalar(a==b)
-#define ne_impl(a,b) Scalar(a!=b)
-#define lt_impl(a,b) Scalar(a<b)
-#define gt_impl(a,b) Scalar(a>b)
+/*--- Relational operators need to be cast to the scalar type to allow vectorization.
+ * TO_PASSIVE is used to convert active scalars to passive, which CoDi will then capture
+ * by value in its expressions, and thus dangling references are avoided. No AD info
+ * is lost since these operators are non-differentiable. ---*/
+
+#define TO_PASSIVE(IMPL) SU2_TYPE::Passive<Scalar>::Value(IMPL)
+#define le_impl(a,b) TO_PASSIVE(a<=b)
+#define ge_impl(a,b) TO_PASSIVE(a>=b)
+#define eq_impl(a,b) TO_PASSIVE(a==b)
+#define ne_impl(a,b) TO_PASSIVE(a!=b)
+#define lt_impl(a,b) TO_PASSIVE(a<b)
+#define gt_impl(a,b) TO_PASSIVE(a>b)
 MAKE_BINARY_FUN(operator<=, le_, le_impl)
 MAKE_BINARY_FUN(operator>=, ge_, ge_impl)
 MAKE_BINARY_FUN(operator==, eq_, eq_impl)
 MAKE_BINARY_FUN(operator!=, ne_, ne_impl)
 MAKE_BINARY_FUN(operator<, lt_, lt_impl)
 MAKE_BINARY_FUN(operator>, gt_, gt_impl)
+#undef TO_PASSIVE
 #undef le_impl
 #undef ge_impl
 #undef eq_impl

diff --git a/Common/include/parallelization/special_vectorization.hpp b/Common/include/parallelization/special_vectorization.hpp
@@ -126,8 +126,8 @@ MAKE_BINARY_FUN(operator==, eq_p)
 MAKE_BINARY_FUN(operator!=, ne_p)
 MAKE_BINARY_FUN(operator<=, le_p)
 MAKE_BINARY_FUN(operator>=, ge_p)
-MAKE_BINARY_FUN(max, max_p)
-MAKE_BINARY_FUN(min, min_p)
+MAKE_BINARY_FUN(fmax, max_p)
+MAKE_BINARY_FUN(fmin, min_p)
 
 #undef MAKE_BINARY_FUN
 

diff --git a/Common/include/parallelization/vectorization.hpp b/Common/include/parallelization/vectorization.hpp
@@ -58,7 +58,17 @@ template<class T>
 constexpr size_t preferredLen() { return PREFERRED_SIZE / sizeof(T); }
 
 template<>
-constexpr size_t preferredLen<su2double>() { return PREFERRED_SIZE / sizeof(passivedouble); }
+constexpr size_t preferredLen<su2double>() {
+#ifdef CODI_REVERSE_TYPE
+  /*--- Use a SIMD size of 1 for reverse AD, larger sizes increase
+   * the pre-accumulation time with no performance benefit. ---*/
+  return 1;
+#else
+  /*--- For forward AD there is a performance benefit. This covers
+   * forward AD and primal mode (su2double == passivedouble). ---*/
+  return PREFERRED_SIZE / sizeof(passivedouble);
+#endif
+}
 
 /*!
  * \class Array

diff --git a/SU2_CFD/include/numerics/elasticity/CFEAElasticity.hpp b/SU2_CFD/include/numerics/elasticity/CFEAElasticity.hpp
@@ -32,11 +32,11 @@
 
 /*!
  * \class CFEAElasticity
+ * \ingroup Elasticity_Equations
  * \brief Abstract class for computing the tangent matrix and the residual for structural problems.
  * \note  At the next level of abstraction (linear or not) a class must define the constitutive term.
  *        The methods we override in this class with an empty implementation are here just to better
  *        document the public interface of this class hierarchy.
- * \ingroup FEM_Discr
  * \author R.Sanchez
  * \version 7.4.0 "Blackbird"
  */

diff --git a/SU2_CFD/include/numerics/elasticity/CFEALinearElasticity.hpp b/SU2_CFD/include/numerics/elasticity/CFEALinearElasticity.hpp
@@ -33,7 +33,7 @@
 /*!
  * \class CFEALinearElasticity
  * \brief Class for computing the stiffness matrix of a linear, elastic problem.
- * \ingroup FEM_Discr
+ * \ingroup Elasticity_Equations
  * \author R.Sanchez
  * \version 7.4.0 "Blackbird"
  */
@@ -88,7 +88,7 @@ class CFEALinearElasticity : public CFEAElasticity {
 /*!
  * \class CFEAMeshElasticity
  * \brief Particular case of linear elasticity used for mesh deformation.
- * \ingroup FEM_Discr
+ * \ingroup Elasticity_Equations
  * \author R.Sanchez
  * \version 7.4.0 "Blackbird"
  */

diff --git a/SU2_CFD/include/numerics/elasticity/CFEANonlinearElasticity.hpp b/SU2_CFD/include/numerics/elasticity/CFEANonlinearElasticity.hpp
@@ -35,7 +35,7 @@
  *        This class does not implement a particular model, that will be done by its children.
  * \note  In addition to Compute_Constitutive_Matrix, derived classes MUST further implement
  *        Compute_Plane_Stress_Term and Compute_Stress_Tensor.
- * \ingroup FEM_Discr
+ * \ingroup Elasticity_Equations
  * \author R.Sanchez
  * \version 7.4.0 "Blackbird"
  */

diff --git a/SU2_CFD/include/numerics/elasticity/nonlinear_models.hpp b/SU2_CFD/include/numerics/elasticity/nonlinear_models.hpp
@@ -33,7 +33,7 @@
 /*!
  * \class CFEM_NeoHookean_Comp
  * \brief Class for computing the constitutive and stress tensors for a neo-Hookean material model, compressible.
- * \ingroup FEM_Discr
+ * \ingroup Elasticity_Equations
  * \author R.Sanchez
  * \version 7.4.0 "Blackbird"
  */
@@ -81,7 +81,7 @@ class CFEM_NeoHookean_Comp final : public CFEANonlinearElasticity {
 /*!
  * \class CFEM_NeoHookean_Comp
  * \brief Constitutive and stress tensors for a Knowles stored-energy function, nearly incompressible.
- * \ingroup FEM_Discr
+ * \ingroup Elasticity_Equations
  * \author R.Sanchez
  * \version 7.4.0 "Blackbird"
  */
@@ -132,7 +132,7 @@ class CFEM_Knowles_NearInc final : public CFEANonlinearElasticity {
 /*!
  * \class CFEM_DielectricElastomer
  * \brief Class for computing the constitutive and stress tensors for a dielectric elastomer.
- * \ingroup FEM_Discr
+ * \ingroup Elasticity_Equations
  * \author R.Sanchez
  * \version 7.4.0 "Blackbird"
  */
@@ -180,7 +180,7 @@ class CFEM_DielectricElastomer final : public CFEANonlinearElasticity {
 /*!
  * \class CFEM_IdealDE
  * \brief Class for computing the constitutive and stress tensors for a nearly-incompressible ideal DE.
- * \ingroup FEM_Discr
+ * \ingroup Elasticity_Equations
  * \author R.Sanchez
  * \version 7.4.0 "Blackbird"
  */

diff --git a/SU2_CFD/include/numerics_simd/CNumericsSIMD.cpp b/SU2_CFD/include/numerics_simd/CNumericsSIMD.cpp
@@ -130,9 +130,12 @@ CNumericsSIMD* createNumerics(const CConfig& config, int iMesh, const CVariable*
  * numerical methods.
  */
 CNumericsSIMD* CNumericsSIMD::CreateNumerics(const CConfig& config, int nDim, int iMesh, const CVariable* turbVars) {
+#ifndef CODI_REVERSE_TYPE
   if ((Double::Size < 4) && (SU2_MPI::GetRank() == MASTER_NODE)) {
-    cout << "WARNING: SU2 was not compiled for an AVX-capable architecture." << endl;
+    cout << "WARNING: SU2 was not compiled for an AVX-capable architecture. Performance could be better,\n"
+            "         see https://su2code.github.io/docs_v7/Build-SU2-Linux-MacOS/#compiler-optimizations" << endl;
   }
+#endif
   if (nDim == 2) return createNumerics<2>(config, iMesh, turbVars);
   if (nDim == 3) return createNumerics<3>(config, iMesh, turbVars);
 

diff --git a/SU2_CFD/include/numerics_simd/CNumericsSIMD.hpp b/SU2_CFD/include/numerics_simd/CNumericsSIMD.hpp
@@ -58,6 +58,7 @@ using SparseMatrixType = CSysMatrix<su2mixedfloat>;
 
 /*!
  * \class CNumericsSIMD
+ * \ingroup ConvDiscr
  * \brief Base class to define the interface.
  * \note See CNumericsEmptyDecorator.
  */

diff --git a/SU2_CFD/include/numerics_simd/flow/convection/centered.hpp b/SU2_CFD/include/numerics_simd/flow/convection/centered.hpp
@@ -36,6 +36,7 @@
 
 /*!
  * \class CCenteredBase
+ * \ingroup ConvDiscr
  * \brief Base class for Centered schemes, derived classes implement
  * the dissipation term in a const "finalizeFlux" method.
  * \note See CRoeBase for the role of Base.
@@ -186,6 +187,7 @@ class CCenteredBase : public Base {
 
 /*!
  * \class CJSTScheme
+ * \ingroup ConvDiscr
  * \brief Classical JST scheme with scalar dissipation.
  */
 template<class Decorator>
@@ -243,7 +245,7 @@ class CJSTScheme : public CCenteredBase<CJSTScheme<Decorator>,Decorator> {
     const auto si = gatherVariables(iPoint, solution.GetSensor());
     const auto sj = gatherVariables(jPoint, solution.GetSensor());
     const Double eps2 = kappa2 * 0.5*(si+sj) * sc2;
-    const Double eps4 = max(0.0, kappa4-eps2) * sc4;
+    const Double eps4 = fmax(0.0, kappa4-eps2) * sc4;
 
     /*--- Update flux and Jacobians with dissipation terms. ---*/
 
@@ -265,6 +267,7 @@ class CJSTScheme : public CCenteredBase<CJSTScheme<Decorator>,Decorator> {
 
 /*!
  * \class CJSTmatScheme
+ * \ingroup ConvDiscr
  * \brief JST scheme with matrix dissipation.
  */
 template<class Decorator>
@@ -321,7 +324,7 @@ class CJSTmatScheme : public CCenteredBase<CJSTmatScheme<Decorator>,Decorator> {
     const auto si = gatherVariables(iPoint, solution.GetSensor());
     const auto sj = gatherVariables(jPoint, solution.GetSensor());
     const Double eps2 = kappa2 * 0.5*(si+sj) * sc2;
-    const Double eps4 = max(0.0, kappa4-eps2) * sc4;
+    const Double eps4 = fmax(0.0, kappa4-eps2) * sc4;
 
     const auto lapl_i = gatherVariables<nVar>(iPoint, solution.GetUndivided_Laplacian());
     const auto lapl_j = gatherVariables<nVar>(jPoint, solution.GetUndivided_Laplacian());
@@ -357,10 +360,10 @@ class CJSTmatScheme : public CCenteredBase<CJSTmatScheme<Decorator>,Decorator> {
     lambda(nDim) = projVel + avgV.speedSound()*area;
     lambda(nDim+1) = projVel - avgV.speedSound()*area;
 
-    const Double maxLambda = max(lambda(nDim), -lambda(nDim+1));
+    const Double maxLambda = fmax(lambda(nDim), -lambda(nDim+1));
 
     for (size_t iVar = 0; iVar < nVar; ++iVar) {
-      lambda(iVar) = max(abs(lambda(iVar)), entropyFix*maxLambda);
+      lambda(iVar) = fmax(abs(lambda(iVar)), entropyFix*maxLambda);
     }
 
     /*--- Update flux and Jacobians with scaled dissipation terms. ---*/
@@ -391,6 +394,7 @@ class CJSTmatScheme : public CCenteredBase<CJSTmatScheme<Decorator>,Decorator> {
 
 /*!
  * \class CJSTkeScheme
+ * \ingroup ConvDiscr
  * \brief JST scheme without 4th order dissipation.
  */
 template<class Decorator>
@@ -461,6 +465,7 @@ class CJSTkeScheme : public CCenteredBase<CJSTkeScheme<Decorator>,Decorator> {
 
 /*!
  * \class CLaxScheme
+ * \ingroup ConvDiscr
  * \brief Lax–Friedrichs 1st order scheme.
  */
 template<class Decorator>