diff --git a/Src/Base/AMReX_CTOParallelForImpl.H b/Src/Base/AMReX_CTOParallelForImpl.H new file mode 100644 index 00000000000..f4dd41ca0c8 --- /dev/null +++ b/Src/Base/AMReX_CTOParallelForImpl.H @@ -0,0 +1,331 @@ +#ifndef AMREX_CTO_PARALLEL_FOR_H_ +#define AMREX_CTO_PARALLEL_FOR_H_ + +#include +#include +#include + +#include +#include + +/* This header is not for the users to include directly. It's meant to be + * included in AMReX_GpuLaunch.H, which has included the headers needed + * here. */ + +/* Thank Maikel Nadolski and Alex Sinn for the techniques used here! */ + +namespace amrex { + +template +struct CompileTimeOptions { + // TypeList is defined in AMReX_Tuple.H + using list_type = TypeList...>; +}; + +#if (__cplusplus >= 201703L) + +namespace meta +{ + template + constexpr auto operator+ (TypeList, TypeList) { + return TypeList{}; + } + + template + constexpr auto single_product (TypeList, A) { + return TypeList{})...>{}; + } + + template + constexpr auto operator* (LLs, TypeList) { + return (TypeList<>{} + ... + single_product(LLs{}, As{})); + } + + template + constexpr auto cartesian_product_n (TypeList) { + return (TypeList>{} * ... * Ls{}); + } +} + +namespace detail +{ + template + std::enable_if_t::value || std::is_same::value, bool> + ParallelFor_helper2 (T const& N, F&& f, TypeList, + std::array const& runtime_options) + { + if (runtime_options == std::array{As::value...}) { + if constexpr (std::is_integral::value) { + ParallelFor(N, [f] AMREX_GPU_DEVICE (T i) noexcept + { + f(i, As{}...); + }); + } else { + ParallelFor(N, [f] AMREX_GPU_DEVICE (int i, int j, int k) noexcept + { + f(i, j, k, As{}...); + }); + } + return true; + } else { + return false; + } + } + + template + std::enable_if_t::value, bool> + ParallelFor_helper2 (Box const& box, T ncomp, F&& f, TypeList, + std::array const& runtime_options) + { + if (runtime_options == std::array{As::value...}) { + ParallelFor(box, ncomp, [f] AMREX_GPU_DEVICE (int i, int j, int k, T n) noexcept + { + f(i, j, k, n, As{}...); + }); + return true; + } else { + return false; + } + } + + template + std::enable_if_t::value || std::is_same::value> + ParallelFor_helper1 (T const& N, F&& f, TypeList, + RO const& runtime_options) + { + bool found_option = (false || ... || + ParallelFor_helper2(N, std::forward(f), + PPs{}, runtime_options)); + amrex::ignore_unused(found_option); + AMREX_ASSERT(found_option); + } + + template + std::enable_if_t::value> + ParallelFor_helper1 (Box const& box, T ncomp, F&& f, TypeList, + RO const& runtime_options) + { + bool found_option = (false || ... || + ParallelFor_helper2(box, ncomp, std::forward(f), + PPs{}, runtime_options)); + amrex::ignore_unused(found_option); + AMREX_ASSERT(found_option); + } +} + +#endif + +template +std::enable_if_t::value> +ParallelFor (TypeList /*list_of_compile_time_options*/, + std::array const& runtime_options, + T N, F&& f) +{ +#if (__cplusplus >= 201703L) + using OptionsListList = TypeList; + detail::ParallelFor_helper1(N, std::forward(f), + meta::cartesian_product_n(OptionsListList{}), + runtime_options); +#else + amrex::ignore_unused(N, f, runtime_options); + static_assert(std::is_integral::value, "This requires C++17"); +#endif +} + +template +void ParallelFor (TypeList /*list_of_compile_time_options*/, + std::array const& runtime_options, + Box const& box, F&& f) +{ +#if (__cplusplus >= 201703L) + using OptionsListList = TypeList; + detail::ParallelFor_helper1(box, std::forward(f), + meta::cartesian_product_n(OptionsListList{}), + runtime_options); +#else + amrex::ignore_unused(box, f, runtime_options); + static_assert(std::is_integral::value, "This requires C++17"); +#endif +} + +template +std::enable_if_t::value> +ParallelFor (TypeList /*list_of_compile_time_options*/, + std::array const& runtime_options, + Box const& box, T ncomp, F&& f) +{ +#if (__cplusplus >= 201703L) + using OptionsListList = TypeList; + detail::ParallelFor_helper1(box, ncomp, std::forward(f), + meta::cartesian_product_n(OptionsListList{}), + runtime_options); +#else + amrex::ignore_unused(box, ncomp, f, runtime_options); + static_assert(std::is_integral::value, "This requires C++17"); +#endif +} + +/** + * \brief ParallelFor with compile time optimization of kernels with run time options. + * + * It uses fold expression to generate kernel launches for all combinations + * of the run time options. The kernel function can use constexpr if to + * discard unused code blocks for better run time performance. In the + * example below, the code will be expanded into 4*2=8 normal ParallelFors + * for all combinations of the run time parameters. + \verbatim + int A_runtime_option = ...; + int B_runtime_option = ...; + enum A_options : int { A0, A1, A2, A3}; + enum B_options : int { B0, B1 }; + ParallelFor(TypeList, + CompileTimeOptions>{}, + {A_runtime_option, B_runtime_option}, + N, [=] AMREX_GPU_DEVICE (int i, auto A_control, auto B_control) + { + ... + if constexpr (A_control.value == A0) { + ... + } else if constexpr (A_control.value == A1) { + ... + } else if constexpr (A_control.value == A2) { + ... + else { + ... + } + if constexpr (A_control.value != A3 && B_control.value == B1) { + ... + } + ... + }); + \endverbatim + * Note that due to a limitation of CUDA's extended device lambda, the + * constexpr if block cannot be the one that captures a variable first. + * If nvcc complains about it, you will have to manually capture it outside + * constexpr if. The data type for the parameters is int. + * + * \param ctos list of all possible values of the parameters. + * \param option the run time parameters. + * \param N an interger specifying the 1D for loop's range. + * \param f a callable object taking an integer and working on that iteration. + */ +template +std::enable_if_t::value> +ParallelFor (TypeList ctos, + std::array const& option, + T N, F&& f) +{ + ParallelFor(ctos, option, N, std::forward(f)); +} + +/** + * \brief ParallelFor with compile time optimization of kernels with run time options. + * + * It uses fold expression to generate kernel launches for all combinations + * of the run time options. The kernel function can use constexpr if to + * discard unused code blocks for better run time performance. In the + * example below, the code will be expanded into 4*2=8 normal ParallelFors + * for all combinations of the run time parameters. + \verbatim + int A_runtime_option = ...; + int B_runtime_option = ...; + enum A_options : int { A0, A1, A2, A3}; + enum B_options : int { B0, B1 }; + ParallelFor(TypeList, + CompileTimeOptions>{}, + {A_runtime_option, B_runtime_option}, + box, [=] AMREX_GPU_DEVICE (int i, int j, int k, + auto A_control, auto B_control) + { + ... + if constexpr (A_control.value == A0) { + ... + } else if constexpr (A_control.value == A1) { + ... + } else if constexpr (A_control.value == A2) { + ... + else { + ... + } + if constexpr (A_control.value != A3 && B_control.value == B1) { + ... + } + ... + }); + \endverbatim + * Note that due to a limitation of CUDA's extended device lambda, the + * constexpr if block cannot be the one that captures a variable first. + * If nvcc complains about it, you will have to manually capture it outside + * constexpr if. The data type for the parameters is int. + * + * \param ctos list of all possible values of the parameters. + * \param option the run time parameters. + * \param box a Box specifying the 3D for loop's range. + * \param f a callable object taking three integers and working on the given cell. + */ +template +void ParallelFor (TypeList ctos, + std::array const& option, + Box const& box, F&& f) +{ + ParallelFor(ctos, option, box, std::forward(f)); +} + +/** + * \brief ParallelFor with compile time optimization of kernels with run time options. + * + * It uses fold expression to generate kernel launches for all combinations + * of the run time options. The kernel function can use constexpr if to + * discard unused code blocks for better run time performance. In the + * example below, the code will be expanded into 4*2=8 normal ParallelFors + * for all combinations of the run time parameters. + \verbatim + int A_runtime_option = ...; + int B_runtime_option = ...; + enum A_options : int { A0, A1, A2, A3}; + enum B_options : int { B0, B1 }; + ParallelFor(TypeList, + CompileTimeOptions>{}, + {A_runtime_option, B_runtime_option}, + box, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n, + auto A_control, auto B_control) + { + ... + if constexpr (A_control.value == A0) { + ... + } else if constexpr (A_control.value == A1) { + ... + } else if constexpr (A_control.value == A2) { + ... + else { + ... + } + if constexpr (A_control.value != A3 && B_control.value == B1) { + ... + } + ... + }); + \endverbatim + * Note that due to a limitation of CUDA's extended device lambda, the + * constexpr if block cannot be the one that captures a variable first. + * If nvcc complains about it, you will have to manually capture it outside + * constexpr if. The data type for the parameters is int. + * + * \param ctos list of all possible values of the parameters. + * \param option the run time parameters. + * \param box a Box specifying the iteration in 3D space. + * \param ncomp an integer specifying the range for iteration over components. + * \param f a callable object taking three integers and working on the given cell. + */ +template +std::enable_if_t::value> +ParallelFor (TypeList ctos, + std::array const& option, + Box const& box, T ncomp, F&& f) +{ + ParallelFor(ctos, option, box, ncomp, std::forward(f)); +} + +} + +#endif diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H index d1a9e352336..39fac18835e 100644 --- a/Src/Base/AMReX_GpuLaunch.H +++ b/Src/Base/AMReX_GpuLaunch.H @@ -443,4 +443,6 @@ namespace Gpu { #endif +#include + #endif diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt index c47fdcae706..38d45d4d4dc 100644 --- a/Src/Base/CMakeLists.txt +++ b/Src/Base/CMakeLists.txt @@ -223,6 +223,7 @@ target_sources( amrex AMReX_MFParallelForC.H AMReX_MFParallelForG.H AMReX_TagParallelFor.H + AMReX_CTOParallelForImpl.H AMReX_ParReduce.H # CUDA -------------------------------------------------------------------- AMReX_CudaGraph.H diff --git a/Src/Base/Make.package b/Src/Base/Make.package index 79085ae70a1..5b1a0e7e267 100644 --- a/Src/Base/Make.package +++ b/Src/Base/Make.package @@ -100,6 +100,7 @@ C$(AMREX_BASE)_headers += AMReX_MFParallelForC.H C$(AMREX_BASE)_headers += AMReX_MFParallelForG.H C$(AMREX_BASE)_headers += AMReX_TagParallelFor.H +C$(AMREX_BASE)_headers += AMReX_CTOParallelForImpl.H C$(AMREX_BASE)_headers += AMReX_ParReduce.H diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt index 50cc2bb8cb2..8d318f918b8 100644 --- a/Tests/CMakeLists.txt +++ b/Tests/CMakeLists.txt @@ -1,7 +1,7 @@ # # List of subdirectories to search for CMakeLists. # -set( AMREX_TESTS_SUBDIRS AsyncOut MultiBlock Amr CLZ Parser) +set( AMREX_TESTS_SUBDIRS AsyncOut MultiBlock Amr CLZ Parser CTOParFor) if (AMReX_PARTICLES) list(APPEND AMREX_TESTS_SUBDIRS Particles) diff --git a/Tests/CTOParFor/CMakeLists.txt b/Tests/CTOParFor/CMakeLists.txt new file mode 100644 index 00000000000..57c1e7715e2 --- /dev/null +++ b/Tests/CTOParFor/CMakeLists.txt @@ -0,0 +1,7 @@ +set(_sources main.cpp) +set(_input_files) + +setup_test(_sources _input_files) + +unset(_sources) +unset(_input_files) diff --git a/Tests/CTOParFor/GNUmakefile b/Tests/CTOParFor/GNUmakefile new file mode 100644 index 00000000000..0dbc65578af --- /dev/null +++ b/Tests/CTOParFor/GNUmakefile @@ -0,0 +1,20 @@ +AMREX_HOME = ../../ + +DEBUG = FALSE +DIM = 3 +COMP = gcc + +USE_MPI = FALSE +USE_OMP = FALSE +USE_CUDA = FALSE + +TINY_PROFILE = FALSE + +CXXSTD = c++17 + +include $(AMREX_HOME)/Tools/GNUMake/Make.defs + +include ./Make.package +include $(AMREX_HOME)/Src/Base/Make.package + +include $(AMREX_HOME)/Tools/GNUMake/Make.rules diff --git a/Tests/CTOParFor/Make.package b/Tests/CTOParFor/Make.package new file mode 100644 index 00000000000..4497b0e25b9 --- /dev/null +++ b/Tests/CTOParFor/Make.package @@ -0,0 +1,4 @@ +CEXE_sources += main.cpp + + + diff --git a/Tests/CTOParFor/main.cpp b/Tests/CTOParFor/main.cpp new file mode 100644 index 00000000000..0cf1d7ea35a --- /dev/null +++ b/Tests/CTOParFor/main.cpp @@ -0,0 +1,64 @@ +#include +#include + +using namespace amrex; + +int main (int argc, char* argv[]) +{ + amrex::Initialize(argc,argv); +#if (__cplusplus >= 201703L) + { + enum A_options: int { + A0 = 0, A1 + }; + + enum B_options: int { + B0 = 0, B1, B2 + }; + + Box box(IntVect(0),IntVect(7)); + IArrayBox fab(box,2); + fab.setVal(-10); + + auto const& arr = fab.array(); + + for (int ia = 0; ia < 2; ++ia) { + for (int ib = 0; ib < 3; ++ib) { + ParallelFor(TypeList, + CompileTimeOptions>{}, + {ia, ib}, + box, [=] AMREX_GPU_DEVICE (int i, int j, int k, + auto A_control, + auto B_control) + { + auto const& larr = arr; + int a, b; + if constexpr (A_control.value == 0) { + a = 0; + } else if constexpr (A_control.value == 1) { + a = 1; + } else { + a = -1; + } + if constexpr (B_control.value == 0) { + b = 0; + } else if constexpr (B_control.value == 1) { + b = 1; + } else if constexpr (B_control.value == 2) { + b = 2; + } else if constexpr (B_control.value == 3) { + b = 3; + } + larr(i,j,k) = a*10 + b; + }); + + auto s = fab.sum(0); + AMREX_ALWAYS_ASSERT(s == box.numPts()*(ia*10+ib)); + } + } + } +#else + amrex::Print() << "This test requires C++17." << std::endl; +#endif + amrex::Finalize(); +}