Use Stride Kinds/IDs in Unstructured Nabla (#1799)

Slightly reduces memory usage required for strides / kernel arguments. Gives speedups of up to almost 9% on Clang/Cray CUDA and around 3.5% on NVCC (for fn_unstructured_nabla_fused_tuple_of_fields, double precision, large domain).
GridTools · Aug 5, 2024 · ffcf790 · ffcf790
1 parent b6e3dce
commit ffcf790
Show file tree

Hide file tree

Showing 9 changed files with 86,861 additions and 86,847 deletions.
diff --git a/pyutils/perftest/references/daint_cray/128.json b/pyutils/perftest/references/daint_cray/128.json
diff --git a/pyutils/perftest/references/daint_cray/256.json b/pyutils/perftest/references/daint_cray/256.json
diff --git a/pyutils/perftest/references/daint_nvcc_cray/128.json b/pyutils/perftest/references/daint_nvcc_cray/128.json
diff --git a/pyutils/perftest/references/daint_nvcc_cray/256.json b/pyutils/perftest/references/daint_nvcc_cray/256.json
diff --git a/pyutils/perftest/references/daint_nvcc_gcc/128.json b/pyutils/perftest/references/daint_nvcc_gcc/128.json
diff --git a/pyutils/perftest/references/daint_nvcc_gcc/256.json b/pyutils/perftest/references/daint_nvcc_gcc/256.json
diff --git a/pyutils/pyutils/runtools.py b/pyutils/pyutils/runtools.py
@@ -14,7 +14,7 @@ async def _run_async(command, log_output, **kwargs):
         stdout=asyncio.subprocess.PIPE,
         stderr=asyncio.subprocess.PIPE,
         env=env.env,
-        limit=2**20,
+        limit=2**24,
         **kwargs)
 
     async def read_output(stream):

diff --git a/tests/include/fn_mesh.hpp b/tests/include/fn_mesh.hpp
@@ -89,25 +89,34 @@ namespace gridtools {
         constexpr int nlevels() const { return m_nz; }
 
         template <class T = FloatType,
+            int Id = -1,
             class Init,
             class... Dims,
             std::enable_if_t<!(std::is_integral_v<Init> || is_integral_constant<Init>::value), int> = 0>
         auto make_storage(Init const &init, Dims... dims) const {
-            return storage::builder<StorageTraits>.dimensions(dims...).template type<T>().initializer(init).unknown_id().build();
+            auto builder = storage::builder<StorageTraits>.dimensions(dims...).template type<T>().initializer(init);
+            if constexpr (Id == -1)
+                return builder.unknown_id().build();
+            else
+                return builder.template id<Id>().build();
+            // disable incorrect warning "missing return statement at end of non-void function"
+            GT_NVCC_DIAG_PUSH_SUPPRESS(940)
         }
+        GT_NVCC_DIAG_POP_SUPPRESS(940)
 
         template <class T = FloatType,
+            int Id = -1,
             class... Dims,
             std::enable_if_t<std::conjunction_v<std::bool_constant<std::is_integral<Dims>::value ||
                                                                    is_integral_constant<Dims>::value>...>,
                 int> = 0>
         auto make_storage(Dims... dims) const {
-            return make_storage<T>([](int...) { return T(); }, dims...);
+            return make_storage<T, Id>([](int...) { return T(); }, dims...);
         }
 
-        template <class T = FloatType, class... Args>
+        template <class T = FloatType, int Id = -1, class... Args>
         auto make_const_storage(Args &&...args) const {
-            return make_storage<T const>(std::forward<Args>(args)...);
+            return make_storage<T const, Id>(std::forward<Args>(args)...);
         }
 
         auto v2e_table() const {

diff --git a/tests/regression/fn/fn_unstructured_nabla.cpp b/tests/regression/fn/fn_unstructured_nabla.cpp
@@ -184,51 +184,56 @@ namespace {
         apply_nabla_fused(vertex_backend.stencil_executor(), nabla, sign, vol, pp, s);
     };
 
+    static constexpr int vertex_field_id = 0;
+    static constexpr int edge_field_id = 1;
+
     constexpr inline auto make_comp = [](auto backend, auto const &mesh, auto &nabla) {
         using mesh_t = std::remove_reference_t<decltype(mesh)>;
         using float_t = typename mesh_t::float_t;
-        return [backend,
-                   &nabla,
-                   nvertices = mesh.nvertices(),
-                   nedges = mesh.nedges(),
-                   nlevels = mesh.nlevels(),
-                   v2e_table = mesh.v2e_table(),
-                   e2v_table = mesh.e2v_table(),
-                   pp = mesh.make_const_storage(pp, mesh.nvertices(), mesh.nlevels()),
-                   sign = mesh.template make_const_storage<array<float_t, 6>>(sign, mesh.nvertices()),
-                   vol = mesh.make_const_storage(vol, mesh.nvertices()),
-                   s = mesh.template make_const_storage<tuple<float_t, float_t>>(s, mesh.nedges(), mesh.nlevels())] {
-            auto v2e_ptr = sid_neighbor_table::as_neighbor_table<integral_constant<int, 0>,
-                integral_constant<int, 1>,
-                mesh_t::max_v2e_neighbors_t::value>(v2e_table);
-            auto e2v_ptr = sid_neighbor_table::as_neighbor_table<integral_constant<int, 0>,
-                integral_constant<int, 1>,
-                mesh_t::max_e2v_neighbors_t::value>(e2v_table);
-            fencil(backend, nvertices, nedges, nlevels, v2e_ptr, e2v_ptr, nabla, pp, s, sign, vol);
-        };
+        return
+            [backend,
+                &nabla,
+                nvertices = mesh.nvertices(),
+                nedges = mesh.nedges(),
+                nlevels = mesh.nlevels(),
+                v2e_table = mesh.v2e_table(),
+                e2v_table = mesh.e2v_table(),
+                pp = mesh.template make_const_storage<float_t, vertex_field_id>(pp, mesh.nvertices(), mesh.nlevels()),
+                sign = mesh.template make_const_storage<array<float_t, 6>>(sign, mesh.nvertices()),
+                vol = mesh.make_const_storage(vol, mesh.nvertices()),
+                s = mesh.template make_const_storage<tuple<float_t, float_t>>(s, mesh.nedges(), mesh.nlevels())] {
+                auto v2e_ptr = sid_neighbor_table::as_neighbor_table<integral_constant<int, 0>,
+                    integral_constant<int, 1>,
+                    mesh_t::max_v2e_neighbors_t::value>(v2e_table);
+                auto e2v_ptr = sid_neighbor_table::as_neighbor_table<integral_constant<int, 0>,
+                    integral_constant<int, 1>,
+                    mesh_t::max_e2v_neighbors_t::value>(e2v_table);
+                fencil(backend, nvertices, nedges, nlevels, v2e_ptr, e2v_ptr, nabla, pp, s, sign, vol);
+            };
     };
 
     constexpr inline auto make_comp_fused = [](auto backend, auto const &mesh, auto &nabla) {
         using mesh_t = std::remove_reference_t<decltype(mesh)>;
         using float_t = typename mesh_t::float_t;
-        return [backend,
-                   &nabla,
-                   nvertices = mesh.nvertices(),
-                   nlevels = mesh.nlevels(),
-                   v2e_table = mesh.v2e_table(),
-                   e2v_table = mesh.e2v_table(),
-                   pp = mesh.make_const_storage(pp, mesh.nvertices(), mesh.nlevels()),
-                   sign = mesh.template make_const_storage<array<float_t, 6>>(sign, mesh.nvertices()),
-                   vol = mesh.make_const_storage(vol, mesh.nvertices()),
-                   s = mesh.template make_const_storage<tuple<float_t, float_t>>(s, mesh.nedges(), mesh.nlevels())] {
-            auto v2e_ptr = sid_neighbor_table::as_neighbor_table<integral_constant<int, 0>,
-                integral_constant<int, 1>,
-                mesh_t::max_v2e_neighbors_t::value>(v2e_table);
-            auto e2v_ptr = sid_neighbor_table::as_neighbor_table<integral_constant<int, 0>,
-                integral_constant<int, 1>,
-                mesh_t::max_e2v_neighbors_t::value>(e2v_table);
-            fencil_fused(backend, nvertices, nlevels, v2e_ptr, e2v_ptr, nabla, pp, s, sign, vol);
-        };
+        return
+            [backend,
+                &nabla,
+                nvertices = mesh.nvertices(),
+                nlevels = mesh.nlevels(),
+                v2e_table = mesh.v2e_table(),
+                e2v_table = mesh.e2v_table(),
+                pp = mesh.template make_const_storage<float_t, vertex_field_id>(pp, mesh.nvertices(), mesh.nlevels()),
+                sign = mesh.template make_const_storage<array<float_t, 6>>(sign, mesh.nvertices()),
+                vol = mesh.make_const_storage(vol, mesh.nvertices()),
+                s = mesh.template make_const_storage<tuple<float_t, float_t>>(s, mesh.nedges(), mesh.nlevels())] {
+                auto v2e_ptr = sid_neighbor_table::as_neighbor_table<integral_constant<int, 0>,
+                    integral_constant<int, 1>,
+                    mesh_t::max_v2e_neighbors_t::value>(v2e_table);
+                auto e2v_ptr = sid_neighbor_table::as_neighbor_table<integral_constant<int, 0>,
+                    integral_constant<int, 1>,
+                    mesh_t::max_e2v_neighbors_t::value>(e2v_table);
+                fencil_fused(backend, nvertices, nlevels, v2e_ptr, e2v_ptr, nabla, pp, s, sign, vol);
+            };
     };
 
     constexpr inline auto make_expected = [](auto const &mesh) {
@@ -265,8 +270,8 @@ namespace {
 
     GT_REGRESSION_TEST(fn_unstructured_nabla_tuple_of_fields, test_environment<>, fn_backend_t) {
         auto mesh = TypeParam::fn_unstructured_mesh();
-        auto nabla0 = mesh.make_storage(mesh.nvertices(), mesh.nlevels());
-        auto nabla1 = mesh.make_storage(mesh.nvertices(), mesh.nlevels());
+        auto nabla0 = mesh.template make_storage<float_t, vertex_field_id>(mesh.nvertices(), mesh.nlevels());
+        auto nabla1 = mesh.template make_storage<float_t, vertex_field_id>(mesh.nvertices(), mesh.nlevels());
         auto nabla =
             sid::composite::keys<integral_constant<int, 0>, integral_constant<int, 1>>::make_values(nabla0, nabla1);
 
@@ -280,8 +285,8 @@ namespace {
 
     GT_REGRESSION_TEST(fn_unstructured_nabla_fused_tuple_of_fields, test_environment<>, fn_backend_t) {
         auto mesh = TypeParam::fn_unstructured_mesh();
-        auto nabla0 = mesh.make_storage(mesh.nvertices(), mesh.nlevels());
-        auto nabla1 = mesh.make_storage(mesh.nvertices(), mesh.nlevels());
+        auto nabla0 = mesh.template make_storage<float_t, vertex_field_id>(mesh.nvertices(), mesh.nlevels());
+        auto nabla1 = mesh.template make_storage<float_t, vertex_field_id>(mesh.nvertices(), mesh.nlevels());
         auto nabla =
             sid::composite::keys<integral_constant<int, 0>, integral_constant<int, 1>>::make_values(nabla0, nabla1);