Skip to content

Commit

Permalink
Use Stride Kinds/IDs in Unstructured Nabla (#1799)
Browse files Browse the repository at this point in the history
Slightly reduces memory usage required for strides / kernel arguments.
Gives speedups of up to almost 9% on Clang/Cray CUDA and around 3.5% on NVCC
(for fn_unstructured_nabla_fused_tuple_of_fields, double precision,
large domain).
  • Loading branch information
fthaler authored and havogt committed Sep 30, 2024
1 parent 083bd1c commit 90b9175
Show file tree
Hide file tree
Showing 9 changed files with 86,861 additions and 86,847 deletions.
28,688 changes: 14,344 additions & 14,344 deletions pyutils/perftest/references/daint_cray/128.json

Large diffs are not rendered by default.

29,248 changes: 14,624 additions & 14,624 deletions pyutils/perftest/references/daint_cray/256.json

Large diffs are not rendered by default.

28,676 changes: 14,338 additions & 14,338 deletions pyutils/perftest/references/daint_nvcc_cray/128.json

Large diffs are not rendered by default.

29,032 changes: 14,516 additions & 14,516 deletions pyutils/perftest/references/daint_nvcc_cray/256.json

Large diffs are not rendered by default.

28,696 changes: 14,348 additions & 14,348 deletions pyutils/perftest/references/daint_nvcc_gcc/128.json

Large diffs are not rendered by default.

29,262 changes: 14,631 additions & 14,631 deletions pyutils/perftest/references/daint_nvcc_gcc/256.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyutils/pyutils/runtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ async def _run_async(command, log_output, **kwargs):
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
env=env.env,
limit=2**20,
limit=2**24,
**kwargs)

async def read_output(stream):
Expand Down
17 changes: 13 additions & 4 deletions tests/include/fn_mesh.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,25 +89,34 @@ namespace gridtools {
constexpr int nlevels() const { return m_nz; }

template <class T = FloatType,
int Id = -1,
class Init,
class... Dims,
std::enable_if_t<!(std::is_integral_v<Init> || is_integral_constant<Init>::value), int> = 0>
auto make_storage(Init const &init, Dims... dims) const {
return storage::builder<StorageTraits>.dimensions(dims...).template type<T>().initializer(init).unknown_id().build();
auto builder = storage::builder<StorageTraits>.dimensions(dims...).template type<T>().initializer(init);
if constexpr (Id == -1)
return builder.unknown_id().build();
else
return builder.template id<Id>().build();
// disable incorrect warning "missing return statement at end of non-void function"
GT_NVCC_DIAG_PUSH_SUPPRESS(940)
}
GT_NVCC_DIAG_POP_SUPPRESS(940)

template <class T = FloatType,
int Id = -1,
class... Dims,
std::enable_if_t<std::conjunction_v<std::bool_constant<std::is_integral<Dims>::value ||
is_integral_constant<Dims>::value>...>,
int> = 0>
auto make_storage(Dims... dims) const {
return make_storage<T>([](int...) { return T(); }, dims...);
return make_storage<T, Id>([](int...) { return T(); }, dims...);
}

template <class T = FloatType, class... Args>
template <class T = FloatType, int Id = -1, class... Args>
auto make_const_storage(Args &&...args) const {
return make_storage<T const>(std::forward<Args>(args)...);
return make_storage<T const, Id>(std::forward<Args>(args)...);
}

auto v2e_table() const {
Expand Down
87 changes: 46 additions & 41 deletions tests/regression/fn/fn_unstructured_nabla.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,51 +184,56 @@ namespace {
apply_nabla_fused(vertex_backend.stencil_executor(), nabla, sign, vol, pp, s);
};

static constexpr int vertex_field_id = 0;
static constexpr int edge_field_id = 1;

constexpr inline auto make_comp = [](auto backend, auto const &mesh, auto &nabla) {
using mesh_t = std::remove_reference_t<decltype(mesh)>;
using float_t = typename mesh_t::float_t;
return [backend,
&nabla,
nvertices = mesh.nvertices(),
nedges = mesh.nedges(),
nlevels = mesh.nlevels(),
v2e_table = mesh.v2e_table(),
e2v_table = mesh.e2v_table(),
pp = mesh.make_const_storage(pp, mesh.nvertices(), mesh.nlevels()),
sign = mesh.template make_const_storage<array<float_t, 6>>(sign, mesh.nvertices()),
vol = mesh.make_const_storage(vol, mesh.nvertices()),
s = mesh.template make_const_storage<tuple<float_t, float_t>>(s, mesh.nedges(), mesh.nlevels())] {
auto v2e_ptr = sid_neighbor_table::as_neighbor_table<integral_constant<int, 0>,
integral_constant<int, 1>,
mesh_t::max_v2e_neighbors_t::value>(v2e_table);
auto e2v_ptr = sid_neighbor_table::as_neighbor_table<integral_constant<int, 0>,
integral_constant<int, 1>,
mesh_t::max_e2v_neighbors_t::value>(e2v_table);
fencil(backend, nvertices, nedges, nlevels, v2e_ptr, e2v_ptr, nabla, pp, s, sign, vol);
};
return
[backend,
&nabla,
nvertices = mesh.nvertices(),
nedges = mesh.nedges(),
nlevels = mesh.nlevels(),
v2e_table = mesh.v2e_table(),
e2v_table = mesh.e2v_table(),
pp = mesh.template make_const_storage<float_t, vertex_field_id>(pp, mesh.nvertices(), mesh.nlevels()),
sign = mesh.template make_const_storage<array<float_t, 6>>(sign, mesh.nvertices()),
vol = mesh.make_const_storage(vol, mesh.nvertices()),
s = mesh.template make_const_storage<tuple<float_t, float_t>>(s, mesh.nedges(), mesh.nlevels())] {
auto v2e_ptr = sid_neighbor_table::as_neighbor_table<integral_constant<int, 0>,
integral_constant<int, 1>,
mesh_t::max_v2e_neighbors_t::value>(v2e_table);
auto e2v_ptr = sid_neighbor_table::as_neighbor_table<integral_constant<int, 0>,
integral_constant<int, 1>,
mesh_t::max_e2v_neighbors_t::value>(e2v_table);
fencil(backend, nvertices, nedges, nlevels, v2e_ptr, e2v_ptr, nabla, pp, s, sign, vol);
};
};

constexpr inline auto make_comp_fused = [](auto backend, auto const &mesh, auto &nabla) {
using mesh_t = std::remove_reference_t<decltype(mesh)>;
using float_t = typename mesh_t::float_t;
return [backend,
&nabla,
nvertices = mesh.nvertices(),
nlevels = mesh.nlevels(),
v2e_table = mesh.v2e_table(),
e2v_table = mesh.e2v_table(),
pp = mesh.make_const_storage(pp, mesh.nvertices(), mesh.nlevels()),
sign = mesh.template make_const_storage<array<float_t, 6>>(sign, mesh.nvertices()),
vol = mesh.make_const_storage(vol, mesh.nvertices()),
s = mesh.template make_const_storage<tuple<float_t, float_t>>(s, mesh.nedges(), mesh.nlevels())] {
auto v2e_ptr = sid_neighbor_table::as_neighbor_table<integral_constant<int, 0>,
integral_constant<int, 1>,
mesh_t::max_v2e_neighbors_t::value>(v2e_table);
auto e2v_ptr = sid_neighbor_table::as_neighbor_table<integral_constant<int, 0>,
integral_constant<int, 1>,
mesh_t::max_e2v_neighbors_t::value>(e2v_table);
fencil_fused(backend, nvertices, nlevels, v2e_ptr, e2v_ptr, nabla, pp, s, sign, vol);
};
return
[backend,
&nabla,
nvertices = mesh.nvertices(),
nlevels = mesh.nlevels(),
v2e_table = mesh.v2e_table(),
e2v_table = mesh.e2v_table(),
pp = mesh.template make_const_storage<float_t, vertex_field_id>(pp, mesh.nvertices(), mesh.nlevels()),
sign = mesh.template make_const_storage<array<float_t, 6>>(sign, mesh.nvertices()),
vol = mesh.make_const_storage(vol, mesh.nvertices()),
s = mesh.template make_const_storage<tuple<float_t, float_t>>(s, mesh.nedges(), mesh.nlevels())] {
auto v2e_ptr = sid_neighbor_table::as_neighbor_table<integral_constant<int, 0>,
integral_constant<int, 1>,
mesh_t::max_v2e_neighbors_t::value>(v2e_table);
auto e2v_ptr = sid_neighbor_table::as_neighbor_table<integral_constant<int, 0>,
integral_constant<int, 1>,
mesh_t::max_e2v_neighbors_t::value>(e2v_table);
fencil_fused(backend, nvertices, nlevels, v2e_ptr, e2v_ptr, nabla, pp, s, sign, vol);
};
};

constexpr inline auto make_expected = [](auto const &mesh) {
Expand Down Expand Up @@ -265,8 +270,8 @@ namespace {

GT_REGRESSION_TEST(fn_unstructured_nabla_tuple_of_fields, test_environment<>, fn_backend_t) {
auto mesh = TypeParam::fn_unstructured_mesh();
auto nabla0 = mesh.make_storage(mesh.nvertices(), mesh.nlevels());
auto nabla1 = mesh.make_storage(mesh.nvertices(), mesh.nlevels());
auto nabla0 = mesh.template make_storage<float_t, vertex_field_id>(mesh.nvertices(), mesh.nlevels());
auto nabla1 = mesh.template make_storage<float_t, vertex_field_id>(mesh.nvertices(), mesh.nlevels());
auto nabla =
sid::composite::keys<integral_constant<int, 0>, integral_constant<int, 1>>::make_values(nabla0, nabla1);

Expand All @@ -280,8 +285,8 @@ namespace {

GT_REGRESSION_TEST(fn_unstructured_nabla_fused_tuple_of_fields, test_environment<>, fn_backend_t) {
auto mesh = TypeParam::fn_unstructured_mesh();
auto nabla0 = mesh.make_storage(mesh.nvertices(), mesh.nlevels());
auto nabla1 = mesh.make_storage(mesh.nvertices(), mesh.nlevels());
auto nabla0 = mesh.template make_storage<float_t, vertex_field_id>(mesh.nvertices(), mesh.nlevels());
auto nabla1 = mesh.template make_storage<float_t, vertex_field_id>(mesh.nvertices(), mesh.nlevels());
auto nabla =
sid::composite::keys<integral_constant<int, 0>, integral_constant<int, 1>>::make_values(nabla0, nabla1);

Expand Down

0 comments on commit 90b9175

Please sign in to comment.