Skip to content

Commit

Permalink
Yield from atomic_fence / atomic_ref operations to guarantee forward …
Browse files Browse the repository at this point in the history
…progress
  • Loading branch information
fknorr committed Jan 2, 2024
1 parent f89c565 commit 65d5e76
Show file tree
Hide file tree
Showing 12 changed files with 274 additions and 257 deletions.
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ add_library(simsycl
include/simsycl/detail/config.hh
include/simsycl/detail/coordinate.hh
include/simsycl/detail/hash.hh
include/simsycl/detail/schedule.hh
include/simsycl/detail/subscript.hh
include/simsycl/detail/utils.hh
include/simsycl/detail/vec_swizzles.inc
Expand Down Expand Up @@ -119,7 +120,7 @@ add_library(simsycl
src/simsycl/check.cc
src/simsycl/context.cc
src/simsycl/device.cc
src/simsycl/handler.cc
src/simsycl/schedule.cc
src/simsycl/platform.cc
src/simsycl/queue.cc
src/simsycl/system.cc
Expand Down
2 changes: 1 addition & 1 deletion include/simsycl/detail/coordinate.hh
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ class coordinate {

#define SIMSYCL_DETAIL_DEFINE_COORDINATE_UNARY_POSTFIX_OPERATOR(op) \
friend constexpr Interface operator op(Interface &lhs, int) { \
Interface result = lhs = make_interface_type(); \
Interface result = lhs; \
for(int d = 0; d < Dimensions; ++d) { lhs[d] op; } \
return result; \
}
Expand Down
2 changes: 1 addition & 1 deletion include/simsycl/detail/group_operation_impl.hh
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ auto perform_group_operation(G g, group_operation_id id, const Spec &spec) {

// wait for all work items to enter this group operation
for(;;) {
this_concurrent_nd_item.yield_to_scheduler();
detail::yield_to_kernel_scheduler();
// we cannot preserve a reference into `operations` across a yield since it might be resized by another item
const auto &op = group_instance.operations[new_op_index];
if(op.num_work_items_participating == op.expected_num_work_items) break;
Expand Down
195 changes: 195 additions & 0 deletions include/simsycl/detail/schedule.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#pragma once

#include <cstddef>
#include <cstring>
#include <memory>
#include <vector>

#include "../sycl/device.hh"
#include "../sycl/forward.hh"
#include "../sycl/id.hh"
#include "../sycl/item.hh"
#include "../sycl/nd_item.hh"
#include "../sycl/nd_range.hh"
#include "../sycl/range.hh"


namespace simsycl::detail {

struct no_offset_t {
} inline constexpr no_offset;

template<typename Func, typename... Params>
void sequential_for(const sycl::range<1> &range, no_offset_t /* no offset */, Func &&func, Params &&...args) {
sycl::id<1> id;
for(id[0] = 0; id[0] < range[0]; ++id[0]) { //
func(make_item(id, range), std::forward<Params>(args)...);
}
}

template<typename Func, typename... Params>
void sequential_for(const sycl::range<2> &range, no_offset_t /* no offset */, Func &&func, Params &&...args) {
sycl::id<2> id;
for(id[0] = 0; id[0] < range[0]; ++id[0]) {
for(id[1] = 0; id[1] < range[1]; ++id[1]) { //
func(make_item(id, range), std::forward<Params>(args)...);
}
}
}

template<typename Func, typename... Params>
void sequential_for(const sycl::range<3> &range, no_offset_t /* no offset */, Func &&func, Params &&...args) {
sycl::id<3> id;
for(id[0] = 0; id[0] < range[0]; ++id[0]) {
for(id[1] = 0; id[1] < range[1]; ++id[1]) {
for(id[2] = 0; id[2] < range[2]; ++id[2]) { //
func(make_item(id, range), std::forward<Params>(args)...);
}
}
}
}

template<typename Func, typename... Params>
void sequential_for(const sycl::range<1> &range, const sycl::id<1> &offset, Func &&func, Params &&...args) {
sycl::id<1> id;
for(id[0] = offset[0]; id[0] < offset[0] + range[0]; ++id[0]) { //
func(make_item(id, range, offset), std::forward<Params>(args)...);
}
}

template<typename Func, typename... Params>
void sequential_for(const sycl::range<2> &range, const sycl::id<2> &offset, Func &&func, Params &&...args) {
sycl::id<2> id;
for(id[0] = offset[0]; id[0] < offset[0] + range[0]; ++id[0]) {
for(id[1] = offset[1]; id[1] < offset[1] + range[1]; ++id[1]) { //
func(make_item(id, range, offset), std::forward<Params>(args)...);
}
}
}

template<typename Func, typename... Params>
void sequential_for(const sycl::range<3> &range, const sycl::id<3> &offset, Func &&func, Params &&...args) {
sycl::id<3> id;
for(id[0] = offset[0]; id[0] < offset[0] + range[0]; ++id[0]) {
for(id[1] = offset[1]; id[1] < offset[1] + range[1]; ++id[1]) {
for(id[2] = offset[2]; id[2] < offset[2] + range[2]; ++id[2]) { //
func(make_item(id, range, offset), std::forward<Params>(args)...);
}
}
}
}


template<int Dimensions>
sycl::id<Dimensions> linear_index_to_id(const sycl::range<Dimensions> &range, size_t linear_index) {
sycl::id<Dimensions> id;
for(int d = Dimensions - 1; d >= 0; --d) {
id[d] = linear_index % range[d];
linear_index /= range[d];
}
return id;
}


struct local_memory_requirement {
std::unique_ptr<void *> ptr;
size_t size = 0;
size_t align = 1;
};


template<int Dimensions>
using nd_kernel = std::function<void(const sycl::nd_item<Dimensions> &)>;

template<int Dimensions>
void dispatch_for_nd_range(const sycl::device &device, const sycl::nd_range<Dimensions> &range,
const std::vector<local_memory_requirement> &local_memory, const nd_kernel<Dimensions> &kernel);

template<int Dimensions, typename Func, typename... Params>
requires(!std::is_same_v<std::remove_cvref_t<Func>, nd_kernel<Dimensions>>)
void dispatch_for_nd_range(const sycl::device &device, const sycl::nd_range<Dimensions> &range,
const std::vector<local_memory_requirement> &local_memory, Func &&func, Params &&...args) {
const nd_kernel<Dimensions> kernel(
[&](const sycl::nd_item<Dimensions> &item) { func(item, std::forward<Params>(args)...); });
dispatch_for_nd_range(device, range, local_memory, kernel);
}

template<int Dimensions, typename ParamTuple, size_t... ReductionIndices, size_t KernelIndex>
void dispatch_for(const sycl::range<Dimensions> &range, ParamTuple &&params,
std::index_sequence<ReductionIndices...> /* reduction_indices */,
std::index_sequence<KernelIndex> /* kernel_index */) {
const auto &kernel_func = std::get<KernelIndex>(params);
detail::sequential_for(range, no_offset, kernel_func, std::get<ReductionIndices>(params)...);
}

template<int Dimensions, typename ParamTuple, size_t... ReductionIndices, size_t KernelIndex>
void dispatch_for(const sycl::device &device, const sycl::nd_range<Dimensions> &range,
const std::vector<local_memory_requirement> &local_memory, ParamTuple &&params,
std::index_sequence<ReductionIndices...> /* reduction_indices */,
std::index_sequence<KernelIndex> /* kernel_index */) {
const auto &kernel_func = std::get<KernelIndex>(params);
detail::dispatch_for_nd_range(device, range, local_memory, kernel_func, std::get<ReductionIndices>(params)...);
}

template<int Dimensions, typename... Rest, std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
void parallel_for(sycl::range<Dimensions> num_work_items, Rest &&...rest) {
dispatch_for(num_work_items, std::forward_as_tuple(std::forward<Rest>(rest)...),
std::make_index_sequence<sizeof...(Rest) - 1>(), std::index_sequence<sizeof...(Rest) - 1>());
}

template<typename KernelType, int Dimensions>
void parallel_for(
sycl::range<Dimensions> num_work_items, sycl::id<Dimensions> work_item_offset, KernelType &&kernel_func) {
detail::sequential_for(num_work_items, work_item_offset, kernel_func);
}

template<typename KernelName = unnamed_kernel, int Dimensions, typename... Rest,
std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
void parallel_for(const sycl::device &device, sycl::nd_range<Dimensions> execution_range,
const std::vector<local_memory_requirement> &local_memory, Rest &&...rest) {
detail::dispatch_for(device, execution_range, local_memory, std::forward_as_tuple(std::forward<Rest>(rest)...),
std::make_index_sequence<sizeof...(Rest) - 1>(), std::index_sequence<sizeof...(Rest) - 1>());
}

template<typename WorkgroupFunctionType>
void parallel_for_work_group(sycl::range<1> num_work_groups, std::optional<sycl::range<1>> work_group_size,
const WorkgroupFunctionType &kernel_func) {
sycl::id<1> group_id;
for(group_id[0] = 0; group_id[0] < num_work_groups[0]; ++group_id[0]) {
concurrent_group impl;
sycl::group<1> group = make_hierarchical_group(make_item(group_id, num_work_groups), work_group_size, &impl);
kernel_func(group);
}
}

template<typename WorkgroupFunctionType>
void parallel_for_work_group(sycl::range<2> num_work_groups, std::optional<sycl::range<2>> work_group_size,
const WorkgroupFunctionType &kernel_func) {
sycl::id<2> group_id;
for(group_id[0] = 0; group_id[0] < num_work_groups[0]; ++group_id[0]) {
for(group_id[1] = 0; group_id[1] < num_work_groups[1]; ++group_id[1]) {
concurrent_group impl;
sycl::group<2> group
= make_hierarchical_group(make_item(group_id, num_work_groups), work_group_size, &impl);
kernel_func(group);
}
}
}

template<typename WorkgroupFunctionType>
void parallel_for_work_group(sycl::range<3> num_work_groups, std::optional<sycl::range<3>> work_group_size,
const WorkgroupFunctionType &kernel_func) {
sycl::id<3> group_id;
for(group_id[0] = 0; group_id[0] < num_work_groups[0]; ++group_id[0]) {
for(group_id[1] = 0; group_id[1] < num_work_groups[1]; ++group_id[1]) {
for(group_id[2] = 0; group_id[2] < num_work_groups[2]; ++group_id[2]) {
concurrent_group impl;
sycl::group<3> group
= make_hierarchical_group(make_item(group_id, num_work_groups), work_group_size, &impl);
kernel_func(group);
}
}
}
}

} // namespace simsycl::detail
6 changes: 5 additions & 1 deletion include/simsycl/sycl/atomic_fence.hh
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
#pragma once

#include "enums.hh"
#include "forward.hh"


namespace simsycl::sycl {

inline void atomic_fence(memory_order order, memory_scope scope) {
(void)order;
(void)scope;
// TODO yield if order != relaxed and this is inside an nd_range kernel
// Guarantee forward progress in kernels that use atomics for synchronization. It is somewhat unclear to me whether
// that is strictly necessary if order == relaxed since that does not introduce any ordering.
detail::maybe_yield_to_kernel_scheduler();
}

} // namespace simsycl::sycl
Loading

0 comments on commit 65d5e76

Please sign in to comment.