From a223af950688f8246414b3bc39328917e1c7e2a7 Mon Sep 17 00:00:00 2001 From: Fabian Knorr Date: Wed, 27 Dec 2023 10:11:05 +0100 Subject: [PATCH] Configure simulated system via env SIMSYCL_CONFIG=system.json --- CMakeLists.txt | 28 ++- cmake/simsycl-config.cmake.in | 2 + include/simsycl/detail/config.hh | 26 --- include/simsycl/sycl/device.hh | 6 +- include/simsycl/sycl/forward.hh | 4 +- include/simsycl/sycl/kernel.hh | 21 +- include/simsycl/sycl/platform.hh | 8 +- include/simsycl/sycl/sub_group.hh | 20 +- include/simsycl/sycl/vec.hh | 2 +- include/simsycl/system.hh | 31 ++- include/simsycl/templates.hh | 15 -- src/simsycl/device.cc | 29 ++- src/simsycl/platform.cc | 4 +- src/simsycl/schedule.cc | 43 +++- src/simsycl/system.cc | 233 ++++++++++++++++--- src/simsycl/system_config.cc | 370 ++++++++++++++++++++++++++++++ src/simsycl/templates.cc | 109 --------- test/CMakeLists.txt | 3 + test/group_op_tests.cc | 32 +-- test/launch_tests.cc | 66 ++++-- test/test_setup.cc | 15 ++ test/test_utils.hh | 15 ++ 22 files changed, 813 insertions(+), 269 deletions(-) delete mode 100644 include/simsycl/detail/config.hh delete mode 100644 include/simsycl/templates.hh create mode 100644 src/simsycl/system_config.cc delete mode 100644 src/simsycl/templates.cc create mode 100644 test/test_setup.cc create mode 100644 test/test_utils.hh diff --git a/CMakeLists.txt b/CMakeLists.txt index e556a54..14040b4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,5 @@ cmake_minimum_required(VERSION 3.13) +cmake_policy(SET CMP0135 NEW) # ExternalProject downloads will have archive timestamps set(SIMSYCL_VERSION 0.1) set(SIMSYCL_VERSION_MAJOR 0) @@ -26,6 +27,23 @@ endif() find_package(Boost 1.70 COMPONENTS context REQUIRED) +include(FetchContent) + +set(LIBENVPP_INSTALL ON CACHE BOOL "" FORCE) # If installation is desired. +FetchContent_Declare(libenvpp + GIT_REPOSITORY https://github.com/ph3at/libenvpp.git + GIT_TAG v1.4.0 +) +FetchContent_MakeAvailable(libenvpp) + +set(JSON_BuildTests OFF CACHE INTERNAL "") +set(JSON_Install ON CACHE INTERNAL "") +FetchContent_Declare(nlohmann_json + URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz + URL_HASH SHA256=d6c65aca6b1ed68e7a182f4757257b107ae403032760ed6ef121c9d55e81757d +) +FetchContent_MakeAvailable(nlohmann_json) + include(CheckTypeSize) check_type_size(_Float16 FLOAT16 BUILTIN_TYPES_ONLY LANGUAGE CXX) if (HAVE_FLOAT16) @@ -64,7 +82,6 @@ add_library(simsycl include/simsycl/sycl.hh include/simsycl/detail/allocation.hh include/simsycl/detail/check.hh - include/simsycl/detail/config.hh include/simsycl/detail/coordinate.hh include/simsycl/detail/hash.hh include/simsycl/detail/schedule.hh @@ -115,7 +132,6 @@ add_library(simsycl include/simsycl/sycl/usm.hh include/simsycl/sycl/vec.hh include/simsycl/system.hh - include/simsycl/templates.hh ${CONFIG_PATH} src/simsycl/check.cc src/simsycl/context.cc @@ -124,9 +140,13 @@ add_library(simsycl src/simsycl/platform.cc src/simsycl/queue.cc src/simsycl/system.cc - src/simsycl/templates.cc + src/simsycl/system_config.cc +) +target_link_libraries(simsycl PRIVATE + Boost::context + nlohmann_json::nlohmann_json + libenvpp::libenvpp ) -target_link_libraries(simsycl Boost::context) target_include_directories(simsycl PUBLIC $ $ diff --git a/cmake/simsycl-config.cmake.in b/cmake/simsycl-config.cmake.in index 95164a4..a80bee9 100644 --- a/cmake/simsycl-config.cmake.in +++ b/cmake/simsycl-config.cmake.in @@ -15,6 +15,8 @@ set(SIMSYCL_ORIGINAL_CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}") set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" "${SIMSYCL_CMAKE_DIR}") find_dependency(Boost 1.70 COMPONENTS context REQUIRED) +find_dependency(nlohmann_json) +find_dependency(libenvpp) include("${CMAKE_CURRENT_LIST_DIR}/simsycl-targets.cmake") diff --git a/include/simsycl/detail/config.hh b/include/simsycl/detail/config.hh deleted file mode 100644 index aa8abac..0000000 --- a/include/simsycl/detail/config.hh +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once - -#include - -namespace simsycl::detail { - -class config { - public: - inline static uint32_t max_sub_group_size = 32; -}; - -template -class configure_temporarily { - public: - configure_temporarily(T &to_configure, T new_value) : m_to_configure(to_configure) { - m_old_value = to_configure; - to_configure = new_value; - } - ~configure_temporarily() { m_to_configure = m_old_value; } - - private: - T &m_to_configure; - T m_old_value; -}; - -} // namespace simsycl::detail diff --git a/include/simsycl/sycl/device.hh b/include/simsycl/sycl/device.hh index 3aa2f30..24549d5 100644 --- a/include/simsycl/sycl/device.hh +++ b/include/simsycl/sycl/device.hh @@ -16,7 +16,8 @@ namespace simsycl { struct device_config; -sycl::device create_device(sycl::platform &platform, const device_config &config); +sycl::device make_device(sycl::platform &platform, const device_config &config); +void set_parent_device(sycl::device &device, const sycl::device &parent); } // namespace simsycl @@ -102,7 +103,8 @@ class device final : public detail::reference_type template friend class detail::weak_ref; - friend device simsycl::create_device(sycl::platform &platform, const device_config &config); + friend device simsycl::make_device(sycl::platform &platform, const device_config &config); + friend void simsycl::set_parent_device(sycl::device &device, const sycl::device &parent); device(const detail::device_selector &selector); device(std::shared_ptr &&state) : reference_type(std::move(state)) {} diff --git a/include/simsycl/sycl/forward.hh b/include/simsycl/sycl/forward.hh index debc048..c6187b7 100644 --- a/include/simsycl/sycl/forward.hh +++ b/include/simsycl/sycl/forward.hh @@ -147,8 +147,8 @@ struct concurrent_sub_group; using device_selector = std::function; -sycl::sub_group make_sub_group( - const sycl::id<1> &, const sycl::range<1> &, const sycl::id<1> &, const sycl::range<1> &, concurrent_sub_group *); +sycl::sub_group make_sub_group(const sycl::id<1> &, const sycl::range<1> &, const sycl::range<1> &, const sycl::id<1> &, + const sycl::range<1> &, concurrent_sub_group *); concurrent_sub_group &get_concurrent_group(const sycl::sub_group &g); template diff --git a/include/simsycl/sycl/kernel.hh b/include/simsycl/sycl/kernel.hh index a38648b..ef704be 100644 --- a/include/simsycl/sycl/kernel.hh +++ b/include/simsycl/sycl/kernel.hh @@ -12,9 +12,15 @@ namespace simsycl::detail { struct kernel_state {}; -struct kernel_id_state {}; + +struct kernel_id_state { + std::string name; +}; + struct kernel_bundle_state {}; +sycl::kernel_id make_kernel_id(std::string name); + } // namespace simsycl::detail @@ -50,7 +56,12 @@ class kernel_id : public detail::reference_type @@ -211,3 +222,9 @@ struct std::hash> : public std::hash< simsycl::detail::reference_type, simsycl::detail::kernel_bundle_state>> { }; + +namespace simsycl::detail { + +inline sycl::kernel_id make_kernel_id(std::string name) { return sycl::kernel_id(std::move(name)); } + +} // namespace simsycl::detail diff --git a/include/simsycl/sycl/platform.hh b/include/simsycl/sycl/platform.hh index 6e418c2..692afc0 100644 --- a/include/simsycl/sycl/platform.hh +++ b/include/simsycl/sycl/platform.hh @@ -16,8 +16,8 @@ namespace simsycl { // forward struct platform_config; -sycl::platform create_platform(const platform_config &config); -sycl::device create_device(sycl::platform &platform, const device_config &config); +sycl::platform make_platform(const platform_config &config); +sycl::device make_device(sycl::platform &platform, const device_config &config); } // namespace simsycl @@ -59,8 +59,8 @@ class platform final : public detail::reference_type friend class detail::weak_ref; - friend sycl::platform simsycl::create_platform(const platform_config &config); - friend device simsycl::create_device(platform &platform, const device_config &config); + friend sycl::platform simsycl::make_platform(const platform_config &config); + friend device simsycl::make_device(platform &platform, const device_config &config); platform(const detail::device_selector &selector); platform(std::shared_ptr &&state) : reference_type(std::move(state)) {} diff --git a/include/simsycl/sycl/sub_group.hh b/include/simsycl/sycl/sub_group.hh index efa5211..5cf3d78 100644 --- a/include/simsycl/sycl/sub_group.hh +++ b/include/simsycl/sycl/sub_group.hh @@ -6,7 +6,6 @@ #include "type_traits.hh" #include "simsycl/detail/check.hh" -#include "simsycl/detail/config.hh" #include "simsycl/detail/group_operation_impl.hh" namespace simsycl::sycl { @@ -25,7 +24,7 @@ class sub_group { range_type get_local_range() const { return m_local_range; } - range_type get_max_local_range() const { return range<1>(detail::config::max_sub_group_size); } + range_type get_max_local_range() const { return m_max_local_range; } id_type get_group_id() const { return m_group_id; } @@ -96,18 +95,20 @@ class sub_group { private: id_type m_local_id; range_type m_local_range; + range_type m_max_local_range; id_type m_group_id; range_type m_group_range; detail::concurrent_sub_group *m_concurrent_group; // NOLINT - sub_group(const id_type &local_id, const range_type &local_range, const id_type &group_id, - const range_type &group_range, detail::concurrent_sub_group *concurrent_group) - : m_local_id(local_id), m_local_range(local_range), m_group_id(group_id), m_group_range(group_range), - m_concurrent_group(concurrent_group) {} + sub_group(const id_type &local_id, const range_type &local_range, const range_type &max_local_range, + const id_type &group_id, const range_type &group_range, detail::concurrent_sub_group *concurrent_group) + : m_local_id(local_id), m_local_range(local_range), m_max_local_range(max_local_range), m_group_id(group_id), + m_group_range(group_range), m_concurrent_group(concurrent_group) {} friend sycl::sub_group detail::make_sub_group(const sycl::id<1> &local_id, const sycl::range<1> &local_range, - const sycl::id<1> &group_id, const sycl::range<1> &group_range, detail::concurrent_sub_group *impl); + const sycl::range<1> &max_local_range, const sycl::id<1> &group_id, const sycl::range<1> &group_range, + detail::concurrent_sub_group *impl); friend detail::concurrent_sub_group &detail::get_concurrent_group(const sycl::sub_group &g); }; @@ -123,8 +124,9 @@ template<> struct is_sub_group : std::true_type {}; inline sycl::sub_group make_sub_group(const sycl::id<1> &local_id, const sycl::range<1> &local_range, - const sycl::id<1> &group_id, const sycl::range<1> &group_range, detail::concurrent_sub_group *impl) { - return sycl::sub_group(local_id, local_range, group_id, group_range, impl); + const sycl::range<1> &max_local_range, const sycl::id<1> &group_id, const sycl::range<1> &group_range, + detail::concurrent_sub_group *impl) { + return sycl::sub_group(local_id, local_range, max_local_range, group_id, group_range, impl); } } // namespace simsycl::detail diff --git a/include/simsycl/sycl/vec.hh b/include/simsycl/sycl/vec.hh index e5125d7..1ee31e7 100644 --- a/include/simsycl/sycl/vec.hh +++ b/include/simsycl/sycl/vec.hh @@ -242,7 +242,7 @@ class swizzled_vec { static constexpr bool allow_assign = !std::is_const_v && no_repeat_indices_v; static constexpr int num_elements = sizeof...(Indices); - static constexpr index_list indices = {}; + static constexpr index_list indices{}; public: using element_type = std::remove_const_t; diff --git a/include/simsycl/system.hh b/include/simsycl/system.hh index 0ee67dc..b48f7bd 100644 --- a/include/simsycl/system.hh +++ b/include/simsycl/system.hh @@ -1,10 +1,6 @@ #pragma once -#include "detail/check.hh" - #include "sycl/device.hh" -#include "sycl/exception.hh" -#include "sycl/kernel.hh" #include "sycl/platform.hh" #include "sycl/range.hh" @@ -13,6 +9,10 @@ namespace simsycl { +using platform_id = std::string; +using device_id = std::string; +using system_id = std::string; + struct device_config { sycl::info::device_type device_type{}; uint32_t vendor_id{}; @@ -78,18 +78,17 @@ struct device_config { std::vector execution_capabilities{}; bool queue_profiling{}; std::vector built_in_kernels{}; - std::vector built_in_kernel_ids{}; + std::vector built_in_kernel_ids{}; + simsycl::platform_id platform_id{}; std::string name{}; std::string vendor{}; std::string driver_version{}; - std::string profile{}; std::string version{}; std::string backend_version{}; std::vector aspects{}; std::vector extensions{}; size_t printf_buffer_size{}; - bool preferred_interop_user_sync{}; - std::optional parent_device{}; + std::optional parent_device_id{}; uint32_t partition_max_sub_devices{}; std::vector partition_properties{}; std::vector partition_affinity_domains{}; @@ -106,17 +105,25 @@ struct platform_config { }; struct system_config { - std::vector platforms{}; - std::vector devices{}; + std::unordered_map platforms{}; + std::unordered_map devices{}; }; -const system_config &get_system_config(); -void configure_system(system_config system); +extern const platform_config builtin_platform; +extern const device_config builtin_device; +extern const system_config builtin_system; + +const system_config &get_default_system_config(); +system_config read_system_config(const std::string &path_to_json_file); +void write_system_config(const std::string &path_to_json_file, const system_config &config); +void configure_system(const system_config &system); } // namespace simsycl namespace simsycl::detail { +const std::vector &get_platforms(); +const std::vector &get_devices(); sycl::device select_device(const device_selector &selector); } // namespace simsycl::detail diff --git a/include/simsycl/templates.hh b/include/simsycl/templates.hh deleted file mode 100644 index c0a9997..0000000 --- a/include/simsycl/templates.hh +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -#include "sycl/forward.hh" - -namespace simsycl::templates::platform { - -extern const platform_config cuda_12_2; - -} - -namespace simsycl::templates::device::nvidia { - -extern const device_config rtx_3090; - -} diff --git a/src/simsycl/device.cc b/src/simsycl/device.cc index 3a5f6e9..f0606a8 100644 --- a/src/simsycl/device.cc +++ b/src/simsycl/device.cc @@ -1,14 +1,19 @@ #include "simsycl/sycl/device.hh" +#include "simsycl/sycl/exception.hh" +#include "simsycl/sycl/kernel.hh" #include "simsycl/sycl/range.hh" #include "simsycl/system.hh" +#include #include namespace simsycl::detail { struct device_state { device_config config; + size_t bytes_free = 0; weak_ref platform; + weak_ref parent; }; int default_selector::operator()(const sycl::device &device) const { @@ -358,7 +363,9 @@ SIMSYCL_STOP_IGNORING_DEPRECATIONS template<> std::vector device::get_info() const { - return state().config.built_in_kernel_ids; + std::vector ids; + for(auto &string_id : state().config.built_in_kernel_ids) { ids.push_back(detail::make_kernel_id(string_id)); } + return ids; } template<> @@ -383,7 +390,7 @@ std::string device::get_info() const { template<> std::string device::get_info() const { - return state().config.profile; + throw exception(errc::invalid, "not an OpenCL backend"); } template<> @@ -415,12 +422,15 @@ size_t device::get_info() const { template<> bool device::get_info() const { - return state().config.preferred_interop_user_sync; + throw exception(errc::invalid, "not an OpenCL backend"); } template<> sycl::device device::get_info() const { - return state().config.parent_device.value(); + const auto parent_instance = state().parent.lock(); + assert(parent_instance.has_value() == state().config.parent_device_id.has_value()); + if(!parent_instance.has_value()) { throw exception(errc::invalid, "not a sub-device"); } + return *parent_instance; } template<> @@ -460,9 +470,9 @@ bool device::has(aspect asp) const { } std::vector device::get_devices(info::device_type type) { - auto &system = get_system_config(); + auto &devices = detail::get_devices(); std::vector result; - std::copy_if(system.devices.begin(), system.devices.end(), std::back_inserter(result), + std::copy_if(devices.begin(), devices.end(), std::back_inserter(result), [type](const device &dev) { return dev.get_info() == type; }); return result; @@ -472,13 +482,18 @@ std::vector device::get_devices(info::device_type type) { namespace simsycl { -sycl::device create_device(sycl::platform &platform, const device_config &config) { +sycl::device make_device(sycl::platform &platform, const device_config &config) { auto state = std::make_shared(); state->config = config; state->platform = detail::weak_ref(platform); + state->bytes_free = config.global_mem_size; sycl::device device(std::move(state)); platform.add_device(device); return device; } +void set_parent_device(sycl::device &device, const sycl::device &parent) { + device.state().parent = detail::weak_ref(parent); +} + } // namespace simsycl diff --git a/src/simsycl/platform.cc b/src/simsycl/platform.cc index 16df6c8..6225e16 100644 --- a/src/simsycl/platform.cc +++ b/src/simsycl/platform.cc @@ -65,7 +65,7 @@ bool platform::has_extension(const std::string &extension) const { != state().config.extensions.end(); } -std::vector platform::get_platforms() { return get_system_config().platforms; } +std::vector platform::get_platforms() { return detail::get_platforms(); } void platform::add_device(const device &dev) { state().devices.push_back(dev); } @@ -73,7 +73,7 @@ void platform::add_device(const device &dev) { state().devices.push_back(dev); } namespace simsycl { -sycl::platform create_platform(const platform_config &config) { +sycl::platform make_platform(const platform_config &config) { auto state = std::make_shared(); state->config = std::move(config); return sycl::platform(std::move(state)); diff --git a/src/simsycl/schedule.cc b/src/simsycl/schedule.cc index ee5469f..5ebac56 100644 --- a/src/simsycl/schedule.cc +++ b/src/simsycl/schedule.cc @@ -36,10 +36,14 @@ template void dispatch_for_nd_range(const sycl::device &device, const sycl::nd_range &range, const std::vector &local_memory, const nd_kernel &kernel) // { + if(Dimensions > device.get_info()) { + throw sycl::exception(sycl::errc::nd_range, "Work item dimensionality exceeds device limit"); + } + const auto required_local_memory = std::accumulate(local_memory.begin(), local_memory.end(), size_t{0}, [](size_t sum, const local_memory_requirement &req) { return sum + req.size; }); if(required_local_memory > device.get_info()) { - throw sycl::exception(sycl::errc::accessor, "total required local memory exceeds device limit"); + throw sycl::exception(sycl::errc::accessor, "Total required local memory exceeds device limit"); } const auto &global_range = range.get_global_range(); @@ -51,13 +55,23 @@ void dispatch_for_nd_range(const sycl::device &device, const sycl::nd_range 0); - const auto sub_group_local_linear_range = config::max_sub_group_size; - const auto sub_group_local_range = sycl::range<1>(sub_group_local_linear_range); - assert(sub_group_local_linear_range > 0); - const auto sub_group_linear_range_in_group = detail::div_ceil(local_linear_range, sub_group_local_linear_range); + + if(local_linear_range > device.get_info() + || !all_true(local_range <= device.get_info>())) { + throw sycl::exception(sycl::errc::nd_range, "Work group size exceeds device limit"); + } + + const auto sub_group_max_local_linear_range = device.get_info().at(0); + const auto sub_group_max_local_range = sycl::range<1>(sub_group_max_local_linear_range); + assert(sub_group_max_local_linear_range > 0); + const auto sub_group_linear_range_in_group = detail::div_ceil(local_linear_range, sub_group_max_local_linear_range); const sycl::range<1> sub_group_range_in_group{sub_group_linear_range_in_group}; assert(sub_group_linear_range_in_group > 0); + if(sub_group_linear_range_in_group > device.get_info()) { + throw sycl::exception(sycl::errc::nd_range, "Number of sub-groups in work group exceeds device limit"); + } + // limit the number of concurrent groups to avoid allocating excessive numbers of fibers const size_t max_num_concurrent_groups = device.get_info(); const auto num_concurrent_groups = std::min(group_linear_range, max_num_concurrent_groups); @@ -84,8 +98,8 @@ void dispatch_for_nd_range(const sycl::device &device, const sycl::nd_range(sub_group_linear_id_in_group); const auto thread_id_in_sub_group = sycl::id<1>(thread_linear_id_in_sub_group); @@ -105,9 +119,10 @@ void dispatch_for_nd_range(const sycl::device &device, const sycl::nd_range(local_range) + local_id; + // if sub-group range is not divisible by local range, the last sub-group will be smaller + const auto sub_group_local_linear_range = std::min(sub_group_max_local_linear_range, + local_linear_range - sub_group_linear_id_in_group * sub_group_max_local_linear_range); + const auto sub_group_local_range = sycl::range<1>(sub_group_local_linear_range); + SIMSYCL_START_IGNORING_DEPRECATIONS; const auto global_item = detail::make_item(global_id, range.get_global_range(), range.get_offset()); SIMSYCL_STOP_IGNORING_DEPRECATIONS @@ -140,7 +160,8 @@ void dispatch_for_nd_range(const sycl::device &device, const sycl::nd_range +#include // std::endian +#include +#include #include #include #include +#include -namespace simsycl::detail { -sycl::device select_device(const device_selector &selector) { - auto &system = simsycl::get_system_config(); - SIMSYCL_CHECK(!system.devices.empty()); - int max_rating = std::numeric_limits::lowest(); - for(const auto &device : system.devices) { - if(int rating = selector(device); rating > max_rating) { max_rating = rating; } - } - if(max_rating < 0) { throw sycl::exception(sycl::errc::runtime, "No suitable device found"); } - const auto device = std::find_if(system.devices.begin(), system.devices.end(), - [&](const auto &device) { return selector(device) == max_rating; }); - assert(device != system.devices.end()); - return *device; -} +namespace simsycl::detail { class error_category : public std::error_category { const char *name() const noexcept override { return "sycl"; } @@ -119,31 +110,55 @@ struct memory_state { }; struct system_state { - system_config config; + std::vector platforms; + std::vector devices; std::unordered_map device_bytes_free; std::set usm_allocations; - explicit system_state(system_config config) : config(std::move(config)) { - for(const auto &device : this->config.devices) { - device_bytes_free.emplace(device, device.get_info()); + explicit system_state(const system_config &config) { + std::unordered_map platforms_by_id; + for(const auto &[id, platform_config] : config.platforms) { + platforms_by_id.emplace(id, make_platform(platform_config)); } + std::unordered_map devices_by_id; + for(const auto &[id, device_config] : config.devices) { + auto &platform = platforms_by_id.at(device_config.platform_id); + devices_by_id.emplace(id, make_device(platform, device_config)); + } + for(const auto &[id, device_config] : config.devices) { + if(device_config.parent_device_id.has_value()) { + set_parent_device(devices_by_id.at(id), devices_by_id.at(*device_config.parent_device_id)); + } + } + for(auto &[_, platform] : platforms_by_id) { platforms.push_back(std::move(platform)); } + for(auto &[_, device] : devices_by_id) { devices.push_back(std::move(device)); } } }; std::optional system; system_state &get_system() { - if(!detail::system.has_value()) { - system_config config; - auto platform = config.platforms.emplace_back(create_platform(simsycl::templates::platform::cuda_12_2)); - for(int i = 0; i < 4; ++i) { - config.devices.push_back(create_device(platform, simsycl::templates::device::nvidia::rtx_3090)); - } - configure_system(std::move(config)); - } + if(!system.has_value()) { system.emplace(get_default_system_config()); } return system.value(); } +const std::vector &get_platforms() { return get_system().platforms; } +const std::vector &get_devices() { return get_system().devices; } + +sycl::device select_device(const device_selector &selector) { + auto &devices = get_devices(); + SIMSYCL_CHECK(!devices.empty()); + int max_rating = std::numeric_limits::lowest(); + for(const auto &device : devices) { + if(int rating = selector(device); rating > max_rating) { max_rating = rating; } + } + if(max_rating < 0) { throw sycl::exception(sycl::errc::runtime, "No suitable device found"); } + const auto device = std::find_if( + devices.begin(), devices.end(), [&](const sycl::device &device) { return selector(device) == max_rating; }); + assert(device != devices.end()); + return *device; +} + void *usm_alloc(const sycl::context &context, sycl::usm::alloc kind, std::optional device, size_t size_bytes, size_t alignment_bytes) { SIMSYCL_CHECK(kind != sycl::usm::alloc::unknown); @@ -160,6 +175,10 @@ void *usm_alloc(const sycl::context &context, sycl::usm::alloc kind, std::option throw sycl::exception(sycl::errc::invalid, "Device not associated with context"); } + if(size_bytes > device->get_info()) { + throw sycl::exception(sycl::errc::memory_allocation, "Allocation size exceeds device limit"); + } + bytes_free = &system.device_bytes_free.at(*device); if(*bytes_free < size_bytes) { throw sycl::exception(sycl::errc::memory_allocation, "Not enough memory available"); @@ -246,10 +265,164 @@ device get_pointer_device(const void *ptr, const context &sycl_context) { } // namespace simsycl::sycl +namespace simsycl::detail { + +bool g_environment_parsed = false; +std::optional g_env_config; + +void parse_environment() { + if(g_environment_parsed) return; + + auto prefix = env::prefix("SIMSYCL"); + const auto config = prefix.register_variable("CONFIG"); + if(const auto parsed = prefix.parse_and_validate(); parsed.ok()) { + g_env_config = parsed.get(config); + } else { + std::cerr << parsed.warning_message() << parsed.error_message(); + } + g_environment_parsed = true; +} + +std::optional g_default_system_config; + +} // namespace simsycl::detail + namespace simsycl { -const system_config &get_system_config() { return detail::get_system().config; } +const system_config &get_default_system_config() { + if(!detail::g_default_system_config.has_value()) { + detail::parse_environment(); + if(detail::g_env_config.has_value()) { + detail::g_default_system_config.emplace(read_system_config(*detail::g_env_config)); + } else { + detail::g_default_system_config.emplace(builtin_system); + } + } + return detail::g_default_system_config.value(); +} + +void configure_system(const system_config &system) { detail::system.emplace(system); } + +const platform_config builtin_platform{ + .version = "0.1", + .name = "SimSYCL", + .vendor = "SimSYCL", + .extensions = {}, +}; -void configure_system(system_config system) { detail::system.emplace(std::move(system)); } +// clang-format off +const device_config builtin_device { + .device_type = sycl::info::device_type::gpu, // + .vendor_id = 0, // + .max_compute_units = 16, // + .max_work_item_dimensions = 3, // + .max_work_item_sizes_1 = {1024}, // + .max_work_item_sizes_2 = {1024, 1024}, // + .max_work_item_sizes_3 = {1024, 1024, 1024}, // + .max_work_group_size = 1024, // + .max_num_sub_groups = 32, + .sub_group_sizes = {32}, + .preferred_vector_width_char = 4, + .preferred_vector_width_short = 2, + .preferred_vector_width_int = 1, + .preferred_vector_width_long = 1, + .preferred_vector_width_float = 1, + .preferred_vector_width_double = 1, + .preferred_vector_width_half = 2, + .native_vector_width_char = 4, + .native_vector_width_short = 2, + .native_vector_width_int = 1, + .native_vector_width_long = 1, + .native_vector_width_float = 1, + .native_vector_width_double = 1, + .native_vector_width_half = 2, + .max_clock_frequency = 1000, + .address_bits = 64, + .max_mem_alloc_size = std::numeric_limits::max(), + .image_support = false, + .max_read_image_args = 0, + .max_write_image_args = 0, + .image2d_max_height = 0, + .image2d_max_width = 0, + .image3d_max_height = 0, + .image3d_max_width = 0, + .image3d_max_depth = 0, + .image_max_buffer_size = 0, + .max_samplers = 0, + .max_parameter_size = std::numeric_limits::max(), + .mem_base_addr_align = 8 * sizeof(sycl::long16), + .half_fp_config +#if SIMSYCL_FEATURE_HALF_TYPE + = {sycl::info::fp_config::denorm, sycl::info::fp_config::inf_nan, + sycl::info::fp_config::fma, sycl::info::fp_config::correctly_rounded_divide_sqrt}, +#else + = {}, +#endif + .single_fp_config = {sycl::info::fp_config::denorm, sycl::info::fp_config::inf_nan, + sycl::info::fp_config::fma, sycl::info::fp_config::correctly_rounded_divide_sqrt}, + .double_fp_config = {sycl::info::fp_config::denorm, sycl::info::fp_config::inf_nan, + sycl::info::fp_config::fma, sycl::info::fp_config::correctly_rounded_divide_sqrt}, + .global_mem_cache_type = sycl::info::global_mem_cache_type::read_write, + .global_mem_cache_line_size = 64, + .global_mem_cache_size = 16 << 20, + .global_mem_size = std::numeric_limits::max(), + .max_constant_buffer_size = 1 << 16, + .max_constant_args = std::numeric_limits::max(), + .local_mem_type = sycl::info::local_mem_type::local, + .local_mem_size = 64 << 10, + .error_correction_support = false, + .host_unified_memory = true, + .atomic_memory_order_capabilities = {sycl::memory_order::relaxed, sycl::memory_order::acquire, + sycl::memory_order::release, sycl::memory_order::acq_rel, sycl::memory_order::seq_cst}, + .atomic_fence_order_capabilities = {sycl::memory_order::relaxed, sycl::memory_order::acquire, + sycl::memory_order::release, sycl::memory_order::acq_rel, sycl::memory_order::seq_cst}, + .atomic_memory_scope_capabilities = {sycl::memory_scope::work_item, + sycl::memory_scope::sub_group, sycl::memory_scope::work_group, sycl::memory_scope::device, + sycl::memory_scope::system }, + .atomic_fence_scope_capabilities = {sycl::memory_scope::work_item, + sycl::memory_scope::sub_group, sycl::memory_scope::work_group, sycl::memory_scope::device, + sycl::memory_scope::system }, + .profiling_timer_resolution = 1, + .is_endian_little = std::endian::native == std::endian::little, + .is_available = true, + .is_compiler_available = false, + .is_linker_available = false, + .execution_capabilities = {sycl::info::execution_capability::exec_kernel}, + .queue_profiling = true, + .built_in_kernels = {}, + .platform_id = "SimSYCL", + .name = "SimSYCL virtual GPU", + .vendor = "SimSYCL", + .driver_version = "0.1", + .version = "0.1", + .aspects = { sycl::aspect::gpu, sycl::aspect::accelerator, sycl::aspect::emulated, + sycl::aspect::host_debuggable, +#if SIMSYCL_FEATURE_HALF_TYPE + sycl::aspect::fp16, +#endif + sycl::aspect::fp64, sycl::aspect::atomic64, sycl::aspect::queue_profiling, + sycl::aspect::usm_device_allocations, sycl::aspect::usm_host_allocations, + sycl::aspect::usm_atomic_host_allocations, sycl::aspect::usm_shared_allocations, + sycl::aspect::usm_atomic_shared_allocations, sycl::aspect::usm_system_allocations }, + .extensions = { + "cl_khr_int64_base_atomics", + "cl_khr_int64_extended_atomics", +#if SIMSYCL_FEATURE_HALF_TYPE + "cl_khr_fp16", +#endif + }, + .printf_buffer_size = std::numeric_limits::max(), + .partition_max_sub_devices = 0, + .partition_properties = {}, + .partition_affinity_domains = {sycl::info::partition_affinity_domain::not_applicable}, + .partition_type_property = sycl::info::partition_property::no_partition, + .partition_type_affinity_domain = sycl::info::partition_affinity_domain::not_applicable, +}; +// clang-format off + +const system_config builtin_system{ + .platforms = {{"SimSYCL", builtin_platform}}, + .devices = {{"GPU", builtin_device}}, +}; } // namespace simsycl diff --git a/src/simsycl/system_config.cc b/src/simsycl/system_config.cc new file mode 100644 index 0000000..08b964b --- /dev/null +++ b/src/simsycl/system_config.cc @@ -0,0 +1,370 @@ +#include + +#include +#include + +#include + + +namespace nlohmann { + +template +struct adl_serializer> { + static void to_json(json &j, const std::optional &opt) { + if(opt.has_value()) { + j = *opt; + } else { + j = nullptr; + } + } + + static void from_json(const json &j, std::optional &opt) { + if(j.is_null()) { + opt = std::nullopt; + } else { + opt = j.template get(); + } + } +}; + +} // namespace nlohmann + +namespace simsycl::sycl { // nlohmann_json requires us to invoke SERIALIZE_ENUM in this namespace + +NLOHMANN_JSON_SERIALIZE_ENUM(aspect, + { + {aspect::cpu, "cpu"}, + {aspect::gpu, "gpu"}, + {aspect::accelerator, "accelerator"}, + {aspect::custom, "custom"}, + {aspect::emulated, "emulated"}, + {aspect::host_debuggable, "host_debuggable"}, + {aspect::fp16, "fp16"}, + {aspect::fp64, "fp64"}, + {aspect::atomic64, "atomic64"}, + {aspect::image, "image"}, + {aspect::online_compiler, "online_compiler"}, + {aspect::online_linker, "online_linker"}, + {aspect::queue_profiling, "queue_profiling"}, + {aspect::usm_device_allocations, "usm_device_allocations"}, + {aspect::usm_host_allocations, "usm_host_allocations"}, + {aspect::usm_atomic_host_allocations, "usm_atomic_host_allocations"}, + {aspect::usm_shared_allocations, "usm_shared_allocations"}, + {aspect::usm_atomic_shared_allocations, "usm_atomic_shared_allocations"}, + {aspect::usm_system_allocations, "usm_system_allocations"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(memory_order, + { + {memory_order::relaxed, "relaxed"}, + {memory_order::acquire, "acquire"}, + {memory_order::release, "release"}, + {memory_order::acq_rel, "acq_rel"}, + {memory_order::seq_cst, "seq_cst"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(memory_scope, + { + {memory_scope::work_item, "work_item"}, + {memory_scope::sub_group, "sub_group"}, + {memory_scope::work_group, "work_group"}, + {memory_scope::device, "device"}, + {memory_scope::system, "system"}, + }) + +} // namespace simsycl::sycl + +namespace simsycl::sycl::info { // nlohmann_json requires us to invoke SERIALIZE_ENUM in this namespace + +NLOHMANN_JSON_SERIALIZE_ENUM(device_type, + { + {device_type::cpu, "cpu"}, + {device_type::gpu, "gpu"}, + {device_type::accelerator, "accelerator"}, + {device_type::custom, "custom"}, + {device_type::automatic, "automatic"}, + {device_type::host, "host"}, + {device_type::all, "all"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(partition_property, + { + {partition_property::no_partition, "no_partition"}, + {partition_property::partition_equally, "partition_equally"}, + {partition_property::partition_by_counts, "partition_by_counts"}, + {partition_property::partition_by_affinity_domain, "partition_by_affinity_domain"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(partition_affinity_domain, + { + {partition_affinity_domain::not_applicable, "not_applicable"}, + {partition_affinity_domain::numa, "numa"}, + {partition_affinity_domain::L4_cache, "L4_cache,"}, + {partition_affinity_domain::L3_cache, "L3_cache"}, + {partition_affinity_domain::L2_cache, "L2_cache"}, + {partition_affinity_domain::L1_cache, "L1_cache"}, + {partition_affinity_domain::next_partitionable, "next_partitionable"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(local_mem_type, + { + {local_mem_type::none, "none"}, + {local_mem_type::local, "local"}, + {local_mem_type::global, "global"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(fp_config, + { + {fp_config::denorm, "denorm"}, + {fp_config::inf_nan, "inf_nan"}, + {fp_config::round_to_nearest, "round_to_nearest"}, + {fp_config::round_to_zero, "round_to_zero"}, + {fp_config::round_to_inf, "round_to_inf"}, + {fp_config::fma, "fma"}, + {fp_config::correctly_rounded_divide_sqrt, "correctly_rounded_divide_sqrt"}, + {fp_config::soft_float, "soft_float"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(global_mem_cache_type, + { + {global_mem_cache_type::none, "none"}, + {global_mem_cache_type::read_only, "read_only"}, + {global_mem_cache_type::read_write, "read_write"}, + }) + +NLOHMANN_JSON_SERIALIZE_ENUM(execution_capability, + { + {execution_capability::exec_kernel, "exec_kernel"}, + {execution_capability::exec_native_kernel, "exec_native_kernel"}, + }) + +} // namespace simsycl::sycl::info + +namespace simsycl::detail { + +template +void to_json(nlohmann::json &json, const coordinate &coord) { + std::array array; + for(int d = 0; d < Dimensions; ++d) { array[d] = coord[d]; } + json = nlohmann::json(array); +} + +template +void from_json(const nlohmann::json &json, coordinate &coord) { + for(int d = 0; d < Dimensions; ++d) { coord[d] = json.at(d); } +} + +} // namespace simsycl::detail + +namespace simsycl { + +void to_json(nlohmann::json &json, const platform_config &platform) { + json = { + {"profile", platform.profile}, + {"version", platform.version}, + {"name", platform.name}, + {"vendor", platform.vendor}, + {"extensions", platform.extensions}, + }; +}; + +void from_json(const nlohmann::json &json, platform_config &platform) { + json.at("profile").get_to(platform.profile); + json.at("version").get_to(platform.version); + json.at("name").get_to(platform.name); + json.at("vendor").get_to(platform.vendor); + json.at("extensions").get_to(platform.extensions); +}; + +void to_json(nlohmann::json &json, const device_config &device) { + json = { + {"device_type", device.device_type}, + {"vendor_id", device.vendor_id}, + {"max_compute_units", device.max_compute_units}, + {"max_work_item_dimensions", device.max_work_item_dimensions}, + {"max_work_item_sizes<1>", device.max_work_item_sizes_1}, + {"max_work_item_sizes<2>", device.max_work_item_sizes_2}, + {"max_work_item_sizes<3>", device.max_work_item_sizes_3}, + {"max_work_group_size", device.max_work_group_size}, + {"max_num_sub_groups", device.max_num_sub_groups}, + {"sub_group_sizes", device.sub_group_sizes}, + {"preferred_vector_width_char", device.preferred_vector_width_char}, + {"preferred_vector_width_short", device.preferred_vector_width_short}, + {"preferred_vector_width_int", device.preferred_vector_width_int}, + {"preferred_vector_width_long", device.preferred_vector_width_long}, + {"preferred_vector_width_float", device.preferred_vector_width_float}, + {"preferred_vector_width_double", device.preferred_vector_width_double}, + {"preferred_vector_width_half", device.preferred_vector_width_half}, + {"native_vector_width_char", device.native_vector_width_char}, + {"native_vector_width_short", device.native_vector_width_short}, + {"native_vector_width_int", device.native_vector_width_int}, + {"native_vector_width_long", device.native_vector_width_long}, + {"native_vector_width_float", device.native_vector_width_float}, + {"native_vector_width_double", device.native_vector_width_double}, + {"native_vector_width_half", device.native_vector_width_half}, + {"max_clock_frequency", device.max_clock_frequency}, + {"address_bits", device.address_bits}, + {"max_mem_alloc_size", device.max_mem_alloc_size}, + {"image_support", device.image_support}, + {"max_read_image_args", device.max_read_image_args}, + {"max_write_image_args", device.max_write_image_args}, + {"image2d_max_height", device.image2d_max_height}, + {"image2d_max_width", device.image2d_max_width}, + {"image3d_max_height", device.image3d_max_height}, + {"image3d_max_width", device.image3d_max_width}, + {"image3d_max_depth", device.image3d_max_depth}, + {"image_max_buffer_size", device.image_max_buffer_size}, + {"max_samplers", device.max_samplers}, + {"max_parameter_size", device.max_parameter_size}, + {"mem_base_addr_align", device.mem_base_addr_align}, + {"half_fp_config", device.half_fp_config}, + {"single_fp_config", device.single_fp_config}, + {"double_fp_config", device.double_fp_config}, + {"global_mem_cache_type", device.global_mem_cache_type}, + {"global_mem_cache_line_size", device.global_mem_cache_line_size}, + {"global_mem_cache_size", device.global_mem_cache_size}, + {"global_mem_size", device.global_mem_size}, + {"max_constant_buffer_size", device.max_constant_buffer_size}, + {"max_constant_args", device.max_constant_args}, + {"local_mem_type", device.local_mem_type}, + {"local_mem_size", device.local_mem_size}, + {"error_correction_support", device.error_correction_support}, + {"host_unified_memory", device.host_unified_memory}, + {"atomic_memory_order_capabilities", device.atomic_memory_order_capabilities}, + {"atomic_fence_order_capabilities", device.atomic_fence_order_capabilities}, + {"atomic_memory_scope_capabilities", device.atomic_memory_scope_capabilities}, + {"atomic_fence_scope_capabilities", device.atomic_fence_scope_capabilities}, + {"profiling_timer_resolution", device.profiling_timer_resolution}, + {"is_endian_little", device.is_endian_little}, + {"is_available", device.is_available}, + {"is_compiler_available", device.is_compiler_available}, + {"is_linker_available", device.is_linker_available}, + {"execution_capabilities", device.execution_capabilities}, + {"queue_profiling", device.queue_profiling}, + {"built_in_kernels", device.built_in_kernels}, + {"built_in_kernel_ids", device.built_in_kernel_ids}, + {"platform_id", device.platform_id}, + {"name", device.name}, + {"vendor", device.vendor}, + {"driver_version", device.driver_version}, + {"version", device.version}, + {"backend_version", device.backend_version}, + {"aspects", device.aspects}, + {"extensions", device.extensions}, + {"printf_buffer_size", device.printf_buffer_size}, + {"parent_device_id", device.parent_device_id}, + {"partition_max_sub_devices", device.partition_max_sub_devices}, + {"partition_properties", device.partition_properties}, + {"partition_affinity_domains", device.partition_affinity_domains}, + {"partition_type_property", device.partition_type_property}, + {"partition_type_affinity_domain", device.partition_type_affinity_domain}, + }; +}; + +void from_json(const nlohmann::json &json, device_config &device) { + json.at("device_type").get_to(device.device_type); + json.at("vendor_id").get_to(device.vendor_id); + json.at("max_compute_units").get_to(device.max_compute_units); + json.at("max_work_item_dimensions").get_to(device.max_work_item_dimensions); + json.at("max_work_item_sizes<1>").get_to(device.max_work_item_sizes_1); + json.at("max_work_item_sizes<2>").get_to(device.max_work_item_sizes_2); + json.at("max_work_item_sizes<3>").get_to(device.max_work_item_sizes_3); + json.at("max_work_group_size").get_to(device.max_work_group_size); + json.at("max_num_sub_groups").get_to(device.max_num_sub_groups); + json.at("sub_group_sizes").get_to(device.sub_group_sizes); + json.at("preferred_vector_width_char").get_to(device.preferred_vector_width_char); + json.at("preferred_vector_width_short").get_to(device.preferred_vector_width_short); + json.at("preferred_vector_width_int").get_to(device.preferred_vector_width_int); + json.at("preferred_vector_width_long").get_to(device.preferred_vector_width_long); + json.at("preferred_vector_width_float").get_to(device.preferred_vector_width_float); + json.at("preferred_vector_width_double").get_to(device.preferred_vector_width_double); + json.at("preferred_vector_width_half").get_to(device.preferred_vector_width_half); + json.at("native_vector_width_char").get_to(device.native_vector_width_char); + json.at("native_vector_width_short").get_to(device.native_vector_width_short); + json.at("native_vector_width_int").get_to(device.native_vector_width_int); + json.at("native_vector_width_long").get_to(device.native_vector_width_long); + json.at("native_vector_width_float").get_to(device.native_vector_width_float); + json.at("native_vector_width_double").get_to(device.native_vector_width_double); + json.at("native_vector_width_half").get_to(device.native_vector_width_half); + json.at("max_clock_frequency").get_to(device.max_clock_frequency); + json.at("address_bits").get_to(device.address_bits); + json.at("max_mem_alloc_size").get_to(device.max_mem_alloc_size); + json.at("image_support").get_to(device.image_support); + json.at("max_read_image_args").get_to(device.max_read_image_args); + json.at("max_write_image_args").get_to(device.max_write_image_args); + json.at("image2d_max_height").get_to(device.image2d_max_height); + json.at("image2d_max_width").get_to(device.image2d_max_width); + json.at("image3d_max_height").get_to(device.image3d_max_height); + json.at("image3d_max_width").get_to(device.image3d_max_width); + json.at("image3d_max_depth").get_to(device.image3d_max_depth); + json.at("image_max_buffer_size").get_to(device.image_max_buffer_size); + json.at("max_samplers").get_to(device.max_samplers); + json.at("max_parameter_size").get_to(device.max_parameter_size); + json.at("mem_base_addr_align").get_to(device.mem_base_addr_align); + json.at("half_fp_config").get_to(device.half_fp_config); + json.at("single_fp_config").get_to(device.single_fp_config); + json.at("double_fp_config").get_to(device.double_fp_config); + json.at("global_mem_cache_type").get_to(device.global_mem_cache_type); + json.at("global_mem_cache_line_size").get_to(device.global_mem_cache_line_size); + json.at("global_mem_cache_size").get_to(device.global_mem_cache_size); + json.at("global_mem_size").get_to(device.global_mem_size); + json.at("max_constant_buffer_size").get_to(device.max_constant_buffer_size); + json.at("max_constant_args").get_to(device.max_constant_args); + json.at("local_mem_type").get_to(device.local_mem_type); + json.at("local_mem_size").get_to(device.local_mem_size); + json.at("error_correction_support").get_to(device.error_correction_support); + json.at("host_unified_memory").get_to(device.host_unified_memory); + json.at("atomic_memory_order_capabilities").get_to(device.atomic_memory_order_capabilities); + json.at("atomic_fence_order_capabilities").get_to(device.atomic_fence_order_capabilities); + json.at("atomic_memory_scope_capabilities").get_to(device.atomic_memory_scope_capabilities); + json.at("atomic_fence_scope_capabilities").get_to(device.atomic_fence_scope_capabilities); + json.at("profiling_timer_resolution").get_to(device.profiling_timer_resolution); + json.at("is_endian_little").get_to(device.is_endian_little); + json.at("is_available").get_to(device.is_available); + json.at("is_compiler_available").get_to(device.is_compiler_available); + json.at("is_linker_available").get_to(device.is_linker_available); + json.at("execution_capabilities").get_to(device.execution_capabilities); + json.at("queue_profiling").get_to(device.queue_profiling); + json.at("built_in_kernels").get_to(device.built_in_kernels); + json.at("built_in_kernel_ids").get_to(device.built_in_kernel_ids); + json.at("platform_id").get_to(device.platform_id); + json.at("name").get_to(device.name); + json.at("vendor").get_to(device.vendor); + json.at("driver_version").get_to(device.driver_version); + json.at("version").get_to(device.version); + json.at("backend_version").get_to(device.backend_version); + json.at("aspects").get_to(device.aspects); + json.at("extensions").get_to(device.extensions); + json.at("printf_buffer_size").get_to(device.printf_buffer_size); + json.at("parent_device_id").get_to(device.parent_device_id); + json.at("partition_max_sub_devices").get_to(device.partition_max_sub_devices); + json.at("partition_properties").get_to(device.partition_properties); + json.at("partition_affinity_domains").get_to(device.partition_affinity_domains); + json.at("partition_type_property").get_to(device.partition_type_property); + json.at("partition_type_affinity_domain").get_to(device.partition_type_affinity_domain); +}; + +void to_json(nlohmann::json &json, const system_config &system) { + json = { + {"platforms", system.platforms}, + {"devices", system.devices}, + }; +}; + +void from_json(const nlohmann::json &json, system_config &system) { + json.at("platforms").get_to(system.platforms); + json.at("devices").get_to(system.devices); +}; + +system_config read_system_config(const std::string &path_to_json_file) { + std::ifstream ifs(path_to_json_file); + return nlohmann::json::parse(ifs).get(); +} + +void write_system_config(const std::string &path_to_json_file, const system_config &config) { + nlohmann::json json; + to_json(json, config); + std::ofstream(path_to_json_file) << std::setw(4) << json; +} + +} // namespace simsycl diff --git a/src/simsycl/templates.cc b/src/simsycl/templates.cc deleted file mode 100644 index 75d8653..0000000 --- a/src/simsycl/templates.cc +++ /dev/null @@ -1,109 +0,0 @@ -#include "simsycl/templates.hh" -#include "simsycl/sycl/device.hh" -#include "simsycl/sycl/platform.hh" -#include "simsycl/system.hh" - - -namespace simsycl::templates::platform { - -const platform_config cuda_12_2{ - .version = "12.2.0", - .name = "CUDA", - .vendor = "NVIDIA", - .extensions = {}, -}; - -} - -namespace simsycl::templates::device::nvidia { - -const device_config rtx_3090{ - .device_type = sycl::info::device_type::gpu, - .vendor_id = 4318, - .max_compute_units = 82, - .max_work_item_dimensions = 3, - .max_work_item_sizes_1 = {1024}, - .max_work_item_sizes_2 = {1024, 1024}, - .max_work_item_sizes_3 = {64, 1024, 1024}, - .max_work_group_size = 1024, - .max_num_sub_groups = 32, - .sub_group_sizes = {32}, - .preferred_vector_width_char = 4, - .preferred_vector_width_short = 2, - .preferred_vector_width_int = 1, - .preferred_vector_width_long = 1, - .preferred_vector_width_float = 1, - .preferred_vector_width_double = 1, - .preferred_vector_width_half = 2, - .native_vector_width_char = 4, - .native_vector_width_short = 2, - .native_vector_width_int = 1, - .native_vector_width_long = 1, - .native_vector_width_float = 1, - .native_vector_width_double = 1, - .native_vector_width_half = 2, - .max_clock_frequency = 1695, - .address_bits = 64, - .max_mem_alloc_size = 25438126080ull, - .image_support = false, - .max_read_image_args = 0, - .max_write_image_args = 0, - .image2d_max_height = 0, - .image2d_max_width = 0, - .image3d_max_height = 0, - .image3d_max_width = 0, - .image3d_max_depth = 0, - .image_max_buffer_size = 0, - .max_samplers = 0, - .max_parameter_size = 18446744073709551615ull, - .mem_base_addr_align = 8, - .half_fp_config - = {sycl::info::fp_config::denorm, sycl::info::fp_config::inf_nan, sycl::info::fp_config::round_to_nearest, - sycl::info::fp_config::round_to_zero, sycl::info::fp_config::round_to_inf, sycl::info::fp_config::fma, - sycl::info::fp_config::correctly_rounded_divide_sqrt}, - .single_fp_config - = {sycl::info::fp_config::denorm, sycl::info::fp_config::inf_nan, sycl::info::fp_config::round_to_nearest, - sycl::info::fp_config::round_to_zero, sycl::info::fp_config::round_to_inf, sycl::info::fp_config::fma, - sycl::info::fp_config::correctly_rounded_divide_sqrt}, - .double_fp_config - = {sycl::info::fp_config::denorm, sycl::info::fp_config::inf_nan, sycl::info::fp_config::round_to_nearest, - sycl::info::fp_config::round_to_zero, sycl::info::fp_config::round_to_inf, sycl::info::fp_config::fma, - sycl::info::fp_config::correctly_rounded_divide_sqrt}, - .global_mem_cache_type = sycl::info::global_mem_cache_type::read_write, - .global_mem_cache_line_size = 128, - .global_mem_cache_size = 6291456, - .global_mem_size = 25438126080, - .max_constant_buffer_size = 65536, - .max_constant_args = 4294967295, - .local_mem_type = sycl::info::local_mem_type::local, - .local_mem_size = 49152, - .error_correction_support = false, - .host_unified_memory = false, - .profiling_timer_resolution = 1, - .is_endian_little = true, - .is_available = true, - .is_compiler_available = true, - .is_linker_available = true, - .execution_capabilities = {sycl::info::execution_capability::exec_kernel}, - .queue_profiling = true, - .built_in_kernels = {}, - .name = "NVIDIA GeForce RTX 3090", - .vendor = "NVIDIA", - .driver_version = "12010", - .profile = "FULL_PROFILE", - .version = "sm_86", - .aspects - = { sycl::aspect::gpu, sycl::aspect::accelerator, sycl::aspect::fp64, sycl::aspect::atomic64, - sycl::aspect::queue_profiling, sycl::aspect::usm_device_allocations, sycl::aspect::usm_host_allocations, - sycl::aspect::usm_shared_allocations, }, - .extensions = {}, - .printf_buffer_size = 18446744073709551615ull, - .preferred_interop_user_sync = true, - .partition_max_sub_devices = 0, - .partition_properties = {}, - .partition_affinity_domains = {sycl::info::partition_affinity_domain::not_applicable}, - .partition_type_property = sycl::info::partition_property::no_partition, - .partition_type_affinity_domain = sycl::info::partition_affinity_domain::not_applicable, -}; - -} // namespace simsycl::templates::device::nvidia diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index fdbfc1f..d6604c1 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -9,6 +9,8 @@ FetchContent_Declare( FetchContent_MakeAvailable(Catch2) add_executable(tests + test_utils.hh + test_setup.cc ambiguity_tests.cc group_op_tests.cc hierarchical_tests.cc @@ -18,6 +20,7 @@ add_executable(tests reduction_tests.cc vec_tests.cc ) + target_link_libraries(tests PRIVATE Catch2::Catch2WithMain simsycl) set_simsycl_target_options(tests) diff --git a/test/group_op_tests.cc b/test/group_op_tests.cc index 38a9bd2..b993632 100644 --- a/test/group_op_tests.cc +++ b/test/group_op_tests.cc @@ -2,6 +2,8 @@ #include +#include "test_utils.hh" + using namespace simsycl; template @@ -58,7 +60,7 @@ TEST_CASE("Group barriers behave as expected", "[group_op]") { } SECTION("For subgroups") { - detail::configure_temporarily cfg{detail::config::max_sub_group_size, 2u}; + test::configure_device_with([](device_config &dev) { dev.sub_group_sizes = {2u}; }); sycl::queue().submit([&actual](sycl::handler &cgh) { cgh.parallel_for(sycl::nd_range<1>{4, 4}, [&actual](sycl::nd_item<1> it) { const auto &sg = it.get_sub_group(); @@ -105,7 +107,7 @@ TEST_CASE("Group broadcasts behave as expected", "[group_op][broadcast]") { } SECTION("For subgroups") { - detail::configure_temporarily cfg{detail::config::max_sub_group_size, 4u}; + test::configure_device_with([](device_config &dev) { dev.sub_group_sizes = {4u}; }); sycl::queue().submit([&actual](sycl::handler &cgh) { cgh.parallel_for(sycl::nd_range<1>{8, 8}, [&actual](sycl::nd_item<1> it) { actual[it.get_global_linear_id()] @@ -134,7 +136,7 @@ TEST_CASE("Group joint_any_of behaves as expected", "[group_op][joint_any_of]") } SECTION("For subgroups") { - detail::configure_temporarily cfg{detail::config::max_sub_group_size, 4u}; + test::configure_device_with([](device_config &dev) { dev.sub_group_sizes = {4u}; }); sycl::queue().submit([&inputs](sycl::handler &cgh) { cgh.parallel_for(sycl::nd_range<1>{8, 8}, [&inputs](sycl::nd_item<1> it) { CHECK(sycl::joint_any_of(it.get_sub_group(), inputs, inputs + 4, [](int i) { return i == 3; })); @@ -173,7 +175,7 @@ TEST_CASE("Group any_of_group behaves as expected", "[group_op][any_of_group]") } SECTION("For subgroups") { - detail::configure_temporarily cfg{detail::config::max_sub_group_size, 4u}; + test::configure_device_with([](device_config &dev) { dev.sub_group_sizes = {4u}; }); sycl::queue().submit([&inputs](sycl::handler &cgh) { cgh.parallel_for(sycl::nd_range<1>{8, 8}, [&inputs](sycl::nd_item<1> it) { auto id = it.get_sub_group().get_local_linear_id(); @@ -201,7 +203,7 @@ TEST_CASE("Group joint_all_of behaves as expected", "[group_op][joint_all_of]") } SECTION("For subgroups") { - detail::configure_temporarily cfg{detail::config::max_sub_group_size, 4u}; + test::configure_device_with([](device_config &dev) { dev.sub_group_sizes = {4u}; }); sycl::queue().submit([&inputs](sycl::handler &cgh) { cgh.parallel_for(sycl::nd_range<1>{8, 8}, [&inputs](sycl::nd_item<1> it) { CHECK(sycl::joint_all_of(it.get_sub_group(), inputs, inputs + 4, [](int i) { return i <= 4; })); @@ -240,7 +242,7 @@ TEST_CASE("Group all_of_group behaves as expected", "[group_op][all_of_group]") } SECTION("For subgroups") { - detail::configure_temporarily cfg{detail::config::max_sub_group_size, 4u}; + test::configure_device_with([](device_config &dev) { dev.sub_group_sizes = {4u}; }); sycl::queue().submit([&inputs](sycl::handler &cgh) { cgh.parallel_for(sycl::nd_range<1>{8, 8}, [&inputs](sycl::nd_item<1> it) { auto id = it.get_sub_group().get_local_linear_id(); @@ -268,7 +270,7 @@ TEST_CASE("Group joint_none_of behaves as expected", "[group_op][joint_none_of]" } SECTION("For subgroups") { - detail::configure_temporarily cfg{detail::config::max_sub_group_size, 4u}; + test::configure_device_with([](device_config &dev) { dev.sub_group_sizes = {4u}; }); sycl::queue().submit([&inputs](sycl::handler &cgh) { cgh.parallel_for(sycl::nd_range<1>{8, 8}, [&inputs](sycl::nd_item<1> it) { CHECK(sycl::joint_none_of(it.get_sub_group(), inputs, inputs + 4, [](int i) { return i > 4; })); @@ -307,7 +309,7 @@ TEST_CASE("Group none_of_group behaves as expected", "[group_op][none_of_group]" } SECTION("For subgroups") { - detail::configure_temporarily cfg{detail::config::max_sub_group_size, 4u}; + test::configure_device_with([](device_config &dev) { dev.sub_group_sizes = {4u}; }); sycl::queue().submit([&inputs](sycl::handler &cgh) { cgh.parallel_for(sycl::nd_range<1>{8, 8}, [&inputs](sycl::nd_item<1> it) { auto id = it.get_sub_group().get_local_linear_id(); @@ -322,7 +324,7 @@ TEST_CASE("Group none_of_group behaves as expected", "[group_op][none_of_group]" TEST_CASE("Group shift operation behave as expected", "[group_op][shift]") { int inputs[4] = {1, 2, 3, 4}; - detail::configure_temporarily cfg{detail::config::max_sub_group_size, 4u}; + test::configure_device_with([](device_config &dev) { dev.sub_group_sizes = {4u}; }); SECTION("Left shift") { sycl::queue().submit([&inputs](sycl::handler &cgh) { @@ -356,7 +358,7 @@ TEST_CASE("Group shift operation behave as expected", "[group_op][shift]") { TEST_CASE("Group permute behaves as expected", "[group_op][permute]") { int inputs[4] = {1, 2, 3, 4}; - detail::configure_temporarily cfg{detail::config::max_sub_group_size, 4u}; + test::configure_device_with([](device_config &dev) { dev.sub_group_sizes = {4u}; }); sycl::queue().submit([&inputs](sycl::handler &cgh) { cgh.parallel_for(sycl::nd_range<1>{8, 8}, [&inputs](sycl::nd_item<1> it) { @@ -375,7 +377,7 @@ TEST_CASE("Group permute behaves as expected", "[group_op][permute]") { TEST_CASE("Group select behaves as expected", "[group_op][select]") { int inputs[4] = {1, 2, 3, 4}; - detail::configure_temporarily cfg{detail::config::max_sub_group_size, 4u}; + test::configure_device_with([](device_config &dev) { dev.sub_group_sizes = {4u}; }); sycl::queue().submit([&inputs](sycl::handler &cgh) { cgh.parallel_for(sycl::nd_range<1>{8, 8}, [&inputs](sycl::nd_item<1> it) { @@ -404,7 +406,7 @@ TEST_CASE("Group joint_reduce behaves as expected", "[group_op][joint_reduce]") } SECTION("For subgroups") { - detail::configure_temporarily cfg{detail::config::max_sub_group_size, 4u}; + test::configure_device_with([](device_config &dev) { dev.sub_group_sizes = {4u}; }); sycl::queue().submit([&inputs](sycl::handler &cgh) { cgh.parallel_for(sycl::nd_range<1>{8, 8}, [&inputs](sycl::nd_item<1> it) { CHECK(sycl::joint_reduce(it.get_sub_group(), inputs, inputs + 4, sycl::plus{}) == 10); @@ -436,7 +438,7 @@ TEST_CASE("Group reduce_over_group behaves as expected", "[group_op][reduce_over } SECTION("For subgroups") { - detail::configure_temporarily cfg{detail::config::max_sub_group_size, 4u}; + test::configure_device_with([](device_config &dev) { dev.sub_group_sizes = {4u}; }); sycl::queue().submit([&inputs](sycl::handler &cgh) { cgh.parallel_for(sycl::nd_range<1>{8, 8}, [&inputs](sycl::nd_item<1> it) { auto id = it.get_sub_group().get_local_linear_id(); @@ -476,7 +478,7 @@ TEST_CASE("Group joint scans behave as expected", "[group_op][joint_exclusive_sc } SECTION("For subgroups") { - detail::configure_temporarily cfg{detail::config::max_sub_group_size, 4u}; + test::configure_device_with([](device_config &dev) { dev.sub_group_sizes = {4u}; }); sycl::queue().submit([&inputs](sycl::handler &cgh) { cgh.parallel_for(sycl::nd_range<1>{8, 8}, [&inputs](sycl::nd_item<1> it) { std::vector outputs = {0, 0, 0, 0}; @@ -546,7 +548,7 @@ TEST_CASE("Group scans behave as expected", "[group_op][exclusive_scan_over_grou } SECTION("For subgroups") { - detail::configure_temporarily cfg{detail::config::max_sub_group_size, 4u}; + test::configure_device_with([](device_config &dev) { dev.sub_group_sizes = {4u}; }); sycl::queue().submit([&inputs](sycl::handler &cgh) { cgh.parallel_for(sycl::nd_range<1>{8, 8}, [&inputs](sycl::nd_item<1> it) { const auto id = it.get_sub_group().get_local_linear_id(); diff --git a/test/launch_tests.cc b/test/launch_tests.cc index 4c1ad03..a85d342 100644 --- a/test/launch_tests.cc +++ b/test/launch_tests.cc @@ -1,9 +1,11 @@ -#include -#include +#include "test_utils.hh" +#include #include -#include -#include + +#include +#include +#include using namespace simsycl; @@ -81,25 +83,53 @@ TEMPLATE_TEST_CASE_SIG( TEST_CASE( "parallel_for(nd_range) correctly will re-use fibers and local allocations when the number of groups is large", "[launch]") { - simsycl::system_config system; - simsycl::device_config device = simsycl::templates::device::nvidia::rtx_3090; - device.max_compute_units = 2; // we currently allocate #max_compute_units groups worth of fibers - system.platforms.push_back(simsycl::create_platform(simsycl::templates::platform::cuda_12_2)); - system.devices.push_back(simsycl::create_device(system.platforms[0], device)); - simsycl::configure_system(std::move(system)); + // we currently allocate #max_compute_units groups worth of fibers + test::configure_device_with([](simsycl::device_config &device) { device.max_compute_units = 2; }); sycl::range<1> global_range(256); sycl::range<1> local_range(16); std::vector visited(global_range.size(), false); - sycl::queue() - .submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range(global_range, local_range), [=, &visited](sycl::nd_item<1> it) { - CHECK(!visited[it.get_global_id()]); - visited[it.get_global_id()] = true; - }); - }) - .wait(); + sycl::queue().parallel_for(sycl::nd_range(global_range, local_range), [=, &visited](sycl::nd_item<1> it) { + CHECK(!visited[it.get_global_id()]); + visited[it.get_global_id()] = true; + }); for(size_t i = 0; i < global_range.size(); ++i) { CHECK(visited[i]); } } + +TEST_CASE("partial sub-groups are generated if local size is not divisible by device sub-group size", "[launch]") { + const size_t local_size = GENERATE(values({3, 10})); + const size_t num_groups = 3; + const size_t global_size = num_groups * local_size; + const size_t max_sub_group_size = 4; + + CAPTURE(local_size); + + // this is the default, but better be safe and forward-compatible + test::configure_device_with([=](simsycl::device_config &device) { device.sub_group_sizes = {max_sub_group_size}; }); + + std::vector visited(global_size); + sycl::queue().parallel_for(sycl::nd_range<1>(global_size, local_size), [&](sycl::nd_item<1> it) { + auto group = it.get_group(); + CHECK(group.get_local_range() == local_size); + + auto sg = it.get_sub_group(); + if(sg.get_group_linear_id() == sg.get_group_linear_range() - 1) { + CHECK(sg.get_local_range().size() == local_size % max_sub_group_size); + } else { + CHECK(sg.get_local_range().size() == max_sub_group_size); + } + CHECK(sg.get_local_linear_id() == it.get_local_linear_id() % max_sub_group_size); + CHECK(sg.get_max_local_range().size() == max_sub_group_size); + + // barriers should continue to work when partial sub-groups are involved + sycl::group_barrier(sg); + sycl::group_barrier(group); + + CHECK(it.get_global_linear_id() < global_size); + CHECK(it.get_global_linear_id() == it.get_group_linear_id() * local_size + it.get_local_linear_id()); + CHECK(!visited[it.get_global_linear_id()]); + visited[it.get_global_linear_id()] = true; + }); +} diff --git a/test/test_setup.cc b/test/test_setup.cc new file mode 100644 index 0000000..4919360 --- /dev/null +++ b/test/test_setup.cc @@ -0,0 +1,15 @@ +#include + +#include +#include + + +struct global_setup_and_teardown : Catch::EventListenerBase { + using EventListenerBase::EventListenerBase; + + void testCasePartialStarting(const Catch::TestCaseInfo & /* test_info */, uint64_t /* part_number */) override { + simsycl::configure_system(simsycl::builtin_system); + } +}; + +CATCH_REGISTER_LISTENER(global_setup_and_teardown); diff --git a/test/test_utils.hh b/test/test_utils.hh new file mode 100644 index 0000000..9af42a4 --- /dev/null +++ b/test/test_utils.hh @@ -0,0 +1,15 @@ +#pragma once + +#include + + +namespace simsycl::test { + +template +void configure_device_with(DeviceSetup &&setup_device) { + auto system = builtin_system; + setup_device(system.devices.at("GPU")); + configure_system(system); +} + +}; // namespace simsycl::test