diff --git a/src/cuda/api/kernel_launch.hpp b/src/cuda/api/kernel_launch.hpp index ce63f0b8..a57092d9 100644 --- a/src/cuda/api/kernel_launch.hpp +++ b/src/cuda/api/kernel_launch.hpp @@ -188,7 +188,7 @@ void enqueue_raw_kernel_launch_in_current_context( // look at things. detail_::collect_argument_addresses(argument_ptrs, ::std::forward(parameters)...); #if CUDA_VERSION >= 11000 - kernel::handle_t kernel_function_handle = kernel::detail_::get_handle( (const void*) kernel_function); + kernel::handle_t kernel_function_handle = kernel::apriori_compiled::detail_::get_handle( (const void*) kernel_function); auto status = cuLaunchCooperativeKernel( kernel_function_handle, launch_configuration.dimensions.grid.x, @@ -248,7 +248,7 @@ struct raw_kernel_typegen { template typename detail_::raw_kernel_typegen::type -unwrap(const apriori_compiled_kernel_t& kernel) +unwrap(const kernel::apriori_compiled_t& kernel) { using raw_kernel_t = typename detail_::raw_kernel_typegen::type; return reinterpret_cast(const_cast(kernel.ptr())); @@ -259,9 +259,9 @@ unwrap(const apriori_compiled_kernel_t& kernel) namespace detail_ { template -struct enqueue_launch_helper { +struct enqueue_launch_helper { void operator()( - const apriori_compiled_kernel_t& wrapped_kernel, + const kernel::apriori_compiled_t& wrapped_kernel, const stream_t & stream, launch_configuration_t launch_configuration, KernelParameters &&... parameters) const; diff --git a/src/cuda/api/kernels/apriori_compiled.hpp b/src/cuda/api/kernels/apriori_compiled.hpp index e491647e..c9b9a935 100644 --- a/src/cuda/api/kernels/apriori_compiled.hpp +++ b/src/cuda/api/kernels/apriori_compiled.hpp @@ -18,12 +18,13 @@ namespace cuda { +namespace kernel { + ///@cond -class device_t; -class apriori_compiled_kernel_t; +class apriori_compiled_t; ///@nocond -namespace kernel { +namespace apriori_compiled { namespace detail_ { @@ -46,16 +47,16 @@ inline handle_t get_handle(const void *kernel_function_ptr, const char* name = n } #endif -apriori_compiled_kernel_t wrap( +apriori_compiled_t wrap( device::id_t device_id, context::handle_t primary_context_handle, kernel::handle_t f, const void* ptr, bool hold_primary_context_refcount_unit = false); - } // namespace detail_ + #if ! CAN_GET_APRIORI_KERNEL_HANDLE /** * @brief a wrapper around `cudaFuncAttributes`, offering @@ -299,13 +300,13 @@ inline grid::dimension_t max_active_blocks_per_multiprocessor( } // namespace occupancy -} // namespace kernel +} // namespace apriori_compiled /** * @brief A subclass of the @ref `kernel_t` interface for kernels being * functions marked as __global__ in source files and compiled apriori. */ -class apriori_compiled_kernel_t final : public kernel_t { +class apriori_compiled_t final : public kernel_t { public: // getters const void *ptr() const noexcept { return ptr_; } const void *get() const noexcept { return ptr_; } @@ -316,7 +317,7 @@ class apriori_compiled_kernel_t final : public kernel_t { public: // non-mutators #if ! CAN_GET_APRIORI_KERNEL_HANDLE - kernel::attributes_t attributes() const; + apriori_compiled::attributes_t attributes() const; void set_cache_preference(multiprocessor_cache_preference_t preference) const override; void set_shared_memory_bank_size(multiprocessor_shared_memory_bank_size_option_t config) const override; @@ -346,7 +347,7 @@ class apriori_compiled_kernel_t final : public kernel_t { { auto shared_memory_size_determiner = [dynamic_shared_memory_size](int) -> size_t { return dynamic_shared_memory_size; }; - return kernel::occupancy::detail_::min_grid_params_for_max_occupancy( + return kernel::apriori_compiled::occupancy::detail_::min_grid_params_for_max_occupancy( ptr(), device_id(), shared_memory_size_determiner, block_size_limit, disable_caching_override); @@ -357,7 +358,7 @@ class apriori_compiled_kernel_t final : public kernel_t { grid::block_dimension_t block_size_limit = 0, bool disable_caching_override = false) const override { - return kernel::occupancy::detail_::min_grid_params_for_max_occupancy( + return kernel::apriori_compiled::occupancy::detail_::min_grid_params_for_max_occupancy( ptr(), device_id(), shared_memory_size_determiner, block_size_limit, disable_caching_override); @@ -387,7 +388,7 @@ class apriori_compiled_kernel_t final : public kernel_t { memory::shared::size_t dynamic_shared_memory_per_block, bool disable_caching_override = false) const override { - return kernel::occupancy::detail_::max_active_blocks_per_multiprocessor( + return apriori_compiled::occupancy::detail_::max_active_blocks_per_multiprocessor( ptr(), block_size_in_threads, dynamic_shared_memory_per_block, @@ -396,7 +397,7 @@ class apriori_compiled_kernel_t final : public kernel_t { #endif // ! CAN_GET_APRIORI_KERNEL_HANDLE protected: // ctors & dtor - apriori_compiled_kernel_t(device::id_t device_id, context::handle_t primary_context_handle, + apriori_compiled_t(device::id_t device_id, context::handle_t primary_context_handle, kernel::handle_t handle, const void *f, bool hold_pc_refcount_unit) : kernel_t(device_id, primary_context_handle, handle, hold_pc_refcount_unit), ptr_(f) { // TODO: Consider checking whether this actually is a device function, at all and in this context @@ -404,34 +405,35 @@ class apriori_compiled_kernel_t final : public kernel_t { assert(f != nullptr && "Attempt to construct a kernel object for a nullptr kernel function pointer"); #endif } - apriori_compiled_kernel_t( + apriori_compiled_t( device::id_t device_id, context::handle_t primary_context_handle, const void *f, bool hold_primary_context_refcount_unit) - : apriori_compiled_kernel_t( + : apriori_compiled_t( device_id, primary_context_handle, - kernel::detail_::get_handle(f), + apriori_compiled::detail_::get_handle(f), f, hold_primary_context_refcount_unit) { } public: // ctors & dtor - apriori_compiled_kernel_t(const apriori_compiled_kernel_t&) = default; - apriori_compiled_kernel_t(apriori_compiled_kernel_t&&) = default; + apriori_compiled_t(const apriori_compiled_t&) = default; + apriori_compiled_t(apriori_compiled_t&&) = default; public: // friends - friend apriori_compiled_kernel_t kernel::detail_::wrap(device::id_t, context::handle_t, kernel::handle_t, const void*, bool); + friend apriori_compiled_t apriori_compiled::detail_::wrap(device::id_t, context::handle_t, kernel::handle_t, const void*, bool); protected: // data members const void *const ptr_; -}; +}; // class apriori_compiled_t + +namespace apriori_compiled { -namespace kernel { namespace detail_ { -inline apriori_compiled_kernel_t wrap( +inline apriori_compiled_t wrap( device::id_t device_id, context::handle_t primary_context_handle, kernel::handle_t f, @@ -442,7 +444,7 @@ inline apriori_compiled_kernel_t wrap( } #if ! CAN_GET_APRIORI_KERNEL_HANDLE -inline ::std::string identify(const apriori_compiled_kernel_t& kernel) +inline ::std::string identify(const apriori_compiled_t& kernel) { return "apriori-compiled kernel " + cuda::detail_::ptr_as_hex(kernel.ptr()) + " in " + context::detail_::identify(kernel.context()); @@ -461,7 +463,7 @@ inline attribute_value_t get_attribute(const void* function_ptr, attribute_t att inline void set_attribute(const void* function_ptr, attribute_t attribute, attribute_value_t value) { auto handle = detail_::get_handle(function_ptr); - return detail_::set_attribute_in_current_context(handle, attribute, value); + return kernel::detail_::set_attribute_in_current_context(handle, attribute, value); } inline attribute_value_t get_attribute( @@ -484,15 +486,17 @@ inline void set_attribute( } #endif // CAN_GET_APRIORI_KERNEL_HANDLE +} // namespace apriori_compiled + /** * @note The returned kernel proxy object will keep the device's primary * context active while the kernel exists. */ template -apriori_compiled_kernel_t get(const device_t& device, KernelFunctionPtr function_ptr); +apriori_compiled_t get(const device_t& device, KernelFunctionPtr function_ptr); template -apriori_compiled_kernel_t get(context_t context, KernelFunctionPtr function_ptr); +apriori_compiled_t get(context_t context, KernelFunctionPtr function_ptr); } // namespace kernel diff --git a/src/cuda/api/multi_wrapper_impls/apriori_compiled_kernel.hpp b/src/cuda/api/multi_wrapper_impls/apriori_compiled_kernel.hpp index b076b3ff..b4b5b734 100644 --- a/src/cuda/api/multi_wrapper_impls/apriori_compiled_kernel.hpp +++ b/src/cuda/api/multi_wrapper_impls/apriori_compiled_kernel.hpp @@ -15,23 +15,25 @@ namespace cuda { +namespace kernel { + #if ! CAN_GET_APRIORI_KERNEL_HANDLE #if defined(__CUDACC__) // Unfortunately, the CUDA runtime API does not allow for computation of the grid parameters for maximum occupancy // from code compiled with a host-side-only compiler! See cuda_runtime.h for details -inline kernel::attributes_t apriori_compiled_kernel_t::attributes() const +inline apriori_compiled::attributes_t apriori_compiled_t::attributes() const { // Note: assuming the primary context is active CAW_SET_SCOPE_CONTEXT(context_handle_); - kernel::attributes_t function_attributes; + apriori_compiled::attributes_t function_attributes; auto status = cudaFuncGetAttributes(&function_attributes, ptr_); throw_if_error_lazy(status, "Failed obtaining attributes for a CUDA device function"); return function_attributes; } -inline void apriori_compiled_kernel_t::set_cache_preference(multiprocessor_cache_preference_t preference) const +inline void apriori_compiled_t::set_cache_preference(multiprocessor_cache_preference_t preference) const { // Note: assuming the primary context is active CAW_SET_SCOPE_CONTEXT(context_handle_); @@ -41,7 +43,7 @@ inline void apriori_compiled_kernel_t::set_cache_preference(multiprocessor_cache "CUDA device function"); } -inline void apriori_compiled_kernel_t::set_shared_memory_bank_size( +inline void apriori_compiled_t::set_shared_memory_bank_size( multiprocessor_shared_memory_bank_size_option_t config) const { // Note: assuming the primary context is active @@ -50,7 +52,7 @@ inline void apriori_compiled_kernel_t::set_shared_memory_bank_size( throw_if_error_lazy(result, "Failed setting shared memory bank size to " + ::std::to_string(config)); } -inline void apriori_compiled_kernel_t::set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value) const +inline void apriori_compiled_t::set_attribute(attribute_t attribute, attribute_value_t value) const { // Note: assuming the primary context is active CAW_SET_SCOPE_CONTEXT(context_handle_); @@ -70,9 +72,9 @@ inline void apriori_compiled_kernel_t::set_attribute(kernel::attribute_t attribu throw_if_error_lazy(result, "Setting CUDA device function attribute " + ::std::to_string(attribute) + " to value " + ::std::to_string(value)); } -inline kernel::attribute_value_t apriori_compiled_kernel_t::get_attribute(kernel::attribute_t attribute) const +inline attribute_value_t apriori_compiled_t::get_attribute(attribute_t attribute) const { - kernel::attributes_t attrs = attributes(); + apriori_compiled::attributes_t attrs = attributes(); switch(attribute) { case CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: return attrs.maxThreadsPerBlock; @@ -98,9 +100,9 @@ inline kernel::attribute_value_t apriori_compiled_kernel_t::get_attribute(kernel throw cuda::runtime_error(status::not_supported, ::std::string("Attribute ") + #ifdef NDEBUG - ::std::to_string(static_cast<::std::underlying_type::type>(attribute)) + ::std::to_string(static_cast<::std::underlying_type::type>(attribute)) #else - kernel::detail_::attribute_name(attribute) + detail_::attribute_name(attribute) #endif + " cannot be obtained for apriori-compiled kernels before CUDA version 11.0" ); @@ -110,16 +112,15 @@ inline kernel::attribute_value_t apriori_compiled_kernel_t::get_attribute(kernel #endif // defined(__CUDACC__) #endif // ! CAN_GET_APRIORI_KERNEL_HANDLE -namespace kernel { - +namespace apriori_compiled { namespace detail_ { template -apriori_compiled_kernel_t get( - ::cuda::device::id_t device_id, - context::handle_t & primary_context_handle, - KernelFunctionPtr function_ptr) +apriori_compiled_t get( + device::id_t device_id, + context::handle_t & primary_context_handle, + KernelFunctionPtr function_ptr) { static_assert( ::std::is_pointer::value @@ -137,6 +138,8 @@ apriori_compiled_kernel_t get( } // namespace detail_ +} // namespace apriori_compiled + /** * @brief Obtain a wrapped kernel object corresponding to a "raw" kernel function @@ -148,10 +151,10 @@ apriori_compiled_kernel_t get( * context active while the kernel exists. */ template -apriori_compiled_kernel_t get(const device_t& device, KernelFunctionPtr function_ptr) +apriori_compiled_t get(const device_t &device, KernelFunctionPtr function_ptr) { auto primary_context_handle = device::primary_context::detail_::obtain_and_increase_refcount(device.id()); - return detail_::get(device.id(), primary_context_handle, function_ptr); + return apriori_compiled::detail_::get(device.id(), primary_context_handle, function_ptr); } } // namespace kernel @@ -160,9 +163,9 @@ namespace detail_ { template<> inline ::cuda::device::primary_context_t -get_implicit_primary_context(apriori_compiled_kernel_t kernel) +get_implicit_primary_context(kernel::apriori_compiled_t kernel) { - const kernel_t& kernel_ = kernel; + const kernel_t &kernel_ = kernel; return get_implicit_primary_context(kernel_); } diff --git a/src/cuda/api/multi_wrapper_impls/kernel.hpp b/src/cuda/api/multi_wrapper_impls/kernel.hpp index eea3bbaf..22106e16 100644 --- a/src/cuda/api/multi_wrapper_impls/kernel.hpp +++ b/src/cuda/api/multi_wrapper_impls/kernel.hpp @@ -15,7 +15,6 @@ #include "../pointer.hpp" #include "../primary_context.hpp" #include "../kernel.hpp" -#include "../current_context.hpp" namespace cuda { diff --git a/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp b/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp index 523fc90f..e546430a 100644 --- a/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp +++ b/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp @@ -101,8 +101,8 @@ inline void validate_block_dimension_compatibility( } template -void enqueue_launch_helper::operator()( - const apriori_compiled_kernel_t& wrapped_kernel, +void enqueue_launch_helper::operator()( + const kernel::apriori_compiled_t& wrapped_kernel, const stream_t & stream, launch_configuration_t launch_configuration, KernelParameters &&... parameters) const @@ -334,7 +334,7 @@ inline grid::composite_dimensions_t min_grid_params_for_max_occupancy( } // namespace detail_ inline grid::composite_dimensions_t min_grid_params_for_max_occupancy( - const apriori_compiled_kernel_t& kernel, + const kernel::apriori_compiled_t& kernel, memory::shared::size_t dynamic_shared_memory_size, grid::block_dimension_t block_size_limit, bool disable_caching_override) @@ -345,7 +345,7 @@ inline grid::composite_dimensions_t min_grid_params_for_max_occupancy( template grid::composite_dimensions_t min_grid_params_for_max_occupancy( - const apriori_compiled_kernel_t& kernel, + const kernel::apriori_compiled_t& kernel, UnaryFunction block_size_to_dynamic_shared_mem_size, grid::block_dimension_t block_size_limit, bool disable_caching_override)