From dd533971f32b9b4bc2bf8e1e9a8aaf5d22e95cae Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 25 Dec 2024 21:28:25 -0800 Subject: [PATCH 1/9] Refactor `size_t` to `std::size_t` and include `cstddef` everywhere it is used Maintains stylistic consistency and removes reliance on compiler interpretation --- dpctl/_host_task_util.hpp | 1 + dpctl/apis/include/dpctl4pybind11.hpp | 6 +- .../include/kernels/accumulators.hpp | 1 + .../libtensor/include/kernels/alignment.hpp | 2 +- .../kernels/boolean_advanced_indexing.hpp | 37 +- .../tensor/libtensor/include/kernels/clip.hpp | 35 +- .../include/kernels/constructors.hpp | 17 +- .../include/kernels/copy_and_cast.hpp | 90 +- .../include/kernels/copy_as_contiguous.hpp | 54 +- .../kernels/elementwise_functions/abs.hpp | 4 +- .../kernels/elementwise_functions/acos.hpp | 4 +- .../kernels/elementwise_functions/acosh.hpp | 4 +- .../kernels/elementwise_functions/add.hpp | 20 +- .../kernels/elementwise_functions/angle.hpp | 4 +- .../kernels/elementwise_functions/asin.hpp | 4 +- .../kernels/elementwise_functions/asinh.hpp | 4 +- .../kernels/elementwise_functions/atan.hpp | 4 +- .../kernels/elementwise_functions/atan2.hpp | 4 +- .../kernels/elementwise_functions/atanh.hpp | 4 +- .../elementwise_functions/bitwise_and.hpp | 8 +- .../elementwise_functions/bitwise_invert.hpp | 4 +- .../bitwise_left_shift.hpp | 8 +- .../elementwise_functions/bitwise_or.hpp | 8 +- .../bitwise_right_shift.hpp | 8 +- .../elementwise_functions/bitwise_xor.hpp | 8 +- .../kernels/elementwise_functions/cbrt.hpp | 4 +- .../kernels/elementwise_functions/ceil.hpp | 4 +- .../kernels/elementwise_functions/common.hpp | 177 +-- .../elementwise_functions/common_inplace.hpp | 82 +- .../kernels/elementwise_functions/conj.hpp | 4 +- .../elementwise_functions/copysign.hpp | 4 +- .../kernels/elementwise_functions/cos.hpp | 4 +- .../kernels/elementwise_functions/cosh.hpp | 4 +- .../kernels/elementwise_functions/equal.hpp | 4 +- .../kernels/elementwise_functions/exp.hpp | 4 +- .../kernels/elementwise_functions/exp2.hpp | 4 +- .../kernels/elementwise_functions/expm1.hpp | 4 +- .../kernels/elementwise_functions/floor.hpp | 4 +- .../elementwise_functions/floor_divide.hpp | 8 +- .../kernels/elementwise_functions/greater.hpp | 4 +- .../elementwise_functions/greater_equal.hpp | 4 +- .../kernels/elementwise_functions/hypot.hpp | 4 +- .../kernels/elementwise_functions/imag.hpp | 4 +- .../elementwise_functions/isfinite.hpp | 4 +- .../kernels/elementwise_functions/isinf.hpp | 4 +- .../kernels/elementwise_functions/isnan.hpp | 4 +- .../kernels/elementwise_functions/less.hpp | 4 +- .../elementwise_functions/less_equal.hpp | 4 +- .../kernels/elementwise_functions/log.hpp | 4 +- .../kernels/elementwise_functions/log10.hpp | 4 +- .../kernels/elementwise_functions/log1p.hpp | 4 +- .../kernels/elementwise_functions/log2.hpp | 4 +- .../elementwise_functions/logaddexp.hpp | 4 +- .../elementwise_functions/logical_and.hpp | 4 +- .../elementwise_functions/logical_not.hpp | 4 +- .../elementwise_functions/logical_or.hpp | 4 +- .../elementwise_functions/logical_xor.hpp | 4 +- .../kernels/elementwise_functions/maximum.hpp | 4 +- .../kernels/elementwise_functions/minimum.hpp | 4 +- .../elementwise_functions/multiply.hpp | 20 +- .../elementwise_functions/negative.hpp | 4 +- .../elementwise_functions/nextafter.hpp | 4 +- .../elementwise_functions/not_equal.hpp | 4 +- .../elementwise_functions/positive.hpp | 4 +- .../kernels/elementwise_functions/pow.hpp | 8 +- .../kernels/elementwise_functions/proj.hpp | 4 +- .../kernels/elementwise_functions/real.hpp | 4 +- .../elementwise_functions/reciprocal.hpp | 4 +- .../elementwise_functions/remainder.hpp | 8 +- .../kernels/elementwise_functions/round.hpp | 4 +- .../kernels/elementwise_functions/rsqrt.hpp | 4 +- .../kernels/elementwise_functions/sign.hpp | 4 +- .../kernels/elementwise_functions/signbit.hpp | 4 +- .../kernels/elementwise_functions/sin.hpp | 4 +- .../kernels/elementwise_functions/sinh.hpp | 4 +- .../kernels/elementwise_functions/sqrt.hpp | 4 +- .../kernels/elementwise_functions/square.hpp | 4 +- .../elementwise_functions/subtract.hpp | 20 +- .../kernels/elementwise_functions/tan.hpp | 4 +- .../kernels/elementwise_functions/tanh.hpp | 4 +- .../elementwise_functions/true_divide.hpp | 20 +- .../kernels/integer_advanced_indexing.hpp | 29 +- .../kernels/linalg_functions/dot_product.hpp | 244 ++-- .../include/kernels/linalg_functions/gemm.hpp | 1042 +++++++++-------- .../libtensor/include/kernels/reductions.hpp | 518 ++++---- .../libtensor/include/kernels/repeat.hpp | 45 +- .../include/kernels/sorting/merge_sort.hpp | 145 +-- .../include/kernels/sorting/radix_sort.hpp | 1 + .../include/kernels/sorting/searchsorted.hpp | 25 +- .../kernels/sorting/sort_impl_fn_ptr_t.hpp | 5 +- .../libtensor/include/kernels/where.hpp | 35 +- .../include/utils/indexing_utils.hpp | 3 +- .../libtensor/include/utils/offset_utils.hpp | 51 +- .../libtensor/include/utils/strided_iters.hpp | 9 +- .../include/utils/sycl_alloc_utils.hpp | 3 +- .../libtensor/include/utils/sycl_utils.hpp | 16 +- .../libtensor/include/utils/type_utils.hpp | 1 + .../tensor/libtensor/source/accumulators.cpp | 34 +- .../tensor/libtensor/source/accumulators.hpp | 18 +- .../accumulators/accumulate_over_axis.hpp | 17 +- .../source/boolean_advanced_indexing.cpp | 25 +- dpctl/tensor/libtensor/source/clip.cpp | 5 +- .../source/copy_and_cast_usm_to_usm.cpp | 5 +- .../libtensor/source/copy_as_contig.cpp | 1 + .../tensor/libtensor/source/copy_for_roll.cpp | 7 +- .../copy_numpy_ndarray_into_usm_ndarray.cpp | 11 +- .../elementwise_functions.hpp | 25 +- .../elementwise_functions/true_divide.cpp | 9 +- dpctl/tensor/libtensor/source/eye_ctor.cpp | 3 +- dpctl/tensor/libtensor/source/full_ctor.cpp | 11 +- .../source/integer_advanced_indexing.cpp | 23 +- .../libtensor/source/linalg_functions/dot.cpp | 32 +- .../libtensor/source/linear_sequences.cpp | 15 +- .../source/reductions/reduction_over_axis.hpp | 71 +- dpctl/tensor/libtensor/source/repeat.cpp | 35 +- .../source/simplify_iteration_space.cpp | 81 +- .../source/sorting/py_argsort_common.hpp | 9 +- .../source/sorting/py_sort_common.hpp | 9 +- .../source/sorting/radix_argsort.cpp | 5 +- .../libtensor/source/sorting/radix_sort.cpp | 5 +- .../libtensor/source/sorting/searchsorted.cpp | 7 +- dpctl/tensor/libtensor/source/triul_ctor.cpp | 5 +- dpctl/tensor/libtensor/source/where.cpp | 5 +- dpctl/tensor/libtensor/source/zeros_ctor.cpp | 7 +- dpctl/utils/src/order_keeper.cpp | 1 + dpctl/utils/src/sequential_order_keeper.hpp | 17 +- .../helper/source/dpctl_error_handlers.cpp | 1 + .../source/dpctl_sycl_context_interface.cpp | 1 + .../source/dpctl_sycl_device_interface.cpp | 1 + .../source/dpctl_sycl_device_manager.cpp | 1 + .../dpctl_sycl_kernel_bundle_interface.cpp | 1 + .../source/dpctl_sycl_kernel_interface.cpp | 1 + .../source/dpctl_sycl_platform_interface.cpp | 1 + .../source/dpctl_sycl_platform_manager.cpp | 1 + .../source/dpctl_sycl_queue_interface.cpp | 1 + .../source/dpctl_sycl_usm_interface.cpp | 1 + libsyclinterface/source/dpctl_utils.cpp | 1 + .../source/dpctl_vector_templ.cpp | 1 + libsyclinterface/tests/test_service.cpp | 2 +- .../tests/test_sycl_context_interface.cpp | 1 + .../tests/test_sycl_device_aspects.cpp | 3 +- .../tests/test_sycl_device_interface.cpp | 1 + .../tests/test_sycl_device_manager.cpp | 1 + .../tests/test_sycl_device_subdevices.cpp | 1 + .../test_sycl_kernel_bundle_interface.cpp | 1 + .../tests/test_sycl_kernel_interface.cpp | 1 + .../tests/test_sycl_queue_interface.cpp | 1 + .../tests/test_sycl_queue_submit.cpp | 1 + ...t_sycl_queue_submit_local_accessor_arg.cpp | 1 + .../tests/test_sycl_usm_interface.cpp | 1 + 150 files changed, 1857 insertions(+), 1718 deletions(-) diff --git a/dpctl/_host_task_util.hpp b/dpctl/_host_task_util.hpp index 166fa814b1..5fc984e792 100644 --- a/dpctl/_host_task_util.hpp +++ b/dpctl/_host_task_util.hpp @@ -31,6 +31,7 @@ #pragma once #include +#include #include #include "Python.h" diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp index 1fe8b91ab1..d506fc1d85 100644 --- a/dpctl/apis/include/dpctl4pybind11.hpp +++ b/dpctl/apis/include/dpctl4pybind11.hpp @@ -27,8 +27,10 @@ #include "dpctl_capi.h" #include +#include // for std::size_t for C++ linkage #include #include +#include // for size_t for C linkage #include #include #include @@ -759,7 +761,7 @@ class usm_memory : public py::object * lifetime of the USM allocation. */ usm_memory(void *usm_ptr, - size_t nbytes, + std::size_t nbytes, const sycl::queue &q, std::shared_ptr shptr) { @@ -819,7 +821,7 @@ class usm_memory : public py::object return reinterpret_cast(MRef); } - size_t get_nbytes() const + std::size_t get_nbytes() const { auto const &api = ::dpctl::detail::dpctl_capi::get(); Py_MemoryObject *mem_obj = reinterpret_cast(m_ptr); diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index 589208b63b..86fb745eaf 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -24,6 +24,7 @@ #pragma once #include +#include #include #include #include diff --git a/dpctl/tensor/libtensor/include/kernels/alignment.hpp b/dpctl/tensor/libtensor/include/kernels/alignment.hpp index 9ec14dd027..b712e70ce1 100644 --- a/dpctl/tensor/libtensor/include/kernels/alignment.hpp +++ b/dpctl/tensor/libtensor/include/kernels/alignment.hpp @@ -30,7 +30,7 @@ namespace kernels namespace alignment_utils { -static constexpr size_t required_alignment = 64UL; +static constexpr std::size_t required_alignment = 64UL; template bool is_aligned(Ptr p) { diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp index f20d269bd0..4857142ae4 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -23,6 +23,7 @@ //===---------------------------------------------------------------------===// #pragma once +#include #include #include #include @@ -55,7 +56,7 @@ struct MaskedExtractStridedFunctor MaskedExtractStridedFunctor(const dataT *src_data_p, const indT *cumsum_data_p, dataT *dst_data_p, - size_t masked_iter_size, + std::size_t masked_iter_size, const OrthogIndexerT &orthog_src_dst_indexer_, const MaskedSrcIndexerT &masked_src_indexer_, const MaskedDstIndexerT &masked_dst_indexer_, @@ -81,7 +82,7 @@ struct MaskedExtractStridedFunctor const std::size_t max_offset = masked_nelems + 1; for (std::uint32_t i = l_i; i < lacc.size(); i += lws) { - const size_t offset = masked_block_start + i; + const std::size_t offset = masked_block_start + i; lacc[i] = (offset == 0) ? indT(0) : (offset < max_offset) ? cumsum[offset - 1] : cumsum[masked_nelems - 1] + 1; @@ -99,9 +100,10 @@ struct MaskedExtractStridedFunctor if (mask_set && (masked_i < masked_nelems)) { const auto &orthog_offsets = orthog_src_dst_indexer(orthog_i); - const size_t total_src_offset = masked_src_indexer(masked_i) + - orthog_offsets.get_first_offset(); - const size_t total_dst_offset = + const std::size_t total_src_offset = + masked_src_indexer(masked_i) + + orthog_offsets.get_first_offset(); + const std::size_t total_dst_offset = masked_dst_indexer(current_running_count - 1) + orthog_offsets.get_second_offset(); @@ -113,7 +115,7 @@ struct MaskedExtractStridedFunctor const dataT *src = nullptr; const indT *cumsum = nullptr; dataT *dst = nullptr; - const size_t masked_nelems = 0; + const std::size_t masked_nelems = 0; // has nd, shape, src_strides, dst_strides for // dimensions that ARE NOT masked const OrthogIndexerT orthog_src_dst_indexer; @@ -136,7 +138,7 @@ struct MaskedPlaceStridedFunctor MaskedPlaceStridedFunctor(dataT *dst_data_p, const indT *cumsum_data_p, const dataT *rhs_data_p, - size_t masked_iter_size, + std::size_t masked_iter_size, const OrthogIndexerT &orthog_dst_rhs_indexer_, const MaskedDstIndexerT &masked_dst_indexer_, const MaskedRhsIndexerT &masked_rhs_indexer_, @@ -157,12 +159,12 @@ struct MaskedPlaceStridedFunctor const std::uint32_t l_i = ndit.get_local_id(1); const std::uint32_t lws = ndit.get_local_range(1); - const size_t masked_i = ndit.get_global_id(1); - const size_t masked_block_start = masked_i - l_i; + const std::size_t masked_i = ndit.get_global_id(1); + const std::size_t masked_block_start = masked_i - l_i; const std::size_t max_offset = masked_nelems + 1; for (std::uint32_t i = l_i; i < lacc.size(); i += lws) { - const size_t offset = masked_block_start + i; + const std::size_t offset = masked_block_start + i; lacc[i] = (offset == 0) ? indT(0) : (offset < max_offset) ? cumsum[offset - 1] : cumsum[masked_nelems - 1] + 1; @@ -180,9 +182,10 @@ struct MaskedPlaceStridedFunctor if (mask_set && (masked_i < masked_nelems)) { const auto &orthog_offsets = orthog_dst_rhs_indexer(orthog_i); - const size_t total_dst_offset = masked_dst_indexer(masked_i) + - orthog_offsets.get_first_offset(); - const size_t total_rhs_offset = + const std::size_t total_dst_offset = + masked_dst_indexer(masked_i) + + orthog_offsets.get_first_offset(); + const std::size_t total_rhs_offset = masked_rhs_indexer(current_running_count - 1) + orthog_offsets.get_second_offset(); @@ -194,7 +197,7 @@ struct MaskedPlaceStridedFunctor dataT *dst = nullptr; const indT *cumsum = nullptr; const dataT *rhs = nullptr; - const size_t masked_nelems = 0; + const std::size_t masked_nelems = 0; // has nd, shape, dst_strides, rhs_strides for // dimensions that ARE NOT masked const OrthogIndexerT orthog_dst_rhs_indexer; @@ -450,8 +453,8 @@ sycl::event masked_extract_some_slices_strided_impl( const std::size_t lws = get_lws(masked_extent); - const size_t n_groups = ((masked_extent + lws - 1) / lws); - const size_t orthog_extent = static_cast(orthog_nelems); + const std::size_t n_groups = ((masked_extent + lws - 1) / lws); + const std::size_t orthog_extent = static_cast(orthog_nelems); sycl::range<2> gRange{orthog_extent, n_groups * lws}; sycl::range<2> lRange{1, lws}; @@ -809,7 +812,7 @@ sycl::event non_zero_indexes_impl(sycl::queue &exec_q, const std::size_t masked_block_start = group_i * lws; for (std::uint32_t i = l_i; i < lacc.size(); i += lws) { - const size_t offset = masked_block_start + i; + const std::size_t offset = masked_block_start + i; lacc[i] = (offset == 0) ? indT1(0) : (offset - 1 < masked_extent) ? cumsum_data[offset - 1] diff --git a/dpctl/tensor/libtensor/include/kernels/clip.hpp b/dpctl/tensor/libtensor/include/kernels/clip.hpp index 66bedfd1cd..f89d345b42 100644 --- a/dpctl/tensor/libtensor/include/kernels/clip.hpp +++ b/dpctl/tensor/libtensor/include/kernels/clip.hpp @@ -25,6 +25,7 @@ #pragma once #include #include +#include #include #include #include @@ -85,14 +86,14 @@ template ::value || !enable_sg_loadstore) { const std::uint16_t sgSize = ndit.get_sub_group().get_local_range()[0]; - const size_t gid = ndit.get_global_linear_id(); + const std::size_t gid = ndit.get_global_linear_id(); const uint16_t nelems_per_sg = sgSize * nelems_per_wi; - const size_t start = + const std::size_t start = (gid / sgSize) * (nelems_per_sg - sgSize) + gid; - const size_t end = std::min(nelems, start + nelems_per_sg); + const std::size_t end = std::min(nelems, start + nelems_per_sg); - for (size_t offset = start; offset < end; offset += sgSize) { + for (std::size_t offset = start; offset < end; offset += sgSize) { dst_p[offset] = clip(x_p[offset], min_p[offset], max_p[offset]); } } @@ -125,7 +126,7 @@ class ClipContigFunctor auto sg = ndit.get_sub_group(); const std::uint16_t sgSize = sg.get_max_local_range()[0]; - const size_t base = + const std::size_t base = nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); @@ -133,7 +134,7 @@ class ClipContigFunctor sycl::vec dst_vec; #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - const size_t idx = base + it * sgSize; + const std::size_t idx = base + it * sgSize; auto x_multi_ptr = sycl::address_space_cast< sycl::access::address_space::global_space, sycl::access::decorated::yes>(&x_p[idx]); @@ -162,8 +163,8 @@ class ClipContigFunctor } } else { - const size_t lane_id = sg.get_local_id()[0]; - for (size_t k = base + lane_id; k < nelems; k += sgSize) { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems; k += sgSize) { dst_p[k] = clip(x_p[k], min_p[k], max_p[k]); } } @@ -175,7 +176,7 @@ template class clip_contig_kernel; typedef sycl::event (*clip_contig_impl_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, const char *, const char *, const char *, @@ -184,7 +185,7 @@ typedef sycl::event (*clip_contig_impl_fn_ptr_t)( template sycl::event clip_contig_impl(sycl::queue &q, - size_t nelems, + std::size_t nelems, const char *x_cp, const char *min_cp, const char *max_cp, @@ -199,10 +200,10 @@ sycl::event clip_contig_impl(sycl::queue &q, sycl::event clip_ev = q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - size_t lws = 64; + std::size_t lws = 64; constexpr std::uint8_t vec_sz = 4; constexpr std::uint8_t n_vecs = 2; - const size_t n_groups = + const std::size_t n_groups = ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); const auto gws_range = sycl::range<1>(n_groups * lws); const auto lws_range = sycl::range<1>(lws); @@ -258,7 +259,7 @@ template class ClipStridedFunctor void operator()(sycl::id<1> id) const { - size_t gid = id[0]; + std::size_t gid = id[0]; auto offsets = indexer(static_cast(gid)); dst_p[offsets.get_fourth_offset()] = clip( x_p[offsets.get_first_offset()], min_p[offsets.get_second_offset()], @@ -270,7 +271,7 @@ template class clip_strided_kernel; typedef sycl::event (*clip_strided_impl_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, int, const char *, const char *, @@ -285,7 +286,7 @@ typedef sycl::event (*clip_strided_impl_fn_ptr_t)( template sycl::event clip_strided_impl(sycl::queue &q, - size_t nelems, + std::size_t nelems, int nd, const char *x_cp, const char *min_cp, diff --git a/dpctl/tensor/libtensor/include/kernels/constructors.hpp b/dpctl/tensor/libtensor/include/kernels/constructors.hpp index 30731f82c7..d458c28495 100644 --- a/dpctl/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl/tensor/libtensor/include/kernels/constructors.hpp @@ -29,6 +29,7 @@ #include "utils/strided_iters.hpp" #include "utils/type_utils.hpp" #include +#include #include namespace dpctl @@ -97,7 +98,7 @@ template class LinearSequenceStepFunctor */ template sycl::event lin_space_step_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, Ty start_v, Ty step_v, char *array_data, @@ -123,10 +124,10 @@ template class LinearSequenceAffineFunctor Ty *p = nullptr; Ty start_v; Ty end_v; - size_t n; + std::size_t n; public: - LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, size_t den) + LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, std::size_t den) : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), n((den == 0) ? 1 : den) { @@ -188,7 +189,7 @@ template class LinearSequenceAffineFunctor */ template sycl::event lin_space_affine_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, Ty start_v, Ty end_v, bool include_endpoint, @@ -238,7 +239,7 @@ sycl::event lin_space_affine_impl(sycl::queue &exec_q, */ template sycl::event full_contig_impl(sycl::queue &q, - size_t nelems, + std::size_t nelems, dstTy fill_v, char *dst_p, const std::vector &depends) @@ -294,7 +295,7 @@ template class FullStridedFunctor template sycl::event full_strided_impl(sycl::queue &q, int nd, - size_t nelems, + std::size_t nelems, const ssize_t *shape_strides, dstTy fill_v, char *dst_p, @@ -321,7 +322,7 @@ sycl::event full_strided_impl(sycl::queue &q, /* ================ Eye ================== */ typedef sycl::event (*eye_fn_ptr_t)(sycl::queue &, - size_t nelems, // num_elements + std::size_t nelems, // num_elements ssize_t start, ssize_t end, ssize_t step, @@ -375,7 +376,7 @@ template class EyeFunctor */ template sycl::event eye_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const ssize_t start, const ssize_t end, const ssize_t step, diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp index a4e7fceca1..84cc59f3db 100644 --- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp +++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -24,6 +24,7 @@ #pragma once #include +#include #include #include #include @@ -113,7 +114,7 @@ class GenericCopyFunctor */ typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, int, const ssize_t *, const char *, @@ -159,7 +160,7 @@ typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)( template sycl::event copy_and_cast_generic_impl(sycl::queue &q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *src_p, @@ -217,12 +218,14 @@ template ::value) { std::uint16_t sgSize = ndit.get_sub_group().get_local_range()[0]; - const size_t gid = ndit.get_global_linear_id(); + const std::size_t gid = ndit.get_global_linear_id(); // start = (gid / sgSize) * elems_per_sg + (gid % sgSize) const std::uint16_t elems_per_sg = sgSize * elems_per_wi; - const size_t start = (gid / sgSize) * (elems_per_sg - sgSize) + gid; - const size_t end = std::min(nelems, start + elems_per_sg); - for (size_t offset = start; offset < end; offset += sgSize) { + const std::size_t start = + (gid / sgSize) * (elems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems, start + elems_per_sg); + for (std::size_t offset = start; offset < end; offset += sgSize) { dst_p[offset] = fn(src_p[offset]); } } else { auto sg = ndit.get_sub_group(); const std::uint16_t sgSize = sg.get_max_local_range()[0]; - const size_t base = + const std::size_t base = elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); @@ -260,7 +264,7 @@ class ContigCopyFunctor #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - const size_t offset = base + it * sgSize; + const std::size_t offset = base + it * sgSize; auto src_multi_ptr = sycl::address_space_cast< sycl::access::address_space::global_space, sycl::access::decorated::yes>(&src_p[offset]); @@ -278,8 +282,8 @@ class ContigCopyFunctor } } else { - const size_t start = base + sg.get_local_id()[0]; - for (size_t k = start; k < nelems; k += sgSize) { + const std::size_t start = base + sg.get_local_id()[0]; + for (std::size_t k = start; k < nelems; k += sgSize) { dst_p[k] = fn(src_p[k]); } } @@ -292,7 +296,7 @@ class ContigCopyFunctor */ typedef sycl::event (*copy_and_cast_contig_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, const char *, char *, const std::vector &); @@ -318,7 +322,7 @@ typedef sycl::event (*copy_and_cast_contig_fn_ptr_t)( */ template sycl::event copy_and_cast_contig_impl(sycl::queue &q, - size_t nelems, + std::size_t nelems, const char *src_cp, char *dst_cp, const std::vector &depends) @@ -332,10 +336,10 @@ sycl::event copy_and_cast_contig_impl(sycl::queue &q, const srcTy *src_tp = reinterpret_cast(src_cp); dstTy *dst_tp = reinterpret_cast(dst_cp); - size_t lws = 64; + std::size_t lws = 64; constexpr std::uint32_t vec_sz = 4; constexpr std::uint32_t n_vecs = 2; - const size_t n_groups = + const std::size_t n_groups = ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); const auto gws_range = sycl::range<1>(n_groups * lws); const auto lws_range = sycl::range<1>(lws); @@ -393,7 +397,7 @@ template struct CopyAndCastContigFactory */ typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, const std::array &, const std::array &, const std::array &, @@ -409,7 +413,7 @@ typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)( */ typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, const std::array &, const std::array &, const std::array &, @@ -451,7 +455,7 @@ typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)( template sycl::event copy_and_cast_nd_specialized_impl(sycl::queue &q, - size_t nelems, + std::size_t nelems, const std::array &shape, const std::array &src_strides, const std::array &dst_strides, @@ -544,7 +548,7 @@ class GenericCopyFromHostFunctor typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, int, ssize_t *, const char *, @@ -597,7 +601,7 @@ typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)( template void copy_and_cast_from_host_impl( sycl::queue &q, - size_t nelems, + std::size_t nelems, int nd, ssize_t *shape_and_strides, const char *host_src_p, @@ -663,7 +667,7 @@ struct CopyAndCastFromHostFactory typedef void (*copy_and_cast_from_host_contig_blocking_fn_ptr_t)( sycl::queue &, - size_t, /* nelems */ + std::size_t, /* nelems */ const char *, /* src_pointer */ ssize_t, /* src_offset */ char *, /* dst_pointer */ @@ -699,7 +703,7 @@ typedef void (*copy_and_cast_from_host_contig_blocking_fn_ptr_t)( template void copy_and_cast_from_host_contig_impl( sycl::queue &q, - size_t nelems, + std::size_t nelems, const char *host_src_p, ssize_t src_offset, char *dst_p, @@ -792,7 +796,7 @@ class GenericCopyForReshapeFunctor // define function type typedef sycl::event (*copy_for_reshape_fn_ptr_t)( sycl::queue &, - size_t, // num_elements + std::size_t, // num_elements int, // src_nd int, // dst_nd ssize_t *, // packed shapes and strides @@ -824,7 +828,7 @@ typedef sycl::event (*copy_for_reshape_fn_ptr_t)( template sycl::event copy_for_reshape_generic_impl(sycl::queue &q, - size_t nelems, + std::size_t nelems, int src_nd, int dst_nd, ssize_t *packed_shapes_and_strides, @@ -881,21 +885,21 @@ template struct CopyForReshapeGenericFactory /*! @brief Functor to cyclically roll global_id to the left */ struct LeftRolled1DTransformer { - LeftRolled1DTransformer(size_t offset, size_t size) + LeftRolled1DTransformer(std::size_t offset, std::size_t size) : offset_(offset), size_(size) { } - size_t operator()(size_t gid) const + std::size_t operator()(std::size_t gid) const { - const size_t shifted_gid = + const std::size_t shifted_gid = ((gid < offset_) ? gid + size_ - offset_ : gid - offset_); return shifted_gid; } private: - size_t offset_ = 0; - size_t size_ = 1; + std::size_t offset_ = 0; + std::size_t size_ = 1; }; /*! @brief Indexer functor to compose indexer and transformer */ @@ -903,7 +907,7 @@ template struct CompositionIndexer { CompositionIndexer(IndexerT f, TransformerT t) : f_(f), t_(t) {} - auto operator()(size_t gid) const { return f_(t_(gid)); } + auto operator()(std::size_t gid) const { return f_(t_(gid)); } private: IndexerT f_; @@ -924,7 +928,7 @@ struct RolledNDIndexer { } - ssize_t operator()(size_t gid) const { return compute_offset(gid); } + ssize_t operator()(std::size_t gid) const { return compute_offset(gid); } private: int nd_ = -1; @@ -973,7 +977,7 @@ class StridedCopyForRollFunctor void operator()(sycl::id<1> wiid) const { - const size_t gid = wiid.get(0); + const std::size_t gid = wiid.get(0); const ssize_t src_offset = src_indexer_(gid); const ssize_t dst_offset = dst_indexer_(gid); @@ -985,8 +989,8 @@ class StridedCopyForRollFunctor // define function type typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)( sycl::queue &, - size_t, // shift - size_t, // num_elements + std::size_t, // shift + std::size_t, // num_elements int, // common_nd const ssize_t *, // packed shapes and strides const char *, // src_data_ptr @@ -1021,8 +1025,8 @@ typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)( */ template sycl::event copy_for_roll_strided_impl(sycl::queue &q, - size_t shift, - size_t nelems, + std::size_t shift, + std::size_t nelems, int nd, const ssize_t *packed_shapes_and_strides, const char *src_p, @@ -1073,8 +1077,8 @@ sycl::event copy_for_roll_strided_impl(sycl::queue &q, // define function type typedef sycl::event (*copy_for_roll_contig_fn_ptr_t)( sycl::queue &, - size_t, // shift - size_t, // num_elements + std::size_t, // shift + std::size_t, // num_elements const char *, // src_data_ptr ssize_t, // src_offset char *, // dst_data_ptr @@ -1106,8 +1110,8 @@ template class copy_for_roll_contig_kernel; */ template sycl::event copy_for_roll_contig_impl(sycl::queue &q, - size_t shift, - size_t nelems, + std::size_t shift, + std::size_t nelems, const char *src_p, ssize_t src_offset, char *dst_p, @@ -1176,7 +1180,7 @@ class copy_for_roll_ndshift_strided_kernel; // define function type typedef sycl::event (*copy_for_roll_ndshift_strided_fn_ptr_t)( sycl::queue &, - size_t, // num_elements + std::size_t, // num_elements int, // common_nd const ssize_t *, // packed shape, strides, shifts const char *, // src_data_ptr @@ -1188,7 +1192,7 @@ typedef sycl::event (*copy_for_roll_ndshift_strided_fn_ptr_t)( template sycl::event copy_for_roll_ndshift_strided_impl( sycl::queue &q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *packed_shapes_and_strides_and_shifts, const char *src_p, diff --git a/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp index 1a44946cc4..82447a3298 100644 --- a/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp +++ b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp @@ -24,6 +24,7 @@ #pragma once #include +#include #include #include #include @@ -53,13 +54,13 @@ template (&dst_p[block_start_id]); - const size_t elem_id0 = block_start_id + sg.get_local_id(); + const std::size_t elem_id0 = + block_start_id + sg.get_local_id(); sycl::vec dst_vec; #pragma unroll for (std::uint8_t k = 0; k < vec_sz; ++k) { - const size_t elem_id = elem_id0 + k * sgSize; + const std::size_t elem_id = elem_id0 + k * sgSize; const ssize_t src_offset = src_indexer(elem_id); dst_vec[k] = src_p[src_offset]; } @@ -121,9 +123,9 @@ class CopyAsCContigFunctor } } else { - const size_t lane_id = sg.get_local_id()[0]; - const size_t k0 = base + lane_id; - for (size_t k = k0; k < nelems; k += sgSize) { + const std::size_t lane_id = sg.get_local_id()[0]; + const std::size_t k0 = base + lane_id; + for (std::size_t k = k0; k < nelems; k += sgSize) { const ssize_t src_offset = src_indexer(k); dst_p[k] = src_p[src_offset]; } @@ -139,7 +141,7 @@ template sycl::event submit_c_contiguous_copy(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const T *src, T *dst, const IndexerT &src_indexer, @@ -167,8 +169,8 @@ sycl::event submit_c_contiguous_copy(sycl::queue &exec_q, constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz; - const size_t nelems_per_group = nelems_per_wi * lws; - const size_t n_groups = + const std::size_t nelems_per_group = nelems_per_wi * lws; + const std::size_t n_groups = (nelems + nelems_per_group - 1) / (nelems_per_group); sycl::event copy_ev = exec_q.submit([&](sycl::handler &cgh) { @@ -196,7 +198,7 @@ class as_contig_krn; template sycl::event as_c_contiguous_array_generic_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *src_p, @@ -243,7 +245,7 @@ as_c_contiguous_array_generic_impl(sycl::queue &exec_q, typedef sycl::event (*as_c_contiguous_array_impl_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, int, const ssize_t *, const char *, @@ -270,9 +272,9 @@ namespace detail template sycl::event as_c_contiguous_batch_of_square_matrices_impl( sycl::queue &exec_q, - size_t batch_nelems, + std::size_t batch_nelems, const BatchIndexerT &batch_two_offsets_indexer, - size_t n, + std::size_t n, const char *src_p, ssize_t src_ld, char *dst_p, @@ -320,7 +322,7 @@ sycl::event as_c_contiguous_batch_of_square_matrices_impl( cgh.parallel_for(ndRange, [=](sycl::nd_item<1> nd_it) { // 1. Read block from source array into SLM const std::uint32_t lid_lin = nd_it.get_local_linear_id(); - const size_t gr_id_lin = nd_it.get_group_linear_id(); + const std::size_t gr_id_lin = nd_it.get_group_linear_id(); const std::size_t batch_id = gr_id_lin / (n_tiles * n_tiles); const std::size_t rem = gr_id_lin - batch_id * (n_tiles * n_tiles); @@ -523,10 +525,10 @@ sycl::event as_c_contiguous_batch_of_square_matrices_impl( template sycl::event as_c_contiguous_1d_batch_of_square_matrices_impl( sycl::queue &exec_q, - size_t batch_nelems, + std::size_t batch_nelems, ssize_t src_batch_step, ssize_t dst_batch_step, - size_t n, + std::size_t n, const char *src_p, ssize_t src_ld, char *dst_p, @@ -554,10 +556,10 @@ sycl::event as_c_contiguous_1d_batch_of_square_matrices_impl( typedef sycl::event ( *as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t)( sycl::queue &, /* execution queue */ - size_t, /* number of batch elements */ + std::size_t, /* number of batch elements */ ssize_t, /* distance between batches in source array */ ssize_t, /* distance between batches in destination array */ - size_t, /* size of square matrices in the batch */ + std::size_t, /* size of square matrices in the batch */ const char *, ssize_t, /* untyped pointer to F-contig source array, and matrix leading dimension */ @@ -575,11 +577,11 @@ struct AsCContig1DBatchOfSquareMatricesFactory template sycl::event as_c_contiguous_nd_batch_of_square_matrices_impl( sycl::queue &exec_q, - size_t batch_nelems, + std::size_t batch_nelems, int batch_nd, const ssize_t *src_batch_shape_strides, const ssize_t dst_batch_step, - size_t n, + std::size_t n, const char *src_p, ssize_t src_ld, char *dst_p, @@ -610,12 +612,12 @@ sycl::event as_c_contiguous_nd_batch_of_square_matrices_impl( typedef sycl::event ( *as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t)( sycl::queue &, /* execution queue */ - size_t, /* number of matrices in the batch */ + std::size_t, /* number of matrices in the batch */ int, const ssize_t *, /* dimensionality, and packed [shape, src_strides] describing iteration over batch in source array */ ssize_t, /* distance between batches in destination array */ - size_t, /* matrix size */ + std::size_t, /* matrix size */ const char *, ssize_t, /* untyped pointer to source array of F-contig matrices, and leading dimension of the matrix */ diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp index 0dd315fc9d..d235515556 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp @@ -147,7 +147,7 @@ class abs_contig_kernel; template sycl::event abs_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -193,7 +193,7 @@ template class abs_strided_kernel; template sycl::event abs_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp index 47c69d5190..a2ce2c12c6 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp @@ -181,7 +181,7 @@ class acos_contig_kernel; template sycl::event acos_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -224,7 +224,7 @@ template class acos_strided_kernel; template sycl::event acos_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp index f199be5a7e..2ed61244ff 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp @@ -209,7 +209,7 @@ class acosh_contig_kernel; template sycl::event acosh_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -252,7 +252,7 @@ template class acosh_strided_kernel; template sycl::event acosh_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp index 69f63b53c0..7138a8afcc 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp @@ -262,7 +262,7 @@ class add_contig_kernel; template sycl::event add_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -310,7 +310,7 @@ class add_strided_kernel; template sycl::event add_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, @@ -358,8 +358,8 @@ template sycl::event add_contig_matrix_contig_row_broadcast_impl( sycl::queue &exec_q, std::vector &host_tasks, - size_t n0, - size_t n1, + std::size_t n0, + std::size_t n1, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix ssize_t mat_offset, const char *vec_p, // typeless pointer to (n1,) contiguous row @@ -407,8 +407,8 @@ template sycl::event add_contig_row_contig_matrix_broadcast_impl( sycl::queue &exec_q, std::vector &host_tasks, - size_t n0, - size_t n1, + std::size_t n0, + std::size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row ssize_t vec_offset, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix @@ -542,7 +542,7 @@ struct AddInplaceTypeMapFactory template sycl::event add_inplace_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, ssize_t arg_offset, char *res_p, @@ -579,7 +579,7 @@ class add_inplace_strided_kernel; template sycl::event add_inplace_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, @@ -625,8 +625,8 @@ template sycl::event add_inplace_row_matrix_broadcast_impl( sycl::queue &exec_q, std::vector &host_tasks, - size_t n0, - size_t n1, + std::size_t n0, + std::size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row ssize_t vec_offset, char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp index 670b9c10f8..dc2f455fde 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp @@ -125,7 +125,7 @@ class angle_contig_kernel; template sycl::event angle_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -168,7 +168,7 @@ template class angle_strided_kernel; template sycl::event angle_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp index db7ec5723e..bf466f80d4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp @@ -202,7 +202,7 @@ class asin_contig_kernel; template sycl::event asin_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -245,7 +245,7 @@ template class asin_strided_kernel; template sycl::event asin_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp index 9b58d7ad19..e1237f0252 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp @@ -185,7 +185,7 @@ class asinh_contig_kernel; template sycl::event asinh_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -228,7 +228,7 @@ template class asinh_strided_kernel; template sycl::event asinh_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp index 3f96f95526..103a22ddee 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp @@ -195,7 +195,7 @@ class atan_contig_kernel; template sycl::event atan_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -238,7 +238,7 @@ template class atan_strided_kernel; template sycl::event atan_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp index 37bd66fb54..2160531d67 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp @@ -133,7 +133,7 @@ class atan2_contig_kernel; template sycl::event atan2_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -185,7 +185,7 @@ class atan2_strided_kernel; template sycl::event atan2_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp index 25c15ef614..790afc8a00 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp @@ -186,7 +186,7 @@ class atanh_contig_kernel; template sycl::event atanh_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -229,7 +229,7 @@ template class atanh_strided_kernel; template sycl::event atanh_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp index 45a03c913d..ad1d27a11a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp @@ -191,7 +191,7 @@ class bitwise_and_contig_kernel; template sycl::event bitwise_and_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -245,7 +245,7 @@ class bitwise_and_strided_kernel; template sycl::event bitwise_and_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, @@ -381,7 +381,7 @@ struct BitwiseAndInplaceTypeMapFactory template sycl::event bitwise_and_inplace_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, ssize_t arg_offset, char *res_p, @@ -421,7 +421,7 @@ class bitwise_and_inplace_strided_kernel; template sycl::event bitwise_and_inplace_strided_impl( sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp index 582da57c29..3954dbaac6 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp @@ -142,7 +142,7 @@ class bitwise_invert_contig_kernel; template sycl::event bitwise_invert_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -189,7 +189,7 @@ class bitwise_invert_strided_kernel; template sycl::event bitwise_invert_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp index 8cb0dcc9d0..23cc878727 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp @@ -201,7 +201,7 @@ class bitwise_left_shift_contig_kernel; template sycl::event bitwise_left_shift_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -256,7 +256,7 @@ class bitwise_left_shift_strided_kernel; template sycl::event bitwise_left_shift_strided_impl( sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, @@ -395,7 +395,7 @@ struct BitwiseLeftShiftInplaceTypeMapFactory template sycl::event bitwise_left_shift_inplace_contig_impl( sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, ssize_t arg_offset, char *res_p, @@ -437,7 +437,7 @@ class bitwise_left_shift_inplace_strided_kernel; template sycl::event bitwise_left_shift_inplace_strided_impl( sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp index e1de5be474..3415ea6255 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp @@ -190,7 +190,7 @@ class bitwise_or_contig_kernel; template sycl::event bitwise_or_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -243,7 +243,7 @@ class bitwise_or_strided_kernel; template sycl::event bitwise_or_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, @@ -376,7 +376,7 @@ struct BitwiseOrInplaceTypeMapFactory template sycl::event bitwise_or_inplace_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, ssize_t arg_offset, char *res_p, @@ -416,7 +416,7 @@ class bitwise_or_inplace_strided_kernel; template sycl::event bitwise_or_inplace_strided_impl( sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp index 35d3352c41..e58361eca2 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp @@ -203,7 +203,7 @@ class bitwise_right_shift_contig_kernel; template sycl::event bitwise_right_shift_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -258,7 +258,7 @@ class bitwise_right_shift_strided_kernel; template sycl::event bitwise_right_shift_strided_impl( sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, @@ -399,7 +399,7 @@ struct BitwiseRightShiftInplaceTypeMapFactory template sycl::event bitwise_right_shift_inplace_contig_impl( sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, ssize_t arg_offset, char *res_p, @@ -442,7 +442,7 @@ class bitwise_right_shift_inplace_strided_kernel; template sycl::event bitwise_right_shift_inplace_strided_impl( sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp index fb18128cc1..2167adf40c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp @@ -192,7 +192,7 @@ class bitwise_xor_contig_kernel; template sycl::event bitwise_xor_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -246,7 +246,7 @@ class bitwise_xor_strided_kernel; template sycl::event bitwise_xor_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, @@ -382,7 +382,7 @@ struct BitwiseXorInplaceTypeMapFactory template sycl::event bitwise_xor_inplace_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, ssize_t arg_offset, char *res_p, @@ -422,7 +422,7 @@ class bitwise_xor_inplace_strided_kernel; template sycl::event bitwise_xor_inplace_strided_impl( sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp index a071558a5f..085367d136 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp @@ -117,7 +117,7 @@ class cbrt_contig_kernel; template sycl::event cbrt_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -160,7 +160,7 @@ template class cbrt_strided_kernel; template sycl::event cbrt_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp index ab7610088f..39ed463d24 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp @@ -138,7 +138,7 @@ class ceil_contig_kernel; template sycl::event ceil_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -181,7 +181,7 @@ template class ceil_strided_kernel; template sycl::event ceil_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp index 7efd4b02ee..f6ac74ce13 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp @@ -64,10 +64,10 @@ struct UnaryContigFunctor private: const argT *in = nullptr; resT *out = nullptr; - const size_t nelems_; + const std::size_t nelems_; public: - UnaryContigFunctor(const argT *inp, resT *res, const size_t n_elems) + UnaryContigFunctor(const argT *inp, resT *res, const std::size_t n_elems) : in(inp), out(res), nelems_(n_elems) { } @@ -87,14 +87,14 @@ struct UnaryContigFunctor auto sg = ndit.get_sub_group(); const std::uint16_t sgSize = sg.get_max_local_range()[0]; - const size_t base = + const std::size_t base = elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); if (base + elems_per_wi * sgSize < nelems_) { constexpr sycl::vec res_vec(const_val); #pragma unroll for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { - const size_t offset = base + it * sgSize; + const std::size_t offset = base + it * sgSize; auto out_multi_ptr = sycl::address_space_cast< sycl::access::address_space::global_space, sycl::access::decorated::yes>(&out[offset]); @@ -103,8 +103,8 @@ struct UnaryContigFunctor } } else { - const size_t lane_id = sg.get_local_id()[0]; - for (size_t k = base + lane_id; k < nelems_; k += sgSize) { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) { out[k] = const_val; } } @@ -116,13 +116,13 @@ struct UnaryContigFunctor auto sg = ndit.get_sub_group(); const std::uint16_t sgSize = sg.get_max_local_range()[0]; - const size_t base = + const std::size_t base = elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); if (base + elems_per_wi * sgSize < nelems_) { #pragma unroll for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { - const size_t offset = base + it * sgSize; + const std::size_t offset = base + it * sgSize; auto in_multi_ptr = sycl::address_space_cast< sycl::access::address_space::global_space, sycl::access::decorated::yes>(&in[offset]); @@ -137,8 +137,8 @@ struct UnaryContigFunctor } } else { - const size_t lane_id = sg.get_local_id()[0]; - for (size_t k = base + lane_id; k < nelems_; k += sgSize) { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) { // scalar call out[k] = op(in[k]); } @@ -152,14 +152,14 @@ struct UnaryContigFunctor auto sg = ndit.get_sub_group(); const std::uint16_t sgSize = sg.get_max_local_range()[0]; - const size_t base = + const std::size_t base = elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); if (base + elems_per_wi * sgSize < nelems_) { #pragma unroll for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { - const size_t offset = base + it * sgSize; + const std::size_t offset = base + it * sgSize; auto in_multi_ptr = sycl::address_space_cast< sycl::access::address_space::global_space, sycl::access::decorated::yes>(&in[offset]); @@ -177,8 +177,8 @@ struct UnaryContigFunctor } } else { - const size_t lane_id = sg.get_local_id()[0]; - for (size_t k = base + lane_id; k < nelems_; k += sgSize) { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) { out[k] = op(in[k]); } } @@ -190,14 +190,14 @@ struct UnaryContigFunctor auto sg = ndit.get_sub_group(); const std::uint16_t sgSize = sg.get_max_local_range()[0]; - const size_t base = + const std::size_t base = elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); if (base + elems_per_wi * sgSize < nelems_) { #pragma unroll for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { - const size_t offset = base + it * sgSize; + const std::size_t offset = base + it * sgSize; auto in_multi_ptr = sycl::address_space_cast< sycl::access::address_space::global_space, sycl::access::decorated::yes>(&in[offset]); @@ -216,8 +216,8 @@ struct UnaryContigFunctor } } else { - const size_t lane_id = sg.get_local_id()[0]; - for (size_t k = base + lane_id; k < nelems_; k += sgSize) { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) { out[k] = op(in[k]); } } @@ -225,12 +225,13 @@ struct UnaryContigFunctor else { const std::uint16_t sgSize = ndit.get_sub_group().get_local_range()[0]; - const size_t gid = ndit.get_global_linear_id(); + const std::size_t gid = ndit.get_global_linear_id(); const std::uint16_t elems_per_sg = sgSize * elems_per_wi; - const size_t start = (gid / sgSize) * (elems_per_sg - sgSize) + gid; - const size_t end = std::min(nelems_, start + elems_per_sg); - for (size_t offset = start; offset < end; offset += sgSize) { + const std::size_t start = + (gid / sgSize) * (elems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems_, start + elems_per_sg); + for (std::size_t offset = start; offset < end; offset += sgSize) { out[offset] = op(in[offset]); } } @@ -293,16 +294,17 @@ template sycl::event unary_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) { constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz; - const size_t n_work_items_needed = nelems / elems_per_wi; - const size_t lws = select_lws(exec_q.get_device(), n_work_items_needed); + const std::size_t n_work_items_needed = nelems / elems_per_wi; + const std::size_t lws = + select_lws(exec_q.get_device(), n_work_items_needed); - const size_t n_groups = + const std::size_t n_groups = ((nelems + lws * elems_per_wi - 1) / (lws * elems_per_wi)); const auto gws_range = sycl::range<1>(n_groups * lws); const auto lws_range = sycl::range<1>(lws); @@ -351,7 +353,7 @@ template sycl::event unary_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, @@ -394,13 +396,13 @@ struct BinaryContigFunctor const argT1 *in1 = nullptr; const argT2 *in2 = nullptr; resT *out = nullptr; - const size_t nelems_; + const std::size_t nelems_; public: BinaryContigFunctor(const argT1 *inp1, const argT2 *inp2, resT *res, - const size_t n_elems) + const std::size_t n_elems) : in1(inp1), in2(inp2), out(res), nelems_(n_elems) { } @@ -419,7 +421,7 @@ struct BinaryContigFunctor auto sg = ndit.get_sub_group(); std::uint16_t sgSize = sg.get_max_local_range()[0]; - const size_t base = + const std::size_t base = elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); @@ -428,7 +430,7 @@ struct BinaryContigFunctor #pragma unroll for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { - size_t offset = base + it * sgSize; + std::size_t offset = base + it * sgSize; auto in1_multi_ptr = sycl::address_space_cast< sycl::access::address_space::global_space, sycl::access::decorated::yes>(&in1[offset]); @@ -449,7 +451,7 @@ struct BinaryContigFunctor } else { const std::size_t lane_id = sg.get_local_id()[0]; - for (size_t k = base + lane_id; k < nelems_; k += sgSize) { + for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) { out[k] = op(in1[k], in2[k]); } } @@ -460,14 +462,14 @@ struct BinaryContigFunctor auto sg = ndit.get_sub_group(); const std::uint16_t sgSize = sg.get_max_local_range()[0]; - const size_t base = + const std::size_t base = elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); if (base + elems_per_wi * sgSize < nelems_) { #pragma unroll for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { - const size_t offset = base + it * sgSize; + const std::size_t offset = base + it * sgSize; auto in1_multi_ptr = sycl::address_space_cast< sycl::access::address_space::global_space, sycl::access::decorated::yes>(&in1[offset]); @@ -494,19 +496,21 @@ struct BinaryContigFunctor } else { const std::size_t lane_id = sg.get_local_id()[0]; - for (size_t k = base + lane_id; k < nelems_; k += sgSize) { + for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) { out[k] = op(in1[k], in2[k]); } } } else { - const size_t sgSize = ndit.get_sub_group().get_local_range()[0]; - const size_t gid = ndit.get_global_linear_id(); - const size_t elems_per_sg = sgSize * elems_per_wi; + const std::size_t sgSize = + ndit.get_sub_group().get_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + const std::size_t elems_per_sg = sgSize * elems_per_wi; - const size_t start = (gid / sgSize) * (elems_per_sg - sgSize) + gid; - const size_t end = std::min(nelems_, start + elems_per_sg); - for (size_t offset = start; offset < end; offset += sgSize) { + const std::size_t start = + (gid / sgSize) * (elems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems_, start + elems_per_sg); + for (std::size_t offset = start; offset < end; offset += sgSize) { out[offset] = op(in1[offset], in2[offset]); } } @@ -560,15 +564,15 @@ struct BinaryContigMatrixContigRowBroadcastingFunctor const argT1 *mat; const argT2 *padded_vec; resT *res; - size_t n_elems; - size_t n1; + std::size_t n_elems; + std::size_t n1; public: BinaryContigMatrixContigRowBroadcastingFunctor(const argT1 *mat_tp, const argT2 *row_tp, resT *res_tp, - size_t n_elems_in_mat, - size_t n_elems_in_row) + std::size_t n_elems_in_mat, + std::size_t n_elems_in_row) : mat(mat_tp), padded_vec(row_tp), res(res_tp), n_elems(n_elems_in_mat), n1(n_elems_in_row) { @@ -582,10 +586,10 @@ struct BinaryContigMatrixContigRowBroadcastingFunctor static_assert(BinaryOperatorT::supports_sg_loadstore::value); const auto &sg = ndit.get_sub_group(); - const size_t gid = ndit.get_global_linear_id(); + const std::size_t gid = ndit.get_global_linear_id(); - const size_t sgSize = sg.get_max_local_range()[0]; - const size_t base = gid - sg.get_local_id()[0]; + const std::size_t sgSize = sg.get_max_local_range()[0]; + const std::size_t base = gid - sg.get_local_id()[0]; if (base + sgSize < n_elems) { auto in1_multi_ptr = sycl::address_space_cast< @@ -608,8 +612,8 @@ struct BinaryContigMatrixContigRowBroadcastingFunctor sub_group_store(sg, res_el, out_multi_ptr); } else { - const size_t lane_id = sg.get_local_id()[0]; - for (size_t k = base + lane_id; k < n_elems; k += sgSize) { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < n_elems; k += sgSize) { res[k] = op(mat[k], padded_vec[k % n1]); } } @@ -626,15 +630,15 @@ struct BinaryContigRowContigMatrixBroadcastingFunctor const argT1 *padded_vec; const argT2 *mat; resT *res; - size_t n_elems; - size_t n1; + std::size_t n_elems; + std::size_t n1; public: BinaryContigRowContigMatrixBroadcastingFunctor(const argT1 *row_tp, const argT2 *mat_tp, resT *res_tp, - size_t n_elems_in_mat, - size_t n_elems_in_row) + std::size_t n_elems_in_mat, + std::size_t n_elems_in_row) : padded_vec(row_tp), mat(mat_tp), res(res_tp), n_elems(n_elems_in_mat), n1(n_elems_in_row) { @@ -647,10 +651,10 @@ struct BinaryContigRowContigMatrixBroadcastingFunctor static_assert(BinaryOperatorT::supports_sg_loadstore::value); const auto &sg = ndit.get_sub_group(); - size_t gid = ndit.get_global_linear_id(); + std::size_t gid = ndit.get_global_linear_id(); - const size_t sgSize = sg.get_max_local_range()[0]; - const size_t base = gid - sg.get_local_id()[0]; + const std::size_t sgSize = sg.get_max_local_range()[0]; + const std::size_t base = gid - sg.get_local_id()[0]; if (base + sgSize < n_elems) { auto in1_multi_ptr = sycl::address_space_cast< @@ -673,8 +677,8 @@ struct BinaryContigRowContigMatrixBroadcastingFunctor sub_group_store(sg, res_el, out_multi_ptr); } else { - const size_t lane_id = sg.get_local_id()[0]; - for (size_t k = base + lane_id; k < n_elems; k += sgSize) { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < n_elems; k += sgSize) { res[k] = op(padded_vec[k % n1], mat[k]); } } @@ -685,14 +689,14 @@ struct BinaryContigRowContigMatrixBroadcastingFunctor typedef sycl::event (*unary_contig_impl_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, const char *, char *, const std::vector &); typedef sycl::event (*unary_strided_impl_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, int, const ssize_t *, const char *, @@ -704,7 +708,7 @@ typedef sycl::event (*unary_strided_impl_fn_ptr_t)( typedef sycl::event (*binary_contig_impl_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, const char *, ssize_t, const char *, @@ -715,7 +719,7 @@ typedef sycl::event (*binary_contig_impl_fn_ptr_t)( typedef sycl::event (*binary_strided_impl_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, int, const ssize_t *, const char *, @@ -730,8 +734,8 @@ typedef sycl::event (*binary_strided_impl_fn_ptr_t)( typedef sycl::event (*binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t)( sycl::queue &, std::vector &, - size_t, - size_t, + std::size_t, + std::size_t, const char *, ssize_t, const char *, @@ -743,8 +747,8 @@ typedef sycl::event (*binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t)( typedef sycl::event (*binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t)( sycl::queue &, std::vector &, - size_t, - size_t, + std::size_t, + std::size_t, const char *, ssize_t, const char *, @@ -773,7 +777,7 @@ template sycl::event binary_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -782,10 +786,11 @@ sycl::event binary_contig_impl(sycl::queue &exec_q, ssize_t res_offset, const std::vector &depends = {}) { - const size_t n_work_items_needed = nelems / (n_vecs * vec_sz); - const size_t lws = select_lws(exec_q.get_device(), n_work_items_needed); + const std::size_t n_work_items_needed = nelems / (n_vecs * vec_sz); + const std::size_t lws = + select_lws(exec_q.get_device(), n_work_items_needed); - const size_t n_groups = + const std::size_t n_groups = ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); const auto gws_range = sycl::range<1>(n_groups * lws); const auto lws_range = sycl::range<1>(lws); @@ -839,7 +844,7 @@ template sycl::event binary_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, @@ -884,8 +889,8 @@ template &host_tasks, - size_t n0, - size_t n1, + std::size_t n0, + std::size_t n1, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix ssize_t mat_offset, const char *vec_p, // typeless pointer to (n1,) contiguous row @@ -902,10 +907,10 @@ sycl::event binary_contig_matrix_contig_row_broadcast_impl( const auto &dev = exec_q.get_device(); const auto &sg_sizes = dev.get_info(); // Get device-specific kernel info max_sub_group_size - size_t max_sgSize = + std::size_t max_sgSize = *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); - size_t n1_padded = n1 + max_sgSize; + std::size_t n1_padded = n1 + max_sgSize; argT2 *padded_vec = sycl::malloc_device(n1_padded, exec_q); if (padded_vec == nullptr) { @@ -926,14 +931,14 @@ sycl::event binary_contig_matrix_contig_row_broadcast_impl( // We read sub_group_load(&padded_vec[(base / n0)]). // The vector is padded to ensure that reads are accessible - const size_t lws = 128; + const std::size_t lws = 128; sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(make_padded_vec_ev); auto lwsRange = sycl::range<1>(lws); - size_t n_elems = n0 * n1; - size_t n_groups = (n_elems + lws - 1) / lws; + std::size_t n_elems = n0 * n1; + std::size_t n_groups = (n_elems + lws - 1) / lws; auto gwsRange = sycl::range<1>(n_groups * lws); cgh.parallel_for>( @@ -964,8 +969,8 @@ template &host_tasks, - size_t n0, - size_t n1, + std::size_t n0, + std::size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row ssize_t vec_offset, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix @@ -982,10 +987,10 @@ sycl::event binary_contig_row_contig_matrix_broadcast_impl( const auto &dev = exec_q.get_device(); const auto &sg_sizes = dev.get_info(); // Get device-specific kernel info max_sub_group_size - size_t max_sgSize = + std::size_t max_sgSize = *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); - size_t n1_padded = n1 + max_sgSize; + std::size_t n1_padded = n1 + max_sgSize; argT2 *padded_vec = sycl::malloc_device(n1_padded, exec_q); if (padded_vec == nullptr) { @@ -1007,14 +1012,14 @@ sycl::event binary_contig_row_contig_matrix_broadcast_impl( // We read sub_group_load(&padded_vec[(base / n0)]). The vector is // padded to ensure that reads are accessible - const size_t lws = 128; + const std::size_t lws = 128; sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(make_padded_vec_ev); auto lwsRange = sycl::range<1>(lws); - size_t n_elems = n0 * n1; - size_t n_groups = (n_elems + lws - 1) / lws; + std::size_t n_elems = n0 * n1; + std::size_t n_groups = (n_elems + lws - 1) / lws; auto gwsRange = sycl::range<1>(n_groups * lws); cgh.parallel_for>( diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp index e3bf906484..552b3abd8a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp @@ -63,12 +63,12 @@ struct BinaryInplaceContigFunctor private: const argT *rhs = nullptr; resT *lhs = nullptr; - const size_t nelems_; + const std::size_t nelems_; public: BinaryInplaceContigFunctor(const argT *rhs_tp, resT *lhs_tp, - const size_t n_elems) + const std::size_t n_elems) : rhs(rhs_tp), lhs(lhs_tp), nelems_(n_elems) { } @@ -88,7 +88,7 @@ struct BinaryInplaceContigFunctor auto sg = ndit.get_sub_group(); std::uint16_t sgSize = sg.get_max_local_range()[0]; - size_t base = + std::size_t base = elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); @@ -96,7 +96,7 @@ struct BinaryInplaceContigFunctor #pragma unroll for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { - const size_t offset = base + it * sgSize; + const std::size_t offset = base + it * sgSize; auto rhs_multi_ptr = sycl::address_space_cast< sycl::access::address_space::global_space, sycl::access::decorated::yes>(&rhs[offset]); @@ -114,8 +114,8 @@ struct BinaryInplaceContigFunctor } } else { - const size_t lane_id = sg.get_local_id()[0]; - for (size_t k = base + lane_id; k < nelems_; k += sgSize) { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) { op(lhs[k], rhs[k]); } } @@ -126,14 +126,14 @@ struct BinaryInplaceContigFunctor auto sg = ndit.get_sub_group(); std::uint16_t sgSize = sg.get_max_local_range()[0]; - size_t base = + std::size_t base = elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); if (base + elems_per_wi * sgSize < nelems_) { #pragma unroll for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { - const size_t offset = base + it * sgSize; + const std::size_t offset = base + it * sgSize; auto rhs_multi_ptr = sycl::address_space_cast< sycl::access::address_space::global_space, sycl::access::decorated::yes>(&rhs[offset]); @@ -153,20 +153,22 @@ struct BinaryInplaceContigFunctor } } else { - const size_t lane_id = sg.get_local_id()[0]; - for (size_t k = base + lane_id; k < nelems_; k += sgSize) { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) { op(lhs[k], rhs[k]); } } } else { - const size_t sgSize = ndit.get_sub_group().get_local_range()[0]; - const size_t gid = ndit.get_global_linear_id(); - const size_t elems_per_sg = elems_per_wi * sgSize; - - const size_t start = (gid / sgSize) * (elems_per_sg - sgSize) + gid; - const size_t end = std::min(nelems_, start + elems_per_sg); - for (size_t offset = start; offset < end; offset += sgSize) { + const std::size_t sgSize = + ndit.get_sub_group().get_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + const std::size_t elems_per_sg = elems_per_wi * sgSize; + + const std::size_t start = + (gid / sgSize) * (elems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems_, start + elems_per_sg); + for (std::size_t offset = start; offset < end; offset += sgSize) { op(lhs[offset], rhs[offset]); } } @@ -211,14 +213,14 @@ struct BinaryInplaceRowMatrixBroadcastingFunctor private: const argT *padded_vec; resT *mat; - size_t n_elems; - size_t n1; + std::size_t n_elems; + std::size_t n1; public: BinaryInplaceRowMatrixBroadcastingFunctor(const argT *row_tp, resT *mat_tp, - size_t n_elems_in_mat, - size_t n_elems_in_row) + std::size_t n_elems_in_mat, + std::size_t n_elems_in_row) : padded_vec(row_tp), mat(mat_tp), n_elems(n_elems_in_mat), n1(n_elems_in_row) { @@ -231,10 +233,10 @@ struct BinaryInplaceRowMatrixBroadcastingFunctor static_assert(BinaryOperatorT::supports_sg_loadstore::value); auto sg = ndit.get_sub_group(); - const size_t gid = ndit.get_global_linear_id(); + const std::size_t gid = ndit.get_global_linear_id(); std::uint8_t sgSize = sg.get_max_local_range()[0]; - size_t base = gid - sg.get_local_id()[0]; + std::size_t base = gid - sg.get_local_id()[0]; if (base + sgSize < n_elems) { auto in_multi_ptr = sycl::address_space_cast< @@ -253,8 +255,8 @@ struct BinaryInplaceRowMatrixBroadcastingFunctor sub_group_store(sg, mat_el, out_multi_ptr); } else { - const size_t start = base + sg.get_local_id()[0]; - for (size_t k = start; k < n_elems; k += sgSize) { + const std::size_t start = base + sg.get_local_id()[0]; + for (std::size_t k = start; k < n_elems; k += sgSize) { op(mat[k], padded_vec[k % n1]); } } @@ -265,7 +267,7 @@ struct BinaryInplaceRowMatrixBroadcastingFunctor typedef sycl::event (*binary_inplace_contig_impl_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, const char *, ssize_t, char *, @@ -274,7 +276,7 @@ typedef sycl::event (*binary_inplace_contig_impl_fn_ptr_t)( typedef sycl::event (*binary_inplace_strided_impl_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, int, const ssize_t *, const char *, @@ -287,8 +289,8 @@ typedef sycl::event (*binary_inplace_strided_impl_fn_ptr_t)( typedef sycl::event (*binary_inplace_row_matrix_broadcast_impl_fn_ptr_t)( sycl::queue &, std::vector &, - size_t, - size_t, + std::size_t, + std::size_t, const char *, ssize_t, char *, @@ -309,7 +311,7 @@ template sycl::event binary_inplace_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *rhs_p, ssize_t rhs_offset, char *lhs_p, @@ -319,8 +321,8 @@ binary_inplace_contig_impl(sycl::queue &exec_q, sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - const size_t lws = 128; - const size_t n_groups = + const std::size_t lws = 128; + const std::size_t n_groups = ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); const auto gws_range = sycl::range<1>(n_groups * lws); const auto lws_range = sycl::range<1>(lws); @@ -364,7 +366,7 @@ template sycl::event binary_inplace_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *rhs_p, @@ -402,8 +404,8 @@ template &host_tasks, - size_t n0, - size_t n1, + std::size_t n0, + std::size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row ssize_t vec_offset, char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix @@ -416,10 +418,10 @@ sycl::event binary_inplace_row_matrix_broadcast_impl( const auto &dev = exec_q.get_device(); const auto &sg_sizes = dev.get_info(); // Get device-specific kernel info max_sub_group_size - size_t max_sgSize = + std::size_t max_sgSize = *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); - size_t n1_padded = n1 + max_sgSize; + std::size_t n1_padded = n1 + max_sgSize; argT *padded_vec = sycl::malloc_device(n1_padded, exec_q); if (padded_vec == nullptr) { @@ -440,14 +442,14 @@ sycl::event binary_inplace_row_matrix_broadcast_impl( // We read sub_group_load(&padded_vec[(base / n0)]). The vector is // padded to ensure that reads are accessible - const size_t lws = 128; + const std::size_t lws = 128; sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(make_padded_vec_ev); auto lwsRange = sycl::range<1>(lws); - size_t n_elems = n0 * n1; - size_t n_groups = (n_elems + lws - 1) / lws; + std::size_t n_elems = n0 * n1; + std::size_t n_groups = (n_elems + lws - 1) / lws; auto gwsRange = sycl::range<1>(n_groups * lws); cgh.parallel_for>( diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp index 486174435c..c91a26148a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp @@ -145,7 +145,7 @@ class conj_contig_kernel; template sycl::event conj_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -188,7 +188,7 @@ template class conj_strided_kernel; template sycl::event conj_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp index 9ad6a6ad65..5a1d70e704 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp @@ -148,7 +148,7 @@ class copysign_contig_kernel; template sycl::event copysign_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -200,7 +200,7 @@ class copysign_strided_kernel; template sycl::event copysign_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp index 52fbebe545..a4f70fccc4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp @@ -218,7 +218,7 @@ class cos_contig_kernel; template sycl::event cos_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -260,7 +260,7 @@ template class cos_strided_kernel; template sycl::event cos_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp index b1752e5929..cbdc1e323c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp @@ -207,7 +207,7 @@ class cosh_contig_kernel; template sycl::event cosh_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -250,7 +250,7 @@ template class cosh_strided_kernel; template sycl::event cosh_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp index 6a455509c5..19c983da2b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp @@ -219,7 +219,7 @@ class equal_contig_kernel; template sycl::event equal_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -271,7 +271,7 @@ class equal_strided_kernel; template sycl::event equal_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp index 21edeaeb31..a721700cc5 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp @@ -176,7 +176,7 @@ class exp_contig_kernel; template sycl::event exp_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -218,7 +218,7 @@ template class exp_strided_kernel; template sycl::event exp_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp index df9a472329..880d5502be 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp @@ -178,7 +178,7 @@ class exp2_contig_kernel; template sycl::event exp2_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -221,7 +221,7 @@ template class exp2_strided_kernel; template sycl::event exp2_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp index a8bebd7a15..d4eeacef0b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp @@ -191,7 +191,7 @@ class expm1_contig_kernel; template sycl::event expm1_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -234,7 +234,7 @@ template class expm1_strided_kernel; template sycl::event expm1_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp index 2381327766..c8dcf7035a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp @@ -138,7 +138,7 @@ class floor_contig_kernel; template sycl::event floor_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -181,7 +181,7 @@ template class floor_strided_kernel; template sycl::event floor_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp index 98bc9820ba..2fdaad656d 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp @@ -233,7 +233,7 @@ class floor_divide_contig_kernel; template sycl::event floor_divide_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -287,7 +287,7 @@ class floor_divide_strided_kernel; template sycl::event floor_divide_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, @@ -461,7 +461,7 @@ struct FloorDivideInplaceTypeMapFactory template sycl::event floor_divide_inplace_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, ssize_t arg_offset, char *res_p, @@ -501,7 +501,7 @@ class floor_divide_inplace_strided_kernel; template sycl::event floor_divide_inplace_strided_impl( sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp index 588ebc780d..84230e50f6 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp @@ -221,7 +221,7 @@ class greater_contig_kernel; template sycl::event greater_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -273,7 +273,7 @@ class greater_strided_kernel; template sycl::event greater_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp index 614fb202e1..30dea84f92 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp @@ -223,7 +223,7 @@ class greater_equal_contig_kernel; template sycl::event greater_equal_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -277,7 +277,7 @@ class greater_equal_strided_kernel; template sycl::event greater_equal_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp index f65951f36b..24a85dc65a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp @@ -149,7 +149,7 @@ class hypot_contig_kernel; template sycl::event hypot_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -201,7 +201,7 @@ class hypot_strided_kernel; template sycl::event hypot_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp index 1d33f83d27..67026131ec 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp @@ -141,7 +141,7 @@ class imag_contig_kernel; template sycl::event imag_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -184,7 +184,7 @@ template class imag_strided_kernel; template sycl::event imag_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp index 067e3e36ee..5fbee2c197 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp @@ -143,7 +143,7 @@ class isfinite_contig_kernel; template sycl::event isfinite_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -183,7 +183,7 @@ template class isfinite_strided_kernel; template sycl::event isfinite_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp index 70069bdaa2..c3b94862d7 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp @@ -143,7 +143,7 @@ class isinf_contig_kernel; template sycl::event isinf_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -180,7 +180,7 @@ template class isinf_strided_kernel; template sycl::event isinf_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp index 0d8a15d0b8..28b4eaf2e9 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp @@ -141,7 +141,7 @@ class isnan_contig_kernel; template sycl::event isnan_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -178,7 +178,7 @@ template class isnan_strided_kernel; template sycl::event isnan_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp index 43f11725b7..61668e54a4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp @@ -218,7 +218,7 @@ class less_contig_kernel; template sycl::event less_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -269,7 +269,7 @@ class less_strided_kernel; template sycl::event less_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp index 81cc375c16..c0dfc9ed40 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp @@ -220,7 +220,7 @@ class less_equal_contig_kernel; template sycl::event less_equal_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -272,7 +272,7 @@ class less_equal_strided_kernel; template sycl::event less_equal_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp index 13eb64afca..10369ec769 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp @@ -133,7 +133,7 @@ class log_contig_kernel; template sycl::event log_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -175,7 +175,7 @@ template class log_strided_kernel; template sycl::event log_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp index ea486239e7..a1ac08479c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp @@ -152,7 +152,7 @@ class log10_contig_kernel; template sycl::event log10_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -195,7 +195,7 @@ template class log10_strided_kernel; template sycl::event log10_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp index 3df38d05f0..9872a3b0be 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp @@ -157,7 +157,7 @@ class log1p_contig_kernel; template sycl::event log1p_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -200,7 +200,7 @@ template class log1p_strided_kernel; template sycl::event log1p_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp index 2da4c55de0..d592da7038 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp @@ -153,7 +153,7 @@ class log2_contig_kernel; template sycl::event log2_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -196,7 +196,7 @@ template class log2_strided_kernel; template sycl::event log2_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp index 6d2375c20d..9d204aaf8f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp @@ -165,7 +165,7 @@ class logaddexp_contig_kernel; template sycl::event logaddexp_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -217,7 +217,7 @@ class logaddexp_strided_kernel; template sycl::event logaddexp_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp index 768ace7754..c6fd495793 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp @@ -191,7 +191,7 @@ class logical_and_contig_kernel; template sycl::event logical_and_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -245,7 +245,7 @@ class logical_and_strided_kernel; template sycl::event logical_and_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp index 53c5404caa..14f85ec0d4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp @@ -117,7 +117,7 @@ class logical_not_contig_kernel; template sycl::event logical_not_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -158,7 +158,7 @@ class logical_not_strided_kernel; template sycl::event logical_not_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp index 93c5f3b9a6..7ada2e6027 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp @@ -189,7 +189,7 @@ class logical_or_contig_kernel; template sycl::event logical_or_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -242,7 +242,7 @@ class logical_or_strided_kernel; template sycl::event logical_or_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp index 9ff54b6f16..657e86f2a4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp @@ -192,7 +192,7 @@ class logical_xor_contig_kernel; template sycl::event logical_xor_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -246,7 +246,7 @@ class logical_xor_strided_kernel; template sycl::event logical_xor_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp index ed44b8ade7..d22fc98b79 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp @@ -223,7 +223,7 @@ class maximum_contig_kernel; template sycl::event maximum_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -275,7 +275,7 @@ class maximum_strided_kernel; template sycl::event maximum_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp index 551daf0498..c8ba2d89f4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp @@ -223,7 +223,7 @@ class minimum_contig_kernel; template sycl::event minimum_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -275,7 +275,7 @@ class minimum_strided_kernel; template sycl::event minimum_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp index 37b3803c27..8130ebf30a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp @@ -215,7 +215,7 @@ class multiply_contig_kernel; template sycl::event multiply_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -267,7 +267,7 @@ class multiply_strided_kernel; template sycl::event multiply_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, @@ -316,8 +316,8 @@ template sycl::event multiply_contig_matrix_contig_row_broadcast_impl( sycl::queue &exec_q, std::vector &host_tasks, - size_t n0, - size_t n1, + std::size_t n0, + std::size_t n1, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix ssize_t mat_offset, const char *vec_p, // typeless pointer to (n1,) contiguous row @@ -366,8 +366,8 @@ template sycl::event multiply_contig_row_contig_matrix_broadcast_impl( sycl::queue &exec_q, std::vector &host_tasks, - size_t n0, - size_t n1, + std::size_t n0, + std::size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row ssize_t vec_offset, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix @@ -503,7 +503,7 @@ struct MultiplyInplaceTypeMapFactory template sycl::event multiply_inplace_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, ssize_t arg_offset, char *res_p, @@ -543,7 +543,7 @@ class multiply_inplace_strided_kernel; template sycl::event multiply_inplace_strided_impl( sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, @@ -590,8 +590,8 @@ template sycl::event multiply_inplace_row_matrix_broadcast_impl( sycl::queue &exec_q, std::vector &host_tasks, - size_t n0, - size_t n1, + std::size_t n0, + std::size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row ssize_t vec_offset, char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp index a036158ccd..6acb31f581 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp @@ -123,7 +123,7 @@ class negative_contig_kernel; template sycl::event negative_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -173,7 +173,7 @@ template class negative_strided_kernel; template sycl::event negative_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp index b58b1b98ef..f1d3a38542 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp @@ -148,7 +148,7 @@ class nextafter_contig_kernel; template sycl::event nextafter_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -200,7 +200,7 @@ class nextafter_strided_kernel; template sycl::event nextafter_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp index be1231648c..a6653696aa 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp @@ -204,7 +204,7 @@ class not_equal_contig_kernel; template sycl::event not_equal_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -256,7 +256,7 @@ class not_equal_strided_kernel; template sycl::event not_equal_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp index 3ccca611d8..659c197f15 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp @@ -138,7 +138,7 @@ class positive_contig_kernel; template sycl::event positive_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -188,7 +188,7 @@ template class positive_strided_kernel; template sycl::event positive_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp index 353e516d28..a5e7cc9dfe 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp @@ -266,7 +266,7 @@ class pow_contig_kernel; template sycl::event pow_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -316,7 +316,7 @@ class pow_strided_kernel; template sycl::event pow_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, @@ -514,7 +514,7 @@ struct PowInplaceTypeMapFactory template sycl::event pow_inplace_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, ssize_t arg_offset, char *res_p, @@ -553,7 +553,7 @@ class pow_inplace_strided_kernel; template sycl::event pow_inplace_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp index 1297dab283..f4f9fd3fc7 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp @@ -142,7 +142,7 @@ class proj_contig_kernel; template sycl::event proj_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -191,7 +191,7 @@ template class proj_strided_kernel; template sycl::event proj_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp index 270b613346..a3d0e5e8be 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp @@ -141,7 +141,7 @@ class real_contig_kernel; template sycl::event real_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -184,7 +184,7 @@ template class real_strided_kernel; template sycl::event real_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp index 90909ea772..28f9f06a05 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp @@ -138,7 +138,7 @@ class reciprocal_contig_kernel; template sycl::event reciprocal_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -185,7 +185,7 @@ class reciprocal_strided_kernel; template sycl::event reciprocal_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp index 57467d56b3..15d6bd115d 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp @@ -250,7 +250,7 @@ class remainder_contig_kernel; template sycl::event remainder_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -302,7 +302,7 @@ class remainder_strided_kernel; template sycl::event remainder_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, @@ -485,7 +485,7 @@ struct RemainderInplaceTypeMapFactory template sycl::event remainder_inplace_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, ssize_t arg_offset, char *res_p, @@ -525,7 +525,7 @@ class remainder_inplace_strided_kernel; template sycl::event remainder_inplace_strided_impl( sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp index 60ea58f7c3..37e52a6e3b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp @@ -149,7 +149,7 @@ class round_contig_kernel; template sycl::event round_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -192,7 +192,7 @@ template class round_strided_kernel; template sycl::event round_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp index f92dac50b1..6b19e9fba4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp @@ -120,7 +120,7 @@ class rsqrt_contig_kernel; template sycl::event rsqrt_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -163,7 +163,7 @@ template class rsqrt_strided_kernel; template sycl::event rsqrt_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp index ffb4183474..615e80efda 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp @@ -161,7 +161,7 @@ class sign_contig_kernel; template sycl::event sign_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -208,7 +208,7 @@ template class sign_strided_kernel; template sycl::event sign_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp index 7ba04fcd17..88edeacf44 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp @@ -127,7 +127,7 @@ class signbit_contig_kernel; template sycl::event signbit_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -172,7 +172,7 @@ template class signbit_strided_kernel; template sycl::event signbit_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp index 596c1de9e4..d37900777b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp @@ -240,7 +240,7 @@ class sin_contig_kernel; template sycl::event sin_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -282,7 +282,7 @@ template class sin_strided_kernel; template sycl::event sin_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp index 6d418872b8..c918323e66 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp @@ -209,7 +209,7 @@ class sinh_contig_kernel; template sycl::event sinh_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -252,7 +252,7 @@ template class sinh_strided_kernel; template sycl::event sinh_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp index 6dcb2ca742..88a3db84ac 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp @@ -135,7 +135,7 @@ class sqrt_contig_kernel; template sycl::event sqrt_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -178,7 +178,7 @@ template class sqrt_strided_kernel; template sycl::event sqrt_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp index dbf665b79c..aaabd9761c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp @@ -160,7 +160,7 @@ class square_contig_kernel; template sycl::event square_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -205,7 +205,7 @@ template class square_strided_kernel; template sycl::event square_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp index 47ca000c3f..0ebf102805 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp @@ -201,7 +201,7 @@ class subtract_contig_kernel; template sycl::event subtract_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -253,7 +253,7 @@ class subtract_strided_kernel; template sycl::event subtract_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, @@ -313,8 +313,8 @@ template sycl::event subtract_contig_matrix_contig_row_broadcast_impl( sycl::queue &exec_q, std::vector &host_tasks, - size_t n0, - size_t n1, + std::size_t n0, + std::size_t n1, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix ssize_t mat_offset, const char *vec_p, // typeless pointer to (n1,) contiguous row @@ -363,8 +363,8 @@ template sycl::event subtract_contig_row_contig_matrix_broadcast_impl( sycl::queue &exec_q, std::vector &host_tasks, - size_t n0, - size_t n1, + std::size_t n0, + std::size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row ssize_t vec_offset, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix @@ -501,7 +501,7 @@ struct SubtractInplaceTypeMapFactory template sycl::event subtract_inplace_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, ssize_t arg_offset, char *res_p, @@ -541,7 +541,7 @@ class subtract_inplace_strided_kernel; template sycl::event subtract_inplace_strided_impl( sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, @@ -588,8 +588,8 @@ template sycl::event subtract_inplace_row_matrix_broadcast_impl( sycl::queue &exec_q, std::vector &host_tasks, - size_t n0, - size_t n1, + std::size_t n0, + std::size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row ssize_t vec_offset, char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp index a7da718a4b..7dbc80a66a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp @@ -184,7 +184,7 @@ class tan_contig_kernel; template sycl::event tan_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -226,7 +226,7 @@ template class tan_strided_kernel; template sycl::event tan_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp index 626420d48b..cdb38c00a7 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp @@ -178,7 +178,7 @@ class tanh_contig_kernel; template sycl::event tanh_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -221,7 +221,7 @@ template class tanh_strided_kernel; template sycl::event tanh_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp index 27de2069ff..454745e7d3 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp @@ -209,7 +209,7 @@ class true_divide_contig_kernel; template sycl::event true_divide_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg1_p, ssize_t arg1_offset, const char *arg2_p, @@ -262,7 +262,7 @@ class true_divide_strided_kernel; template sycl::event true_divide_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg1_p, @@ -323,8 +323,8 @@ template sycl::event true_divide_contig_matrix_contig_row_broadcast_impl( sycl::queue &exec_q, std::vector &host_tasks, - size_t n0, - size_t n1, + std::size_t n0, + std::size_t n1, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix ssize_t mat_offset, const char *vec_p, // typeless pointer to (n1,) contiguous row @@ -373,8 +373,8 @@ template sycl::event true_divide_contig_row_contig_matrix_broadcast_impl( sycl::queue &exec_q, std::vector &host_tasks, - size_t n0, - size_t n1, + std::size_t n0, + std::size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row ssize_t vec_offset, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix @@ -530,7 +530,7 @@ class true_divide_inplace_contig_kernel; template sycl::event true_divide_inplace_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, ssize_t arg_offset, char *res_p, @@ -570,7 +570,7 @@ class true_divide_inplace_strided_kernel; template sycl::event true_divide_inplace_strided_impl( sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, @@ -617,8 +617,8 @@ template sycl::event true_divide_inplace_row_matrix_broadcast_impl( sycl::queue &exec_q, std::vector &host_tasks, - size_t n0, - size_t n1, + std::size_t n0, + std::size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row ssize_t vec_offset, char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix diff --git a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp index 462323f001..1ca651a06f 100644 --- a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -25,6 +25,7 @@ #pragma once #include #include +#include #include #include #include @@ -56,7 +57,7 @@ class TakeFunctor char *dst_ = nullptr; char **ind_ = nullptr; int k_ = 0; - size_t ind_nelems_ = 0; + std::size_t ind_nelems_ = 0; const ssize_t *axes_shape_and_strides_ = nullptr; const OrthogIndexer orthog_strider; const IndicesIndexer ind_strider; @@ -67,7 +68,7 @@ class TakeFunctor char *dst_cp, char **ind_cp, int k, - size_t ind_nelems, + std::size_t ind_nelems, const ssize_t *axes_shape_and_strides, const OrthogIndexer &orthog_strider_, const IndicesIndexer &ind_strider_, @@ -120,8 +121,8 @@ template sycl::event take_impl(sycl::queue &q, - size_t orthog_nelems, - size_t ind_nelems, + std::size_t orthog_nelems, + std::size_t ind_nelems, int nd, int ind_nd, int k, @@ -176,7 +177,7 @@ sycl::event take_impl(sycl::queue &q, take_kernel; - const size_t gws = orthog_nelems * ind_nelems; + const std::size_t gws = orthog_nelems * ind_nelems; cgh.parallel_for( sycl::range<1>(gws), @@ -202,7 +203,7 @@ class PutFunctor const char *val_ = nullptr; char **ind_ = nullptr; int k_ = 0; - size_t ind_nelems_ = 0; + std::size_t ind_nelems_ = 0; const ssize_t *axes_shape_and_strides_ = nullptr; const OrthogIndexer orthog_strider; const IndicesIndexer ind_strider; @@ -213,7 +214,7 @@ class PutFunctor const char *val_cp, char **ind_cp, int k, - size_t ind_nelems, + std::size_t ind_nelems, const ssize_t *axes_shape_and_strides, const OrthogIndexer &orthog_strider_, const IndicesIndexer &ind_strider_, @@ -267,8 +268,8 @@ template sycl::event put_impl(sycl::queue &q, - size_t orthog_nelems, - size_t ind_nelems, + std::size_t orthog_nelems, + std::size_t ind_nelems, int nd, int ind_nd, int k, @@ -323,7 +324,7 @@ sycl::event put_impl(sycl::queue &q, put_kernel; - const size_t gws = orthog_nelems * ind_nelems; + const std::size_t gws = orthog_nelems * ind_nelems; cgh.parallel_for( sycl::range<1>(gws), diff --git a/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp b/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp index 3eeec88f16..e8f5c53f5b 100644 --- a/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp +++ b/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp @@ -62,7 +62,7 @@ struct SequentialDotProduct outT *out_ = nullptr; BatchIndexerT batch_indexer_; RedIndexerT reduced_dims_indexer_; - size_t reduction_max_gid_ = 0; + std::size_t reduction_max_gid_ = 0; public: SequentialDotProduct(const lhsT *lhs, @@ -70,7 +70,7 @@ struct SequentialDotProduct outT *out, BatchIndexerT batch_indexer, RedIndexerT reduced_dims_indexer, - size_t reduction_size) + std::size_t reduction_size) : lhs_(lhs), rhs_(rhs), out_(out), batch_indexer_(batch_indexer), reduced_dims_indexer_(reduced_dims_indexer), reduction_max_gid_(reduction_size) @@ -86,7 +86,7 @@ struct SequentialDotProduct const ssize_t &out_batch_offset = batch_offsets.get_third_offset(); outT red_val(0); - for (size_t m = 0; m < reduction_max_gid_; ++m) { + for (std::size_t m = 0; m < reduction_max_gid_; ++m) { auto reduction_offsets = reduced_dims_indexer_(m); auto lhs_reduction_offset = reduction_offsets.get_first_offset(); auto rhs_reduction_offset = reduction_offsets.get_second_offset(); @@ -117,9 +117,9 @@ struct DotProductFunctor const ReductionOpT reduction_op_; const BatchIndexerT batch_indexer_; const RedIndexerT reduced_dims_indexer_; - size_t reduction_max_gid_ = 0; - size_t batches_ = 1; - size_t reductions_per_wi = 16; + std::size_t reduction_max_gid_ = 0; + std::size_t batches_ = 1; + std::size_t reductions_per_wi = 16; public: DotProductFunctor(const lhsT *lhs, @@ -128,9 +128,9 @@ struct DotProductFunctor const ReductionOpT &reduction_op, const BatchIndexerT &batch_indexer, const RedIndexerT &arg_reduced_dims_indexer, - size_t reduction_size, - size_t iteration_size, - size_t reduction_size_per_wi) + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op), batch_indexer_(batch_indexer), reduced_dims_indexer_(arg_reduced_dims_indexer), @@ -141,11 +141,12 @@ struct DotProductFunctor void operator()(sycl::nd_item<1> it) const { - const size_t batch_id = it.get_group(0) % batches_; - const size_t reduction_batch_id = it.get_group(0) / batches_; + const std::size_t batch_id = it.get_group(0) % batches_; + const std::size_t reduction_batch_id = it.get_group(0) / batches_; - const size_t reduction_lid = it.get_local_id(0); - const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg // work-items operate over input with indices // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg @@ -159,12 +160,12 @@ struct DotProductFunctor const auto &out_batch_offset = batch_offsets_.get_third_offset(); outT local_red_val(0); - size_t arg_reduce_gid0 = + std::size_t arg_reduce_gid0 = reduction_lid + reduction_batch_id * wg * reductions_per_wi; - size_t arg_reduce_gid_max = std::min( + std::size_t arg_reduce_gid_max = std::min( reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); - for (size_t arg_reduce_gid = arg_reduce_gid0; + for (std::size_t arg_reduce_gid = arg_reduce_gid0; arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) { auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid); @@ -213,9 +214,9 @@ struct DotProductCustomFunctor const BatchIndexerT batch_indexer_; const RedIndexerT reduced_dims_indexer_; SlmT local_mem_; - size_t reduction_max_gid_ = 0; - size_t batches_ = 1; - size_t reductions_per_wi = 16; + std::size_t reduction_max_gid_ = 0; + std::size_t batches_ = 1; + std::size_t reductions_per_wi = 16; public: DotProductCustomFunctor(const lhsT *lhs, @@ -225,9 +226,9 @@ struct DotProductCustomFunctor const BatchIndexerT &batch_indexer, const RedIndexerT &arg_reduced_dims_indexer, SlmT local_mem, - size_t reduction_size, - size_t iteration_size, - size_t reduction_size_per_wi) + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op), batch_indexer_(batch_indexer), reduced_dims_indexer_(arg_reduced_dims_indexer), @@ -238,11 +239,12 @@ struct DotProductCustomFunctor void operator()(sycl::nd_item<1> it) const { - const size_t batch_id = it.get_group(0) % batches_; - const size_t reduction_batch_id = it.get_group(0) / batches_; + const std::size_t batch_id = it.get_group(0) % batches_; + const std::size_t reduction_batch_id = it.get_group(0) / batches_; - const size_t reduction_lid = it.get_local_id(0); - const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg // work-items operate over input with indices // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg @@ -256,12 +258,12 @@ struct DotProductCustomFunctor const auto &out_batch_offset = batch_offsets_.get_third_offset(); outT local_red_val(0); - size_t arg_reduce_gid0 = + std::size_t arg_reduce_gid0 = reduction_lid + reduction_batch_id * wg * reductions_per_wi; - size_t arg_reduce_gid_max = std::min( + std::size_t arg_reduce_gid_max = std::min( reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); - for (size_t arg_reduce_gid = arg_reduce_gid0; + for (std::size_t arg_reduce_gid = arg_reduce_gid0; arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) { auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid); @@ -305,8 +307,8 @@ sycl::event sequential_dot_product(sycl::queue &exec_q, const lhsTy *lhs, const rhsTy *rhs, resTy *res, - size_t batches, - size_t reduction_nelems, + std::size_t batches, + std::size_t reduction_nelems, const BatchIndexerT &batch_indexer, const RedIndexerT &reduction_indexer, const std::vector &depends) @@ -343,11 +345,11 @@ sycl::event submit_atomic_dot_product(sycl::queue &exec_q, const lhsTy *lhs, const rhsTy *rhs, resTy *res, - size_t wg, - size_t batches, - size_t reduction_nelems, - size_t reductions_per_wi, - size_t reduction_groups, + std::size_t wg, + std::size_t batches, + std::size_t reduction_nelems, + std::size_t reductions_per_wi, + std::size_t reduction_groups, const BatchIndexerT &batch_indexer, const RedIndexerT &reduction_indexer, const std::vector &depends) @@ -405,8 +407,8 @@ class dot_product_krn; typedef sycl::event (*dot_product_impl_fn_ptr_t)( sycl::queue &, - size_t, - size_t, + std::size_t, + std::size_t, const char *, const char *, char *, @@ -423,8 +425,8 @@ typedef sycl::event (*dot_product_impl_fn_ptr_t)( template sycl::event dot_product_impl(sycl::queue &exec_q, - size_t batches, - size_t reduction_nelems, + std::size_t batches, + std::size_t reduction_nelems, const char *lhs_cp, const char *rhs_cp, char *res_cp, @@ -445,7 +447,7 @@ sycl::event dot_product_impl(sycl::queue &exec_q, const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); if (reduction_nelems < wg) { using InputOutputBatchIndexerT = @@ -504,14 +506,14 @@ sycl::event dot_product_impl(sycl::queue &exec_q, reduction_rhs_offset, reduction_shape_stride}; - constexpr size_t preferred_reductions_per_wi = + constexpr std::size_t preferred_reductions_per_wi = 4; // determined experimentally - size_t reductions_per_wi = + std::size_t reductions_per_wi = (reduction_nelems < preferred_reductions_per_wi * wg) - ? std::max(1, (reduction_nelems + wg - 1) / wg) + ? std::max(1, (reduction_nelems + wg - 1) / wg) : preferred_reductions_per_wi; - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / (reductions_per_wi * wg); @@ -529,8 +531,8 @@ sycl::event dot_product_impl(sycl::queue &exec_q, typedef sycl::event (*dot_product_contig_impl_fn_ptr_t)( sycl::queue &, - size_t, - size_t, + std::size_t, + std::size_t, const char *, const char *, char *, @@ -544,8 +546,8 @@ typedef sycl::event (*dot_product_contig_impl_fn_ptr_t)( template sycl::event dot_product_contig_impl(sycl::queue &exec_q, - size_t batches, - size_t reduction_nelems, + std::size_t batches, + std::size_t reduction_nelems, const char *lhs_cp, const char *rhs_cp, char *res_cp, @@ -564,7 +566,7 @@ dot_product_contig_impl(sycl::queue &exec_q, const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); if (reduction_nelems < wg) { using InputBatchIndexerT = @@ -619,14 +621,14 @@ dot_product_contig_impl(sycl::queue &exec_q, constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; - constexpr size_t preferred_reductions_per_wi = + constexpr std::size_t preferred_reductions_per_wi = 4; // determined experimentally - size_t reductions_per_wi = + std::size_t reductions_per_wi = (reduction_nelems < preferred_reductions_per_wi * wg) - ? std::max(1, (reduction_nelems + wg - 1) / wg) + ? std::max(1, (reduction_nelems + wg - 1) / wg) : preferred_reductions_per_wi; - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / (reductions_per_wi * wg); @@ -657,9 +659,9 @@ struct DotProductNoAtomicFunctor const ReductionOpT reduction_op_; const BatchIndexerT batch_indexer_; const RedIndexerT reduced_dims_indexer_; - size_t reduction_max_gid_ = 0; - size_t batches_ = 1; - size_t reductions_per_wi = 16; + std::size_t reduction_max_gid_ = 0; + std::size_t batches_ = 1; + std::size_t reductions_per_wi = 16; public: DotProductNoAtomicFunctor(const lhsT *lhs, @@ -668,9 +670,9 @@ struct DotProductNoAtomicFunctor const ReductionOpT &reduction_op, const BatchIndexerT &batch_indexer, const RedIndexerT &arg_reduced_dims_indexer, - size_t reduction_size, - size_t iteration_size, - size_t reduction_size_per_wi) + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op), batch_indexer_(batch_indexer), reduced_dims_indexer_(arg_reduced_dims_indexer), @@ -681,12 +683,13 @@ struct DotProductNoAtomicFunctor void operator()(sycl::nd_item<1> it) const { - const size_t reduction_lid = it.get_local_id(0); - const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg - const size_t batch_id = it.get_group(0) % batches_; - const size_t reduction_batch_id = it.get_group(0) / batches_; - const size_t n_reduction_groups = it.get_group_range(0) / batches_; + const std::size_t batch_id = it.get_group(0) % batches_; + const std::size_t reduction_batch_id = it.get_group(0) / batches_; + const std::size_t n_reduction_groups = it.get_group_range(0) / batches_; // work-items operate over input with indices // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg @@ -700,12 +703,12 @@ struct DotProductNoAtomicFunctor const auto &out_batch_offset = batch_offsets_.get_third_offset(); outT local_red_val(0); - size_t arg_reduce_gid0 = + std::size_t arg_reduce_gid0 = reduction_lid + reduction_batch_id * wg * reductions_per_wi; - size_t arg_reduce_gid_max = std::min( + std::size_t arg_reduce_gid_max = std::min( reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); - for (size_t arg_reduce_gid = arg_reduce_gid0; + for (std::size_t arg_reduce_gid = arg_reduce_gid0; arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) { auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid); @@ -756,9 +759,9 @@ struct DotProductNoAtomicCustomFunctor const BatchIndexerT batch_indexer_; const RedIndexerT reduced_dims_indexer_; SlmT local_mem_; - size_t reduction_max_gid_ = 0; - size_t batches_ = 1; - size_t reductions_per_wi = 16; + std::size_t reduction_max_gid_ = 0; + std::size_t batches_ = 1; + std::size_t reductions_per_wi = 16; public: DotProductNoAtomicCustomFunctor(const lhsT *lhs, @@ -768,9 +771,9 @@ struct DotProductNoAtomicCustomFunctor const BatchIndexerT &batch_indexer, const RedIndexerT &arg_reduced_dims_indexer, SlmT local_mem, - size_t reduction_size, - size_t iteration_size, - size_t reduction_size_per_wi) + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op), batch_indexer_(batch_indexer), reduced_dims_indexer_(arg_reduced_dims_indexer), @@ -781,12 +784,13 @@ struct DotProductNoAtomicCustomFunctor void operator()(sycl::nd_item<1> it) const { - const size_t reduction_lid = it.get_local_id(0); - const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg - const size_t batch_id = it.get_group(0) % batches_; - const size_t reduction_batch_id = it.get_group(0) / batches_; - const size_t n_reduction_groups = it.get_group_range(0) / batches_; + const std::size_t batch_id = it.get_group(0) % batches_; + const std::size_t reduction_batch_id = it.get_group(0) / batches_; + const std::size_t n_reduction_groups = it.get_group_range(0) / batches_; // work-items operate over input with indices // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg @@ -800,12 +804,12 @@ struct DotProductNoAtomicCustomFunctor const auto &out_batch_offset = batch_offsets_.get_third_offset(); outT local_red_val(0); - size_t arg_reduce_gid0 = + std::size_t arg_reduce_gid0 = reduction_lid + reduction_batch_id * wg * reductions_per_wi; - size_t arg_reduce_gid_max = std::min( + std::size_t arg_reduce_gid_max = std::min( reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); - for (size_t arg_reduce_gid = arg_reduce_gid0; + for (std::size_t arg_reduce_gid = arg_reduce_gid0; arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) { auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid); @@ -854,11 +858,11 @@ submit_no_atomic_dot_product(sycl::queue &exec_q, const lhsTy *lhs, const rhsTy *rhs, resTy *res, - size_t wg, - size_t batches, - size_t reduction_nelems, - size_t reductions_per_wi, - size_t reduction_groups, + std::size_t wg, + std::size_t batches, + std::size_t reduction_nelems, + std::size_t reductions_per_wi, + std::size_t reduction_groups, const BatchIndexerT &batch_indexer, const RedIndexerT &reduction_indexer, const std::vector &depends) @@ -916,8 +920,8 @@ class dot_product_tree_reduction_krn; template sycl::event dot_product_tree_impl(sycl::queue &exec_q, - size_t batches, - size_t reduction_nelems, + std::size_t batches, + std::size_t reduction_nelems, const char *lhs_cp, const char *rhs_cp, char *res_cp, @@ -938,7 +942,7 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q, const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); if (reduction_nelems < wg) { using InputOutputBatchIndexerT = @@ -963,15 +967,15 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q, return dot_ev; } - constexpr size_t preferred_reductions_per_wi = 8; + constexpr std::size_t preferred_reductions_per_wi = 8; // prevents running out of resources on CPU - size_t max_wg = reduction_detail::get_work_group_size(d); + std::size_t max_wg = reduction_detail::get_work_group_size(d); using ReductionOpT = typename std::conditional, sycl::logical_or, sycl::plus>::type; - size_t reductions_per_wi(preferred_reductions_per_wi); + std::size_t reductions_per_wi(preferred_reductions_per_wi); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { using BatchIndexerT = dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer; @@ -990,9 +994,9 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q, wg = max_wg; } reductions_per_wi = - std::max(1, (reduction_nelems + wg - 1) / wg); + std::max(1, (reduction_nelems + wg - 1) / wg); - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / (reductions_per_wi * wg); assert(reduction_groups == 1); @@ -1012,12 +1016,12 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q, sycl::known_identity::value; // more than one work-groups is needed, requires a temporary - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); - size_t second_iter_reduction_groups_ = + std::size_t second_iter_reduction_groups_ = (reduction_groups + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); @@ -1067,7 +1071,7 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q, in_out_iter_indexer, reduction_indexer, depends); } - size_t remaining_reduction_nelems = reduction_groups; + std::size_t remaining_reduction_nelems = reduction_groups; resTy *temp_arg = partially_reduced_tmp; resTy *temp2_arg = partially_reduced_tmp2; @@ -1076,9 +1080,10 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q, while (remaining_reduction_nelems > preferred_reductions_per_wi * max_wg) { - size_t reduction_groups_ = (remaining_reduction_nelems + - preferred_reductions_per_wi * wg - 1) / - (preferred_reductions_per_wi * wg); + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; @@ -1130,8 +1135,8 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q, constexpr ReductionIndexerT reduction_indexer{}; wg = max_wg; - reductions_per_wi = - std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); reduction_groups = (remaining_reduction_nelems + reductions_per_wi * wg - 1) / @@ -1164,8 +1169,8 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q, template sycl::event dot_product_contig_tree_impl(sycl::queue &exec_q, - size_t batches, - size_t reduction_nelems, + std::size_t batches, + std::size_t reduction_nelems, const char *lhs_cp, const char *rhs_cp, char *res_cp, @@ -1184,7 +1189,7 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); if (reduction_nelems < wg) { using InputBatchIndexerT = @@ -1214,15 +1219,15 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, return dot_ev; } - constexpr size_t preferred_reductions_per_wi = 8; + constexpr std::size_t preferred_reductions_per_wi = 8; // prevents running out of resources on CPU - size_t max_wg = reduction_detail::get_work_group_size(d); + std::size_t max_wg = reduction_detail::get_work_group_size(d); using ReductionOpT = typename std::conditional, sycl::logical_or, sycl::plus>::type; - size_t reductions_per_wi(preferred_reductions_per_wi); + std::size_t reductions_per_wi(preferred_reductions_per_wi); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { using InputBatchIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; @@ -1246,9 +1251,9 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, wg = max_wg; } reductions_per_wi = - std::max(1, (reduction_nelems + wg - 1) / wg); + std::max(1, (reduction_nelems + wg - 1) / wg); - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / (reductions_per_wi * wg); assert(reduction_groups == 1); @@ -1267,12 +1272,12 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, sycl::known_identity::value; // more than one work-groups is needed, requires a temporary - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); - size_t second_iter_reduction_groups_ = + std::size_t second_iter_reduction_groups_ = (reduction_groups + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); @@ -1317,7 +1322,7 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, inp_out_batch_indexer, reduction_indexer, depends); } - size_t remaining_reduction_nelems = reduction_groups; + std::size_t remaining_reduction_nelems = reduction_groups; resTy *temp_arg = partially_reduced_tmp; resTy *temp2_arg = partially_reduced_tmp2; @@ -1326,9 +1331,10 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, while (remaining_reduction_nelems > preferred_reductions_per_wi * max_wg) { - size_t reduction_groups_ = (remaining_reduction_nelems + - preferred_reductions_per_wi * wg - 1) / - (preferred_reductions_per_wi * wg); + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; @@ -1377,8 +1383,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, constexpr ReductionIndexerT reduction_indexer{}; wg = max_wg; - reductions_per_wi = - std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); reduction_groups = (remaining_reduction_nelems + reductions_per_wi * wg - 1) / diff --git a/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp b/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp index ef2769275b..63f9bcf37b 100644 --- a/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp +++ b/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp @@ -52,14 +52,14 @@ using dpctl::tensor::ssize_t; namespace gemm_detail { -template -void scale_gemm_k_parameters(const size_t &local_mem_size, - const size_t &reserved_slm_size, - const size_t delta_k, - size_t &n_wi, - size_t &delta_n) +template +void scale_gemm_k_parameters(const std::size_t &local_mem_size, + const std::size_t &reserved_slm_size, + const std::size_t delta_k, + std::size_t &n_wi, + std::size_t &delta_n) { - constexpr size_t slm_elem_size = sizeof(T) * m_groups; + constexpr std::size_t slm_elem_size = sizeof(T) * m_groups; while (slm_elem_size * (n_wi + delta_n) * delta_k + reserved_slm_size >= local_mem_size) @@ -72,15 +72,15 @@ void scale_gemm_k_parameters(const size_t &local_mem_size, } template -void scale_gemm_nm_parameters(const size_t &local_mem_size, - const size_t &reserved_slm_size, - const size_t &wi_delta_n, - size_t &wi_delta_k, - size_t &wg_delta_n, - size_t &wg_delta_m) +void scale_gemm_nm_parameters(const std::size_t &local_mem_size, + const std::size_t &reserved_slm_size, + const std::size_t &wi_delta_n, + std::size_t &wi_delta_k, + std::size_t &wg_delta_n, + std::size_t &wg_delta_m) { - constexpr size_t slm_A_elem_size = sizeof(T); - constexpr size_t slm_B_elem_size = sizeof(T) * wi_delta_m; + constexpr std::size_t slm_A_elem_size = sizeof(T); + constexpr std::size_t slm_B_elem_size = sizeof(T) * wi_delta_m; while ((wi_delta_n * wg_delta_n * wi_delta_k * slm_A_elem_size) + (wi_delta_k * wg_delta_m * slm_B_elem_size) + @@ -109,13 +109,13 @@ sycl::event single_reduction_for_gemm(sycl::queue &exec_q, T *tmp_tp, T *res_tp, T identity_val, - size_t iter_nelems, - size_t reduction_nelems, - size_t reduction_groups, - size_t wg, - size_t max_wg, - size_t preferred_reductions_per_wi, - size_t reductions_per_wi, + std::size_t iter_nelems, + std::size_t reduction_nelems, + std::size_t reduction_groups, + std::size_t wg, + std::size_t max_wg, + std::size_t preferred_reductions_per_wi, + std::size_t reductions_per_wi, int res_nd, ssize_t res_offset, const ssize_t *res_shapes_strides, @@ -170,9 +170,9 @@ sycl::event single_reduction_for_gemm(sycl::queue &exec_q, wg = max_wg; } reductions_per_wi = - std::max(1, (reduction_nelems + wg - 1) / wg); + std::max(1, (reduction_nelems + wg - 1) / wg); - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / (reductions_per_wi * wg); assert(reduction_groups == 1); @@ -193,13 +193,13 @@ single_reduction_for_gemm_contig(sycl::queue &exec_q, T *tmp_tp, T *res_tp, T identity_val, - size_t iter_nelems, - size_t reduction_nelems, - size_t reduction_groups, - size_t wg, - size_t max_wg, - size_t preferred_reductions_per_wi, - size_t reductions_per_wi, + std::size_t iter_nelems, + std::size_t reduction_nelems, + std::size_t reduction_groups, + std::size_t wg, + std::size_t max_wg, + std::size_t preferred_reductions_per_wi, + std::size_t reductions_per_wi, const std::vector &depends) { sycl::event red_ev; @@ -251,9 +251,9 @@ single_reduction_for_gemm_contig(sycl::queue &exec_q, wg = max_wg; } reductions_per_wi = - std::max(1, (reduction_nelems + wg - 1) / wg); + std::max(1, (reduction_nelems + wg - 1) / wg); - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / (reductions_per_wi * wg); assert(reduction_groups == 1); @@ -274,13 +274,13 @@ sycl::event tree_reduction_for_gemm(sycl::queue &exec_q, T *partially_reduced_tmp2, T *res_tp, T identity_val, - size_t iter_nelems, - size_t reduction_nelems, - size_t reduction_groups, - size_t wg, - size_t max_wg, - size_t preferred_reductions_per_wi, - size_t reductions_per_wi, + std::size_t iter_nelems, + std::size_t reduction_nelems, + std::size_t reduction_groups, + std::size_t wg, + std::size_t max_wg, + std::size_t preferred_reductions_per_wi, + std::size_t reductions_per_wi, int res_nd, ssize_t res_offset, const ssize_t *res_shape_strides, @@ -309,16 +309,16 @@ sycl::event tree_reduction_for_gemm(sycl::queue &exec_q, reduction_groups, in_out_iter_indexer, reduction_indexer, depends); } - size_t remaining_reduction_nelems = reduction_groups; + std::size_t remaining_reduction_nelems = reduction_groups; T *temp_arg = partially_reduced_tmp2; T *temp2_arg = partially_reduced_tmp; sycl::event dependent_ev = first_reduction_ev; while (remaining_reduction_nelems > preferred_reductions_per_wi * max_wg) { - size_t reduction_groups_ = (remaining_reduction_nelems + - preferred_reductions_per_wi * wg - 1) / - (preferred_reductions_per_wi * wg); + std::size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -373,7 +373,7 @@ sycl::event tree_reduction_for_gemm(sycl::queue &exec_q, wg = max_wg; reductions_per_wi = - std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + std::max(1, (remaining_reduction_nelems + wg - 1) / wg); reduction_groups = (remaining_reduction_nelems + reductions_per_wi * wg - 1) / @@ -401,13 +401,13 @@ tree_reduction_for_gemm_contig(sycl::queue &exec_q, T *partially_reduced_tmp2, T *res_tp, T identity_val, - size_t iter_nelems, - size_t reduction_nelems, - size_t reduction_groups, - size_t wg, - size_t max_wg, - size_t preferred_reductions_per_wi, - size_t reductions_per_wi, + std::size_t iter_nelems, + std::size_t reduction_nelems, + std::size_t reduction_groups, + std::size_t wg, + std::size_t max_wg, + std::size_t preferred_reductions_per_wi, + std::size_t reductions_per_wi, const std::vector &depends) { using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; @@ -429,16 +429,16 @@ tree_reduction_for_gemm_contig(sycl::queue &exec_q, wg, iter_nelems, reduction_nelems, reductions_per_wi, reduction_groups, in_out_iter_indexer, reduction_indexer, depends); - size_t remaining_reduction_nelems = reduction_groups; + std::size_t remaining_reduction_nelems = reduction_groups; T *temp_arg = partially_reduced_tmp2; T *temp2_arg = partially_reduced_tmp; sycl::event dependent_ev = first_reduction_ev; while (remaining_reduction_nelems > preferred_reductions_per_wi * max_wg) { - size_t reduction_groups_ = (remaining_reduction_nelems + - preferred_reductions_per_wi * wg - 1) / - (preferred_reductions_per_wi * wg); + std::size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -494,10 +494,10 @@ tree_reduction_for_gemm_contig(sycl::queue &exec_q, constexpr ReductionIndexerT reduction_indexer{}; wg = max_wg; - reductions_per_wi = - std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); - size_t reduction_groups = + std::size_t reduction_groups = (remaining_reduction_nelems + reductions_per_wi * wg - 1) / (reductions_per_wi * wg); assert(reduction_groups == 1); @@ -520,7 +520,7 @@ template + std::size_t m_groups> class GemmBatchFunctorThreadK { private: @@ -529,15 +529,15 @@ class GemmBatchFunctorThreadK resT *res = nullptr; LocAccT workspace; LocAccT local_B_block; - size_t n = 0; - size_t n_blocks = 0; - size_t delta_n = 0; - size_t k = 0; - size_t k_blocks = 0; - size_t delta_k = 0; - size_t n_wi = 0; - size_t m = 0; - size_t batch_nelems = 0; + std::size_t n = 0; + std::size_t n_blocks = 0; + std::size_t delta_n = 0; + std::size_t k = 0; + std::size_t k_blocks = 0; + std::size_t delta_k = 0; + std::size_t n_wi = 0; + std::size_t m = 0; + std::size_t batch_nelems = 0; const BatchDimsIndexerT batch_indexer; const OuterInnerDimsIndexerT lhs_indexer; const OuterInnerDimsIndexerT rhs_indexer; @@ -549,15 +549,15 @@ class GemmBatchFunctorThreadK resT *res_, LocAccT workspace_, LocAccT local_B_block_, - size_t n_, - size_t n_blocks_, - size_t delta_n_, - size_t k_, - size_t k_blocks_, - size_t delta_k_, - size_t n_wi_, - size_t m_, - size_t batch_nelems_, + std::size_t n_, + std::size_t n_blocks_, + std::size_t delta_n_, + std::size_t k_, + std::size_t k_blocks_, + std::size_t delta_k_, + std::size_t n_wi_, + std::size_t m_, + std::size_t batch_nelems_, const BatchDimsIndexerT &batch_indexer_, const OuterInnerDimsIndexerT &lhs_indexer_, const OuterInnerDimsIndexerT &rhs_indexer_, @@ -578,11 +578,12 @@ class GemmBatchFunctorThreadK // batch_nelems) for lhs, offset = m_id * (n * k) for rhs, offset = // m_id // * (k * m) for res, offset = m_id * (n * m) - const size_t n_groups_per_batch = it.get_group_range(0) / batch_nelems; - const size_t m_id = it.get_group_linear_id() / n_groups_per_batch; - const size_t gr_id = + const std::size_t n_groups_per_batch = + it.get_group_range(0) / batch_nelems; + const std::size_t m_id = it.get_group_linear_id() / n_groups_per_batch; + const std::size_t gr_id = it.get_group_linear_id() - m_id * n_groups_per_batch; - const size_t lid = it.get_local_linear_id(); + const std::size_t lid = it.get_local_linear_id(); const auto &three_offsets_ = batch_indexer(static_cast(m_id)); @@ -593,32 +594,32 @@ class GemmBatchFunctorThreadK // lift gr_id -> (block_i, block_j, block_s) // block_i moves fastest, then block_s, then block_j - const size_t r_size = (n_blocks * k_blocks); + const std::size_t r_size = (n_blocks * k_blocks); // 0 <= block_j < m_blocks, - const size_t block_j = gr_id / r_size; + const std::size_t block_j = gr_id / r_size; // 0 <= block_r < n_blocks * k_blocks - const size_t block_r = gr_id - block_j * r_size; + const std::size_t block_r = gr_id - block_j * r_size; // 0 <= block_s < k_blocks - const size_t block_s = block_r / n_blocks; + const std::size_t block_s = block_r / n_blocks; // 0 <= block_i < n_blocks - const size_t block_i = block_r - block_s * n_blocks; + const std::size_t block_i = block_r - block_s * n_blocks; // 0 <= local_i < delta_n - const size_t local_i = lid / (delta_k); + const std::size_t local_i = lid / (delta_k); // 0 <= local_s < delta_k - const size_t local_s = lid - local_i * (delta_k); + const std::size_t local_s = lid - local_i * (delta_k); - size_t i = block_i * delta_n + local_i; - size_t j = m_groups * block_j; - size_t s = block_s * delta_k * n_wi + local_s; + std::size_t i = block_i * delta_n + local_i; + std::size_t j = m_groups * block_j; + std::size_t s = block_s * delta_k * n_wi + local_s; using accV_t = typename LocAccT::value_type; constexpr resT identity_ = resT(0); if (local_i == 0) { - for (size_t q = 0; q < n_wi * delta_k; q += delta_k) { - const size_t sq = s + q; - const size_t sqmj = sq * m + j; + for (std::size_t q = 0; q < n_wi * delta_k; q += delta_k) { + const std::size_t sq = s + q; + const std::size_t sqmj = sq * m + j; if constexpr (m_groups == 1 && std::is_same_v) { local_B_block[local_s + q] = @@ -630,7 +631,8 @@ class GemmBatchFunctorThreadK else { accV_t local_B_vec; #pragma unroll - for (size_t vec_idx = 0; vec_idx < m_groups; ++vec_idx) { + for (std::size_t vec_idx = 0; vec_idx < m_groups; ++vec_idx) + { local_B_vec[vec_idx] = (sq < k && j + vec_idx < m) ? static_cast( @@ -645,12 +647,12 @@ class GemmBatchFunctorThreadK it.barrier(sycl::access::fence_space::local_space); - size_t t_shift = block_s * delta_k * n_wi; - size_t global_s_offset = i * k + t_shift; + std::size_t t_shift = block_s * delta_k * n_wi; + std::size_t global_s_offset = i * k + t_shift; accV_t private_sum(identity_); constexpr accV_t vec_identity_(identity_); - for (size_t t = local_s; t < local_B_block.size(); t += delta_k) { + for (std::size_t t = local_s; t < local_B_block.size(); t += delta_k) { private_sum += ((i < n) && (t + t_shift < k)) ? (static_cast( @@ -659,14 +661,14 @@ class GemmBatchFunctorThreadK : vec_identity_; } - size_t workspace_i_shift = local_i * delta_k; + std::size_t workspace_i_shift = local_i * delta_k; workspace[workspace_i_shift + local_s] = private_sum; it.barrier(sycl::access::fence_space::local_space); if (local_s == 0 && i < n) { accV_t local_sum(workspace[workspace_i_shift]); - for (size_t t = 1; t < delta_k; ++t) { + for (std::size_t t = 1; t < delta_k; ++t) { local_sum += workspace[workspace_i_shift + t]; } @@ -682,7 +684,7 @@ class GemmBatchFunctorThreadK aout0 += local_sum[0]; #pragma unroll - for (size_t vec_id = 1; vec_id < m_groups; ++vec_id) { + for (std::size_t vec_id = 1; vec_id < m_groups; ++vec_id) { if (j + vec_id < m) { sycl::atomic_ref< resT, sycl::memory_order::relaxed, @@ -701,10 +703,10 @@ class GemmBatchFunctorThreadK template class gemm_init_krn; -template +template class gemm_k_krn; -template +template class gemm_nm_krn; template + std::size_t> class gemm_batch_k_krn; template + std::size_t> class gemm_batch_nm_krn; namespace gemm_detail @@ -737,28 +739,28 @@ sycl::event _gemm_k_impl(sycl::queue &exec_q, const lhsTy *lhs_tp, const rhsTy *rhs_tp, resTy *res_tp, - const size_t batch_nelems, - const size_t n, - const size_t k, - const size_t m, + const std::size_t batch_nelems, + const std::size_t n, + const std::size_t k, + const std::size_t m, const BatchIndexerT &batch_indexer, const LhsIndexerT &lhs_indexer, const RhsIndexerT &rhs_indexer, const ResIndexerT &res_indexer, const std::vector &depends) { - constexpr size_t m_groups = 4; - const size_t delta_k(4); - size_t n_wi(64); - size_t delta_n(32); + constexpr std::size_t m_groups = 4; + const std::size_t delta_k(4); + std::size_t n_wi(64); + std::size_t delta_n(32); static_assert(std::is_same_v); static_assert(std::is_same_v); const sycl::device &dev = exec_q.get_device(); - const size_t local_mem_size = + const std::size_t local_mem_size = dev.get_info(); - const size_t reserved_slm_size = 512; + const std::size_t reserved_slm_size = 512; gemm_detail::scale_gemm_k_parameters( local_mem_size, reserved_slm_size, delta_k, @@ -766,11 +768,11 @@ sycl::event _gemm_k_impl(sycl::queue &exec_q, delta_n // modified by reference ); - size_t n_blocks = (n + delta_n - 1) / delta_n; - size_t m_blocks = (m + m_groups - 1) / m_groups; - size_t k_blocks = (k + n_wi * delta_k - 1) / (n_wi * delta_k); + std::size_t n_blocks = (n + delta_n - 1) / delta_n; + std::size_t m_blocks = (m + m_groups - 1) / m_groups; + std::size_t k_blocks = (k + n_wi * delta_k - 1) / (n_wi * delta_k); - size_t lws = delta_n * delta_k; + std::size_t lws = delta_n * delta_k; auto gRange = sycl::range<1>(batch_nelems * n_blocks * m_blocks * k_blocks * lws); @@ -811,28 +813,28 @@ sycl::event _gemm_small_m_impl(sycl::queue &exec_q, const lhsTy *lhs_tp, const rhsTy *rhs_tp, resTy *res_tp, - const size_t batch_nelems, - const size_t n, - const size_t k, - const size_t m, + const std::size_t batch_nelems, + const std::size_t n, + const std::size_t k, + const std::size_t m, const BatchIndexerT &batch_indexer, const LhsIndexerT &lhs_indexer, const RhsIndexerT &rhs_indexer, const ResIndexerT &res_indexer, const std::vector &depends) { - constexpr size_t m_groups = 1; - const size_t delta_k(4); - size_t n_wi(64); - size_t delta_n(32); + constexpr std::size_t m_groups = 1; + const std::size_t delta_k(4); + std::size_t n_wi(64); + std::size_t delta_n(32); static_assert(std::is_same_v); static_assert(std::is_same_v); const sycl::device &dev = exec_q.get_device(); - const size_t local_mem_size = + const std::size_t local_mem_size = dev.get_info(); - const size_t reserved_slm_size = 512; + const std::size_t reserved_slm_size = 512; gemm_detail::scale_gemm_k_parameters( local_mem_size, reserved_slm_size, delta_k, @@ -840,11 +842,11 @@ sycl::event _gemm_small_m_impl(sycl::queue &exec_q, delta_n // modified by reference ); - size_t n_blocks = (n + delta_n - 1) / delta_n; - size_t m_blocks = (m + m_groups - 1) / m_groups; - size_t k_blocks = (k + n_wi * delta_k - 1) / (n_wi * delta_k); + std::size_t n_blocks = (n + delta_n - 1) / delta_n; + std::size_t m_blocks = (m + m_groups - 1) / m_groups; + std::size_t k_blocks = (k + n_wi * delta_k - 1) / (n_wi * delta_k); - size_t lws = delta_n * delta_k; + std::size_t lws = delta_n * delta_k; auto gRange = sycl::range<1>(batch_nelems * n_blocks * m_blocks * k_blocks * lws); @@ -897,11 +899,11 @@ class GemmBatchFunctorThreadNM_vecm resT *res = nullptr; LocAccT1 local_lhs_block; LocAccT2 local_rhs_block; - size_t batch_nelems; - size_t n = 0; - size_t k = 0; - size_t m = 0; - size_t n_groups = 0; + std::size_t batch_nelems; + std::size_t n = 0; + std::size_t k = 0; + std::size_t m = 0; + std::size_t n_groups = 0; std::uint32_t wg_delta_n = 0; std::uint32_t wg_delta_m = 0; std::uint32_t wi_delta_k = 0; @@ -917,14 +919,14 @@ class GemmBatchFunctorThreadNM_vecm resT *res_, LocAccT1 local_lhs_block_, LocAccT2 local_rhs_block_, - size_t batch_nelems_, - size_t n_, - size_t k_, - size_t m_, - size_t n_groups_, - size_t wg_delta_n_, - size_t wg_delta_m_, - size_t wi_delta_k_, + std::size_t batch_nelems_, + std::size_t n_, + std::size_t k_, + std::size_t m_, + std::size_t n_groups_, + std::size_t wg_delta_n_, + std::size_t wg_delta_m_, + std::size_t wi_delta_k_, const BatchDimsIndexerT &batch_indexer_, const LhsIndexerT &lhs_indexer_, const RhsIndexerT &rhs_indexer_, @@ -943,9 +945,9 @@ class GemmBatchFunctorThreadNM_vecm constexpr resT zero_(0); constexpr std::uint32_t wi_total_delta_m = wi_delta_m_vecs * m_vec_size; - const size_t gws_per_batch = it.get_group_range(0) / batch_nelems; - const size_t batch_id = it.get_group_linear_id() / gws_per_batch; - const size_t gr_id = + const std::size_t gws_per_batch = it.get_group_range(0) / batch_nelems; + const std::size_t batch_id = it.get_group_linear_id() / gws_per_batch; + const std::size_t gr_id = it.get_group_linear_id() - batch_id * gws_per_batch; const auto &three_offsets_ = @@ -956,9 +958,9 @@ class GemmBatchFunctorThreadNM_vecm const auto &res_offset = three_offsets_.get_third_offset(); // 0 <= block_j < m_groups - const size_t block_j = gr_id / n_groups; + const std::size_t block_j = gr_id / n_groups; // 0 <= block_i < n_groups - const size_t block_i = gr_id - block_j * n_groups; + const std::size_t block_i = gr_id - block_j * n_groups; // Assumption: lws == wg_delta_n * wg_delta_m const std::uint32_t lid = it.get_local_linear_id(); @@ -968,27 +970,27 @@ class GemmBatchFunctorThreadNM_vecm const std::uint32_t local_i = lid - local_j * wg_delta_n; // Coordinates of the block of C the work-group works on - size_t i = block_i * wg_delta_n * wi_delta_n; - size_t j = block_j * wg_delta_m * wi_total_delta_m; + std::size_t i = block_i * wg_delta_n * wi_delta_n; + std::size_t j = block_j * wg_delta_m * wi_total_delta_m; using slmA_t = typename LocAccT1::value_type; using slmB_t = typename LocAccT2::value_type; - const size_t a_st0 = k; - const size_t a_st1 = 1; + const std::size_t a_st0 = k; + const std::size_t a_st1 = 1; - const size_t b_st0 = m; - const size_t b_st1 = 1; + const std::size_t b_st0 = m; + const std::size_t b_st1 = 1; - const size_t c_st0 = m; - const size_t c_st1 = 1; + const std::size_t c_st0 = m; + const std::size_t c_st1 = 1; // allocate/initialize private matrix C // size ( wi_total_delta_n, wi_total_delta_m ) constexpr std::uint32_t C_size = wi_delta_n * wi_delta_m_vecs; std::array private_C{slmB_t{zero_}}; - for (size_t s = 0; s < k; s += wi_delta_k) { + for (std::size_t s = 0; s < k; s += wi_delta_k) { // populate local_lhs_block ( wg_delta_n * wi_delta_n, // wi_delta_k) for (std::uint32_t vid = lid; vid < local_lhs_block.size(); @@ -999,8 +1001,8 @@ class GemmBatchFunctorThreadNM_vecm // 0 <= v_s < wi_delta_k const std::uint32_t v_s = vid - v_i * wi_delta_k; - const size_t g_i = i + v_i; - const size_t g_s = s + v_s; + const std::size_t g_i = i + v_i; + const std::size_t g_s = s + v_s; const std::uint32_t mapped_vid = wg_delta_n * wi_delta_n * v_s + v_i; @@ -1022,8 +1024,8 @@ class GemmBatchFunctorThreadNM_vecm // 0 <= v_s < wi_delta_k const std::uint32_t v_s = vid - v_j * wi_delta_k; - const size_t g_j = j + v_j * m_vec_size; - const size_t g_s = s + v_s; + const std::size_t g_j = j + v_j * m_vec_size; + const std::size_t g_s = s + v_s; const std::uint32_t mapped_vid = wg_delta_m * wi_delta_m_vecs * v_s + v_j; @@ -1041,7 +1043,7 @@ class GemmBatchFunctorThreadNM_vecm for (std::uint32_t lane_id = 0; lane_id < m_vec_size; ++lane_id) { - const size_t g_j1 = g_j + lane_id; + const std::size_t g_j1 = g_j + lane_id; vec[lane_id] = (g_j1 < m && g_s < k) ? static_cast( rhs[rhs_offset + @@ -1092,14 +1094,14 @@ class GemmBatchFunctorThreadNM_vecm if constexpr (m_vec_size == 1) { #pragma unroll for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) { - size_t out_i = i + local_i + pr_i * wg_delta_n; + std::size_t out_i = i + local_i + pr_i * wg_delta_n; if (out_i < n) { #pragma unroll for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; ++pr_j) { - const size_t out_j = + const std::size_t out_j = j + (local_j + pr_j * wg_delta_m) * m_vec_size; - const size_t out_flat_id = + const std::size_t out_flat_id = out_i * c_st0 + out_j * c_st1; if (out_j < m) { res[res_offset + res_indexer(out_flat_id)] = @@ -1112,18 +1114,18 @@ class GemmBatchFunctorThreadNM_vecm else { #pragma unroll for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) { - size_t out_i = i + local_i + pr_i * wg_delta_n; + std::size_t out_i = i + local_i + pr_i * wg_delta_n; if (out_i < n) { // could be unrolled for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; ++pr_j) { - size_t out_j = + std::size_t out_j = j + (local_j + pr_j * wg_delta_m) * m_vec_size; #pragma unroll for (std::uint32_t lane_id = 0; lane_id < m_vec_size; ++lane_id) { - const size_t out_flat_id = + const std::size_t out_flat_id = out_i * c_st0 + (out_j + lane_id) * c_st1; if (out_j + lane_id < m) { res[res_offset + res_indexer(out_flat_id)] = @@ -1219,13 +1221,13 @@ namespace gemm_detail template std::tuple -get_wg_delta_m_and_wi_delta_k(const size_t slm_byte_size, +get_wg_delta_m_and_wi_delta_k(const std::size_t slm_byte_size, const std::uint32_t wg_delta_n, const std::uint32_t suggested_wg_delta_m) { std::uint32_t wg_delta_m = suggested_wg_delta_m; - const size_t slm_max_rows = + const std::size_t slm_max_rows = slm_byte_size / ((wg_delta_n * wi_delta_n + wg_delta_m * wi_delta_m) * sizeof(T)); @@ -1237,7 +1239,7 @@ get_wg_delta_m_and_wi_delta_k(const size_t slm_byte_size, for (std::uint32_t it = 0; !wi_delta_k && (it < 4); ++it) { wg_delta_m /= 2; - const size_t slm_max_rows = + const std::size_t slm_max_rows = slm_byte_size / ((wg_delta_n * wi_delta_n + wg_delta_m * wi_delta_m) * sizeof(T)); @@ -1269,10 +1271,10 @@ sycl::event _gemm_batch_nm_impl(sycl::queue &exec_q, const lhsTy *lhs_tp, const rhsTy *rhs_tp, resTy *res_tp, - const size_t batch_nelems, - const size_t n, - const size_t k, - const size_t m, + const std::size_t batch_nelems, + const std::size_t n, + const std::size_t k, + const std::size_t m, const BatchIndexerT &batch_indexer, const LhsIndexerT &lhs_indexer, const RhsIndexerT &rhs_indexer, @@ -1307,18 +1309,18 @@ sycl::event _gemm_batch_nm_impl(sycl::queue &exec_q, const std::uint32_t max_sg_size = krn.template get_info< sycl::info::kernel_device_specific::max_sub_group_size>(dev); - const size_t k_wg_sz = krn.template get_info< + const std::size_t k_wg_sz = krn.template get_info< sycl::info::kernel_device_specific::work_group_size>(dev); // Limit work-group size - constexpr size_t wg_sz_limit(2048); - const size_t max_wg_sz = std::min(wg_sz_limit, k_wg_sz); + constexpr std::size_t wg_sz_limit(2048); + const std::size_t max_wg_sz = std::min(wg_sz_limit, k_wg_sz); const std::uint32_t max_subgroups_per_wg = static_cast(max_wg_sz / max_sg_size); - const size_t reserved_slm_byte_size = 512; - const size_t slm_byte_size = + const std::size_t reserved_slm_byte_size = 512; + const std::size_t slm_byte_size = dev.get_info(); const std::uint32_t wg_delta_n = max_sg_size; @@ -1332,12 +1334,12 @@ sycl::event _gemm_batch_nm_impl(sycl::queue &exec_q, const std::uint32_t lws = wg_delta_n * wg_delta_m; - const size_t n_groups = + const std::size_t n_groups = (n + wg_delta_n * wi_delta_n - 1) / (wg_delta_n * wi_delta_n); - const size_t m_groups = (m + wg_delta_m * wi_total_delta_m - 1) / - (wg_delta_m * wi_total_delta_m); + const std::size_t m_groups = (m + wg_delta_m * wi_total_delta_m - 1) / + (wg_delta_m * wi_total_delta_m); - const size_t gws = lws * batch_nelems * n_groups * m_groups; + const std::size_t gws = lws * batch_nelems * n_groups * m_groups; sycl::range<1> lRange(lws); sycl::range<1> gRange(gws); @@ -1379,9 +1381,9 @@ typedef sycl::event (*gemm_impl_fn_ptr_t)( const char *, // lhs const char *, // rhs char *, // res - size_t, // lhs_outer_nelems (n) - size_t, // inner_nelems (k) - size_t, // rhs_outer_nelems (m) + std::size_t, // lhs_outer_nelems (n) + std::size_t, // inner_nelems (k) + std::size_t, // rhs_outer_nelems (m) int, // inner nd int, // lhs outer nd const ssize_t *, // lhs shape and strides @@ -1396,9 +1398,9 @@ sycl::event gemm_impl(sycl::queue &exec_q, const char *lhs_cp, const char *rhs_cp, char *res_cp, - size_t n, - size_t k, - size_t m, + std::size_t n, + std::size_t k, + std::size_t m, int inner_nd, int lhs_outer_nd, const ssize_t *lhs_shape_strides, @@ -1422,10 +1424,10 @@ sycl::event gemm_impl(sycl::queue &exec_q, using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; constexpr BatchIndexerT batch_indexer{}; - constexpr size_t single_batch_nelems = 1; + constexpr std::size_t single_batch_nelems = 1; - const size_t min_nm = std::min(n, m); - const size_t max_nm = std::max(n, m); + const std::size_t min_nm = std::min(n, m); + const std::size_t max_nm = std::max(n, m); if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) { return gemm_detail::_gemm_batch_nm_impl< @@ -1481,9 +1483,9 @@ typedef sycl::event (*gemm_contig_impl_fn_ptr_t)( const char *, // lhs const char *, // rhs char *, // res - size_t, // n - size_t, // k - size_t, // m + std::size_t, // n + std::size_t, // k + std::size_t, // m std::vector const &); template @@ -1491,9 +1493,9 @@ sycl::event gemm_contig_impl(sycl::queue &exec_q, const char *lhs_cp, const char *rhs_cp, char *res_cp, - size_t n, - size_t k, - size_t m, + std::size_t n, + std::size_t k, + std::size_t m, std::vector const &depends = {}) { const lhsTy *lhs_tp = reinterpret_cast(lhs_cp); @@ -1508,10 +1510,10 @@ sycl::event gemm_contig_impl(sycl::queue &exec_q, using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; constexpr BatchIndexerT batch_indexer{}; - constexpr size_t single_batch_nelems = 1; + constexpr std::size_t single_batch_nelems = 1; - const size_t min_nm = std::min(n, m); - const size_t max_nm = std::max(n, m); + const std::size_t min_nm = std::min(n, m); + const std::size_t max_nm = std::max(n, m); if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) { return gemm_detail::_gemm_batch_nm_impl< lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT, @@ -1560,10 +1562,10 @@ typedef sycl::event (*gemm_batch_impl_fn_ptr_t)( const char *, // lhs const char *, // rhs char *, // res - size_t, // batch nelems - size_t, // lhs outer nelems (n) - size_t, // inner nelems (k) - size_t, // rhs outer nelems (m) + std::size_t, // batch nelems + std::size_t, // lhs outer nelems (n) + std::size_t, // inner nelems (k) + std::size_t, // rhs outer nelems (m) int, // batching nd const ssize_t *, // batch shape strides ssize_t, // lhs batch offset @@ -1584,10 +1586,10 @@ sycl::event gemm_batch_impl(sycl::queue &exec_q, const char *lhs_cp, const char *rhs_cp, char *res_cp, - size_t batch_nelems, - size_t n, - size_t k, - size_t m, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, int batch_nd, const ssize_t *batch_shape_strides, ssize_t lhs_batch_offset, @@ -1620,8 +1622,8 @@ sycl::event gemm_batch_impl(sycl::queue &exec_q, rhs_batch_offset, res_batch_offset, batch_shape_strides); - const size_t min_nm = std::min(n, m); - const size_t max_nm = std::max(n, m); + const std::size_t min_nm = std::min(n, m); + const std::size_t max_nm = std::max(n, m); if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) { return gemm_detail::_gemm_batch_nm_impl< @@ -1680,10 +1682,10 @@ typedef sycl::event (*gemm_batch_contig_impl_fn_ptr_t)( const char *, // lhs const char *, // rhs char *, // res - size_t, // batch nelems - size_t, // n - size_t, // k - size_t, // m + std::size_t, // batch nelems + std::size_t, // n + std::size_t, // k + std::size_t, // m ssize_t, // lhs batch offset ssize_t, // rhs batch offset ssize_t, // res batch offset @@ -1694,10 +1696,10 @@ sycl::event gemm_batch_contig_impl(sycl::queue &exec_q, const char *lhs_cp, const char *rhs_cp, char *res_cp, - size_t batch_nelems, - size_t n, - size_t k, - size_t m, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, ssize_t lhs_batch_offset, ssize_t rhs_batch_offset, ssize_t res_batch_offset, @@ -1728,8 +1730,8 @@ sycl::event gemm_batch_contig_impl(sycl::queue &exec_q, Strided1DIndexer{/* size */ batch_nelems, /* step */ n * m}); - const size_t min_nm = std::min(n, m); - const size_t max_nm = std::max(n, m); + const std::size_t min_nm = std::min(n, m); + const std::size_t max_nm = std::max(n, m); if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) { return gemm_detail::_gemm_batch_nm_impl< @@ -1792,15 +1794,15 @@ class GemmBatchNoAtomicFunctorThreadNM resT *res = nullptr; LocAccT1 local_A_block; LocAccT2 local_B_block; - size_t n = 0; - size_t wg_delta_n = 0; - size_t k = 0; - size_t k_blocks = 0; - size_t wi_delta_k = 0; - size_t m = 0; - size_t m_blocks = 0; - size_t wg_delta_m = 0; - size_t batch_nelems; + std::size_t n = 0; + std::size_t wg_delta_n = 0; + std::size_t k = 0; + std::size_t k_blocks = 0; + std::size_t wi_delta_k = 0; + std::size_t m = 0; + std::size_t m_blocks = 0; + std::size_t wg_delta_m = 0; + std::size_t batch_nelems; const BatchDimsIndexerT batch_indexer; const OuterInnerDimsIndexerT lhs_indexer; const OuterInnerDimsIndexerT rhs_indexer; @@ -1812,15 +1814,15 @@ class GemmBatchNoAtomicFunctorThreadNM resT *res_, LocAccT1 local_A_block_, LocAccT2 local_B_block_, - size_t n_, - size_t wg_delta_n_, - size_t k_, - size_t k_blocks_, - size_t wi_delta_k_, - size_t m_, - size_t m_blocks_, - size_t wg_delta_m_, - size_t batch_nelems_, + std::size_t n_, + std::size_t wg_delta_n_, + std::size_t k_, + std::size_t k_blocks_, + std::size_t wi_delta_k_, + std::size_t m_, + std::size_t m_blocks_, + std::size_t wg_delta_m_, + std::size_t batch_nelems_, const BatchDimsIndexerT batch_indexer_, const OuterInnerDimsIndexerT lhs_indexer_, const OuterInnerDimsIndexerT rhs_indexer_, @@ -1837,9 +1839,10 @@ class GemmBatchNoAtomicFunctorThreadNM void operator()(sycl::nd_item<1> it) const { - const size_t n_groups_per_batch = it.get_group_range(0) / batch_nelems; - const size_t m_id = it.get_group_linear_id() / n_groups_per_batch; - const size_t gr_id = + const std::size_t n_groups_per_batch = + it.get_group_range(0) / batch_nelems; + const std::size_t m_id = it.get_group_linear_id() / n_groups_per_batch; + const std::size_t gr_id = it.get_group_linear_id() - m_id * n_groups_per_batch; const auto &three_offsets_ = batch_indexer(static_cast(m_id)); @@ -1852,20 +1855,21 @@ class GemmBatchNoAtomicFunctorThreadNM const auto &rhs_offset = three_offsets_.get_second_offset(); const auto &res_offset = three_offsets_.get_third_offset(); - size_t block_i = gr_id / (m_blocks * k_blocks); - size_t block_r = gr_id - block_i * (m_blocks * k_blocks); - size_t block_j = block_r / k_blocks; - size_t block_s = block_r - block_j * k_blocks; + std::size_t block_i = gr_id / (m_blocks * k_blocks); + std::size_t block_r = gr_id - block_i * (m_blocks * k_blocks); + std::size_t block_j = block_r / k_blocks; + std::size_t block_s = block_r - block_j * k_blocks; - size_t lid = it.get_local_linear_id(); - size_t local_i = lid / wg_delta_m; // 0<= local_i < wg_delta_n - size_t local_j = lid - local_i * wg_delta_m; // 0<= local_j < wg_delta_m + std::size_t lid = it.get_local_linear_id(); + std::size_t local_i = lid / wg_delta_m; // 0<= local_i < wg_delta_n + std::size_t local_j = + lid - local_i * wg_delta_m; // 0<= local_j < wg_delta_m // load A block and B blocks into SLM - size_t i = block_i * wi_delta_n * wg_delta_n; - size_t j = block_j * wi_delta_m * wg_delta_m; - size_t s = block_s * wi_delta_k; + std::size_t i = block_i * wi_delta_n * wg_delta_n; + std::size_t j = block_j * wi_delta_m * wg_delta_m; + std::size_t s = block_s * wi_delta_k; const std::int64_t a_st0 = k; const std::int64_t a_st1 = 1; @@ -1876,14 +1880,15 @@ class GemmBatchNoAtomicFunctorThreadNM const std::int64_t c_st0 = m; const std::int64_t c_st1 = 1; - size_t lws = it.get_local_range(0); + std::size_t lws = it.get_local_range(0); - for (size_t vid = lid; vid < local_A_block.size(); vid += lws) { - size_t v_i = vid / wi_delta_k; // 0<= v_i < wg_delta_n * wi_delta_n - size_t v_s = vid - v_i * wi_delta_k; // 0<= v_s < wi_delta_k + for (std::size_t vid = lid; vid < local_A_block.size(); vid += lws) { + std::size_t v_i = + vid / wi_delta_k; // 0<= v_i < wg_delta_n * wi_delta_n + std::size_t v_s = vid - v_i * wi_delta_k; // 0<= v_s < wi_delta_k - size_t g_i = i + v_i; - size_t g_s = s + v_s; + std::size_t g_i = i + v_i; + std::size_t g_s = s + v_s; local_A_block[vid] = (g_i < n && g_s < k) @@ -1895,12 +1900,12 @@ class GemmBatchNoAtomicFunctorThreadNM using slmB_t = typename LocAccT2::value_type; - for (size_t vid = lid; vid < local_B_block.size(); vid += lws) { - size_t v_j = vid / wi_delta_k; // 0<= v_i < wg_delta_m - size_t v_s = vid - v_j * wi_delta_k; // 0<= v_s < wi_delta_k + for (std::size_t vid = lid; vid < local_B_block.size(); vid += lws) { + std::size_t v_j = vid / wi_delta_k; // 0<= v_i < wg_delta_m + std::size_t v_s = vid - v_j * wi_delta_k; // 0<= v_s < wi_delta_k - size_t g_j = j + v_j * wi_delta_m; - size_t g_s = s + v_s; + std::size_t g_j = j + v_j * wi_delta_m; + std::size_t g_s = s + v_s; if constexpr (wi_delta_m == 1 && std::is_same_v) { local_B_block[vid] = @@ -1915,7 +1920,7 @@ class GemmBatchNoAtomicFunctorThreadNM #pragma unroll for (std::uint8_t lane_id = 0; lane_id < wi_delta_m; ++lane_id) { - size_t g_j1 = g_j + lane_id; + std::size_t g_j1 = g_j + lane_id; vec[lane_id] = (g_j1 < m && g_s < k) ? static_cast( @@ -1933,25 +1938,26 @@ class GemmBatchNoAtomicFunctorThreadNM i += local_i * wi_delta_n; j += local_j * wi_delta_m; - const size_t a_offset = local_i * wi_delta_k * wi_delta_n; - const size_t b_offset = local_j * wi_delta_k; + const std::size_t a_offset = local_i * wi_delta_k * wi_delta_n; + const std::size_t b_offset = local_j * wi_delta_k; constexpr resT identity_(0); for (std::uint8_t private_i = 0; private_i < wi_delta_n; ++private_i) { - const size_t a_pr_offset = private_i * wi_delta_k; + const std::size_t a_pr_offset = private_i * wi_delta_k; slmB_t local_sum(identity_); - for (size_t private_s = 0; private_s < wi_delta_k; ++private_s) { + for (std::size_t private_s = 0; private_s < wi_delta_k; ++private_s) + { local_sum = local_sum + (local_A_block[a_offset + a_pr_offset + private_s] * local_B_block[b_offset + private_s]); } - const size_t gl_i = i + private_i; + const std::size_t gl_i = i + private_i; if constexpr (wi_delta_m == 1 && std::is_same_v) { - const size_t gl_j = j; + const std::size_t gl_j = j; if (gl_i < n && gl_j < m) { res[res_offset + res_indexer(gl_i * c_st0 + gl_j * c_st1) + (block_s * n * m * batch_nelems)] = local_sum; @@ -1961,7 +1967,7 @@ class GemmBatchNoAtomicFunctorThreadNM #pragma unroll for (std::uint8_t lane_id = 0; lane_id < wi_delta_m; ++lane_id) { - const size_t gl_j = j + lane_id; + const std::size_t gl_j = j + lane_id; if (gl_i < n && gl_j < m) { res[res_offset + @@ -1982,7 +1988,7 @@ template + std::size_t m_groups> class GemmBatchNoAtomicFunctorThreadK { private: @@ -1991,15 +1997,15 @@ class GemmBatchNoAtomicFunctorThreadK resT *res = nullptr; LocAccT workspace; LocAccT local_B_block; - size_t n = 0; - size_t n_blocks = 0; - size_t delta_n = 0; - size_t k = 0; - size_t k_blocks = 0; - size_t delta_k = 0; - size_t n_wi = 0; - size_t m = 0; - size_t batch_nelems = 0; + std::size_t n = 0; + std::size_t n_blocks = 0; + std::size_t delta_n = 0; + std::size_t k = 0; + std::size_t k_blocks = 0; + std::size_t delta_k = 0; + std::size_t n_wi = 0; + std::size_t m = 0; + std::size_t batch_nelems = 0; const BatchDimsIndexerT batch_indexer; const OuterInnerDimsIndexerT lhs_indexer; const OuterInnerDimsIndexerT rhs_indexer; @@ -2011,15 +2017,15 @@ class GemmBatchNoAtomicFunctorThreadK resT *res_, LocAccT workspace_, LocAccT local_B_block_, - size_t n_, - size_t n_blocks_, - size_t delta_n_, - size_t k_, - size_t k_blocks_, - size_t delta_k_, - size_t n_wi_, - size_t m_, - size_t batch_nelems_, + std::size_t n_, + std::size_t n_blocks_, + std::size_t delta_n_, + std::size_t k_, + std::size_t k_blocks_, + std::size_t delta_k_, + std::size_t n_wi_, + std::size_t m_, + std::size_t batch_nelems_, const BatchDimsIndexerT &batch_indexer_, const OuterInnerDimsIndexerT &lhs_indexer_, const OuterInnerDimsIndexerT &rhs_indexer_, @@ -2035,11 +2041,12 @@ class GemmBatchNoAtomicFunctorThreadK void operator()(sycl::nd_item<1> it) const { - const size_t n_groups_per_batch = it.get_group_range(0) / batch_nelems; - const size_t m_id = it.get_group_linear_id() / n_groups_per_batch; - const size_t gr_id = + const std::size_t n_groups_per_batch = + it.get_group_range(0) / batch_nelems; + const std::size_t m_id = it.get_group_linear_id() / n_groups_per_batch; + const std::size_t gr_id = it.get_group_linear_id() - m_id * n_groups_per_batch; - size_t lid = it.get_local_linear_id(); + std::size_t lid = it.get_local_linear_id(); const auto &three_offsets_ = batch_indexer(static_cast(m_id)); const auto &lhs_offset = three_offsets_.get_first_offset(); @@ -2049,30 +2056,31 @@ class GemmBatchNoAtomicFunctorThreadK // lift gr_id -> (block_i, block_j, block_s) // block_i moves fastest, then block_s, then block_j - const size_t r_size = (n_blocks * k_blocks); + const std::size_t r_size = (n_blocks * k_blocks); // 0 <= block_j < m_blocks - size_t block_j = gr_id / r_size; + std::size_t block_j = gr_id / r_size; // 0 <= block_r < n_blocks * k_blocks - size_t block_r = gr_id - block_j * r_size; + std::size_t block_r = gr_id - block_j * r_size; // 0 <= block_s < k_blocks - size_t block_s = block_r / n_blocks; + std::size_t block_s = block_r / n_blocks; // 0 <= block_i < n_blocks - size_t block_i = block_r - block_s * n_blocks; + std::size_t block_i = block_r - block_s * n_blocks; - size_t local_i = lid / (delta_k); // 0 <= local_i < delta_n - size_t local_s = lid - local_i * (delta_k); // 0 <= local_s < delta_k + std::size_t local_i = lid / (delta_k); // 0 <= local_i < delta_n + std::size_t local_s = + lid - local_i * (delta_k); // 0 <= local_s < delta_k - size_t i = block_i * delta_n + local_i; - size_t j = m_groups * block_j; - size_t s = block_s * delta_k * n_wi + local_s; + std::size_t i = block_i * delta_n + local_i; + std::size_t j = m_groups * block_j; + std::size_t s = block_s * delta_k * n_wi + local_s; using accV_t = typename LocAccT::value_type; constexpr resT identity_ = resT(0); if (local_i == 0) { - for (size_t q = 0; q < n_wi * delta_k; q += delta_k) { - size_t sq = s + q; - size_t sqmj = sq * m + j; + for (std::size_t q = 0; q < n_wi * delta_k; q += delta_k) { + std::size_t sq = s + q; + std::size_t sqmj = sq * m + j; if constexpr (m_groups == 1 && std::is_same_v) { local_B_block[local_s + q] = @@ -2084,7 +2092,8 @@ class GemmBatchNoAtomicFunctorThreadK else { accV_t local_B_vec; #pragma unroll - for (size_t vec_idx = 0; vec_idx < m_groups; ++vec_idx) { + for (std::size_t vec_idx = 0; vec_idx < m_groups; ++vec_idx) + { local_B_vec[vec_idx] = (sq < k && j + vec_idx < m) ? static_cast( @@ -2099,12 +2108,12 @@ class GemmBatchNoAtomicFunctorThreadK it.barrier(sycl::access::fence_space::local_space); - size_t t_shift = block_s * delta_k * n_wi; - size_t global_s_offset = i * k + t_shift; + std::size_t t_shift = block_s * delta_k * n_wi; + std::size_t global_s_offset = i * k + t_shift; accV_t private_sum(identity_); constexpr accV_t vec_identity_(identity_); - for (size_t t = local_s; t < local_B_block.size(); t += delta_k) { + for (std::size_t t = local_s; t < local_B_block.size(); t += delta_k) { private_sum += ((i < n) && (t + t_shift < k)) ? (static_cast( @@ -2113,18 +2122,18 @@ class GemmBatchNoAtomicFunctorThreadK : vec_identity_; } - size_t workspace_i_shift = local_i * delta_k; + std::size_t workspace_i_shift = local_i * delta_k; workspace[workspace_i_shift + local_s] = private_sum; it.barrier(sycl::access::fence_space::local_space); if (local_s == 0 && i < n) { accV_t local_sum(workspace[workspace_i_shift]); - for (size_t t = 1; t < delta_k; ++t) { + for (std::size_t t = 1; t < delta_k; ++t) { local_sum += workspace[workspace_i_shift + t]; } - const size_t total_offset = + const std::size_t total_offset = res_offset + (block_s * n * m * batch_nelems); if constexpr (m_groups == 1 && std::is_same_v) { @@ -2134,7 +2143,7 @@ class GemmBatchNoAtomicFunctorThreadK res[total_offset + res_indexer(i * m + j)] = local_sum[0]; #pragma unroll - for (size_t vec_id = 1; vec_id < m_groups; ++vec_id) { + for (std::size_t vec_id = 1; vec_id < m_groups; ++vec_id) { if (j + vec_id < m) { res[total_offset + res_indexer(i * m + j + vec_id)] = local_sum[vec_id]; @@ -2151,7 +2160,7 @@ template + std::size_t> class gemm_batch_tree_k_krn; template + std::size_t> class gemm_batch_tree_nm_krn; namespace gemm_detail @@ -2178,13 +2187,13 @@ sycl::event _gemm_tree_k_step(sycl::queue &exec_q, const lhsTy *lhs_tp, const rhsTy *rhs_tp, resTy *res_tp, - const size_t batch_nelems, - const size_t n, - const size_t k, - const size_t m, - const size_t delta_n, - const size_t n_wi, - const size_t delta_k, + const std::size_t batch_nelems, + const std::size_t n, + const std::size_t k, + const std::size_t m, + const std::size_t delta_n, + const std::size_t n_wi, + const std::size_t delta_k, const BatchIndexerT &batch_indexer, const LhsIndexerT &lhs_indexer, const RhsIndexerT &rhs_indexer, @@ -2196,12 +2205,14 @@ sycl::event _gemm_tree_k_step(sycl::queue &exec_q, sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - const size_t n_blocks = (n + delta_n - 1) / delta_n; - const size_t k_blocks = (k + n_wi * delta_k - 1) / (n_wi * delta_k); - const size_t m_blocks = (m + m_groups - 1) / m_groups; + const std::size_t n_blocks = (n + delta_n - 1) / delta_n; + const std::size_t k_blocks = + (k + n_wi * delta_k - 1) / (n_wi * delta_k); + const std::size_t m_blocks = (m + m_groups - 1) / m_groups; - const size_t lws = delta_n * delta_k; - const size_t gws = batch_nelems * n_blocks * m_blocks * k_blocks * lws; + const std::size_t lws = delta_n * delta_k; + const std::size_t gws = + batch_nelems * n_blocks * m_blocks * k_blocks * lws; auto gRange = sycl::range<1>(gws); auto lRange = sycl::range<1>(lws); @@ -2243,10 +2254,10 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q, const lhsTy *lhs_tp, const rhsTy *rhs_tp, resTy *res_tp, - size_t batch_nelems, - size_t n, - size_t k, - size_t m, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, int batch_nd, const ssize_t *batch_shape_strides, ssize_t lhs_batch_offset, @@ -2262,14 +2273,14 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q, const ssize_t *res_shape_strides, std::vector const &depends) { - size_t delta_k(4); - size_t n_wi(64); - size_t delta_n(32); + std::size_t delta_k(4); + std::size_t n_wi(64); + std::size_t delta_n(32); const sycl::device &dev = exec_q.get_device(); - const size_t local_mem_size = + const std::size_t local_mem_size = dev.get_info(); - const size_t reserved_slm_size = 512; + const std::size_t reserved_slm_size = 512; gemm_detail::scale_gemm_k_parameters( local_mem_size, reserved_slm_size, delta_k, @@ -2307,26 +2318,27 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q, constexpr resTy identity_val = sycl::known_identity::value; - size_t iter_nelems = batch_nelems * n * m; - size_t reduction_nelems = (k + delta_k * n_wi - 1) / (delta_k * n_wi); + std::size_t iter_nelems = batch_nelems * n * m; + std::size_t reduction_nelems = + (k + delta_k * n_wi - 1) / (delta_k * n_wi); // more than one work-group is needed, requires a // temporary delta_k * n_wi elements processed along k, // so if more to process use multiple const auto &sg_sizes = dev.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferred_reductions_per_wi = 4; - size_t reductions_per_wi(preferred_reductions_per_wi); + constexpr std::size_t preferred_reductions_per_wi = 4; + std::size_t reductions_per_wi(preferred_reductions_per_wi); - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); // max_max_wg prevents running out of resources on CPU - constexpr size_t max_max_wg = 2048; - size_t max_wg = std::min( + constexpr std::size_t max_max_wg = 2048; + std::size_t max_wg = std::min( max_max_wg, dev.get_info() / 2); @@ -2474,10 +2486,10 @@ sycl::event _gemm_tree_nm_step(sycl::queue &exec_q, const lhsTy *lhs_tp, const rhsTy *rhs_tp, resTy *res_tp, - const size_t batch_nelems, - const size_t n, - const size_t k, - const size_t m, + const std::size_t batch_nelems, + const std::size_t n, + const std::size_t k, + const std::size_t m, const std::uint32_t wg_delta_n, const std::uint32_t wg_delta_m, const std::uint32_t wi_delta_k, @@ -2492,15 +2504,16 @@ sycl::event _gemm_tree_nm_step(sycl::queue &exec_q, sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - const size_t lws = wg_delta_n * wg_delta_m; + const std::size_t lws = wg_delta_n * wg_delta_m; - const size_t n_blocks = + const std::size_t n_blocks = ((n + wi_delta_n * wg_delta_n - 1) / (wi_delta_n * wg_delta_n)); - const size_t k_blocks = ((k + wi_delta_k - 1) / wi_delta_k); - const size_t m_blocks = + const std::size_t k_blocks = ((k + wi_delta_k - 1) / wi_delta_k); + const std::size_t m_blocks = ((m + wi_delta_m * wg_delta_m - 1) / (wi_delta_m * wg_delta_m)); - const size_t gws = batch_nelems * n_blocks * m_blocks * k_blocks * lws; + const std::size_t gws = + batch_nelems * n_blocks * m_blocks * k_blocks * lws; auto gwsRange = sycl::range<1>(gws); auto lwsRange = sycl::range<1>(lws); @@ -2543,10 +2556,10 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q, const lhsTy *lhs_tp, const rhsTy *rhs_tp, resTy *res_tp, - size_t batch_nelems, - size_t n, - size_t k, - size_t m, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, int batch_nd, const ssize_t *batch_shape_strides, ssize_t lhs_batch_offset, @@ -2563,14 +2576,14 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q, std::vector const &depends) { constexpr int wi_delta_n = 2; - size_t wg_delta_n(16); // rows of A processed in WG - size_t wg_delta_m(16); // rows of B processed in WG - size_t wi_delta_k(64); // Elements in K dimension processed by WI + std::size_t wg_delta_n(16); // rows of A processed in WG + std::size_t wg_delta_m(16); // rows of B processed in WG + std::size_t wi_delta_k(64); // Elements in K dimension processed by WI const sycl::device &dev = exec_q.get_device(); - const size_t local_mem_size = + const std::size_t local_mem_size = dev.get_info(); - const size_t reserved_slm_size = 512; + const std::size_t reserved_slm_size = 512; gemm_detail::scale_gemm_nm_parameters( local_mem_size, reserved_slm_size, wi_delta_n, @@ -2611,24 +2624,24 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q, sycl::plus>::type; constexpr resTy identity_val = sycl::known_identity::value; - size_t iter_nelems = batch_nelems * n * m; - size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k; + std::size_t iter_nelems = batch_nelems * n * m; + std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k; // more than one work-group is needed, requires a temporary // delta_k * n_wi elements processed along k, so if more to // process use multiple const auto &sg_sizes = dev.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferred_reductions_per_wi = 4; - size_t reductions_per_wi(preferred_reductions_per_wi); + constexpr std::size_t preferred_reductions_per_wi = 4; + std::size_t reductions_per_wi(preferred_reductions_per_wi); - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); - size_t max_wg = reduction_detail::get_work_group_size(dev); + std::size_t max_wg = reduction_detail::get_work_group_size(dev); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { resTy *tmp = sycl::malloc_device( @@ -2768,10 +2781,10 @@ sycl::event gemm_batch_nm_impl(sycl::queue &exec_q, const lhsTy *lhs_tp, const rhsTy *rhs_tp, resTy *res_tp, - size_t batch_nelems, - size_t n, - size_t k, - size_t m, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, int batch_nd, const ssize_t *batch_shape_strides, ssize_t lhs_batch_offset, @@ -2819,10 +2832,10 @@ sycl::event gemm_batch_tree_impl(sycl::queue &exec_q, const char *lhs_cp, const char *rhs_cp, char *res_cp, - size_t batch_nelems, - size_t n, - size_t k, - size_t m, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, int batch_nd, const ssize_t *batch_shape_strides, ssize_t lhs_batch_offset, @@ -2842,8 +2855,8 @@ sycl::event gemm_batch_tree_impl(sycl::queue &exec_q, const rhsTy *rhs_tp = reinterpret_cast(rhs_cp); resTy *res_tp = reinterpret_cast(res_cp); - const size_t min_nm = std::min(n, m); - const size_t max_nm = std::max(n, m); + const std::size_t min_nm = std::min(n, m); + const std::size_t max_nm = std::max(n, m); if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) { return gemm_batch_nm_impl( @@ -2936,26 +2949,26 @@ sycl::event gemm_batch_tree_impl(sycl::queue &exec_q, } } -template +template sycl::event gemm_batch_contig_tree_k_impl(sycl::queue &exec_q, const lhsTy *lhs_tp, const rhsTy *rhs_tp, resTy *res_tp, - size_t batch_nelems, - size_t n, - size_t k, - size_t m, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, std::vector const &depends) { - size_t delta_k(4); - size_t n_wi(64); - size_t delta_n(32); + std::size_t delta_k(4); + std::size_t n_wi(64); + std::size_t delta_n(32); const sycl::device &dev = exec_q.get_device(); - const size_t local_mem_size = + const std::size_t local_mem_size = dev.get_info(); - const size_t reserved_slm_size = 512; + const std::size_t reserved_slm_size = 512; gemm_detail::scale_gemm_k_parameters( local_mem_size, reserved_slm_size, delta_k, @@ -2999,24 +3012,25 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q, constexpr resTy identity_val = sycl::known_identity::value; - size_t iter_nelems = batch_nelems * n * m; - size_t reduction_nelems = (k + delta_k * n_wi - 1) / (delta_k * n_wi); + std::size_t iter_nelems = batch_nelems * n * m; + std::size_t reduction_nelems = + (k + delta_k * n_wi - 1) / (delta_k * n_wi); // more than one work-group is needed, requires a // temporary delta_k * n_wi elements processed along k, // so if more to process use multiple const auto &sg_sizes = dev.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferred_reductions_per_wi = 4; - size_t reductions_per_wi(preferred_reductions_per_wi); + constexpr std::size_t preferred_reductions_per_wi = 4; + std::size_t reductions_per_wi(preferred_reductions_per_wi); - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); - size_t max_wg = reduction_detail::get_work_group_size(dev); + std::size_t max_wg = reduction_detail::get_work_group_size(dev); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { resTy *tmp = sycl::malloc_device( @@ -3140,21 +3154,21 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q, const lhsTy *lhs_tp, const rhsTy *rhs_tp, resTy *res_tp, - size_t batch_nelems, - size_t n, - size_t k, - size_t m, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, std::vector const &depends) { constexpr int wi_delta_n = 2; - size_t wg_delta_n(16); // rows of A processed in WG - size_t wg_delta_m(16); // rows of B processed in WG - size_t wi_delta_k(64); // Elements in K dimension processed by WI + std::size_t wg_delta_n(16); // rows of A processed in WG + std::size_t wg_delta_m(16); // rows of B processed in WG + std::size_t wi_delta_k(64); // Elements in K dimension processed by WI const sycl::device &dev = exec_q.get_device(); - const size_t local_mem_size = + const std::size_t local_mem_size = dev.get_info(); - const size_t reserved_slm_size = 512; + const std::size_t reserved_slm_size = 512; gemm_detail::scale_gemm_nm_parameters( local_mem_size, reserved_slm_size, wi_delta_n, @@ -3200,24 +3214,24 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q, sycl::plus>::type; constexpr resTy identity_val = sycl::known_identity::value; - size_t iter_nelems = batch_nelems * n * m; - size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k; + std::size_t iter_nelems = batch_nelems * n * m; + std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k; // more than one work-group is needed, requires a temporary // delta_k * n_wi elements processed along k, so if more to // process use multiple const auto &sg_sizes = dev.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferred_reductions_per_wi = 4; - size_t reductions_per_wi(preferred_reductions_per_wi); + constexpr std::size_t preferred_reductions_per_wi = 4; + std::size_t reductions_per_wi(preferred_reductions_per_wi); - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); - size_t max_wg = reduction_detail::get_work_group_size(dev); + std::size_t max_wg = reduction_detail::get_work_group_size(dev); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { resTy *tmp = sycl::malloc_device( @@ -3343,9 +3357,9 @@ sycl::event gemm_nm_impl(sycl::queue &exec_q, const lhsTy *lhs_tp, const rhsTy *rhs_tp, resTy *res_tp, - size_t n, - size_t k, - size_t m, + std::size_t n, + std::size_t k, + std::size_t m, int inner_nd, int lhs_outer_nd, const ssize_t *lhs_shape_strides, @@ -3367,7 +3381,7 @@ sycl::event gemm_nm_impl(sycl::queue &exec_q, dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; constexpr BatchDimsIndexerT batch_indexer{}; - constexpr size_t single_batch_nelems = 1; + constexpr std::size_t single_batch_nelems = 1; sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl< lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, @@ -3384,10 +3398,10 @@ gemm_batch_nm_contig_impl(sycl::queue &exec_q, const lhsTy *lhs_tp, const rhsTy *rhs_tp, resTy *res_tp, - size_t batch_nelems, - size_t n, - size_t k, - size_t m, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, std::vector const &depends = {}) { using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; @@ -3395,7 +3409,7 @@ gemm_batch_nm_contig_impl(sycl::queue &exec_q, constexpr OuterInnerDimsIndexerT rhs_indexer{}; constexpr OuterInnerDimsIndexerT res_indexer{}; - constexpr size_t single_batch_nelems = 1; + constexpr std::size_t single_batch_nelems = 1; if (batch_nelems == single_batch_nelems) { using BatchDimsIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; @@ -3442,10 +3456,10 @@ gemm_batch_contig_tree_impl(sycl::queue &exec_q, const char *lhs_cp, const char *rhs_cp, char *res_cp, - size_t batch_nelems, - size_t n, - size_t k, - size_t m, + std::size_t batch_nelems, + std::size_t n, + std::size_t k, + std::size_t m, ssize_t lhs_batch_offset, ssize_t rhs_batch_offset, ssize_t res_batch_offset, @@ -3457,8 +3471,8 @@ gemm_batch_contig_tree_impl(sycl::queue &exec_q, reinterpret_cast(rhs_cp) + rhs_batch_offset; resTy *res_tp = reinterpret_cast(res_cp) + res_batch_offset; - const size_t min_nm = std::min(n, m); - const size_t max_nm = std::max(n, m); + const std::size_t min_nm = std::min(n, m); + const std::size_t max_nm = std::max(n, m); if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) { return gemm_batch_nm_contig_impl( @@ -3513,7 +3527,7 @@ template + std::size_t> class gemm_tree_nm_krn; template + std::size_t> class gemm_tree_k_krn; -template +template sycl::event gemm_tree_k_impl(sycl::queue &exec_q, const lhsTy *lhs_tp, const rhsTy *rhs_tp, resTy *res_tp, - size_t n, - size_t k, - size_t m, + std::size_t n, + std::size_t k, + std::size_t m, int inner_nd, int lhs_outer_nd, const ssize_t *lhs_outer_inner_shapes_strides, @@ -3541,14 +3555,14 @@ sycl::event gemm_tree_k_impl(sycl::queue &exec_q, const ssize_t *res_shapes_strides, const std::vector &depends) { - size_t delta_k(4); - size_t n_wi(64); - size_t delta_n(32); + std::size_t delta_k(4); + std::size_t n_wi(64); + std::size_t delta_n(32); const sycl::device &dev = exec_q.get_device(); - const size_t local_mem_size = + const std::size_t local_mem_size = dev.get_info(); - const size_t reserved_slm_size = 512; + const std::size_t reserved_slm_size = 512; gemm_detail::scale_gemm_k_parameters( local_mem_size, reserved_slm_size, delta_k, @@ -3559,7 +3573,7 @@ sycl::event gemm_tree_k_impl(sycl::queue &exec_q, using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; constexpr BatchIndexerT batch_indexer{}; - constexpr size_t single_batch_nelems = 1; + constexpr std::size_t single_batch_nelems = 1; using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer; const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0, @@ -3586,24 +3600,25 @@ sycl::event gemm_tree_k_impl(sycl::queue &exec_q, constexpr resTy identity_val = sycl::known_identity::value; - size_t iter_nelems = n * m; - size_t reduction_nelems = (k + delta_k * n_wi - 1) / (delta_k * n_wi); + std::size_t iter_nelems = n * m; + std::size_t reduction_nelems = + (k + delta_k * n_wi - 1) / (delta_k * n_wi); // more than one work-groups is needed, requires a temporary // delta_k * n_wi elements processed along k, so if more to // process use multiple const auto &sg_sizes = dev.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferred_reductions_per_wi = 8; - size_t reductions_per_wi(preferred_reductions_per_wi); + constexpr std::size_t preferred_reductions_per_wi = 8; + std::size_t reductions_per_wi(preferred_reductions_per_wi); - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); - size_t max_wg = reduction_detail::get_work_group_size(dev); + std::size_t max_wg = reduction_detail::get_work_group_size(dev); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { resTy *tmp = sycl::malloc_device( @@ -3694,9 +3709,9 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q, const lhsTy *lhs_tp, const rhsTy *rhs_tp, resTy *res_tp, - size_t n, - size_t k, - size_t m, + std::size_t n, + std::size_t k, + std::size_t m, int inner_nd, int lhs_outer_nd, const ssize_t *lhs_outer_inner_shapes_strides, @@ -3707,14 +3722,14 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q, const std::vector &depends) { constexpr int wi_delta_n = 2; - size_t wg_delta_n(16); // rows of A processed in WG - size_t wg_delta_m(16); // rows of B processed in WG - size_t wi_delta_k(64); // Elements in K dimension processed by WI + std::size_t wg_delta_n(16); // rows of A processed in WG + std::size_t wg_delta_m(16); // rows of B processed in WG + std::size_t wi_delta_k(64); // Elements in K dimension processed by WI const sycl::device &dev = exec_q.get_device(); - const size_t local_mem_size = + const std::size_t local_mem_size = dev.get_info(); - const size_t reserved_slm_size = 512; + const std::size_t reserved_slm_size = 512; gemm_detail::scale_gemm_nm_parameters( local_mem_size, reserved_slm_size, wi_delta_n, @@ -3726,7 +3741,7 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q, using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; constexpr BatchIndexerT batch_indexer{}; - constexpr size_t single_batch_nelems = 1; + constexpr std::size_t single_batch_nelems = 1; using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer; const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0, @@ -3754,24 +3769,24 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q, constexpr resTy identity_val = sycl::known_identity::value; - size_t iter_nelems = n * m; - size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k; + std::size_t iter_nelems = n * m; + std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k; // more than one work-groups is needed, requires a temporary // wi_delta_k elements processed along k, so if more to // process use multiple const auto &sg_sizes = dev.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferred_reductions_per_wi = 8; - size_t reductions_per_wi(preferred_reductions_per_wi); + constexpr std::size_t preferred_reductions_per_wi = 8; + std::size_t reductions_per_wi(preferred_reductions_per_wi); - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); - size_t max_wg = reduction_detail::get_work_group_size(dev); + std::size_t max_wg = reduction_detail::get_work_group_size(dev); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { resTy *tmp = sycl::malloc_device( @@ -3864,9 +3879,9 @@ sycl::event gemm_tree_impl(sycl::queue &exec_q, const char *lhs_cp, const char *rhs_cp, char *res_cp, - size_t n, - size_t k, - size_t m, + std::size_t n, + std::size_t k, + std::size_t m, int inner_nd, int lhs_outer_nd, const ssize_t *lhs_outer_inner_shapes_strides, @@ -3880,8 +3895,8 @@ sycl::event gemm_tree_impl(sycl::queue &exec_q, const rhsTy *rhs_tp = reinterpret_cast(rhs_cp); resTy *res_tp = reinterpret_cast(res_cp); - const size_t min_nm = std::min(n, m); - const size_t max_nm = std::max(n, m); + const std::size_t min_nm = std::min(n, m); + const std::size_t max_nm = std::max(n, m); if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) { return gemm_nm_impl( @@ -3954,24 +3969,24 @@ sycl::event gemm_tree_impl(sycl::queue &exec_q, } } -template +template sycl::event gemm_contig_tree_k_impl(sycl::queue &exec_q, const lhsTy *lhs_tp, const rhsTy *rhs_tp, resTy *res_tp, - size_t n, - size_t k, - size_t m, + std::size_t n, + std::size_t k, + std::size_t m, std::vector const &depends) { - size_t delta_k(4); - size_t n_wi(64); - size_t delta_n(32); + std::size_t delta_k(4); + std::size_t n_wi(64); + std::size_t delta_n(32); const sycl::device &dev = exec_q.get_device(); - const size_t local_mem_size = + const std::size_t local_mem_size = dev.get_info(); - const size_t reserved_slm_size = 512; + const std::size_t reserved_slm_size = 512; gemm_detail::scale_gemm_k_parameters( local_mem_size, reserved_slm_size, delta_k, @@ -3987,7 +4002,7 @@ sycl::event gemm_contig_tree_k_impl(sycl::queue &exec_q, using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; constexpr BatchIndexerT batch_indexer{}; - constexpr size_t single_batch_nelems = 1; + constexpr std::size_t single_batch_nelems = 1; sycl::event gemm_ev; if (k <= (delta_k * n_wi)) { @@ -4006,24 +4021,25 @@ sycl::event gemm_contig_tree_k_impl(sycl::queue &exec_q, constexpr resTy identity_val = sycl::known_identity::value; - size_t iter_nelems = n * m; - size_t reduction_nelems = (k + delta_k * n_wi - 1) / (delta_k * n_wi); + std::size_t iter_nelems = n * m; + std::size_t reduction_nelems = + (k + delta_k * n_wi - 1) / (delta_k * n_wi); // more than one work-groups is needed, requires a // temporary delta_k * n_wi elements processed along k, // so if more to process use multiple const auto &sg_sizes = dev.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferred_reductions_per_wi = 8; - size_t reductions_per_wi(preferred_reductions_per_wi); + constexpr std::size_t preferred_reductions_per_wi = 8; + std::size_t reductions_per_wi(preferred_reductions_per_wi); - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); - size_t max_wg = reduction_detail::get_work_group_size(dev); + std::size_t max_wg = reduction_detail::get_work_group_size(dev); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { resTy *tmp = sycl::malloc_device( @@ -4110,20 +4126,20 @@ sycl::event gemm_contig_tree_nm_impl(sycl::queue &exec_q, const lhsTy *lhs_tp, const rhsTy *rhs_tp, resTy *res_tp, - size_t n, - size_t k, - size_t m, + std::size_t n, + std::size_t k, + std::size_t m, std::vector const &depends) { constexpr int wi_delta_n = 2; - size_t wg_delta_n(16); // rows of A processed in WG - size_t wg_delta_m(16); // rows of B processed in WG - size_t wi_delta_k(64); // Elements in K dimension processed by WI + std::size_t wg_delta_n(16); // rows of A processed in WG + std::size_t wg_delta_m(16); // rows of B processed in WG + std::size_t wi_delta_k(64); // Elements in K dimension processed by WI const sycl::device &dev = exec_q.get_device(); - const size_t local_mem_size = + const std::size_t local_mem_size = dev.get_info(); - const size_t reserved_slm_size = 512; + const std::size_t reserved_slm_size = 512; gemm_detail::scale_gemm_nm_parameters( local_mem_size, reserved_slm_size, wi_delta_n, @@ -4140,7 +4156,7 @@ sycl::event gemm_contig_tree_nm_impl(sycl::queue &exec_q, using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer; constexpr BatchIndexerT batch_indexer{}; - constexpr size_t single_batch_nelems = 1; + constexpr std::size_t single_batch_nelems = 1; // each group processes delta_k items in a column, // so no need to allocate temp memory if one group needed @@ -4161,24 +4177,24 @@ sycl::event gemm_contig_tree_nm_impl(sycl::queue &exec_q, constexpr resTy identity_val = sycl::known_identity::value; - size_t iter_nelems = n * m; - size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k; + std::size_t iter_nelems = n * m; + std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k; // more than one work-groups is needed, requires a temporary // wi_delta_k elements processed along k, so if more to // process use multiple const auto &sg_sizes = dev.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferred_reductions_per_wi = 8; - size_t reductions_per_wi(preferred_reductions_per_wi); + constexpr std::size_t preferred_reductions_per_wi = 8; + std::size_t reductions_per_wi(preferred_reductions_per_wi); - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); - size_t max_wg = reduction_detail::get_work_group_size(dev); + std::size_t max_wg = reduction_detail::get_work_group_size(dev); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { resTy *tmp = sycl::malloc_device( @@ -4265,20 +4281,20 @@ sycl::event gemm_contig_tree_impl(sycl::queue &exec_q, const char *lhs_cp, const char *rhs_cp, char *res_cp, - size_t n, - size_t k, - size_t m, + std::size_t n, + std::size_t k, + std::size_t m, std::vector const &depends = {}) { const lhsTy *lhs_tp = reinterpret_cast(lhs_cp); const rhsTy *rhs_tp = reinterpret_cast(rhs_cp); resTy *res_tp = reinterpret_cast(res_cp); - const size_t min_nm = std::min(n, m); - const size_t max_nm = std::max(n, m); + const std::size_t min_nm = std::min(n, m); + const std::size_t max_nm = std::max(n, m); if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) { - constexpr size_t single_batch_nelems = 1; + constexpr std::size_t single_batch_nelems = 1; return gemm_batch_nm_contig_impl( exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m, depends); diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 905b22795e..a262a3a149 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -53,7 +53,7 @@ namespace su_ns = dpctl::tensor::sycl_utils; namespace reduction_detail { -inline size_t get_work_group_size(const sycl::device &d) +inline std::size_t get_work_group_size(const sycl::device &d) { // prevents running out of resources on CPU return std::min( @@ -93,7 +93,7 @@ struct SequentialReduction outT identity_; const InputOutputIterIndexerT inp_out_iter_indexer_; const InputRedIndexerT inp_reduced_dims_indexer_; - size_t reduction_max_gid_ = 0; + std::size_t reduction_max_gid_ = 0; public: SequentialReduction(const argT *inp, @@ -102,7 +102,7 @@ struct SequentialReduction const outT &identity_val, const InputOutputIterIndexerT &arg_res_iter_indexer, const InputRedIndexerT &arg_reduced_dims_indexer, - size_t reduction_size) + std::size_t reduction_size) : inp_(inp), out_(res), reduction_op_(reduction_op), identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), inp_reduced_dims_indexer_(arg_reduced_dims_indexer), @@ -120,7 +120,7 @@ struct SequentialReduction inp_out_iter_offsets_.get_second_offset(); outT red_val(identity_); - for (size_t m = 0; m < reduction_max_gid_; ++m) { + for (std::size_t m = 0; m < reduction_max_gid_; ++m) { const ssize_t inp_reduction_offset = inp_reduced_dims_indexer_(m); const ssize_t inp_offset = inp_iter_offset + inp_reduction_offset; @@ -162,9 +162,9 @@ struct ReductionOverGroupWithAtomicFunctor const outT identity_; const InputOutputIterIndexerT inp_out_iter_indexer_; const InputRedIndexerT inp_reduced_dims_indexer_; - size_t reduction_max_gid_ = 0; - size_t iter_gws_ = 1; - size_t reductions_per_wi = 16; + std::size_t reduction_max_gid_ = 0; + std::size_t iter_gws_ = 1; + std::size_t reductions_per_wi = 16; public: ReductionOverGroupWithAtomicFunctor( @@ -174,9 +174,9 @@ struct ReductionOverGroupWithAtomicFunctor const outT &identity_val, const InputOutputIterIndexerT &arg_res_iter_indexer, const InputRedIndexerT &arg_reduced_dims_indexer, - size_t reduction_size, - size_t iteration_size, - size_t reduction_size_per_wi) + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) : inp_(data), out_(res), reduction_op_(reduction_op), identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), inp_reduced_dims_indexer_(arg_reduced_dims_indexer), @@ -187,11 +187,12 @@ struct ReductionOverGroupWithAtomicFunctor void operator()(sycl::nd_item<1> it) const { - const size_t iter_gid = it.get_group(0) % iter_gws_; - const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const std::size_t iter_gid = it.get_group(0) % iter_gws_; + const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_; - const size_t reduction_lid = it.get_local_id(0); - const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg // work-items operate over input with indices // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg @@ -203,12 +204,12 @@ struct ReductionOverGroupWithAtomicFunctor const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); outT local_red_val(identity_); - size_t arg_reduce_gid0 = + std::size_t arg_reduce_gid0 = reduction_lid + reduction_batch_id * wg * reductions_per_wi; - size_t arg_reduce_gid_max = std::min( + std::size_t arg_reduce_gid_max = std::min( reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); - for (size_t arg_reduce_gid = arg_reduce_gid0; + for (std::size_t arg_reduce_gid = arg_reduce_gid0; arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) { auto inp_reduction_offset = @@ -295,9 +296,9 @@ struct CustomReductionOverGroupWithAtomicFunctor const InputOutputIterIndexerT inp_out_iter_indexer_; const InputRedIndexerT inp_reduced_dims_indexer_; SlmT local_mem_; - size_t reduction_max_gid_ = 0; - size_t iter_gws_ = 1; - size_t reductions_per_wi = 16; + std::size_t reduction_max_gid_ = 0; + std::size_t iter_gws_ = 1; + std::size_t reductions_per_wi = 16; public: CustomReductionOverGroupWithAtomicFunctor( @@ -308,9 +309,9 @@ struct CustomReductionOverGroupWithAtomicFunctor const InputOutputIterIndexerT &arg_res_iter_indexer, const InputRedIndexerT &arg_reduced_dims_indexer, SlmT local_mem, - size_t reduction_size, - size_t iteration_size, - size_t reduction_size_per_wi) + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) : inp_(data), out_(res), reduction_op_(reduction_op), identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), inp_reduced_dims_indexer_(arg_reduced_dims_indexer), @@ -321,11 +322,12 @@ struct CustomReductionOverGroupWithAtomicFunctor void operator()(sycl::nd_item<1> it) const { - const size_t iter_gid = it.get_group(0) % iter_gws_; - const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const std::size_t iter_gid = it.get_group(0) % iter_gws_; + const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_; - const size_t reduction_lid = it.get_local_id(0); - const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg // work-items operate over input with indices // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg @@ -337,12 +339,12 @@ struct CustomReductionOverGroupWithAtomicFunctor const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); outT local_red_val(identity_); - size_t arg_reduce_gid0 = + std::size_t arg_reduce_gid0 = reduction_lid + reduction_batch_id * wg * reductions_per_wi; - size_t arg_reduce_gid_max = std::min( + std::size_t arg_reduce_gid_max = std::min( reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); - for (size_t arg_reduce_gid = arg_reduce_gid0; + for (std::size_t arg_reduce_gid = arg_reduce_gid0; arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) { auto inp_reduction_offset = @@ -418,9 +420,9 @@ struct ReductionOverGroupNoAtomicFunctor const outT identity_; const InputOutputIterIndexerT inp_out_iter_indexer_; const InputRedIndexerT inp_reduced_dims_indexer_; - size_t reduction_max_gid_ = 0; - size_t iter_gws_ = 1; - size_t reductions_per_wi = 16; + std::size_t reduction_max_gid_ = 0; + std::size_t iter_gws_ = 1; + std::size_t reductions_per_wi = 16; public: ReductionOverGroupNoAtomicFunctor( @@ -430,9 +432,9 @@ struct ReductionOverGroupNoAtomicFunctor const outT &identity_val, const InputOutputIterIndexerT &arg_res_iter_indexer, const InputRedIndexerT &arg_reduced_dims_indexer, - size_t reduction_size, - size_t iteration_size, - size_t reduction_size_per_wi) + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) : inp_(data), out_(res), reduction_op_(reduction_op), identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), inp_reduced_dims_indexer_(arg_reduced_dims_indexer), @@ -443,12 +445,14 @@ struct ReductionOverGroupNoAtomicFunctor void operator()(sycl::nd_item<1> it) const { - const size_t reduction_lid = it.get_local_id(0); - const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg - const size_t iter_gid = it.get_group(0) % iter_gws_; - const size_t reduction_batch_id = it.get_group(0) / iter_gws_; - const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; + const std::size_t iter_gid = it.get_group(0) % iter_gws_; + const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const std::size_t n_reduction_groups = + it.get_group_range(0) / iter_gws_; // work-items operates over input with indices // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg @@ -460,10 +464,10 @@ struct ReductionOverGroupNoAtomicFunctor const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); outT local_red_val(identity_); - size_t arg_reduce_gid0 = + std::size_t arg_reduce_gid0 = reduction_lid + reduction_batch_id * wg * reductions_per_wi; - for (size_t m = 0; m < reductions_per_wi; ++m) { - size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + for (std::size_t m = 0; m < reductions_per_wi; ++m) { + std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; if (arg_reduce_gid < reduction_max_gid_) { auto inp_reduction_offset = @@ -526,9 +530,9 @@ struct CustomReductionOverGroupNoAtomicFunctor const InputOutputIterIndexerT inp_out_iter_indexer_; const InputRedIndexerT inp_reduced_dims_indexer_; SlmT local_mem_; - size_t reduction_max_gid_ = 0; - size_t iter_gws_ = 1; - size_t reductions_per_wi = 16; + std::size_t reduction_max_gid_ = 0; + std::size_t iter_gws_ = 1; + std::size_t reductions_per_wi = 16; public: CustomReductionOverGroupNoAtomicFunctor( @@ -539,9 +543,9 @@ struct CustomReductionOverGroupNoAtomicFunctor const InputOutputIterIndexerT &arg_res_iter_indexer, const InputRedIndexerT &arg_reduced_dims_indexer, SlmT local_mem, - size_t reduction_size, - size_t iteration_size, - size_t reduction_size_per_wi) + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) : inp_(data), out_(res), reduction_op_(reduction_op), identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), inp_reduced_dims_indexer_(arg_reduced_dims_indexer), @@ -552,12 +556,14 @@ struct CustomReductionOverGroupNoAtomicFunctor void operator()(sycl::nd_item<1> it) const { - const size_t reduction_lid = it.get_local_id(0); - const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg - const size_t iter_gid = it.get_group(0) % iter_gws_; - const size_t reduction_batch_id = it.get_group(0) / iter_gws_; - const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; + const std::size_t iter_gid = it.get_group(0) % iter_gws_; + const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const std::size_t n_reduction_groups = + it.get_group_range(0) / iter_gws_; // work-items operates over input with indices // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg @@ -569,10 +575,10 @@ struct CustomReductionOverGroupNoAtomicFunctor const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); outT local_red_val(identity_); - size_t arg_reduce_gid0 = + std::size_t arg_reduce_gid0 = reduction_lid + reduction_batch_id * wg * reductions_per_wi; - for (size_t m = 0; m < reductions_per_wi; ++m) { - size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + for (std::size_t m = 0; m < reductions_per_wi; ++m) { + std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; if (arg_reduce_gid < reduction_max_gid_) { auto inp_reduction_offset = @@ -623,8 +629,8 @@ sequential_reduction(sycl::queue &exec_q, const argTy *arg, resTy *res, resTy identity_val, - size_t iter_nelems, - size_t reduction_nelems, + std::size_t iter_nelems, + std::size_t reduction_nelems, const InputOutputIterIndexerT &in_out_iter_indexer, const ReductionIndexerT &reduction_indexer, const std::vector &depends) @@ -662,11 +668,11 @@ submit_atomic_reduction(sycl::queue &exec_q, const argTy *arg, resTy *res, resTy identity_val, - size_t wg, - size_t iter_nelems, - size_t reduction_nelems, - size_t reductions_per_wi, - size_t reduction_groups, + std::size_t wg, + std::size_t iter_nelems, + std::size_t reduction_nelems, + std::size_t reductions_per_wi, + std::size_t reduction_groups, const InputOutputIterIndexerT &in_out_iter_indexer, const ReductionIndexerT &reduction_indexer, const std::vector &depends) @@ -725,8 +731,8 @@ class reduction_over_group_with_atomics_krn; typedef sycl::event (*reduction_strided_impl_fn_ptr)( sycl::queue &, - size_t, - size_t, + std::size_t, + std::size_t, const char *, char *, int, @@ -743,10 +749,10 @@ using dpctl::tensor::sycl_utils::choose_workgroup_size; template sycl::event reduction_over_group_with_atomics_strided_impl( sycl::queue &exec_q, - size_t iter_nelems, // number of reductions (num. of rows in a matrix - // when reducing over rows) - size_t reduction_nelems, // size of each reduction (length of rows, i.e. - // number of columns) + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) const char *arg_cp, char *res_cp, int iter_nd, @@ -765,7 +771,7 @@ sycl::event reduction_over_group_with_atomics_strided_impl( const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); if (reduction_nelems < wg) { using InputOutputIterIndexerT = @@ -818,13 +824,13 @@ sycl::event reduction_over_group_with_atomics_strided_impl( const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, reduction_shape_stride}; - constexpr size_t preferred_reductions_per_wi = 8; - size_t reductions_per_wi = + constexpr std::size_t preferred_reductions_per_wi = 8; + std::size_t reductions_per_wi = (reduction_nelems < preferred_reductions_per_wi * wg) - ? std::max(1, (reduction_nelems + wg - 1) / wg) + ? std::max(1, (reduction_nelems + wg - 1) / wg) : preferred_reductions_per_wi; - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / (reductions_per_wi * wg); @@ -844,8 +850,8 @@ sycl::event reduction_over_group_with_atomics_strided_impl( typedef sycl::event (*reduction_contig_impl_fn_ptr)( sycl::queue &, - size_t, - size_t, + std::size_t, + std::size_t, const char *, char *, ssize_t, @@ -857,10 +863,10 @@ typedef sycl::event (*reduction_contig_impl_fn_ptr)( template sycl::event reduction_axis1_over_group_with_atomics_contig_impl( sycl::queue &exec_q, - size_t iter_nelems, // number of reductions (num. of rows in a matrix - // when reducing over rows) - size_t reduction_nelems, // size of each reduction (length of rows, i.e. - // number of columns) + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) const char *arg_cp, char *res_cp, ssize_t iter_arg_offset, @@ -876,7 +882,7 @@ sycl::event reduction_axis1_over_group_with_atomics_contig_impl( const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); if (reduction_nelems < wg) { using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; @@ -920,13 +926,13 @@ sycl::event reduction_axis1_over_group_with_atomics_contig_impl( result_indexer}; constexpr ReductionIndexerT reduction_indexer{}; - constexpr size_t preferred_reductions_per_wi = 8; - size_t reductions_per_wi = + constexpr std::size_t preferred_reductions_per_wi = 8; + std::size_t reductions_per_wi = (reduction_nelems < preferred_reductions_per_wi * wg) - ? std::max(1, (reduction_nelems + wg - 1) / wg) + ? std::max(1, (reduction_nelems + wg - 1) / wg) : preferred_reductions_per_wi; - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / (reductions_per_wi * wg); @@ -946,10 +952,10 @@ sycl::event reduction_axis1_over_group_with_atomics_contig_impl( template sycl::event reduction_axis0_over_group_with_atomics_contig_impl( sycl::queue &exec_q, - size_t iter_nelems, // number of reductions (num. of cols in a matrix - // when reducing over cols) - size_t reduction_nelems, // size of each reduction (length of cols, i.e. - // number of rows) + std::size_t iter_nelems, // number of reductions (num. of cols in a + // matrix when reducing over cols) + std::size_t reduction_nelems, // size of each reduction (length of cols, + // i.e. number of rows) const char *arg_cp, char *res_cp, ssize_t iter_arg_offset, @@ -965,7 +971,7 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl( const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); if (reduction_nelems < wg) { using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; @@ -1007,13 +1013,13 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl( const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, /* step */ iter_nelems}; - constexpr size_t preferred_reductions_per_wi = 8; - size_t reductions_per_wi = + constexpr std::size_t preferred_reductions_per_wi = 8; + std::size_t reductions_per_wi = (reduction_nelems < preferred_reductions_per_wi * wg) - ? std::max(1, (reduction_nelems + wg - 1) / wg) + ? std::max(1, (reduction_nelems + wg - 1) / wg) : preferred_reductions_per_wi; - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / (reductions_per_wi * wg); @@ -1044,11 +1050,11 @@ submit_no_atomic_reduction(sycl::queue &exec_q, const argTy *arg, resTy *res, resTy identity_val, - size_t wg, - size_t iter_nelems, - size_t reduction_nelems, - size_t reductions_per_wi, - size_t reduction_groups, + std::size_t wg, + std::size_t iter_nelems, + std::size_t reduction_nelems, + std::size_t reductions_per_wi, + std::size_t reduction_groups, const InputOutputIterIndexerT &in_out_iter_indexer, const ReductionIndexerT &reduction_indexer, const std::vector &depends) @@ -1100,8 +1106,8 @@ class reduction_over_group_temps_krn; typedef sycl::event (*reduction_strided_impl_fn_ptr)( sycl::queue &, - size_t, - size_t, + std::size_t, + std::size_t, const char *, char *, int, @@ -1119,10 +1125,10 @@ class reduction_over_group_temps_empty_krn; template sycl::event reduction_over_group_temps_strided_impl( sycl::queue &exec_q, - size_t iter_nelems, // number of reductions (num. of rows in a matrix - // when reducing over rows) - size_t reduction_nelems, // size of each reduction (length of rows, i.e. - // number of columns) + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) const char *arg_cp, char *res_cp, int iter_nd, @@ -1166,7 +1172,7 @@ sycl::event reduction_over_group_temps_strided_impl( const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); if (reduction_nelems < wg) { using InputOutputIterIndexerT = @@ -1189,11 +1195,11 @@ sycl::event reduction_over_group_temps_strided_impl( return comp_ev; } - constexpr size_t preferred_reductions_per_wi = 8; + constexpr std::size_t preferred_reductions_per_wi = 8; // prevents running out of resources on CPU - size_t max_wg = reduction_detail::get_work_group_size(d); + std::size_t max_wg = reduction_detail::get_work_group_size(d); - size_t reductions_per_wi(preferred_reductions_per_wi); + std::size_t reductions_per_wi(preferred_reductions_per_wi); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { // Perform reduction using one 1 work-group per iteration, // can output directly to res @@ -1212,9 +1218,9 @@ sycl::event reduction_over_group_temps_strided_impl( wg = max_wg; } reductions_per_wi = - std::max(1, (reduction_nelems + wg - 1) / wg); + std::max(1, (reduction_nelems + wg - 1) / wg); - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / (reductions_per_wi * wg); assert(reduction_groups == 1); @@ -1230,12 +1236,12 @@ sycl::event reduction_over_group_temps_strided_impl( } else { // more than one work-groups is needed, requires a temporary - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); - size_t second_iter_reduction_groups_ = + std::size_t second_iter_reduction_groups_ = (reduction_groups + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); @@ -1284,7 +1290,7 @@ sycl::event reduction_over_group_temps_strided_impl( depends); } - size_t remaining_reduction_nelems = reduction_groups; + std::size_t remaining_reduction_nelems = reduction_groups; resTy *temp_arg = partially_reduced_tmp; resTy *temp2_arg = partially_reduced_tmp2; @@ -1293,9 +1299,10 @@ sycl::event reduction_over_group_temps_strided_impl( while (remaining_reduction_nelems > preferred_reductions_per_wi * max_wg) { - size_t reduction_groups_ = (remaining_reduction_nelems + - preferred_reductions_per_wi * wg - 1) / - (preferred_reductions_per_wi * wg); + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -1352,8 +1359,8 @@ sycl::event reduction_over_group_temps_strided_impl( constexpr ReductionIndexerT reduction_indexer{}; wg = max_wg; - reductions_per_wi = - std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); reduction_groups = (remaining_reduction_nelems + reductions_per_wi * wg - 1) / @@ -1388,10 +1395,10 @@ sycl::event reduction_over_group_temps_strided_impl( template sycl::event reduction_axis1_over_group_temps_contig_impl( sycl::queue &exec_q, - size_t iter_nelems, // number of reductions (num. of rows in a matrix - // when reducing over rows) - size_t reduction_nelems, // size of each reduction (length of rows, i.e. - // number of columns) + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) const char *arg_cp, char *res_cp, ssize_t iter_arg_offset, @@ -1414,7 +1421,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); if (reduction_nelems < wg) { using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; @@ -1441,11 +1448,11 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( return comp_ev; } - constexpr size_t preferred_reductions_per_wi = 8; + constexpr std::size_t preferred_reductions_per_wi = 8; // prevents running out of resources on CPU - size_t max_wg = reduction_detail::get_work_group_size(d); + std::size_t max_wg = reduction_detail::get_work_group_size(d); - size_t reductions_per_wi(preferred_reductions_per_wi); + std::size_t reductions_per_wi(preferred_reductions_per_wi); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { // Perform reduction using one 1 work-group per iteration, // can output directly to res @@ -1468,9 +1475,9 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( wg = max_wg; } reductions_per_wi = - std::max(1, (reduction_nelems + wg - 1) / wg); + std::max(1, (reduction_nelems + wg - 1) / wg); - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / (reductions_per_wi * wg); assert(reduction_groups == 1); @@ -1486,12 +1493,12 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( } else { // more than one work-groups is needed, requires a temporary - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); - size_t second_iter_reduction_groups_ = + std::size_t second_iter_reduction_groups_ = (reduction_groups + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); @@ -1533,7 +1540,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( depends); } - size_t remaining_reduction_nelems = reduction_groups; + std::size_t remaining_reduction_nelems = reduction_groups; resTy *temp_arg = partially_reduced_tmp; resTy *temp2_arg = partially_reduced_tmp2; @@ -1542,9 +1549,10 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( while (remaining_reduction_nelems > preferred_reductions_per_wi * max_wg) { - size_t reduction_groups_ = (remaining_reduction_nelems + - preferred_reductions_per_wi * wg - 1) / - (preferred_reductions_per_wi * wg); + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -1593,8 +1601,8 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( constexpr ReductionIndexerT reduction_indexer{}; wg = max_wg; - reductions_per_wi = - std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); reduction_groups = (remaining_reduction_nelems + reductions_per_wi * wg - 1) / @@ -1629,10 +1637,10 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( template sycl::event reduction_axis0_over_group_temps_contig_impl( sycl::queue &exec_q, - size_t iter_nelems, // number of reductions (num. of rows in a matrix - // when reducing over rows) - size_t reduction_nelems, // size of each reduction (length of rows, i.e. - // number of columns) + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) const char *arg_cp, char *res_cp, ssize_t iter_arg_offset, @@ -1655,7 +1663,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); if (reduction_nelems < wg) { using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; @@ -1680,11 +1688,11 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( return comp_ev; } - constexpr size_t preferred_reductions_per_wi = 8; + constexpr std::size_t preferred_reductions_per_wi = 8; // prevents running out of resources on CPU - size_t max_wg = reduction_detail::get_work_group_size(d); + std::size_t max_wg = reduction_detail::get_work_group_size(d); - size_t reductions_per_wi(preferred_reductions_per_wi); + std::size_t reductions_per_wi(preferred_reductions_per_wi); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { // Perform reduction using one 1 work-group per iteration, // can output directly to res @@ -1708,9 +1716,9 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( wg = max_wg; } reductions_per_wi = - std::max(1, (reduction_nelems + wg - 1) / wg); + std::max(1, (reduction_nelems + wg - 1) / wg); - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / (reductions_per_wi * wg); assert(reduction_groups == 1); @@ -1726,12 +1734,12 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( } else { // more than one work-groups is needed, requires a temporary - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); - size_t second_iter_reduction_groups_ = + std::size_t second_iter_reduction_groups_ = (reduction_groups + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); @@ -1774,7 +1782,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( depends); } - size_t remaining_reduction_nelems = reduction_groups; + std::size_t remaining_reduction_nelems = reduction_groups; resTy *temp_arg = partially_reduced_tmp; resTy *temp2_arg = partially_reduced_tmp2; @@ -1783,9 +1791,10 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( while (remaining_reduction_nelems > preferred_reductions_per_wi * max_wg) { - size_t reduction_groups_ = (remaining_reduction_nelems + - preferred_reductions_per_wi * wg - 1) / - (preferred_reductions_per_wi * wg); + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -1834,8 +1843,8 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( constexpr ReductionIndexerT reduction_indexer{}; wg = max_wg; - reductions_per_wi = - std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); reduction_groups = (remaining_reduction_nelems + reductions_per_wi * wg - 1) / @@ -1888,7 +1897,7 @@ struct SequentialSearchReduction outT idx_identity_; const InputOutputIterIndexerT inp_out_iter_indexer_; const InputRedIndexerT inp_reduced_dims_indexer_; - size_t reduction_max_gid_ = 0; + std::size_t reduction_max_gid_ = 0; public: SequentialSearchReduction( @@ -1900,7 +1909,7 @@ struct SequentialSearchReduction const outT &idx_identity_val, const InputOutputIterIndexerT &arg_res_iter_indexer, const InputRedIndexerT &arg_reduced_dims_indexer, - size_t reduction_size) + std::size_t reduction_size) : inp_(inp), out_(res), reduction_op_(reduction_op), identity_(identity_val), idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val), @@ -1921,7 +1930,7 @@ struct SequentialSearchReduction argT red_val(identity_); outT idx_val(idx_identity_); - for (size_t m = 0; m < reduction_max_gid_; ++m) { + for (std::size_t m = 0; m < reduction_max_gid_; ++m) { const ssize_t inp_reduction_offset = inp_reduced_dims_indexer_(m); const ssize_t inp_offset = inp_iter_offset + inp_reduction_offset; @@ -2014,9 +2023,9 @@ struct SearchReduction const outT idx_identity_; const InputOutputIterIndexerT inp_out_iter_indexer_; const InputRedIndexerT inp_reduced_dims_indexer_; - size_t reduction_max_gid_ = 0; - size_t iter_gws_ = 1; - size_t reductions_per_wi = 16; + std::size_t reduction_max_gid_ = 0; + std::size_t iter_gws_ = 1; + std::size_t reductions_per_wi = 16; public: SearchReduction(const argT *data, @@ -2029,9 +2038,9 @@ struct SearchReduction const outT &idx_identity_val, const InputOutputIterIndexerT &arg_res_iter_indexer, const InputRedIndexerT &arg_reduced_dims_indexer, - size_t reduction_size, - size_t iteration_size, - size_t reduction_size_per_wi) + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) : inp_(data), vals_(vals), inds_(inds), out_(res), reduction_op_(reduction_op), identity_(identity_val), idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val), @@ -2044,12 +2053,14 @@ struct SearchReduction void operator()(sycl::nd_item<1> it) const { - const size_t reduction_lid = it.get_local_id(0); - const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg - const size_t iter_gid = it.get_group(0) % iter_gws_; - const size_t reduction_batch_id = it.get_group(0) / iter_gws_; - const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; + const std::size_t iter_gid = it.get_group(0) % iter_gws_; + const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const std::size_t n_reduction_groups = + it.get_group_range(0) / iter_gws_; // work-items operates over input with indices // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg @@ -2062,10 +2073,10 @@ struct SearchReduction argT local_red_val(identity_); outT local_idx(idx_identity_); - size_t arg_reduce_gid0 = + std::size_t arg_reduce_gid0 = reduction_lid + reduction_batch_id * wg * reductions_per_wi; - for (size_t m = 0; m < reductions_per_wi; ++m) { - size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + for (std::size_t m = 0; m < reductions_per_wi; ++m) { + std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; if (arg_reduce_gid < reduction_max_gid_) { auto inp_reduction_offset = @@ -2170,9 +2181,9 @@ struct CustomSearchReduction const InputOutputIterIndexerT inp_out_iter_indexer_; const InputRedIndexerT inp_reduced_dims_indexer_; SlmT local_mem_; - size_t reduction_max_gid_ = 0; - size_t iter_gws_ = 1; - size_t reductions_per_wi = 16; + std::size_t reduction_max_gid_ = 0; + std::size_t iter_gws_ = 1; + std::size_t reductions_per_wi = 16; public: CustomSearchReduction(const argT *data, @@ -2186,9 +2197,9 @@ struct CustomSearchReduction const InputOutputIterIndexerT &arg_res_iter_indexer, const InputRedIndexerT &arg_reduced_dims_indexer, SlmT local_mem, - size_t reduction_size, - size_t iteration_size, - size_t reduction_size_per_wi) + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) : inp_(data), vals_(vals), inds_(inds), out_(res), reduction_op_(reduction_op), identity_(identity_val), idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val), @@ -2201,12 +2212,14 @@ struct CustomSearchReduction void operator()(sycl::nd_item<1> it) const { - const size_t reduction_lid = it.get_local_id(0); - const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg - const size_t iter_gid = it.get_group(0) % iter_gws_; - const size_t reduction_batch_id = it.get_group(0) / iter_gws_; - const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; + const std::size_t iter_gid = it.get_group(0) % iter_gws_; + const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const std::size_t n_reduction_groups = + it.get_group_range(0) / iter_gws_; // work-items operates over input with indices // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg @@ -2219,10 +2232,10 @@ struct CustomSearchReduction argT local_red_val(identity_); outT local_idx(idx_identity_); - size_t arg_reduce_gid0 = + std::size_t arg_reduce_gid0 = reduction_lid + reduction_batch_id * wg * reductions_per_wi; - for (size_t m = 0; m < reductions_per_wi; ++m) { - size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + for (std::size_t m = 0; m < reductions_per_wi; ++m) { + std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; if (arg_reduce_gid < reduction_max_gid_) { auto inp_reduction_offset = @@ -2384,8 +2397,8 @@ struct CustomSearchReduction typedef sycl::event (*search_strided_impl_fn_ptr)( sycl::queue &, - size_t, - size_t, + std::size_t, + std::size_t, const char *, char *, int, @@ -2452,11 +2465,11 @@ submit_search_reduction(sycl::queue &exec_q, resTy *res, argTy identity_val, resTy idx_identity_val, - size_t wg, - size_t iter_nelems, - size_t reduction_nelems, - size_t reductions_per_wi, - size_t reduction_groups, + std::size_t wg, + std::size_t iter_nelems, + std::size_t reduction_nelems, + std::size_t reductions_per_wi, + std::size_t reduction_groups, const InputOutputIterIndexerT &in_out_iter_indexer, const ReductionIndexerT &reduction_indexer, const std::vector &depends) @@ -2508,10 +2521,10 @@ template sycl::event search_over_group_temps_strided_impl( sycl::queue &exec_q, - size_t iter_nelems, // number of reductions (num. of rows in a matrix - // when reducing over rows) - size_t reduction_nelems, // size of each reduction (length of rows, i.e. - // number of columns) + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) const char *arg_cp, char *res_cp, int iter_nd, @@ -2555,7 +2568,7 @@ sycl::event search_over_group_temps_strided_impl( const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); if (reduction_nelems < wg) { using InputOutputIterIndexerT = @@ -2585,11 +2598,11 @@ sycl::event search_over_group_temps_strided_impl( return comp_ev; } - constexpr size_t preferred_reductions_per_wi = 4; + constexpr std::size_t preferred_reductions_per_wi = 4; // prevents running out of resources on CPU - size_t max_wg = reduction_detail::get_work_group_size(d); + std::size_t max_wg = reduction_detail::get_work_group_size(d); - size_t reductions_per_wi(preferred_reductions_per_wi); + std::size_t reductions_per_wi(preferred_reductions_per_wi); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { // Perform reduction using one 1 work-group per iteration, // can output directly to res @@ -2608,9 +2621,9 @@ sycl::event search_over_group_temps_strided_impl( wg = max_wg; } reductions_per_wi = - std::max(1, (reduction_nelems + wg - 1) / wg); + std::max(1, (reduction_nelems + wg - 1) / wg); - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / (reductions_per_wi * wg); assert(reduction_groups == 1); @@ -2628,12 +2641,12 @@ sycl::event search_over_group_temps_strided_impl( } else { // more than one work-groups is needed, requires a temporary - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); - size_t second_iter_reduction_groups_ = + std::size_t second_iter_reduction_groups_ = (reduction_groups + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); @@ -2696,7 +2709,7 @@ sycl::event search_over_group_temps_strided_impl( depends); } - size_t remaining_reduction_nelems = reduction_groups; + std::size_t remaining_reduction_nelems = reduction_groups; resTy *temp_arg = partially_reduced_tmp; resTy *temp2_arg = partially_reduced_tmp2; @@ -2709,9 +2722,10 @@ sycl::event search_over_group_temps_strided_impl( while (remaining_reduction_nelems > preferred_reductions_per_wi * max_wg) { - size_t reduction_groups_ = (remaining_reduction_nelems + - preferred_reductions_per_wi * wg - 1) / - (preferred_reductions_per_wi * wg); + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -2766,8 +2780,8 @@ sycl::event search_over_group_temps_strided_impl( constexpr ReductionIndexerT reduction_indexer{}; wg = max_wg; - reductions_per_wi = - std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); reduction_groups = (remaining_reduction_nelems + reductions_per_wi * wg - 1) / @@ -2805,8 +2819,8 @@ sycl::event search_over_group_temps_strided_impl( typedef sycl::event (*search_contig_impl_fn_ptr)( sycl::queue &, - size_t, - size_t, + std::size_t, + std::size_t, const char *, char *, ssize_t, @@ -2820,10 +2834,10 @@ template sycl::event search_axis1_over_group_temps_contig_impl( sycl::queue &exec_q, - size_t iter_nelems, // number of reductions (num. of rows in a matrix - // when reducing over rows) - size_t reduction_nelems, // size of each reduction (length of rows, i.e. - // number of columns) + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) const char *arg_cp, char *res_cp, ssize_t iter_arg_offset, @@ -2847,7 +2861,7 @@ sycl::event search_axis1_over_group_temps_contig_impl( const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); if (reduction_nelems < wg) { using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; @@ -2881,11 +2895,11 @@ sycl::event search_axis1_over_group_temps_contig_impl( return comp_ev; } - constexpr size_t preferred_reductions_per_wi = 8; + constexpr std::size_t preferred_reductions_per_wi = 8; // prevents running out of resources on CPU - size_t max_wg = reduction_detail::get_work_group_size(d); + std::size_t max_wg = reduction_detail::get_work_group_size(d); - size_t reductions_per_wi(preferred_reductions_per_wi); + std::size_t reductions_per_wi(preferred_reductions_per_wi); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { // Perform reduction using one 1 work-group per iteration, // can output directly to res @@ -2907,9 +2921,9 @@ sycl::event search_axis1_over_group_temps_contig_impl( wg = max_wg; } reductions_per_wi = - std::max(1, (reduction_nelems + wg - 1) / wg); + std::max(1, (reduction_nelems + wg - 1) / wg); - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / (reductions_per_wi * wg); assert(reduction_groups == 1); @@ -2927,12 +2941,12 @@ sycl::event search_axis1_over_group_temps_contig_impl( } else { // more than one work-groups is needed, requires a temporary - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); - size_t second_iter_reduction_groups_ = + std::size_t second_iter_reduction_groups_ = (reduction_groups + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); @@ -2989,7 +3003,7 @@ sycl::event search_axis1_over_group_temps_contig_impl( depends); } - size_t remaining_reduction_nelems = reduction_groups; + std::size_t remaining_reduction_nelems = reduction_groups; resTy *temp_arg = partially_reduced_tmp; resTy *temp2_arg = partially_reduced_tmp2; @@ -3002,9 +3016,10 @@ sycl::event search_axis1_over_group_temps_contig_impl( while (remaining_reduction_nelems > preferred_reductions_per_wi * max_wg) { - size_t reduction_groups_ = (remaining_reduction_nelems + - preferred_reductions_per_wi * wg - 1) / - (preferred_reductions_per_wi * wg); + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -3056,8 +3071,8 @@ sycl::event search_axis1_over_group_temps_contig_impl( constexpr ReductionIndexerT reduction_indexer{}; wg = max_wg; - reductions_per_wi = - std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); reduction_groups = (remaining_reduction_nelems + reductions_per_wi * wg - 1) / @@ -3099,10 +3114,10 @@ template sycl::event search_axis0_over_group_temps_contig_impl( sycl::queue &exec_q, - size_t iter_nelems, // number of reductions (num. of rows in a matrix - // when reducing over rows) - size_t reduction_nelems, // size of each reduction (length of rows, i.e. - // number of columns) + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) const char *arg_cp, char *res_cp, ssize_t iter_arg_offset, @@ -3126,7 +3141,7 @@ sycl::event search_axis0_over_group_temps_contig_impl( const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); if (reduction_nelems < wg) { using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; @@ -3163,11 +3178,11 @@ sycl::event search_axis0_over_group_temps_contig_impl( return comp_ev; } - constexpr size_t preferred_reductions_per_wi = 8; + constexpr std::size_t preferred_reductions_per_wi = 8; // prevents running out of resources on CPU - size_t max_wg = reduction_detail::get_work_group_size(d); + std::size_t max_wg = reduction_detail::get_work_group_size(d); - size_t reductions_per_wi(preferred_reductions_per_wi); + std::size_t reductions_per_wi(preferred_reductions_per_wi); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { // Perform reduction using one 1 work-group per iteration, // can output directly to res @@ -3190,9 +3205,9 @@ sycl::event search_axis0_over_group_temps_contig_impl( wg = max_wg; } reductions_per_wi = - std::max(1, (reduction_nelems + wg - 1) / wg); + std::max(1, (reduction_nelems + wg - 1) / wg); - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / (reductions_per_wi * wg); assert(reduction_groups == 1); @@ -3210,12 +3225,12 @@ sycl::event search_axis0_over_group_temps_contig_impl( } else { // more than one work-groups is needed, requires a temporary - size_t reduction_groups = + std::size_t reduction_groups = (reduction_nelems + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); - size_t second_iter_reduction_groups_ = + std::size_t second_iter_reduction_groups_ = (reduction_groups + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); @@ -3273,7 +3288,7 @@ sycl::event search_axis0_over_group_temps_contig_impl( depends); } - size_t remaining_reduction_nelems = reduction_groups; + std::size_t remaining_reduction_nelems = reduction_groups; resTy *temp_arg = partially_reduced_tmp; resTy *temp2_arg = partially_reduced_tmp2; @@ -3286,9 +3301,10 @@ sycl::event search_axis0_over_group_temps_contig_impl( while (remaining_reduction_nelems > preferred_reductions_per_wi * max_wg) { - size_t reduction_groups_ = (remaining_reduction_nelems + - preferred_reductions_per_wi * wg - 1) / - (preferred_reductions_per_wi * wg); + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -3340,8 +3356,8 @@ sycl::event search_axis0_over_group_temps_contig_impl( constexpr ReductionIndexerT reduction_indexer{}; wg = max_wg; - reductions_per_wi = - std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); reduction_groups = (remaining_reduction_nelems + reductions_per_wi * wg - 1) / diff --git a/dpctl/tensor/libtensor/include/kernels/repeat.hpp b/dpctl/tensor/libtensor/include/kernels/repeat.hpp index bfc019f55b..5550d58446 100644 --- a/dpctl/tensor/libtensor/include/kernels/repeat.hpp +++ b/dpctl/tensor/libtensor/include/kernels/repeat.hpp @@ -25,6 +25,7 @@ #pragma once #include #include +#include #include #include #include @@ -65,7 +66,7 @@ class RepeatSequenceFunctor T *dst = nullptr; const repT *reps = nullptr; const repT *cumsum = nullptr; - size_t src_axis_nelems = 1; + std::size_t src_axis_nelems = 1; const OrthogIndexer orthog_strider; const SrcAxisIndexer src_axis_strider; const DstAxisIndexer dst_axis_strider; @@ -76,7 +77,7 @@ class RepeatSequenceFunctor T *dst_, const repT *reps_, const repT *cumsum_, - size_t src_axis_nelems_, + std::size_t src_axis_nelems_, const OrthogIndexer &orthog_strider_, const SrcAxisIndexer &src_axis_strider_, const DstAxisIndexer &dst_axis_strider_, @@ -90,7 +91,7 @@ class RepeatSequenceFunctor void operator()(sycl::id<1> idx) const { - size_t id = idx[0]; + std::size_t id = idx[0]; auto i_orthog = id / src_axis_nelems; auto i_along = id - (i_orthog * src_axis_nelems); @@ -109,8 +110,8 @@ class RepeatSequenceFunctor typedef sycl::event (*repeat_by_sequence_fn_ptr_t)( sycl::queue &, - size_t, - size_t, + std::size_t, + std::size_t, const char *, char *, const char *, @@ -130,8 +131,8 @@ typedef sycl::event (*repeat_by_sequence_fn_ptr_t)( template sycl::event repeat_by_sequence_impl(sycl::queue &q, - size_t orthog_nelems, - size_t src_axis_nelems, + std::size_t orthog_nelems, + std::size_t src_axis_nelems, const char *src_cp, char *dst_cp, const char *reps_cp, @@ -169,7 +170,7 @@ repeat_by_sequence_impl(sycl::queue &q, const Strided1DIndexer reps_indexer{/* size */ reps_shape, /* step */ reps_stride}; - const size_t gws = orthog_nelems * src_axis_nelems; + const std::size_t gws = orthog_nelems * src_axis_nelems; cgh.parallel_for struct RepeatSequenceFactory typedef sycl::event (*repeat_by_sequence_1d_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, const char *, char *, const char *, @@ -211,7 +212,7 @@ typedef sycl::event (*repeat_by_sequence_1d_fn_ptr_t)( template sycl::event repeat_by_sequence_1d_impl(sycl::queue &q, - size_t src_nelems, + std::size_t src_nelems, const char *src_cp, char *dst_cp, const char *reps_cp, @@ -242,7 +243,7 @@ sycl::event repeat_by_sequence_1d_impl(sycl::queue &q, const Strided1DIndexer reps_indexer{/* size */ reps_shape, /* step */ reps_stride}; - const size_t gws = src_nelems; + const std::size_t gws = src_nelems; cgh.parallel_for idx) const { - size_t id = idx[0]; + std::size_t id = idx[0]; auto i_orthog = id / dst_axis_nelems; auto i_along = id - (i_orthog * dst_axis_nelems); @@ -319,8 +320,8 @@ class RepeatScalarFunctor typedef sycl::event (*repeat_by_scalar_fn_ptr_t)( sycl::queue &, - size_t, - size_t, + std::size_t, + std::size_t, const char *, char *, const ssize_t, @@ -336,8 +337,8 @@ typedef sycl::event (*repeat_by_scalar_fn_ptr_t)( template sycl::event repeat_by_scalar_impl(sycl::queue &q, - size_t orthog_nelems, - size_t dst_axis_nelems, + std::size_t orthog_nelems, + std::size_t dst_axis_nelems, const char *src_cp, char *dst_cp, const ssize_t reps, @@ -366,7 +367,7 @@ sycl::event repeat_by_scalar_impl(sycl::queue &q, const Strided1DIndexer dst_axis_indexer{/* size */ dst_axis_shape, /* step */ dst_axis_stride}; - const size_t gws = orthog_nelems * dst_axis_nelems; + const std::size_t gws = orthog_nelems * dst_axis_nelems; cgh.parallel_for>( @@ -391,7 +392,7 @@ template struct RepeatScalarFactory typedef sycl::event (*repeat_by_scalar_1d_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, const char *, char *, const ssize_t, @@ -403,7 +404,7 @@ typedef sycl::event (*repeat_by_scalar_1d_fn_ptr_t)( template sycl::event repeat_by_scalar_1d_impl(sycl::queue &q, - size_t dst_nelems, + std::size_t dst_nelems, const char *src_cp, char *dst_cp, const ssize_t reps, @@ -426,7 +427,7 @@ sycl::event repeat_by_scalar_1d_impl(sycl::queue &q, const Strided1DIndexer dst_indexer{/* size */ dst_shape, /* step */ dst_stride}; - const size_t gws = dst_nelems; + const std::size_t gws = dst_nelems; cgh.parallel_for>( diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp index dbf40c10fe..5b677bbed1 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp @@ -25,6 +25,7 @@ #pragma once #include +#include #include #include #include @@ -213,13 +214,13 @@ namespace { template void insertion_sort_impl(Iter first, - const size_t begin, - const size_t end, + const std::size_t begin, + const std::size_t end, Compare comp) { - for (size_t i = begin + 1; i < end; ++i) { + for (std::size_t i = begin + 1; i < end; ++i) { const auto val_i = first[i]; - size_t j = i - 1; + std::size_t j = i - 1; while ((j + 1 > begin) && (comp(val_i, first[j]))) { first[j + 1] = first[j]; --j; @@ -232,14 +233,14 @@ void insertion_sort_impl(Iter first, template void bubble_sort_impl(Iter first, - const size_t begin, - const size_t end, + const std::size_t begin, + const std::size_t end, Compare comp) { if (begin < end) { - for (size_t i = begin; i < end; ++i) { + for (std::size_t i = begin; i < end; ++i) { // Handle intermediate items - for (size_t idx = i + 1; idx < end; ++idx) { + for (std::size_t idx = i + 1; idx < end; ++idx) { if (comp(first[idx], first[i])) { std::swap(first[i], first[idx]); } @@ -250,8 +251,8 @@ void bubble_sort_impl(Iter first, template void leaf_sort_impl(Iter first, - const size_t begin, - const size_t end, + const std::size_t begin, + const std::size_t end, Compare comp) { return insertion_sort_impl( @@ -343,12 +344,12 @@ class sort_base_step_contig_krn; template sycl::event sort_base_step_contig_impl(sycl::queue &q, - const size_t iter_nelems, - const size_t sort_nelems, + const std::size_t iter_nelems, + const std::size_t sort_nelems, const InpAcc input, OutAcc output, const Comp &comp, - const size_t conseq_nelems_sorted, + const std::size_t conseq_nelems_sorted, const std::vector &depends = {}) { @@ -356,8 +357,8 @@ sort_base_step_contig_impl(sycl::queue &q, using outT = typename GetValueType::value_type; using KernelName = sort_base_step_contig_krn; - const size_t n_segments = - quotient_ceil(sort_nelems, conseq_nelems_sorted); + const std::size_t n_segments = + quotient_ceil(sort_nelems, conseq_nelems_sorted); sycl::event base_sort = q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -368,17 +369,17 @@ sort_base_step_contig_impl(sycl::queue &q, auto output_acc = GetWriteDiscardAccess{}(output, cgh); cgh.parallel_for(gRange, [=](sycl::id<1> id) { - const size_t iter_id = id[0] / n_segments; - const size_t segment_id = id[0] - iter_id * n_segments; + const std::size_t iter_id = id[0] / n_segments; + const std::size_t segment_id = id[0] - iter_id * n_segments; - const size_t iter_offset = iter_id * sort_nelems; - const size_t beg_id = + const std::size_t iter_offset = iter_id * sort_nelems; + const std::size_t beg_id = iter_offset + segment_id * conseq_nelems_sorted; - const size_t end_id = + const std::size_t end_id = iter_offset + - std::min((segment_id + 1) * conseq_nelems_sorted, - sort_nelems); - for (size_t i = beg_id; i < end_id; ++i) { + std::min((segment_id + 1) * conseq_nelems_sorted, + sort_nelems); + for (std::size_t i = beg_id; i < end_id; ++i) { output_acc[i] = input_acc[i]; } @@ -395,12 +396,12 @@ class sort_over_work_group_contig_krn; template sycl::event sort_over_work_group_contig_impl(sycl::queue &q, - size_t iter_nelems, - size_t sort_nelems, + std::size_t iter_nelems, + std::size_t sort_nelems, const InpAcc input, OutAcc output, const Comp &comp, - size_t &nelems_wg_sorts, + std::size_t &nelems_wg_sorts, const std::vector &depends = {}) { using inpT = typename GetValueType::value_type; @@ -430,7 +431,7 @@ sort_over_work_group_contig_impl(sycl::queue &q, constexpr std::uint32_t sub_groups_per_work_group = 4; const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2; - const size_t lws = sub_groups_per_work_group * max_sg_size; + const std::size_t lws = sub_groups_per_work_group * max_sg_size; nelems_wg_sorts = elems_per_wi * lws; @@ -445,8 +446,8 @@ sort_over_work_group_contig_impl(sycl::queue &q, // This assumption permits doing away with using a loop assert(nelems_wg_sorts % lws == 0); - const size_t n_segments = - quotient_ceil(sort_nelems, nelems_wg_sorts); + const std::size_t n_segments = + quotient_ceil(sort_nelems, nelems_wg_sorts); sycl::event base_sort_ev = q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -466,18 +467,19 @@ sort_over_work_group_contig_impl(sycl::queue &q, sycl::nd_range<1> ndRange(global_range, local_range); cgh.parallel_for(ndRange, [=](sycl::nd_item<1> it) { - const size_t group_id = it.get_group_linear_id(); - const size_t iter_id = group_id / n_segments; - const size_t segment_id = group_id - iter_id * n_segments; - const size_t lid = it.get_local_linear_id(); + const std::size_t group_id = it.get_group_linear_id(); + const std::size_t iter_id = group_id / n_segments; + const std::size_t segment_id = group_id - iter_id * n_segments; + const std::size_t lid = it.get_local_linear_id(); - const size_t segment_start_idx = segment_id * nelems_wg_sorts; - const size_t segment_end_idx = std::min( + const std::size_t segment_start_idx = segment_id * nelems_wg_sorts; + const std::size_t segment_end_idx = std::min( segment_start_idx + nelems_wg_sorts, sort_nelems); - const size_t wg_chunk_size = segment_end_idx - segment_start_idx; + const std::size_t wg_chunk_size = + segment_end_idx - segment_start_idx; // load input into SLM - for (size_t array_id = segment_start_idx + lid; + for (std::size_t array_id = segment_start_idx + lid; array_id < segment_end_idx; array_id += lws) { T v = (array_id < sort_nelems) @@ -487,10 +489,11 @@ sort_over_work_group_contig_impl(sycl::queue &q, } sycl::group_barrier(it.get_group()); - const size_t chunk = quotient_ceil(nelems_wg_sorts, lws); + const std::size_t chunk = + quotient_ceil(nelems_wg_sorts, lws); - const size_t chunk_start_idx = lid * chunk; - const size_t chunk_end_idx = + const std::size_t chunk_start_idx = lid * chunk; + const std::size_t chunk_end_idx = sycl::min(chunk_start_idx + chunk, wg_chunk_size); leaf_sort_impl(work_space, chunk_start_idx, chunk_end_idx, comp); @@ -498,22 +501,24 @@ sort_over_work_group_contig_impl(sycl::queue &q, sycl::group_barrier(it.get_group()); bool data_in_temp = false; - size_t n_chunks_merged = 1; + std::size_t n_chunks_merged = 1; // merge chunk while n_chunks_merged * chunk < wg_chunk_size - const size_t max_chunks_merged = 1 + ((wg_chunk_size - 1) / chunk); + const std::size_t max_chunks_merged = + 1 + ((wg_chunk_size - 1) / chunk); for (; n_chunks_merged < max_chunks_merged; data_in_temp = !data_in_temp, n_chunks_merged *= 2) { - const size_t nelems_sorted_so_far = n_chunks_merged * chunk; - const size_t q = (lid / n_chunks_merged); - const size_t start_1 = + const std::size_t nelems_sorted_so_far = + n_chunks_merged * chunk; + const std::size_t q = (lid / n_chunks_merged); + const std::size_t start_1 = sycl::min(2 * nelems_sorted_so_far * q, wg_chunk_size); - const size_t end_1 = + const std::size_t end_1 = sycl::min(start_1 + nelems_sorted_so_far, wg_chunk_size); - const size_t end_2 = + const std::size_t end_2 = sycl::min(end_1 + nelems_sorted_so_far, wg_chunk_size); - const size_t offset = chunk * (lid - q * n_chunks_merged); + const std::size_t offset = chunk * (lid - q * n_chunks_merged); if (data_in_temp) { merge_impl(offset, scratch_space, work_space, start_1, @@ -527,7 +532,7 @@ sort_over_work_group_contig_impl(sycl::queue &q, } const auto &out_src = (data_in_temp) ? scratch_space : work_space; - for (size_t array_id = segment_start_idx + lid; + for (std::size_t array_id = segment_start_idx + lid; array_id < segment_end_idx; array_id += lws) { if (array_id < sort_nelems) { @@ -567,11 +572,11 @@ template class merge_adjacent_blocks_from_temp_krn; template sycl::event merge_sorted_block_contig_impl(sycl::queue &q, - size_t iter_nelems, - size_t sort_nelems, + std::size_t iter_nelems, + std::size_t sort_nelems, Acc output, const Comp comp, - size_t sorted_block_size, + std::size_t sorted_block_size, const std::vector &depends = {}) { @@ -581,9 +586,9 @@ merge_sorted_block_contig_impl(sycl::queue &q, // experimentally determined value // size of segments worked upon by each work-item during merging const sycl::device &dev = q.get_device(); - const size_t segment_size = (dev.has(sycl::aspect::cpu)) ? 32 : 4; + const std::size_t segment_size = (dev.has(sycl::aspect::cpu)) ? 32 : 4; - const size_t chunk_size = + const std::size_t chunk_size = (sorted_block_size < segment_size) ? sorted_block_size : segment_size; assert(sorted_block_size % chunk_size == 0); @@ -597,7 +602,7 @@ merge_sorted_block_contig_impl(sycl::queue &q, bool used_depends = false; sycl::event dep_ev; - size_t chunks_merged = sorted_block_size / chunk_size; + std::size_t chunks_merged = sorted_block_size / chunk_size; assert(!(chunks_merged & (chunks_merged - 1))); @@ -617,8 +622,8 @@ merge_sorted_block_contig_impl(sycl::queue &q, used_depends = true; } - const size_t n_chunks = - quotient_ceil(sort_nelems, chunk_size); + const std::size_t n_chunks = + quotient_ceil(sort_nelems, chunk_size); if (needs_copy) { sycl::accessor temp_acc{temp_buf, cgh, sycl::write_only, @@ -707,10 +712,10 @@ merge_sorted_block_contig_impl(sycl::queue &q, template > sycl::event stable_sort_axis1_contig_impl( sycl::queue &exec_q, - size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a - // matrix when sorting over rows) - size_t sort_nelems, // size of each array to sort (length of rows, i.e. - // number of columns) + std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a + // matrix when sorting over rows) + std::size_t sort_nelems, // size of each array to sort (length of rows, + // i.e. number of columns) const char *arg_cp, char *res_cp, ssize_t iter_arg_offset, @@ -728,7 +733,7 @@ sycl::event stable_sort_axis1_contig_impl( // constant chosen experimentally to ensure monotonicity of // sorting performance, as measured on GPU Max, and Iris Xe - constexpr size_t sequential_sorting_threshold = 16; + constexpr std::size_t sequential_sorting_threshold = 16; if (sort_nelems < sequential_sorting_threshold) { // equal work-item sorts entire row @@ -741,7 +746,7 @@ sycl::event stable_sort_axis1_contig_impl( return sequential_sorting_ev; } else { - size_t sorted_block_size{}; + std::size_t sorted_block_size{}; // Sort segments of the array sycl::event base_sort_ev = @@ -788,10 +793,10 @@ template > sycl::event stable_argsort_axis1_contig_impl( sycl::queue &exec_q, - size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a - // matrix when sorting over rows) - size_t sort_nelems, // size of each array to sort (length of rows, i.e. - // number of columns) + std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a + // matrix when sorting over rows) + std::size_t sort_nelems, // size of each array to sort (length of rows, + // i.e. number of columns) const char *arg_cp, char *res_cp, ssize_t iter_arg_offset, @@ -807,10 +812,10 @@ sycl::event stable_argsort_axis1_contig_impl( const IndexComp index_comp{arg_tp, ValueComp{}}; - static constexpr size_t determine_automatically = 0; - size_t sorted_block_size = determine_automatically; + static constexpr std::size_t determine_automatically = 0; + std::size_t sorted_block_size = determine_automatically; - const size_t total_nelems = iter_nelems * sort_nelems; + const std::size_t total_nelems = iter_nelems * sort_nelems; using dpctl::tensor::kernels::sort_utils_detail::iota_impl; diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp index 15f22b334e..2e87ce39d1 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp @@ -28,6 +28,7 @@ #pragma once #include +#include #include #include #include diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp index 494d5d4f10..bee9cea592 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp @@ -26,6 +26,7 @@ #pragma once #include +#include #include #include #include @@ -54,7 +55,7 @@ struct SearchSortedFunctor const argTy *hay_tp; const argTy *needles_tp; indTy *positions_tp; - const size_t hay_nelems; + const std::size_t hay_nelems; const HayIndexerT hay_indexer; const NeedlesIndexerT needles_indexer; const PositionsIndexerT positions_indexer; @@ -63,7 +64,7 @@ struct SearchSortedFunctor SearchSortedFunctor(const argTy *hay_, const argTy *needles_, indTy *positions_, - const size_t hay_nelems_, + const std::size_t hay_nelems_, const HayIndexerT &hay_indexer_, const NeedlesIndexerT &needles_indexer_, const PositionsIndexerT &positions_indexer_) @@ -78,13 +79,13 @@ struct SearchSortedFunctor { const Compare comp{}; - const size_t i = id[0]; + const std::size_t i = id[0]; const argTy needle_v = needles_tp[needles_indexer(i)]; // position of the needle_v in the hay array indTy pos{}; - constexpr size_t zero(0); + constexpr std::size_t zero(0); if constexpr (left_side) { // search in hay in left-closed interval, give `pos` such that // hay[pos - 1] < needle_v <= hay[pos] @@ -110,8 +111,8 @@ struct SearchSortedFunctor typedef sycl::event (*searchsorted_contig_impl_fp_ptr_t)( sycl::queue &, - const size_t, - const size_t, + const std::size_t, + const std::size_t, const char *, const ssize_t, const char *, @@ -125,8 +126,8 @@ class searchsorted_contig_impl_krn; template sycl::event searchsorted_contig_impl(sycl::queue &exec_q, - const size_t hay_nelems, - const size_t needles_nelems, + const std::size_t hay_nelems, + const std::size_t needles_nelems, const char *hay_cp, const ssize_t hay_offset, const char *needles_cp, @@ -170,8 +171,8 @@ sycl::event searchsorted_contig_impl(sycl::queue &exec_q, typedef sycl::event (*searchsorted_strided_impl_fp_ptr_t)( sycl::queue &, - const size_t, - const size_t, + const std::size_t, + const std::size_t, const char *, const ssize_t, const ssize_t, @@ -189,8 +190,8 @@ class searchsorted_strided_impl_krn; template sycl::event searchsorted_strided_impl( sycl::queue &exec_q, - const size_t hay_nelems, - const size_t needles_nelems, + const std::size_t hay_nelems, + const std::size_t needles_nelems, const char *hay_cp, const ssize_t hay_offset, // hay is 1D, so hay_nelems, hay_offset, hay_stride describe strided array diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp index c9868093c5..333ec491ab 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp @@ -24,6 +24,7 @@ #pragma once +#include #include #include @@ -35,8 +36,8 @@ namespace kernels { typedef sycl::event (*sort_contig_fn_ptr_t)(sycl::queue &, - size_t, - size_t, + std::size_t, + std::size_t, const char *, char *, ssize_t, diff --git a/dpctl/tensor/libtensor/include/kernels/where.hpp b/dpctl/tensor/libtensor/include/kernels/where.hpp index dbf3fdfedf..1b58edd984 100644 --- a/dpctl/tensor/libtensor/include/kernels/where.hpp +++ b/dpctl/tensor/libtensor/include/kernels/where.hpp @@ -25,6 +25,7 @@ #pragma once #include #include +#include #include #include #include @@ -67,14 +68,14 @@ template (cond_p[offset]); dst_p[offset] = check ? x1_p[offset] : x2_p[offset]; @@ -110,7 +111,7 @@ class WhereContigFunctor auto sg = ndit.get_sub_group(); const std::uint16_t sgSize = sg.get_max_local_range()[0]; - const size_t base = + const std::size_t base = nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); @@ -119,7 +120,7 @@ class WhereContigFunctor #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - const size_t idx = base + it * sgSize; + const std::size_t idx = base + it * sgSize; auto x1_multi_ptr = sycl::address_space_cast< sycl::access::address_space::global_space, sycl::access::decorated::yes>(&x1_p[idx]); @@ -147,8 +148,8 @@ class WhereContigFunctor } } else { - const size_t lane_id = sg.get_local_id()[0]; - for (size_t k = base + lane_id; k < nelems; k += sgSize) { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems; k += sgSize) { dst_p[k] = cond_p[k] ? x1_p[k] : x2_p[k]; } } @@ -158,7 +159,7 @@ class WhereContigFunctor typedef sycl::event (*where_contig_impl_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, const char *, const char *, const char *, @@ -167,7 +168,7 @@ typedef sycl::event (*where_contig_impl_fn_ptr_t)( template sycl::event where_contig_impl(sycl::queue &q, - size_t nelems, + std::size_t nelems, const char *cond_cp, const char *x1_cp, const char *x2_cp, @@ -182,10 +183,10 @@ sycl::event where_contig_impl(sycl::queue &q, sycl::event where_ev = q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - size_t lws = 64; + std::size_t lws = 64; constexpr std::uint8_t vec_sz = 4u; constexpr std::uint8_t n_vecs = 2u; - const size_t n_groups = + const std::size_t n_groups = ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); const auto gws_range = sycl::range<1>(n_groups * lws); const auto lws_range = sycl::range<1>(lws); @@ -245,7 +246,7 @@ class WhereStridedFunctor void operator()(sycl::id<1> id) const { - size_t gid = id[0]; + std::size_t gid = id[0]; auto offsets = indexer(static_cast(gid)); using dpctl::tensor::type_utils::convert_impl; @@ -260,7 +261,7 @@ class WhereStridedFunctor typedef sycl::event (*where_strided_impl_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, int, const char *, const char *, @@ -275,7 +276,7 @@ typedef sycl::event (*where_strided_impl_fn_ptr_t)( template sycl::event where_strided_impl(sycl::queue &q, - size_t nelems, + std::size_t nelems, int nd, const char *cond_cp, const char *x1_cp, diff --git a/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp b/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp index c890cc1262..c115fdeeef 100644 --- a/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp @@ -25,6 +25,7 @@ //===----------------------------------------------------------------------===// #pragma once +#include #include #include #include @@ -119,7 +120,7 @@ template struct ClipIndex else { constexpr IndT lb(0); const IndT ub = static_cast(max_item - 1); - projected = static_cast(sycl::clamp(ind, lb, ub)); + projected = static_cast(sycl::clamp(ind, lb, ub)); } } else { diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp index de8058aac3..c113f30c51 100644 --- a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp @@ -27,6 +27,7 @@ #pragma once #include +#include #include #include #include @@ -83,7 +84,7 @@ std::vector concat(std::vector lhs, Vs &&...vs) } // namespace detail template -std::tuple +std::tuple device_allocate_and_pack(sycl::queue &q, std::vector &host_task_events, Vs &&...vs) @@ -128,7 +129,7 @@ device_allocate_and_pack(sycl::queue &q, struct NoOpIndexer { constexpr NoOpIndexer() {} - constexpr size_t operator()(size_t gid) const { return gid; } + constexpr std::size_t operator()(std::size_t gid) const { return gid; } }; using dpctl::tensor::ssize_t; @@ -146,7 +147,7 @@ struct StridedIndexer ssize_t operator()(ssize_t gid) const { return compute_offset(gid); } - ssize_t operator()(size_t gid) const + ssize_t operator()(std::size_t gid) const { return compute_offset(static_cast(gid)); } @@ -184,7 +185,7 @@ struct UnpackedStridedIndexer ssize_t operator()(ssize_t gid) const { return compute_offset(gid); } - ssize_t operator()(size_t gid) const + ssize_t operator()(std::size_t gid) const { return compute_offset(static_cast(gid)); } @@ -212,63 +213,63 @@ struct UnpackedStridedIndexer struct Strided1DIndexer { - Strided1DIndexer(size_t _size) : offset{}, size(_size), step(1) {} + Strided1DIndexer(std::size_t _size) : offset{}, size(_size), step(1) {} Strided1DIndexer(ssize_t _size) - : offset{}, size(static_cast(_size)), step(1) + : offset{}, size(static_cast(_size)), step(1) { } - Strided1DIndexer(size_t _size, ssize_t _step) + Strided1DIndexer(std::size_t _size, ssize_t _step) : offset{}, size(_size), step(_step) { } - Strided1DIndexer(size_t _size, size_t _step) + Strided1DIndexer(std::size_t _size, std::size_t _step) : offset{}, size(_size), step(static_cast(_step)) { } Strided1DIndexer(ssize_t _size, ssize_t _step) - : offset{}, size(static_cast(_size)), step(_step) + : offset{}, size(static_cast(_size)), step(_step) { } - Strided1DIndexer(ssize_t _offset, size_t _size, ssize_t _step) + Strided1DIndexer(ssize_t _offset, std::size_t _size, ssize_t _step) : offset(_offset), size(_size), step(_step) { } - Strided1DIndexer(ssize_t _offset, size_t _size, size_t _step) + Strided1DIndexer(ssize_t _offset, std::size_t _size, std::size_t _step) : offset(_offset), size(_size), step(static_cast(_step)) { } Strided1DIndexer(ssize_t _offset, ssize_t _size, ssize_t _step) - : offset(_offset), size(static_cast(_size)), step(_step) + : offset(_offset), size(static_cast(_size)), step(_step) { } - ssize_t operator()(size_t gid) const + ssize_t operator()(std::size_t gid) const { // ensure 0 <= gid < size - return offset + std::min(gid, size - 1) * step; + return offset + std::min(gid, size - 1) * step; } private: ssize_t offset = 0; - size_t size = 1; + std::size_t size = 1; ssize_t step = 1; }; struct Strided1DCyclicIndexer { Strided1DCyclicIndexer(ssize_t _offset, ssize_t _size, ssize_t _step) - : offset(_offset), size(static_cast(_size)), step(_step) + : offset(_offset), size(static_cast(_size)), step(_step) { } - ssize_t operator()(size_t gid) const + ssize_t operator()(std::size_t gid) const { return offset + (gid % size) * step; } private: ssize_t offset = 0; - size_t size = 1; + std::size_t size = 1; ssize_t step = 1; }; @@ -306,7 +307,7 @@ struct TwoOffsets_StridedIndexer return compute_offsets(gid); } - TwoOffsets operator()(size_t gid) const + TwoOffsets operator()(std::size_t gid) const { return compute_offsets(static_cast(gid)); } @@ -409,7 +410,7 @@ struct ThreeOffsets_StridedIndexer return compute_offsets(gid); } - ThreeOffsets operator()(size_t gid) const + ThreeOffsets operator()(std::size_t gid) const { return compute_offsets(static_cast(gid)); } @@ -533,7 +534,7 @@ struct FourOffsets_StridedIndexer return compute_offsets(gid); } - constexpr FourOffsets operator()(size_t gid) const + constexpr FourOffsets operator()(std::size_t gid) const { return compute_offsets(static_cast(gid)); } @@ -592,7 +593,7 @@ struct NthStrideOffset { } - size_t operator()(ssize_t gid, int n) const + std::size_t operator()(ssize_t gid, int n) const { ssize_t relative_offset(0); _ind.get_displacement( @@ -618,7 +619,7 @@ template struct FixedDimStridedIndexer : _ind(_shape), strides(_strides), starting_offset(_offset) { } - size_t operator()(size_t gid) const + std::size_t operator()(std::size_t gid) const { dpctl::tensor::strides::CIndexer_array local_indexer( std::move(_ind)); @@ -653,7 +654,7 @@ template struct TwoOffsets_FixedDimStridedIndexer { } - TwoOffsets operator()(size_t gid) const + TwoOffsets operator()(std::size_t gid) const { dpctl::tensor::strides::CIndexer_array local_indexer( std::move(_ind)); @@ -700,7 +701,7 @@ template struct ThreeOffsets_FixedDimStridedIndexer { } - ThreeOffsets operator()(size_t gid) const + ThreeOffsets operator()(std::size_t gid) const { dpctl::tensor::strides::CIndexer_array local_indexer( std::move(_ind)); diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp index 880278dd48..4c500e6dda 100644 --- a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp +++ b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp @@ -28,6 +28,7 @@ #include // sort #include +#include #include // std::iota #include #include @@ -536,7 +537,7 @@ int simplify_iteration_two_strides(const int nd, template > std::tuple contract_iter(const vecT &shape, const vecT &strides) { - const size_t dim = shape.size(); + const std::size_t dim = shape.size(); if (dim != strides.size()) { throw Error("Shape and strides must be of equal size."); } @@ -555,7 +556,7 @@ template > std::tuple contract_iter2(const vecT &shape, const vecT &strides1, const vecT &strides2) { - const size_t dim = shape.size(); + const std::size_t dim = shape.size(); if (dim != strides1.size() || dim != strides2.size()) { throw Error("Shape and strides must be of equal size."); } @@ -712,7 +713,7 @@ std::tuple contract_iter3(const vecT &shape, const vecT &strides2, const vecT &strides3) { - const size_t dim = shape.size(); + const std::size_t dim = shape.size(); if (dim != strides1.size() || dim != strides2.size() || dim != strides3.size()) { @@ -902,7 +903,7 @@ contract_iter4(const vecT &shape, const vecT &strides3, const vecT &strides4) { - const size_t dim = shape.size(); + const std::size_t dim = shape.size(); if (dim != strides1.size() || dim != strides2.size() || dim != strides3.size() || dim != strides4.size()) { diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp index f67e1bba1f..654aef5b01 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp @@ -26,6 +26,7 @@ #pragma once +#include #include #include #include @@ -54,7 +55,7 @@ class usm_host_allocator : public sycl::usm_allocator typedef usm_host_allocator other; }; - void deallocate(T *ptr, size_t n) + void deallocate(T *ptr, std::size_t n) { try { baseT::deallocate(ptr, n); diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index a4ace720ce..9226b991c0 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -103,28 +103,28 @@ template struct IsSyclOp /*! @brief Find the smallest multiple of supported sub-group size larger than * nelems */ -template -size_t choose_workgroup_size(const size_t nelems, - const std::vector &sg_sizes) +template +std::size_t choose_workgroup_size(const std::size_t nelems, + const std::vector &sg_sizes) { - std::vector wg_choices; + std::vector wg_choices; wg_choices.reserve(f * sg_sizes.size()); for (const auto &sg_size : sg_sizes) { #pragma unroll - for (size_t i = 1; i <= f; ++i) { + for (std::size_t i = 1; i <= f; ++i) { wg_choices.push_back(sg_size * i); } } std::sort(std::begin(wg_choices), std::end(wg_choices)); - size_t wg = 1; - for (size_t i = 0; i < wg_choices.size(); ++i) { + std::size_t wg = 1; + for (std::size_t i = 0; i < wg_choices.size(); ++i) { if (wg_choices[i] == wg) { continue; } wg = wg_choices[i]; - size_t n_groups = ((nelems + wg - 1) / wg); + std::size_t n_groups = ((nelems + wg - 1) / wg); if (n_groups == 1) break; } diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl/tensor/libtensor/include/utils/type_utils.hpp index 2ab5392ebd..405d8a6739 100644 --- a/dpctl/tensor/libtensor/include/utils/type_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/type_utils.hpp @@ -24,6 +24,7 @@ #pragma once #include +#include #include #include #include diff --git a/dpctl/tensor/libtensor/source/accumulators.cpp b/dpctl/tensor/libtensor/source/accumulators.cpp index 76c746ff35..eadf574bbb 100644 --- a/dpctl/tensor/libtensor/source/accumulators.cpp +++ b/dpctl/tensor/libtensor/source/accumulators.cpp @@ -22,6 +22,7 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===----------------------------------------------------------------------===// +#include #include #include #include @@ -101,10 +102,10 @@ void populate_mask_positions_dispatch_vectors(void) return; } -size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, - const dpctl::tensor::usm_ndarray &cumsum, - sycl::queue &exec_q, - const std::vector &depends) +std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + const std::vector &depends) { dpctl::tensor::validation::CheckWritable::throw_if_not_writable(cumsum); @@ -163,7 +164,7 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, ? mask_positions_contig_i32_dispatch_vector[mask_typeid] : mask_positions_contig_i64_dispatch_vector[mask_typeid]; - size_t total_set; + std::size_t total_set; { py::gil_scoped_release release; @@ -204,7 +205,7 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, } sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); - if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { + if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { { py::gil_scoped_release release; @@ -223,7 +224,7 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, dependent_events.insert(dependent_events.end(), depends.begin(), depends.end()); - size_t total_set; + std::size_t total_set; { py::gil_scoped_release release; @@ -263,10 +264,10 @@ void populate_cumsum_1d_dispatch_vectors(void) return; } -size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &cumsum, - sycl::queue &exec_q, - std::vector const &depends) +std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + std::vector const &depends) { // cumsum is 1D if (cumsum.get_ndim() != 1) { @@ -324,8 +325,8 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, "this cumsum requires integer type, got src_typeid=" + std::to_string(src_typeid)); } - size_t total = fn(exec_q, src_size, src_data, cumsum_data, - host_task_events, depends); + std::size_t total = fn(exec_q, src_size, src_data, cumsum_data, + host_task_events, depends); { py::gil_scoped_release release; sycl::event::wait(host_task_events); @@ -364,7 +365,7 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, } sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); - if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { + if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { { py::gil_scoped_release release; @@ -382,8 +383,9 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, dependent_events.insert(dependent_events.end(), depends.begin(), depends.end()); - size_t total = strided_fn(exec_q, src_size, src_data, nd, shape_strides, - cumsum_data, host_task_events, dependent_events); + std::size_t total = + strided_fn(exec_q, src_size, src_data, nd, shape_strides, cumsum_data, + host_task_events, dependent_events); { py::gil_scoped_release release; diff --git a/dpctl/tensor/libtensor/source/accumulators.hpp b/dpctl/tensor/libtensor/source/accumulators.hpp index 6b5834d835..d4816e0d3e 100644 --- a/dpctl/tensor/libtensor/source/accumulators.hpp +++ b/dpctl/tensor/libtensor/source/accumulators.hpp @@ -23,6 +23,7 @@ //===--------------------------------------------------------------------===// #pragma once +#include #include #include #include @@ -39,17 +40,18 @@ namespace py_internal extern void populate_mask_positions_dispatch_vectors(void); -extern size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, - const dpctl::tensor::usm_ndarray &cumsum, - sycl::queue &exec_q, - const std::vector &depends = {}); +extern std::size_t +py_mask_positions(const dpctl::tensor::usm_ndarray &mask, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + const std::vector &depends = {}); extern void populate_cumsum_1d_dispatch_vectors(void); -extern size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &cumsum, - sycl::queue &exec_q, - std::vector const &depends = {}); +extern std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + std::vector const &depends = {}); } // namespace py_internal } // namespace tensor diff --git a/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp index 7b24c37da4..573ff72f81 100644 --- a/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp @@ -24,6 +24,7 @@ #pragma once +#include #include #include #include @@ -81,18 +82,18 @@ py_accumulate_over_axis(const dpctl::tensor::usm_ndarray &src, const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); bool same_shapes = true; - size_t iter_nelems(1); + std::size_t iter_nelems(1); for (int i = 0; same_shapes && (i < iter_nd); ++i) { auto src_shape_i = src_shape_ptr[i]; same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); - iter_nelems *= static_cast(src_shape_i); + iter_nelems *= static_cast(src_shape_i); } - size_t acc_nelems(1); + std::size_t acc_nelems(1); for (int i = iter_nd; same_shapes && (i < src_nd); ++i) { auto dst_shape_i = dst_shape_ptr[i]; same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_i); - acc_nelems *= static_cast(dst_shape_i); + acc_nelems *= static_cast(dst_shape_i); } if (!same_shapes) { @@ -267,18 +268,18 @@ std::pair py_accumulate_final_axis_include_initial( const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); bool same_shapes = true; - size_t iter_nelems(1); + std::size_t iter_nelems(1); for (int i = 0; same_shapes && (i < iter_nd); ++i) { auto src_shape_i = src_shape_ptr[i]; same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); - iter_nelems *= static_cast(src_shape_i); + iter_nelems *= static_cast(src_shape_i); } - size_t acc_nelems(1); + std::size_t acc_nelems(1); for (int i = iter_nd; same_shapes && (i < src_nd); ++i) { auto dst_shape_i = dst_shape_ptr[i]; same_shapes = same_shapes && (src_shape_ptr[i] + 1 == dst_shape_i); - acc_nelems *= static_cast(dst_shape_i); + acc_nelems *= static_cast(dst_shape_i); } if (!same_shapes) { diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp index e6d91082d3..48e385b238 100644 --- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp @@ -23,6 +23,7 @@ /// dpctl.tensor.extract, dpctl.tensor.nonzero //===----------------------------------------------------------------------===// +#include #include #include #include @@ -176,7 +177,7 @@ py_extract(const dpctl::tensor::usm_ndarray &src, const py::ssize_t *src_shape = src.get_shape_raw(); const py::ssize_t *dst_shape = dst.get_shape_raw(); bool same_ortho_dims(true); - size_t ortho_nelems(1); // number of orthogonal iterations + std::size_t ortho_nelems(1); // number of orthogonal iterations for (auto i = 0; i < axis_start; ++i) { auto src_sh_i = src_shape[i]; @@ -190,8 +191,8 @@ py_extract(const dpctl::tensor::usm_ndarray &src, same_ortho_dims && (src_sh_i == dst_shape[i - (mask_span_sz - 1)]); } - size_t masked_src_nelems(1); - size_t masked_dst_nelems(dst_shape[axis_start]); + std::size_t masked_src_nelems(1); + std::size_t masked_dst_nelems(dst_shape[axis_start]); for (auto i = axis_start; i < axis_end; ++i) { masked_src_nelems *= src_shape[i]; } @@ -199,7 +200,7 @@ py_extract(const dpctl::tensor::usm_ndarray &src, // masked_dst_nelems is number of set elements in the mask, or last element // in cumsum if (!same_ortho_dims || - (masked_src_nelems != static_cast(cumsum_sz))) + (masked_src_nelems != static_cast(cumsum_sz))) { throw py::value_error("Inconsistent array dimensions"); } @@ -345,8 +346,8 @@ py_extract(const dpctl::tensor::usm_ndarray &src, masked_dst_shape, // 4 vectors modified ortho_dst_strides, masked_dst_strides); - assert(ortho_src_shape.size() == static_cast(ortho_nd)); - assert(ortho_dst_shape.size() == static_cast(ortho_nd)); + assert(ortho_src_shape.size() == static_cast(ortho_nd)); + assert(ortho_dst_shape.size() == static_cast(ortho_nd)); assert(std::equal(ortho_src_shape.begin(), ortho_src_shape.end(), ortho_dst_shape.begin())); @@ -519,7 +520,7 @@ py_place(const dpctl::tensor::usm_ndarray &dst, const py::ssize_t *dst_shape = dst.get_shape_raw(); const py::ssize_t *rhs_shape = rhs.get_shape_raw(); bool same_ortho_dims(true); - size_t ortho_nelems(1); // number of orthogonal iterations + std::size_t ortho_nelems(1); // number of orthogonal iterations for (auto i = 0; i < axis_start; ++i) { auto dst_sh_i = dst_shape[i]; @@ -533,13 +534,13 @@ py_place(const dpctl::tensor::usm_ndarray &dst, same_ortho_dims && (dst_sh_i == rhs_shape[i - (mask_span_sz - 1)]); } - size_t masked_dst_nelems(1); + std::size_t masked_dst_nelems(1); for (auto i = axis_start; i < axis_end; ++i) { masked_dst_nelems *= dst_shape[i]; } if (!same_ortho_dims || - (masked_dst_nelems != static_cast(cumsum_sz))) + (masked_dst_nelems != static_cast(cumsum_sz))) { throw py::value_error("Inconsistent array dimensions"); } @@ -667,8 +668,8 @@ py_place(const dpctl::tensor::usm_ndarray &dst, masked_rhs_shape, // 4 vectors modified ortho_rhs_strides, masked_rhs_strides); - assert(ortho_dst_shape.size() == static_cast(ortho_nd)); - assert(ortho_rhs_shape.size() == static_cast(ortho_nd)); + assert(ortho_dst_shape.size() == static_cast(ortho_nd)); + assert(ortho_rhs_shape.size() == static_cast(ortho_nd)); assert(std::equal(ortho_dst_shape.begin(), ortho_dst_shape.end(), ortho_rhs_shape.begin())); @@ -769,7 +770,7 @@ py_nonzero(const dpctl::tensor::usm_ndarray throw py::value_error("Index array must be a C-contiguous matrix"); } - size_t _ndim = mask_shape.size(); + std::size_t _ndim = mask_shape.size(); if (_ndim > std::numeric_limits::max()) { throw py::value_error("Shape is too large"); } diff --git a/dpctl/tensor/libtensor/source/clip.cpp b/dpctl/tensor/libtensor/source/clip.cpp index 7688f5b61b..68e1276c5f 100644 --- a/dpctl/tensor/libtensor/source/clip.cpp +++ b/dpctl/tensor/libtensor/source/clip.cpp @@ -24,6 +24,7 @@ //===----------------------------------------------------------------------===// #include +#include #include #include #include @@ -114,10 +115,10 @@ py_clip(const dpctl::tensor::usm_ndarray &src, const py::ssize_t *dst_shape = dst.get_shape_raw(); bool shapes_equal(true); - size_t nelems(1); + std::size_t nelems(1); for (int i = 0; i < nd; ++i) { const auto &sh_i = dst_shape[i]; - nelems *= static_cast(sh_i); + nelems *= static_cast(sh_i); shapes_equal = shapes_equal && (min_shape[i] == sh_i) && (max_shape[i] == sh_i) && (src_shape[i] == sh_i); } diff --git a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp index 42ac8f3cdb..3658d92a5c 100644 --- a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp +++ b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -91,10 +92,10 @@ copy_usm_ndarray_into_usm_ndarray(const dpctl::tensor::usm_ndarray &src, const py::ssize_t *dst_shape = dst.get_shape_raw(); bool shapes_equal(true); - size_t src_nelems(1); + std::size_t src_nelems(1); for (int i = 0; shapes_equal && (i < src_nd); ++i) { - src_nelems *= static_cast(src_shape[i]); + src_nelems *= static_cast(src_shape[i]); shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); } if (!shapes_equal) { diff --git a/dpctl/tensor/libtensor/source/copy_as_contig.cpp b/dpctl/tensor/libtensor/source/copy_as_contig.cpp index 1a186d6ce7..5d2b9291d2 100644 --- a/dpctl/tensor/libtensor/source/copy_as_contig.cpp +++ b/dpctl/tensor/libtensor/source/copy_as_contig.cpp @@ -23,6 +23,7 @@ //===----------------------------------------------------------------------===// #include +#include #include #include #include diff --git a/dpctl/tensor/libtensor/source/copy_for_roll.cpp b/dpctl/tensor/libtensor/source/copy_for_roll.cpp index b624d02882..c1235b87e7 100644 --- a/dpctl/tensor/libtensor/source/copy_for_roll.cpp +++ b/dpctl/tensor/libtensor/source/copy_for_roll.cpp @@ -22,6 +22,7 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===----------------------------------------------------------------------===// +#include #include #include #include @@ -149,7 +150,7 @@ copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src, const bool both_f_contig = is_src_f_contig && is_dst_f_contig; // normalize shift parameter to be 0 <= offset < src_nelems - size_t offset = + std::size_t offset = (shift > 0) ? (shift % src_nelems) : src_nelems + (shift % src_nelems); const char *src_data = src.get_data(); @@ -266,7 +267,7 @@ copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src, "have the same number of dimensions."); } - if (static_cast(src_nd) != shifts.size()) { + if (static_cast(src_nd) != shifts.size()) { throw py::value_error( "copy_usm_ndarray_for_roll_nd requires shifts to " "contain an integral shift for each array dimension."); @@ -325,7 +326,7 @@ copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src, for (int i = 0; i < src_nd; ++i) { // normalize shift parameter to be 0 <= offset < dim py::ssize_t dim = src_shape_ptr[i]; - size_t offset = + std::size_t offset = (shifts[i] >= 0) ? (shifts[i] % dim) : dim + (shifts[i] % dim); normalized_shifts.push_back(offset); diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp index b3d04a252a..bd224e83af 100644 --- a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp +++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp @@ -23,6 +23,7 @@ //===----------------------------------------------------------------------===// #include +#include #include #include #include @@ -83,10 +84,10 @@ void copy_numpy_ndarray_into_usm_ndarray( const py::ssize_t *src_shape = npy_src.shape(); const py::ssize_t *dst_shape = dst.get_shape_raw(); bool shapes_equal(true); - size_t src_nelems(1); + std::size_t src_nelems(1); for (int i = 0; shapes_equal && (i < src_ndim); ++i) { shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); - src_nelems *= static_cast(src_shape[i]); + src_nelems *= static_cast(src_shape[i]); } if (!shapes_equal) { @@ -225,9 +226,9 @@ void copy_numpy_ndarray_into_usm_ndarray( simplified_shape, simplified_src_strides, simplified_dst_strides, src_offset, dst_offset); - assert(simplified_shape.size() == static_cast(nd)); - assert(simplified_src_strides.size() == static_cast(nd)); - assert(simplified_dst_strides.size() == static_cast(nd)); + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_src_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); // handle nd == 0 if (nd == 0) { diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp index 56d3c38004..e71e4156bf 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp @@ -24,6 +24,7 @@ //===----------------------------------------------------------------------===// #pragma once +#include #include #include #include @@ -107,10 +108,10 @@ py_unary_ufunc(const dpctl::tensor::usm_ndarray &src, const py::ssize_t *src_shape = src.get_shape_raw(); const py::ssize_t *dst_shape = dst.get_shape_raw(); bool shapes_equal(true); - size_t src_nelems(1); + std::size_t src_nelems(1); for (int i = 0; i < src_nd; ++i) { - src_nelems *= static_cast(src_shape[i]); + src_nelems *= static_cast(src_shape[i]); shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); } if (!shapes_equal) { @@ -355,10 +356,10 @@ std::pair py_binary_ufunc( const py::ssize_t *src2_shape = src2.get_shape_raw(); const py::ssize_t *dst_shape = dst.get_shape_raw(); bool shapes_equal(true); - size_t src_nelems(1); + std::size_t src_nelems(1); for (int i = 0; i < dst_nd; ++i) { - src_nelems *= static_cast(src1_shape[i]); + src_nelems *= static_cast(src1_shape[i]); shapes_equal = shapes_equal && (src1_shape[i] == dst_shape[i] && src2_shape[i] == dst_shape[i]); } @@ -485,8 +486,8 @@ std::pair py_binary_ufunc( is_aligned( dst_data + dst_offset * dst_itemsize)) { - size_t n0 = simplified_shape[0]; - size_t n1 = simplified_shape[1]; + std::size_t n0 = simplified_shape[0]; + std::size_t n1 = simplified_shape[1]; sycl::event comp_ev = matrix_row_broadcast_fn( exec_q, host_tasks, n0, n1, src1_data, src1_offset, src2_data, src2_offset, dst_data, dst_offset, @@ -519,8 +520,8 @@ std::pair py_binary_ufunc( is_aligned( dst_data + dst_offset * dst_itemsize)) { - size_t n0 = simplified_shape[1]; - size_t n1 = simplified_shape[0]; + std::size_t n0 = simplified_shape[1]; + std::size_t n1 = simplified_shape[0]; sycl::event comp_ev = row_matrix_broadcast_fn( exec_q, host_tasks, n0, n1, src1_data, src1_offset, src2_data, src2_offset, dst_data, dst_offset, @@ -672,10 +673,10 @@ py_binary_inplace_ufunc(const dpctl::tensor::usm_ndarray &lhs, const py::ssize_t *rhs_shape = rhs.get_shape_raw(); const py::ssize_t *lhs_shape = lhs.get_shape_raw(); bool shapes_equal(true); - size_t rhs_nelems(1); + std::size_t rhs_nelems(1); for (int i = 0; i < lhs_nd; ++i) { - rhs_nelems *= static_cast(rhs_shape[i]); + rhs_nelems *= static_cast(rhs_shape[i]); shapes_equal = shapes_equal && (rhs_shape[i] == lhs_shape[i]); } if (!shapes_equal) { @@ -776,8 +777,8 @@ py_binary_inplace_ufunc(const dpctl::tensor::usm_ndarray &lhs, contig_row_matrix_broadcast_dispatch_table[rhs_typeid] [lhs_typeid]; if (row_matrix_broadcast_fn != nullptr) { - size_t n0 = simplified_shape[1]; - size_t n1 = simplified_shape[0]; + std::size_t n0 = simplified_shape[1]; + std::size_t n1 = simplified_shape[0]; sycl::event comp_ev = row_matrix_broadcast_fn( exec_q, host_tasks, n0, n1, rhs_data, rhs_offset, lhs_data, lhs_offset, depends); diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp index 8231995868..d82de33de0 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp @@ -24,6 +24,7 @@ //===----------------------------------------------------------------------===// #include +#include #include #include #include @@ -178,7 +179,7 @@ template class divide_by_scalar_krn; typedef sycl::event (*divide_by_scalar_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, int, const ssize_t *, const char *, @@ -190,7 +191,7 @@ typedef sycl::event (*divide_by_scalar_fn_ptr_t)( template sycl::event divide_by_scalar(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, @@ -270,10 +271,10 @@ py_divide_by_scalar(const dpctl::tensor::usm_ndarray &src, const py::ssize_t *src_shape = src.get_shape_raw(); const py::ssize_t *dst_shape = dst.get_shape_raw(); bool shapes_equal(true); - size_t src_nelems(1); + std::size_t src_nelems(1); for (int i = 0; i < dst_nd; ++i) { - src_nelems *= static_cast(src_shape[i]); + src_nelems *= static_cast(src_shape[i]); shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); } if (!shapes_equal) { diff --git a/dpctl/tensor/libtensor/source/eye_ctor.cpp b/dpctl/tensor/libtensor/source/eye_ctor.cpp index c907531be6..609a76d517 100644 --- a/dpctl/tensor/libtensor/source/eye_ctor.cpp +++ b/dpctl/tensor/libtensor/source/eye_ctor.cpp @@ -22,6 +22,7 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// +#include #include #include #include @@ -112,7 +113,7 @@ usm_ndarray_eye(py::ssize_t k, auto fn = eye_dispatch_vector[dst_typeid]; - eye_event = fn(exec_q, static_cast(nelem), start, end, step, + eye_event = fn(exec_q, static_cast(nelem), start, end, step, dst_data, depends); return std::make_pair(keep_args_alive(exec_q, {dst}, {eye_event}), diff --git a/dpctl/tensor/libtensor/source/full_ctor.cpp b/dpctl/tensor/libtensor/source/full_ctor.cpp index 4542598391..4557a2b32b 100644 --- a/dpctl/tensor/libtensor/source/full_ctor.cpp +++ b/dpctl/tensor/libtensor/source/full_ctor.cpp @@ -23,6 +23,7 @@ //===--------------------------------------------------------------------===// #include +#include #include #include #include @@ -52,7 +53,7 @@ namespace py_internal using dpctl::utils::keep_args_alive; typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue &, - size_t, + std::size_t, const py::object &, char *, const std::vector &); @@ -75,7 +76,7 @@ typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue &, */ template sycl::event full_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const py::object &py_value, char *dst_p, const std::vector &depends) @@ -156,7 +157,7 @@ template struct FullContigFactory typedef sycl::event (*full_strided_fn_ptr_t)(sycl::queue &, int, - size_t, + std::size_t, py::ssize_t *, const py::object &, char *, @@ -184,7 +185,7 @@ typedef sycl::event (*full_strided_fn_ptr_t)(sycl::queue &, template sycl::event full_strided_impl(sycl::queue &exec_q, int nd, - size_t nelems, + std::size_t nelems, py::ssize_t *shape_strides, const py::object &py_value, char *dst_p, @@ -243,7 +244,7 @@ usm_ndarray_full(const py::object &py_value, auto fn = full_contig_dispatch_vector[dst_typeid]; sycl::event full_contig_event = - fn(exec_q, static_cast(dst_nelems), py_value, dst_data, + fn(exec_q, static_cast(dst_nelems), py_value, dst_data, depends); return std::make_pair( diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp index f38fab8a2d..6745948c5e 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -209,13 +210,13 @@ _populate_kernel_params(sycl::queue &exec_q, std::vector parse_py_ind(const sycl::queue &q, const py::object &py_ind) { - size_t ind_count = py::len(py_ind); + std::size_t ind_count = py::len(py_ind); std::vector res; res.reserve(ind_count); bool nd_is_known = false; int nd = -1; - for (size_t i = 0; i < ind_count; ++i) { + for (std::size_t i = 0; i < ind_count; ++i) { py::object el_i = py_ind[py::cast(i)]; dpctl::tensor::usm_ndarray arr_i = py::cast(el_i); @@ -295,12 +296,12 @@ usm_ndarray_take(const dpctl::tensor::usm_ndarray &src, const py::ssize_t *dst_shape = dst.get_shape_raw(); bool orthog_shapes_equal(true); - size_t orthog_nelems(1); + std::size_t orthog_nelems(1); for (int i = 0; i < (src_nd - k); ++i) { auto idx1 = (i < axis_start) ? i : i + k; auto idx2 = (i < axis_start) ? i : i + ind_nd; - orthog_nelems *= static_cast(src_shape[idx1]); + orthog_nelems *= static_cast(src_shape[idx1]); orthog_shapes_equal = orthog_shapes_equal && (src_shape[idx1] == dst_shape[idx2]); } @@ -346,9 +347,9 @@ usm_ndarray_take(const dpctl::tensor::usm_ndarray &src, int ind_typenum = ind_rep.get_typenum(); int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum); - size_t ind_nelems(1); + std::size_t ind_nelems(1); for (int i = 0; i < ind_nd; ++i) { - ind_nelems *= static_cast(ind_shape[i]); + ind_nelems *= static_cast(ind_shape[i]); if (!(ind_shape[i] == dst_shape[axis_start + i])) { throw py::value_error( @@ -606,18 +607,18 @@ usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst, } } - size_t dst_nelems = dst.get_size(); + std::size_t dst_nelems = dst.get_size(); const py::ssize_t *dst_shape = dst.get_shape_raw(); const py::ssize_t *val_shape = val.get_shape_raw(); bool orthog_shapes_equal(true); - size_t orthog_nelems(1); + std::size_t orthog_nelems(1); for (int i = 0; i < (dst_nd - k); ++i) { auto idx1 = (i < axis_start) ? i : i + k; auto idx2 = (i < axis_start) ? i : i + ind_nd; - orthog_nelems *= static_cast(dst_shape[idx1]); + orthog_nelems *= static_cast(dst_shape[idx1]); orthog_shapes_equal = orthog_shapes_equal && (dst_shape[idx1] == val_shape[idx2]); } @@ -665,9 +666,9 @@ usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst, int ind_typenum = ind_rep.get_typenum(); int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum); - size_t ind_nelems(1); + std::size_t ind_nelems(1); for (int i = 0; i < ind_nd; ++i) { - ind_nelems *= static_cast(ind_shape[i]); + ind_nelems *= static_cast(ind_shape[i]); if (!(ind_shape[i] == val_shape[axis_start + i])) { throw py::value_error( diff --git a/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp b/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp index 0ab760e764..6856a60ef8 100644 --- a/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp +++ b/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp @@ -22,6 +22,7 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// +#include #include #include #include @@ -235,27 +236,27 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); bool same_shapes = true; - size_t batches(1); + std::size_t batches(1); for (int i = 0; same_shapes && (i < batch_dims); ++i) { same_shapes = same_shapes && (x1_shape_ptr[i] == dst_shape_ptr[i]) && (x2_shape_ptr[i] == dst_shape_ptr[i]); batches *= x1_shape_ptr[i]; } - size_t x1_outer_nelems(1); + std::size_t x1_outer_nelems(1); for (int i = batch_dims; same_shapes && (i < (batch_dims + x1_outer_dims)); ++i) { same_shapes = same_shapes && (x1_shape_ptr[i] == dst_shape_ptr[i]); x1_outer_nelems *= x1_shape_ptr[i]; } - size_t inner_nelems(1); + std::size_t inner_nelems(1); for (int i = batch_dims; i < (batch_dims + inner_dims); ++i) { auto x1_shape_idx = x1_outer_dims + i; same_shapes = same_shapes && (x1_shape_ptr[x1_shape_idx] == x2_shape_ptr[i]); inner_nelems *= x1_shape_ptr[x1_shape_idx]; } - size_t x2_outer_nelems(1); + std::size_t x2_outer_nelems(1); for (int i = 0; same_shapes && (i < x2_outer_dims); ++i) { auto x2_shape_idx = batch_dims + inner_dims + i; same_shapes = @@ -268,12 +269,12 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, "appropriate shapes"); } - size_t dst_nelems = batches * x1_outer_nelems * x2_outer_nelems; + std::size_t dst_nelems = batches * x1_outer_nelems * x2_outer_nelems; if (dst_nelems == 0) { return std::make_pair(sycl::event(), sycl::event()); } - if (static_cast(dst.get_size()) != dst_nelems) { + if (static_cast(dst.get_size()) != dst_nelems) { throw py::value_error("dst shape and size mismatch"); } @@ -429,9 +430,9 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, reduce_all_elems = (simplified_batch_shape[0] == 1); dot_product_c_contig = (simplified_batch_dst_strides[0] == 1) && - (static_cast(simplified_batch_x1_strides[0]) == + (static_cast(simplified_batch_x1_strides[0]) == inner_nelems) && - (static_cast(simplified_batch_x2_strides[0]) == + (static_cast(simplified_batch_x2_strides[0]) == inner_nelems); } @@ -689,22 +690,25 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, { bool gemm_batch_c_contig = false; - if ((static_cast(outer_inner_x1_strides[0]) == + if ((static_cast(outer_inner_x1_strides[0]) == inner_nelems && outer_inner_x1_strides[1] == 1) && - (static_cast(outer_inner_x2_strides[0]) == + (static_cast(outer_inner_x2_strides[0]) == inner_nelems && outer_inner_x2_strides[1] == 1) && - (static_cast(outer_inner_dst_strides[0]) == + (static_cast(outer_inner_dst_strides[0]) == x2_outer_nelems && outer_inner_dst_strides[1] == 1)) { gemm_batch_c_contig = - (static_cast(simplified_batch_x1_strides[0]) == + (static_cast( + simplified_batch_x1_strides[0]) == x1_outer_nelems * inner_nelems) && - (static_cast(simplified_batch_x2_strides[0]) == + (static_cast( + simplified_batch_x2_strides[0]) == x2_outer_nelems * inner_nelems) && - (static_cast(simplified_batch_dst_strides[0]) == + (static_cast( + simplified_batch_dst_strides[0]) == x1_outer_nelems * x2_outer_nelems); } diff --git a/dpctl/tensor/libtensor/source/linear_sequences.cpp b/dpctl/tensor/libtensor/source/linear_sequences.cpp index 1a6b9811fe..a8c583d32a 100644 --- a/dpctl/tensor/libtensor/source/linear_sequences.cpp +++ b/dpctl/tensor/libtensor/source/linear_sequences.cpp @@ -24,6 +24,7 @@ #include "dpctl4pybind11.hpp" #include +#include #include #include #include @@ -52,7 +53,7 @@ namespace py_internal typedef sycl::event (*lin_space_step_fn_ptr_t)( sycl::queue &, - size_t, // num_elements + std::size_t, // num_elements const py::object &start, const py::object &step, char *, // dst_data_ptr @@ -79,7 +80,7 @@ typedef sycl::event (*lin_space_step_fn_ptr_t)( */ template sycl::event lin_space_step_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const py::object &start, const py::object &step, char *array_data, @@ -98,7 +99,7 @@ sycl::event lin_space_step_impl(sycl::queue &exec_q, typedef sycl::event (*lin_space_affine_fn_ptr_t)( sycl::queue &, - size_t, // num_elements + std::size_t, // num_elements const py::object &start, const py::object &end, bool include_endpoint, @@ -127,7 +128,7 @@ typedef sycl::event (*lin_space_affine_fn_ptr_t)( */ template sycl::event lin_space_affine_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const py::object &start, const py::object &end, bool include_endpoint, @@ -195,7 +196,7 @@ usm_ndarray_linear_sequence_step(const py::object &start, auto fn = lin_space_step_dispatch_vector[dst_typeid]; linspace_step_event = - fn(exec_q, static_cast(len), start, dt, dst_data, depends); + fn(exec_q, static_cast(len), start, dt, dst_data, depends); return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}), linspace_step_event); @@ -244,8 +245,8 @@ usm_ndarray_linear_sequence_affine(const py::object &start, auto fn = lin_space_affine_dispatch_vector[dst_typeid]; - linspace_affine_event = fn(exec_q, static_cast(len), start, end, - include_endpoint, dst_data, depends); + linspace_affine_event = fn(exec_q, static_cast(len), start, + end, include_endpoint, dst_data, depends); return std::make_pair( keep_args_alive(exec_q, {dst}, {linspace_affine_event}), diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp index 55988c249b..eef69f1b3f 100644 --- a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp @@ -26,6 +26,7 @@ #pragma once #include +#include #include #include #include @@ -211,15 +212,15 @@ std::pair py_reduction_over_axis( dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); - size_t dst_nelems = dst.get_size(); + std::size_t dst_nelems = dst.get_size(); if (dst_nelems == 0) { return std::make_pair(sycl::event(), sycl::event()); } - size_t reduction_nelems(1); + std::size_t reduction_nelems(1); for (int i = dst_nd; i < src_nd; ++i) { - reduction_nelems *= static_cast(src_shape_ptr[i]); + reduction_nelems *= static_cast(src_shape_ptr[i]); } // check that dst and src do not overlap @@ -263,7 +264,7 @@ std::pair py_reduction_over_axis( fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; } if (fn != nullptr) { - size_t iter_nelems = dst_nelems; + std::size_t iter_nelems = dst_nelems; constexpr py::ssize_t zero_offset = 0; @@ -296,7 +297,7 @@ std::pair py_reduction_over_axis( fn = axis0_temps_dispatch_table[src_typeid][dst_typeid]; } if (fn != nullptr) { - size_t iter_nelems = dst_nelems; + std::size_t iter_nelems = dst_nelems; constexpr py::ssize_t zero_offset = 0; @@ -374,17 +375,17 @@ std::pair py_reduction_over_axis( bool mat_reduce_over_axis1 = false; bool mat_reduce_over_axis0 = false; bool array_reduce_all_elems = false; - size_t iter_nelems = dst_nelems; + std::size_t iter_nelems = dst_nelems; if (simplified_reduction_src_strides[0] == 1) { array_reduce_all_elems = (simplified_iteration_shape[0] == 1); mat_reduce_over_axis1 = (simplified_iteration_dst_strides[0] == 1) && - (static_cast(simplified_iteration_src_strides[0]) == - reduction_nelems); + (static_cast( + simplified_iteration_src_strides[0]) == reduction_nelems); } - else if (static_cast(simplified_reduction_src_strides[0]) == - iter_nelems) + else if (static_cast( + simplified_reduction_src_strides[0]) == iter_nelems) { mat_reduce_over_axis0 = (simplified_iteration_dst_strides[0] == 1) && @@ -554,15 +555,15 @@ std::pair py_tree_reduction_over_axis( dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); - size_t dst_nelems = dst.get_size(); + std::size_t dst_nelems = dst.get_size(); if (dst_nelems == 0) { return std::make_pair(sycl::event(), sycl::event()); } - size_t reduction_nelems(1); + std::size_t reduction_nelems(1); for (int i = dst_nd; i < src_nd; ++i) { - reduction_nelems *= static_cast(src_shape_ptr[i]); + reduction_nelems *= static_cast(src_shape_ptr[i]); } // check that dst and src do not overlap @@ -591,7 +592,7 @@ std::pair py_tree_reduction_over_axis( { auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; if (fn != nullptr) { - size_t iter_nelems = dst_nelems; + std::size_t iter_nelems = dst_nelems; constexpr py::ssize_t zero_offset = 0; @@ -615,7 +616,7 @@ std::pair py_tree_reduction_over_axis( { auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid]; if (fn != nullptr) { - size_t iter_nelems = dst_nelems; + std::size_t iter_nelems = dst_nelems; constexpr py::ssize_t zero_offset = 0; @@ -693,17 +694,17 @@ std::pair py_tree_reduction_over_axis( bool mat_reduce_over_axis1 = false; bool mat_reduce_over_axis0 = false; bool array_reduce_all_elems = false; - size_t iter_nelems = dst_nelems; + std::size_t iter_nelems = dst_nelems; if (simplified_reduction_src_strides[0] == 1) { array_reduce_all_elems = (simplified_iteration_shape[0] == 1); mat_reduce_over_axis1 = (simplified_iteration_dst_strides[0] == 1) && - (static_cast(simplified_iteration_src_strides[0]) == - reduction_nelems); + (static_cast( + simplified_iteration_src_strides[0]) == reduction_nelems); } - else if (static_cast(simplified_reduction_src_strides[0]) == - iter_nelems) + else if (static_cast( + simplified_reduction_src_strides[0]) == iter_nelems) { mat_reduce_over_axis0 = (simplified_iteration_dst_strides[0] == 1) && @@ -842,15 +843,15 @@ std::pair py_search_over_axis( dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); - size_t dst_nelems = dst.get_size(); + std::size_t dst_nelems = dst.get_size(); if (dst_nelems == 0) { return std::make_pair(sycl::event(), sycl::event()); } - size_t reduction_nelems(1); + std::size_t reduction_nelems(1); for (int i = dst_nd; i < src_nd; ++i) { - reduction_nelems *= static_cast(src_shape_ptr[i]); + reduction_nelems *= static_cast(src_shape_ptr[i]); } // check that dst and src do not overlap @@ -877,7 +878,7 @@ std::pair py_search_over_axis( if (is_src_c_contig && is_dst_c_contig) { auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid]; if (fn != nullptr) { - size_t iter_nelems = dst_nelems; + std::size_t iter_nelems = dst_nelems; constexpr py::ssize_t zero_offset = 0; @@ -899,7 +900,7 @@ std::pair py_search_over_axis( else if (is_src_f_contig && dst_nd == 1) { auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid]; if (fn != nullptr) { - size_t iter_nelems = dst_nelems; + std::size_t iter_nelems = dst_nelems; constexpr py::ssize_t zero_offset = 0; @@ -974,15 +975,15 @@ std::pair py_search_over_axis( if ((reduction_nd == 1) && (iteration_nd == 1)) { bool mat_reduce_over_axis1 = false; bool mat_reduce_over_axis0 = false; - size_t iter_nelems = dst_nelems; + std::size_t iter_nelems = dst_nelems; if (compact_reduction_src_strides[0] == 1) { mat_reduce_over_axis1 = (simplified_iteration_dst_strides[0] == 1) && - (static_cast(simplified_iteration_src_strides[0]) == - reduction_nelems); + (static_cast( + simplified_iteration_src_strides[0]) == reduction_nelems); } - else if (static_cast(compact_reduction_src_strides[0]) == + else if (static_cast(compact_reduction_src_strides[0]) == iter_nelems) { mat_reduce_over_axis0 = @@ -1129,11 +1130,11 @@ py_boolean_reduction(const dpctl::tensor::usm_ndarray &src, dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); - size_t dst_nelems = dst.get_size(); + std::size_t dst_nelems = dst.get_size(); - size_t red_nelems(1); + std::size_t red_nelems(1); for (int i = dst_nd; i < src_nd; ++i) { - red_nelems *= static_cast(src_shape_ptr[i]); + red_nelems *= static_cast(src_shape_ptr[i]); } auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); @@ -1256,16 +1257,16 @@ py_boolean_reduction(const dpctl::tensor::usm_ndarray &src, bool mat_reduce_over_axis1 = false; bool mat_reduce_over_axis0 = false; bool array_reduce_all_elems = false; - size_t iter_nelems = dst_nelems; + std::size_t iter_nelems = dst_nelems; if (simplified_red_src_strides[0] == 1) { array_reduce_all_elems = (simplified_iter_shape[0] == 1); mat_reduce_over_axis1 = (simplified_iter_dst_strides[0] == 1) && - (static_cast(simplified_iter_src_strides[0]) == + (static_cast(simplified_iter_src_strides[0]) == red_nelems); } - else if (static_cast(simplified_red_src_strides[0]) == + else if (static_cast(simplified_red_src_strides[0]) == iter_nelems) { mat_reduce_over_axis0 = (simplified_iter_dst_strides[0] == 1) && diff --git a/dpctl/tensor/libtensor/source/repeat.cpp b/dpctl/tensor/libtensor/source/repeat.cpp index 64a7ca3068..003a4dca6b 100644 --- a/dpctl/tensor/libtensor/source/repeat.cpp +++ b/dpctl/tensor/libtensor/source/repeat.cpp @@ -22,6 +22,7 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// +#include #include #include #include @@ -136,13 +137,13 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); - size_t reps_sz = reps.get_size(); - size_t cumsum_sz = cumsum.get_size(); + std::size_t reps_sz = reps.get_size(); + std::size_t cumsum_sz = cumsum.get_size(); const py::ssize_t *src_shape = src.get_shape_raw(); const py::ssize_t *dst_shape = dst.get_shape_raw(); bool same_orthog_dims(true); - size_t orthog_nelems(1); // number of orthogonal iterations + std::size_t orthog_nelems(1); // number of orthogonal iterations for (auto i = 0; i < axis; ++i) { auto src_sh_i = src_shape[i]; orthog_nelems *= src_sh_i; @@ -154,11 +155,11 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]); } - size_t src_axis_nelems(1); + std::size_t src_axis_nelems(1); if (src_nd > 0) { src_axis_nelems = src_shape[axis]; } - size_t dst_axis_nelems(dst_shape[axis]); + std::size_t dst_axis_nelems(dst_shape[axis]); // shape at repeated axis must be equal to the sum of reps if (!same_orthog_dims || src_axis_nelems != reps_sz || @@ -296,8 +297,8 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, dst_shape_vec, dst_strides_vec, axis, axis + 1, orthog_dst_shape, axis_dst_shape, orthog_dst_strides, axis_dst_stride); - assert(orthog_src_shape.size() == static_cast(orthog_nd)); - assert(orthog_dst_shape.size() == static_cast(orthog_nd)); + assert(orthog_src_shape.size() == static_cast(orthog_nd)); + assert(orthog_dst_shape.size() == static_cast(orthog_nd)); assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(), orthog_dst_shape.begin())); @@ -400,9 +401,9 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); - size_t src_sz = src.get_size(); - size_t reps_sz = reps.get_size(); - size_t cumsum_sz = cumsum.get_size(); + std::size_t src_sz = src.get_size(); + std::size_t reps_sz = reps.get_size(); + std::size_t cumsum_sz = cumsum.get_size(); // shape at repeated axis must be equal to the sum of reps if (src_sz != reps_sz || src_sz != cumsum_sz) { @@ -542,7 +543,7 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, const py::ssize_t *src_shape = src.get_shape_raw(); const py::ssize_t *dst_shape = dst.get_shape_raw(); bool same_orthog_dims(true); - size_t orthog_nelems(1); // number of orthogonal iterations + std::size_t orthog_nelems(1); // number of orthogonal iterations for (auto i = 0; i < axis; ++i) { auto src_sh_i = src_shape[i]; orthog_nelems *= src_sh_i; @@ -554,11 +555,11 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]); } - size_t src_axis_nelems(1); + std::size_t src_axis_nelems(1); if (src_nd > 0) { src_axis_nelems = src_shape[axis]; } - size_t dst_axis_nelems(dst_shape[axis]); + std::size_t dst_axis_nelems(dst_shape[axis]); // shape at repeated axis must be equal to the shape of src at the axis * // reps @@ -672,8 +673,8 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, dst_shape_vec, dst_strides_vec, axis, axis + 1, orthog_dst_shape, axis_dst_shape, orthog_dst_strides, axis_dst_stride); - assert(orthog_src_shape.size() == static_cast(orthog_nd)); - assert(orthog_dst_shape.size() == static_cast(orthog_nd)); + assert(orthog_src_shape.size() == static_cast(orthog_nd)); + assert(orthog_dst_shape.size() == static_cast(orthog_nd)); assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(), orthog_dst_shape.begin())); @@ -759,8 +760,8 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); - size_t src_sz = src.get_size(); - size_t dst_sz = dst.get_size(); + std::size_t src_sz = src.get_size(); + std::size_t dst_sz = dst.get_size(); // shape at repeated axis must be equal to the shape of src at the axis * // reps diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp index 52a086c573..7e54b1ee22 100644 --- a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp +++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp @@ -24,6 +24,7 @@ #include "simplify_iteration_space.hpp" #include "utils/strided_iters.hpp" +#include #include #include @@ -55,8 +56,8 @@ void simplify_iteration_space_1(int &nd, simplified_strides.insert(std::end(simplified_strides), std::begin(strides), std::end(strides)); - assert(simplified_shape.size() == static_cast(nd)); - assert(simplified_strides.size() == static_cast(nd)); + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_strides.size() == static_cast(nd)); int contracted_nd = simplify_iteration_stride( nd, simplified_shape.data(), simplified_strides.data(), offset // modified by reference @@ -79,8 +80,8 @@ void simplify_iteration_space_1(int &nd, offset += (shape[0] - 1) * strides[0]; } - assert(simplified_shape.size() == static_cast(nd)); - assert(simplified_strides.size() == static_cast(nd)); + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_strides.size() == static_cast(nd)); } } @@ -102,19 +103,19 @@ void simplify_iteration_space(int &nd, simplified_shape.reserve(nd); simplified_shape.insert(std::begin(simplified_shape), shape, shape + nd); - assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_shape.size() == static_cast(nd)); simplified_src_strides.reserve(nd); simplified_src_strides.insert(std::end(simplified_src_strides), std::begin(src_strides), std::end(src_strides)); - assert(simplified_src_strides.size() == static_cast(nd)); + assert(simplified_src_strides.size() == static_cast(nd)); simplified_dst_strides.reserve(nd); simplified_dst_strides.insert(std::end(simplified_dst_strides), std::begin(dst_strides), std::end(dst_strides)); - assert(simplified_dst_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); int contracted_nd = simplify_iteration_two_strides( nd, simplified_shape.data(), simplified_src_strides.data(), @@ -134,7 +135,7 @@ void simplify_iteration_space(int &nd, // Populate vectors simplified_shape.reserve(nd); simplified_shape.push_back(shape[0]); - assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_shape.size() == static_cast(nd)); simplified_src_strides.reserve(nd); simplified_dst_strides.reserve(nd); @@ -152,8 +153,8 @@ void simplify_iteration_space(int &nd, simplified_dst_strides.push_back(dst_strides[0]); } - assert(simplified_src_strides.size() == static_cast(nd)); - assert(simplified_dst_strides.size() == static_cast(nd)); + assert(simplified_src_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); } } @@ -181,25 +182,25 @@ void simplify_iteration_space_3( // and improve access pattern simplified_shape.reserve(nd); simplified_shape.insert(std::end(simplified_shape), shape, shape + nd); - assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_shape.size() == static_cast(nd)); simplified_src1_strides.reserve(nd); simplified_src1_strides.insert(std::end(simplified_src1_strides), std::begin(src1_strides), std::end(src1_strides)); - assert(simplified_src1_strides.size() == static_cast(nd)); + assert(simplified_src1_strides.size() == static_cast(nd)); simplified_src2_strides.reserve(nd); simplified_src2_strides.insert(std::end(simplified_src2_strides), std::begin(src2_strides), std::end(src2_strides)); - assert(simplified_src2_strides.size() == static_cast(nd)); + assert(simplified_src2_strides.size() == static_cast(nd)); simplified_dst_strides.reserve(nd); simplified_dst_strides.insert(std::end(simplified_dst_strides), std::begin(dst_strides), std::end(dst_strides)); - assert(simplified_dst_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); int contracted_nd = simplify_iteration_three_strides( nd, simplified_shape.data(), simplified_src1_strides.data(), @@ -222,7 +223,7 @@ void simplify_iteration_space_3( // Populate vectors simplified_shape.reserve(nd); simplified_shape.push_back(shape[0]); - assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_shape.size() == static_cast(nd)); simplified_src1_strides.reserve(nd); simplified_src2_strides.reserve(nd); @@ -246,9 +247,9 @@ void simplify_iteration_space_3( simplified_dst_strides.push_back(dst_strides[0]); } - assert(simplified_src1_strides.size() == static_cast(nd)); - assert(simplified_src2_strides.size() == static_cast(nd)); - assert(simplified_dst_strides.size() == static_cast(nd)); + assert(simplified_src1_strides.size() == static_cast(nd)); + assert(simplified_src2_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); } } @@ -280,31 +281,31 @@ void simplify_iteration_space_4( // and improve access pattern simplified_shape.reserve(nd); simplified_shape.insert(std::end(simplified_shape), shape, shape + nd); - assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_shape.size() == static_cast(nd)); simplified_src1_strides.reserve(nd); simplified_src1_strides.insert(std::end(simplified_src1_strides), std::begin(src1_strides), std::end(src1_strides)); - assert(simplified_src1_strides.size() == static_cast(nd)); + assert(simplified_src1_strides.size() == static_cast(nd)); simplified_src2_strides.reserve(nd); simplified_src2_strides.insert(std::end(simplified_src2_strides), std::begin(src2_strides), std::end(src2_strides)); - assert(simplified_src2_strides.size() == static_cast(nd)); + assert(simplified_src2_strides.size() == static_cast(nd)); simplified_src3_strides.reserve(nd); simplified_src3_strides.insert(std::end(simplified_src3_strides), std::begin(src3_strides), std::end(src3_strides)); - assert(simplified_src3_strides.size() == static_cast(nd)); + assert(simplified_src3_strides.size() == static_cast(nd)); simplified_dst_strides.reserve(nd); simplified_dst_strides.insert(std::end(simplified_dst_strides), std::begin(dst_strides), std::end(dst_strides)); - assert(simplified_dst_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); int contracted_nd = simplify_iteration_four_strides( nd, simplified_shape.data(), simplified_src1_strides.data(), @@ -331,7 +332,7 @@ void simplify_iteration_space_4( // Populate vectors simplified_shape.reserve(nd); simplified_shape.push_back(shape[0]); - assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_shape.size() == static_cast(nd)); simplified_src1_strides.reserve(nd); simplified_src2_strides.reserve(nd); @@ -359,10 +360,10 @@ void simplify_iteration_space_4( simplified_dst_strides.push_back(dst_strides[0]); } - assert(simplified_src1_strides.size() == static_cast(nd)); - assert(simplified_src2_strides.size() == static_cast(nd)); - assert(simplified_src3_strides.size() == static_cast(nd)); - assert(simplified_dst_strides.size() == static_cast(nd)); + assert(simplified_src1_strides.size() == static_cast(nd)); + assert(simplified_src2_strides.size() == static_cast(nd)); + assert(simplified_src3_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); } } @@ -379,12 +380,12 @@ void compact_iteration_space(int &nd, // and improve access pattern compact_shape.reserve(nd); compact_shape.insert(std::begin(compact_shape), shape, shape + nd); - assert(compact_shape.size() == static_cast(nd)); + assert(compact_shape.size() == static_cast(nd)); compact_strides.reserve(nd); compact_strides.insert(std::end(compact_strides), std::begin(strides), std::end(strides)); - assert(compact_strides.size() == static_cast(nd)); + assert(compact_strides.size() == static_cast(nd)); int contracted_nd = compact_iteration(nd, compact_shape.data(), compact_strides.data()); @@ -397,11 +398,11 @@ void compact_iteration_space(int &nd, // Populate vectors compact_shape.reserve(nd); compact_shape.push_back(shape[0]); - assert(compact_shape.size() == static_cast(nd)); + assert(compact_shape.size() == static_cast(nd)); compact_strides.reserve(nd); compact_strides.push_back(strides[0]); - assert(compact_strides.size() == static_cast(nd)); + assert(compact_strides.size() == static_cast(nd)); } } @@ -452,7 +453,7 @@ void split_iteration_space(const std::vector &shape_vec, py::ssize_t _ravel_multi_index_c(std::vector const &mi, std::vector const &shape) { - size_t nd = shape.size(); + std::size_t nd = shape.size(); if (nd != mi.size()) { throw py::value_error( "Multi-index and shape vectors must have the same length."); @@ -460,7 +461,7 @@ py::ssize_t _ravel_multi_index_c(std::vector const &mi, py::ssize_t flat_index = 0; py::ssize_t s = 1; - for (size_t i = 0; i < nd; ++i) { + for (std::size_t i = 0; i < nd; ++i) { flat_index += mi.at(nd - 1 - i) * s; s *= shape.at(nd - 1 - i); } @@ -471,7 +472,7 @@ py::ssize_t _ravel_multi_index_c(std::vector const &mi, py::ssize_t _ravel_multi_index_f(std::vector const &mi, std::vector const &shape) { - size_t nd = shape.size(); + std::size_t nd = shape.size(); if (nd != mi.size()) { throw py::value_error( "Multi-index and shape vectors must have the same length."); @@ -479,7 +480,7 @@ py::ssize_t _ravel_multi_index_f(std::vector const &mi, py::ssize_t flat_index = 0; py::ssize_t s = 1; - for (size_t i = 0; i < nd; ++i) { + for (std::size_t i = 0; i < nd; ++i) { flat_index += mi.at(i) * s; s *= shape.at(i); } @@ -490,12 +491,12 @@ py::ssize_t _ravel_multi_index_f(std::vector const &mi, std::vector _unravel_index_c(py::ssize_t flat_index, std::vector const &shape) { - size_t nd = shape.size(); + std::size_t nd = shape.size(); std::vector mi; mi.resize(nd); py::ssize_t i_ = flat_index; - for (size_t dim = 0; dim + 1 < nd; ++dim) { + for (std::size_t dim = 0; dim + 1 < nd; ++dim) { const py::ssize_t si = shape[nd - 1 - dim]; const py::ssize_t q = i_ / si; const py::ssize_t r = (i_ - q * si); @@ -511,12 +512,12 @@ std::vector _unravel_index_c(py::ssize_t flat_index, std::vector _unravel_index_f(py::ssize_t flat_index, std::vector const &shape) { - size_t nd = shape.size(); + std::size_t nd = shape.size(); std::vector mi; mi.resize(nd); py::ssize_t i_ = flat_index; - for (size_t dim = 0; dim + 1 < nd; ++dim) { + for (std::size_t dim = 0; dim + 1 < nd; ++dim) { const py::ssize_t si = shape[dim]; const py::ssize_t q = i_ / si; const py::ssize_t r = (i_ - q * si); diff --git a/dpctl/tensor/libtensor/source/sorting/py_argsort_common.hpp b/dpctl/tensor/libtensor/source/sorting/py_argsort_common.hpp index cae18aed25..c2a18635a1 100644 --- a/dpctl/tensor/libtensor/source/sorting/py_argsort_common.hpp +++ b/dpctl/tensor/libtensor/source/sorting/py_argsort_common.hpp @@ -23,6 +23,7 @@ //===--------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" +#include #include #include #include @@ -66,19 +67,19 @@ py_argsort(const dpctl::tensor::usm_ndarray &src, const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); bool same_shapes = true; - size_t iter_nelems(1); + std::size_t iter_nelems(1); for (int i = 0; same_shapes && (i < iteration_nd); ++i) { auto src_shape_i = src_shape_ptr[i]; same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); - iter_nelems *= static_cast(src_shape_i); + iter_nelems *= static_cast(src_shape_i); } - size_t sort_nelems(1); + std::size_t sort_nelems(1); for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) { auto src_shape_i = src_shape_ptr[i]; same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); - sort_nelems *= static_cast(src_shape_i); + sort_nelems *= static_cast(src_shape_i); } if (!same_shapes) { diff --git a/dpctl/tensor/libtensor/source/sorting/py_sort_common.hpp b/dpctl/tensor/libtensor/source/sorting/py_sort_common.hpp index d261adb352..ab78450bc6 100644 --- a/dpctl/tensor/libtensor/source/sorting/py_sort_common.hpp +++ b/dpctl/tensor/libtensor/source/sorting/py_sort_common.hpp @@ -24,6 +24,7 @@ #pragma once +#include #include #include "dpctl4pybind11.hpp" @@ -69,19 +70,19 @@ py_sort(const dpctl::tensor::usm_ndarray &src, const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); bool same_shapes = true; - size_t iter_nelems(1); + std::size_t iter_nelems(1); for (int i = 0; same_shapes && (i < iteration_nd); ++i) { auto src_shape_i = src_shape_ptr[i]; same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); - iter_nelems *= static_cast(src_shape_i); + iter_nelems *= static_cast(src_shape_i); } - size_t sort_nelems(1); + std::size_t sort_nelems(1); for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) { auto src_shape_i = src_shape_ptr[i]; same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); - sort_nelems *= static_cast(src_shape_i); + sort_nelems *= static_cast(src_shape_i); } if (!same_shapes) { diff --git a/dpctl/tensor/libtensor/source/sorting/radix_argsort.cpp b/dpctl/tensor/libtensor/source/sorting/radix_argsort.cpp index aca4fe902f..2973ac9911 100644 --- a/dpctl/tensor/libtensor/source/sorting/radix_argsort.cpp +++ b/dpctl/tensor/libtensor/source/sorting/radix_argsort.cpp @@ -22,6 +22,7 @@ /// extension. //===--------------------------------------------------------------------===// +#include #include #include #include @@ -70,8 +71,8 @@ namespace template sycl::event argsort_axis1_contig_caller(sycl::queue &q, - size_t iter_nelems, - size_t sort_nelems, + std::size_t iter_nelems, + std::size_t sort_nelems, const char *arg_cp, char *res_cp, ssize_t iter_arg_offset, diff --git a/dpctl/tensor/libtensor/source/sorting/radix_sort.cpp b/dpctl/tensor/libtensor/source/sorting/radix_sort.cpp index 09eb75d1f1..62d5b76883 100644 --- a/dpctl/tensor/libtensor/source/sorting/radix_sort.cpp +++ b/dpctl/tensor/libtensor/source/sorting/radix_sort.cpp @@ -22,6 +22,7 @@ /// extension. //===--------------------------------------------------------------------===// +#include #include #include #include @@ -67,8 +68,8 @@ namespace template sycl::event sort_axis1_contig_caller(sycl::queue &q, - size_t iter_nelems, - size_t sort_nelems, + std::size_t iter_nelems, + std::size_t sort_nelems, const char *arg_cp, char *res_cp, ssize_t iter_arg_offset, diff --git a/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp b/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp index cf12601b1c..2e9f732dfd 100644 --- a/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp +++ b/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp @@ -22,6 +22,7 @@ /// extension. //===--------------------------------------------------------------------===// +#include #include #include #include @@ -217,10 +218,10 @@ py_searchsorted(const dpctl::tensor::usm_ndarray &hay, } // check that needle and positions have the same shape - size_t needles_nelems(1); + std::size_t needles_nelems(1); bool same_shape(true); - const size_t hay_nelems = static_cast(hay.get_shape(0)); + const std::size_t hay_nelems = static_cast(hay.get_shape(0)); const py::ssize_t *needles_shape_ptr = needles.get_shape_raw(); const py::ssize_t *positions_shape_ptr = needles.get_shape_raw(); @@ -230,7 +231,7 @@ py_searchsorted(const dpctl::tensor::usm_ndarray &hay, const auto positions_sh_i = positions_shape_ptr[i]; same_shape = same_shape && (needles_sh_i == positions_sh_i); - needles_nelems *= static_cast(needles_sh_i); + needles_nelems *= static_cast(needles_sh_i); } if (!same_shape) { diff --git a/dpctl/tensor/libtensor/source/triul_ctor.cpp b/dpctl/tensor/libtensor/source/triul_ctor.cpp index e3c4a1420d..1bb9e1799f 100644 --- a/dpctl/tensor/libtensor/source/triul_ctor.cpp +++ b/dpctl/tensor/libtensor/source/triul_ctor.cpp @@ -22,6 +22,7 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// +#include #include #include #include @@ -79,10 +80,10 @@ usm_ndarray_triul(sycl::queue &exec_q, const py::ssize_t *dst_shape = dst.get_shape_raw(); bool shapes_equal(true); - size_t src_nelems(1); + std::size_t src_nelems(1); for (int i = 0; shapes_equal && i < src_nd; ++i) { - src_nelems *= static_cast(src_shape[i]); + src_nelems *= static_cast(src_shape[i]); shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); } if (!shapes_equal) { diff --git a/dpctl/tensor/libtensor/source/where.cpp b/dpctl/tensor/libtensor/source/where.cpp index 2d1cf040b4..f55f767010 100644 --- a/dpctl/tensor/libtensor/source/where.cpp +++ b/dpctl/tensor/libtensor/source/where.cpp @@ -24,6 +24,7 @@ //===----------------------------------------------------------------------===// #include +#include #include #include #include @@ -100,10 +101,10 @@ py_where(const dpctl::tensor::usm_ndarray &condition, const py::ssize_t *cond_shape = condition.get_shape_raw(); bool shapes_equal(true); - size_t nelems(1); + std::size_t nelems(1); for (int i = 0; i < nd; ++i) { const auto &sh_i = dst_shape[i]; - nelems *= static_cast(sh_i); + nelems *= static_cast(sh_i); shapes_equal = shapes_equal && (x1_shape[i] == sh_i) && (x2_shape[i] == sh_i) && (cond_shape[i] == sh_i); } diff --git a/dpctl/tensor/libtensor/source/zeros_ctor.cpp b/dpctl/tensor/libtensor/source/zeros_ctor.cpp index 000f15d4e0..30ba3625c6 100644 --- a/dpctl/tensor/libtensor/source/zeros_ctor.cpp +++ b/dpctl/tensor/libtensor/source/zeros_ctor.cpp @@ -23,6 +23,7 @@ //===--------------------------------------------------------------------===// #include +#include #include #include #include @@ -52,7 +53,7 @@ namespace py_internal using dpctl::utils::keep_args_alive; typedef sycl::event (*zeros_contig_fn_ptr_t)(sycl::queue &, - size_t, + std::size_t, char *, const std::vector &); @@ -72,7 +73,7 @@ typedef sycl::event (*zeros_contig_fn_ptr_t)(sycl::queue &, */ template sycl::event zeros_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, char *dst_p, const std::vector &depends) { @@ -128,7 +129,7 @@ usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst, auto fn = zeros_contig_dispatch_vector[dst_typeid]; sycl::event zeros_contig_event = - fn(exec_q, static_cast(dst_nelems), dst_data, depends); + fn(exec_q, static_cast(dst_nelems), dst_data, depends); return std::make_pair( keep_args_alive(exec_q, {dst}, {zeros_contig_event}), diff --git a/dpctl/utils/src/order_keeper.cpp b/dpctl/utils/src/order_keeper.cpp index 7a1e881001..62fc7e7e65 100644 --- a/dpctl/utils/src/order_keeper.cpp +++ b/dpctl/utils/src/order_keeper.cpp @@ -1,4 +1,5 @@ #include "dpctl4pybind11.hpp" +#include #include #include diff --git a/dpctl/utils/src/sequential_order_keeper.hpp b/dpctl/utils/src/sequential_order_keeper.hpp index db58b99510..b29e27a2b9 100644 --- a/dpctl/utils/src/sequential_order_keeper.hpp +++ b/dpctl/utils/src/sequential_order_keeper.hpp @@ -2,6 +2,7 @@ #include #include +#include #include namespace @@ -37,7 +38,7 @@ class SequentialOrder public: SequentialOrder() : host_task_events{}, submitted_events{} {} - SequentialOrder(size_t n) : host_task_events{}, submitted_events{} + SequentialOrder(std::size_t n) : host_task_events{}, submitted_events{} { host_task_events.reserve(n); submitted_events.reserve(n); @@ -76,7 +77,10 @@ class SequentialOrder return *this; } - size_t get_num_submitted_events() const { return submitted_events.size(); } + std::size_t get_num_submitted_events() const + { + return submitted_events.size(); + } const std::vector &get_host_task_events() { @@ -90,7 +94,10 @@ class SequentialOrder } */ - size_t get_num_host_task_events() const { return host_task_events.size(); } + std::size_t get_num_host_task_events() const + { + return host_task_events.size(); + } const std::vector &get_submitted_events() { @@ -148,7 +155,7 @@ class SequentialOrder void add_list_to_host_task_events(const sycl::event (&ht_events)[num]) { prune_complete(); - for (size_t i = 0; i < num; ++i) { + for (std::size_t i = 0; i < num; ++i) { const auto &e = ht_events[i]; if (!is_event_complete(e)) host_task_events.push_back(e); @@ -159,7 +166,7 @@ class SequentialOrder void add_list_to_submitted_events(const sycl::event (&comp_events)[num]) { prune_complete(); - for (size_t i = 0; i < num; ++i) { + for (std::size_t i = 0; i < num; ++i) { const auto &e = comp_events[i]; if (!is_event_complete(e)) submitted_events.push_back(e); diff --git a/libsyclinterface/helper/source/dpctl_error_handlers.cpp b/libsyclinterface/helper/source/dpctl_error_handlers.cpp index 32967b46ec..25aeb7e701 100644 --- a/libsyclinterface/helper/source/dpctl_error_handlers.cpp +++ b/libsyclinterface/helper/source/dpctl_error_handlers.cpp @@ -29,6 +29,7 @@ #include #ifdef _WIN32 #include +#include #endif #ifdef ENABLE_GLOG #include diff --git a/libsyclinterface/source/dpctl_sycl_context_interface.cpp b/libsyclinterface/source/dpctl_sycl_context_interface.cpp index ca971be984..374e3bb401 100644 --- a/libsyclinterface/source/dpctl_sycl_context_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_context_interface.cpp @@ -28,6 +28,7 @@ #include "Config/dpctl_config.h" #include "dpctl_error_handlers.h" #include "dpctl_sycl_type_casters.hpp" +#include #include #include #include diff --git a/libsyclinterface/source/dpctl_sycl_device_interface.cpp b/libsyclinterface/source/dpctl_sycl_device_interface.cpp index f9e6365c25..441a03f7dd 100644 --- a/libsyclinterface/source/dpctl_sycl_device_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_device_interface.cpp @@ -34,6 +34,7 @@ #include "dpctl_utils_helper.h" #include #include +#include #include /* SYCL headers */ #include #include diff --git a/libsyclinterface/source/dpctl_sycl_device_manager.cpp b/libsyclinterface/source/dpctl_sycl_device_manager.cpp index 6b284be56b..cc9cf616b7 100644 --- a/libsyclinterface/source/dpctl_sycl_device_manager.cpp +++ b/libsyclinterface/source/dpctl_sycl_device_manager.cpp @@ -33,6 +33,7 @@ #include #include #include +#include #include /* SYCL headers */ #include #include diff --git a/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp b/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp index 66aa215808..850ea56467 100644 --- a/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp @@ -32,6 +32,7 @@ #include "dpctl_sycl_type_casters.hpp" #include /* OpenCL headers */ #include +#include #include #include /* Sycl headers */ #include diff --git a/libsyclinterface/source/dpctl_sycl_kernel_interface.cpp b/libsyclinterface/source/dpctl_sycl_kernel_interface.cpp index e27eb192eb..d571a74603 100644 --- a/libsyclinterface/source/dpctl_sycl_kernel_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_kernel_interface.cpp @@ -30,6 +30,7 @@ #include "dpctl_string_utils.hpp" #include "dpctl_sycl_type_casters.hpp" #include +#include #include /* Sycl headers */ using namespace sycl; diff --git a/libsyclinterface/source/dpctl_sycl_platform_interface.cpp b/libsyclinterface/source/dpctl_sycl_platform_interface.cpp index b8adef2b81..6b2a433ea2 100644 --- a/libsyclinterface/source/dpctl_sycl_platform_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_platform_interface.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include diff --git a/libsyclinterface/source/dpctl_sycl_platform_manager.cpp b/libsyclinterface/source/dpctl_sycl_platform_manager.cpp index 130771d07d..b29cbb752d 100644 --- a/libsyclinterface/source/dpctl_sycl_platform_manager.cpp +++ b/libsyclinterface/source/dpctl_sycl_platform_manager.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include using namespace sycl; diff --git a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp index a819b6bd63..3db14451ea 100644 --- a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp @@ -33,6 +33,7 @@ #include "dpctl_sycl_type_casters.hpp" #include #include +#include #include #include /* SYCL headers */ #include diff --git a/libsyclinterface/source/dpctl_sycl_usm_interface.cpp b/libsyclinterface/source/dpctl_sycl_usm_interface.cpp index 8cf35d90c1..4aeb348372 100644 --- a/libsyclinterface/source/dpctl_sycl_usm_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_usm_interface.cpp @@ -29,6 +29,7 @@ #include "dpctl_error_handlers.h" #include "dpctl_sycl_device_interface.h" #include "dpctl_sycl_type_casters.hpp" +#include #include /* SYCL headers */ #include diff --git a/libsyclinterface/source/dpctl_utils.cpp b/libsyclinterface/source/dpctl_utils.cpp index 5b6d1e8356..dd76287bdf 100644 --- a/libsyclinterface/source/dpctl_utils.cpp +++ b/libsyclinterface/source/dpctl_utils.cpp @@ -24,6 +24,7 @@ //===----------------------------------------------------------------------===// #include "dpctl_utils.h" +#include void DPCTLCString_Delete(__dpctl_take const char *str) { delete[] str; } diff --git a/libsyclinterface/source/dpctl_vector_templ.cpp b/libsyclinterface/source/dpctl_vector_templ.cpp index 57ecdb2a42..872e5425ea 100644 --- a/libsyclinterface/source/dpctl_vector_templ.cpp +++ b/libsyclinterface/source/dpctl_vector_templ.cpp @@ -27,6 +27,7 @@ #include "dpctl_error_handlers.h" #include "dpctl_sycl_type_casters.hpp" #include "dpctl_vector_macros.h" +#include #include #include diff --git a/libsyclinterface/tests/test_service.cpp b/libsyclinterface/tests/test_service.cpp index f0b693880f..93fc6b9fe1 100644 --- a/libsyclinterface/tests/test_service.cpp +++ b/libsyclinterface/tests/test_service.cpp @@ -39,7 +39,7 @@ TEST(TestServicesFns, ChkDPCPPVersion) ASSERT_TRUE(ver.length() > 0); std::string ver_from_cmplr(ASSTR(__VERSION__)); - std::size_t found = ver_from_cmplr.find(ver); + auto found = ver_from_cmplr.find(ver); // version returned by DPCTLService_GetDPCPPVersion // should occur as a substring in the version obtained diff --git a/libsyclinterface/tests/test_sycl_context_interface.cpp b/libsyclinterface/tests/test_sycl_context_interface.cpp index e2ca92bc7b..65995178db 100644 --- a/libsyclinterface/tests/test_sycl_context_interface.cpp +++ b/libsyclinterface/tests/test_sycl_context_interface.cpp @@ -30,6 +30,7 @@ #include "dpctl_sycl_device_selector_interface.h" #include "dpctl_sycl_types.h" #include +#include #include #include diff --git a/libsyclinterface/tests/test_sycl_device_aspects.cpp b/libsyclinterface/tests/test_sycl_device_aspects.cpp index c5b8154e3e..a535a66e69 100644 --- a/libsyclinterface/tests/test_sycl_device_aspects.cpp +++ b/libsyclinterface/tests/test_sycl_device_aspects.cpp @@ -31,6 +31,7 @@ #include "dpctl_sycl_type_casters.hpp" #include "dpctl_utils_helper.h" #include +#include #include #include @@ -79,7 +80,7 @@ constexpr auto build_param_pairs(const std::array &arr1, return paramPairs; } -template +template auto build_gtest_values_impl(const PArr &arr, std::index_sequence) { return ::testing::Values(arr[I]...); diff --git a/libsyclinterface/tests/test_sycl_device_interface.cpp b/libsyclinterface/tests/test_sycl_device_interface.cpp index 544f1597fe..5698752afa 100644 --- a/libsyclinterface/tests/test_sycl_device_interface.cpp +++ b/libsyclinterface/tests/test_sycl_device_interface.cpp @@ -30,6 +30,7 @@ #include "dpctl_utils.h" #include "dpctl_utils_helper.h" #include +#include #include using namespace sycl; diff --git a/libsyclinterface/tests/test_sycl_device_manager.cpp b/libsyclinterface/tests/test_sycl_device_manager.cpp index a45e7c185b..745ba5ed1c 100644 --- a/libsyclinterface/tests/test_sycl_device_manager.cpp +++ b/libsyclinterface/tests/test_sycl_device_manager.cpp @@ -31,6 +31,7 @@ #include "dpctl_utils.h" #include "dpctl_utils_helper.h" #include +#include #include using dpctl::syclinterface::dpctl_default_selector; diff --git a/libsyclinterface/tests/test_sycl_device_subdevices.cpp b/libsyclinterface/tests/test_sycl_device_subdevices.cpp index 8c1d9fad6f..edc832982c 100644 --- a/libsyclinterface/tests/test_sycl_device_subdevices.cpp +++ b/libsyclinterface/tests/test_sycl_device_subdevices.cpp @@ -33,6 +33,7 @@ #include "dpctl_utils.h" #include "dpctl_utils_helper.h" #include +#include #include using namespace sycl; diff --git a/libsyclinterface/tests/test_sycl_kernel_bundle_interface.cpp b/libsyclinterface/tests/test_sycl_kernel_bundle_interface.cpp index ad6d6289ad..de755a3b23 100644 --- a/libsyclinterface/tests/test_sycl_kernel_bundle_interface.cpp +++ b/libsyclinterface/tests/test_sycl_kernel_bundle_interface.cpp @@ -37,6 +37,7 @@ #include #include #include +#include #include using namespace sycl; diff --git a/libsyclinterface/tests/test_sycl_kernel_interface.cpp b/libsyclinterface/tests/test_sycl_kernel_interface.cpp index 0a87790b51..c2c0d8bda0 100644 --- a/libsyclinterface/tests/test_sycl_kernel_interface.cpp +++ b/libsyclinterface/tests/test_sycl_kernel_interface.cpp @@ -34,6 +34,7 @@ #include "dpctl_utils.h" #include #include +#include #include using namespace sycl; diff --git a/libsyclinterface/tests/test_sycl_queue_interface.cpp b/libsyclinterface/tests/test_sycl_queue_interface.cpp index 4656e3793e..1de3b90891 100644 --- a/libsyclinterface/tests/test_sycl_queue_interface.cpp +++ b/libsyclinterface/tests/test_sycl_queue_interface.cpp @@ -34,6 +34,7 @@ #include "dpctl_sycl_type_casters.hpp" #include "dpctl_sycl_usm_interface.h" #include +#include #include using namespace sycl; diff --git a/libsyclinterface/tests/test_sycl_queue_submit.cpp b/libsyclinterface/tests/test_sycl_queue_submit.cpp index d89ec3d3ce..21aa7653e3 100644 --- a/libsyclinterface/tests/test_sycl_queue_submit.cpp +++ b/libsyclinterface/tests/test_sycl_queue_submit.cpp @@ -36,6 +36,7 @@ #include #include #include +#include #include #include diff --git a/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp b/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp index 7f28fc0041..539fdbd4c5 100644 --- a/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp +++ b/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include #include diff --git a/libsyclinterface/tests/test_sycl_usm_interface.cpp b/libsyclinterface/tests/test_sycl_usm_interface.cpp index 38616bab00..fc54e3a411 100644 --- a/libsyclinterface/tests/test_sycl_usm_interface.cpp +++ b/libsyclinterface/tests/test_sycl_usm_interface.cpp @@ -33,6 +33,7 @@ #include "dpctl_sycl_usm_interface.h" #include #include +#include #include using namespace sycl; From 4e2b9a4c928303f5f5aae201e81939e2f1035d5d Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 25 Dec 2024 21:51:21 -0800 Subject: [PATCH 2/9] Add using dpctl::tensor::ssize_t` everywhere `ssize_t` is used --- dpctl/tensor/libtensor/include/kernels/accumulators.hpp | 1 + .../libtensor/include/kernels/boolean_advanced_indexing.hpp | 1 + dpctl/tensor/libtensor/include/kernels/clip.hpp | 1 + dpctl/tensor/libtensor/include/kernels/constructors.hpp | 2 ++ dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp | 1 + .../tensor/libtensor/include/kernels/copy_as_contiguous.hpp | 1 + .../libtensor/include/kernels/elementwise_functions/abs.hpp | 1 + .../include/kernels/elementwise_functions/acos.hpp | 1 + .../include/kernels/elementwise_functions/acosh.hpp | 1 + .../libtensor/include/kernels/elementwise_functions/add.hpp | 1 + .../include/kernels/elementwise_functions/angle.hpp | 1 + .../include/kernels/elementwise_functions/asin.hpp | 1 + .../include/kernels/elementwise_functions/asinh.hpp | 1 + .../include/kernels/elementwise_functions/atan.hpp | 1 + .../include/kernels/elementwise_functions/atan2.hpp | 1 + .../include/kernels/elementwise_functions/atanh.hpp | 1 + .../include/kernels/elementwise_functions/bitwise_and.hpp | 1 + .../kernels/elementwise_functions/bitwise_invert.hpp | 1 + .../kernels/elementwise_functions/bitwise_left_shift.hpp | 1 + .../include/kernels/elementwise_functions/bitwise_or.hpp | 1 + .../kernels/elementwise_functions/bitwise_right_shift.hpp | 1 + .../include/kernels/elementwise_functions/bitwise_xor.hpp | 1 + .../include/kernels/elementwise_functions/cbrt.hpp | 1 + .../include/kernels/elementwise_functions/ceil.hpp | 1 + .../include/kernels/elementwise_functions/common.hpp | 1 + .../kernels/elementwise_functions/common_inplace.hpp | 1 + .../include/kernels/elementwise_functions/conj.hpp | 1 + .../include/kernels/elementwise_functions/copysign.hpp | 1 + .../libtensor/include/kernels/elementwise_functions/cos.hpp | 1 + .../include/kernels/elementwise_functions/cosh.hpp | 1 + .../include/kernels/elementwise_functions/equal.hpp | 1 + .../libtensor/include/kernels/elementwise_functions/exp.hpp | 1 + .../include/kernels/elementwise_functions/exp2.hpp | 1 + .../include/kernels/elementwise_functions/expm1.hpp | 1 + .../include/kernels/elementwise_functions/floor.hpp | 1 + .../include/kernels/elementwise_functions/floor_divide.hpp | 1 + .../include/kernels/elementwise_functions/greater.hpp | 1 + .../include/kernels/elementwise_functions/greater_equal.hpp | 1 + .../include/kernels/elementwise_functions/hypot.hpp | 1 + .../include/kernels/elementwise_functions/imag.hpp | 1 + .../include/kernels/elementwise_functions/isfinite.hpp | 1 + .../include/kernels/elementwise_functions/isinf.hpp | 1 + .../include/kernels/elementwise_functions/isnan.hpp | 1 + .../include/kernels/elementwise_functions/less.hpp | 1 + .../include/kernels/elementwise_functions/less_equal.hpp | 1 + .../libtensor/include/kernels/elementwise_functions/log.hpp | 1 + .../include/kernels/elementwise_functions/log10.hpp | 1 + .../include/kernels/elementwise_functions/log1p.hpp | 1 + .../include/kernels/elementwise_functions/log2.hpp | 1 + .../include/kernels/elementwise_functions/logaddexp.hpp | 1 + .../include/kernels/elementwise_functions/logical_and.hpp | 1 + .../include/kernels/elementwise_functions/logical_not.hpp | 1 + .../include/kernels/elementwise_functions/logical_or.hpp | 1 + .../include/kernels/elementwise_functions/logical_xor.hpp | 1 + .../include/kernels/elementwise_functions/maximum.hpp | 1 + .../include/kernels/elementwise_functions/minimum.hpp | 1 + .../include/kernels/elementwise_functions/multiply.hpp | 1 + .../include/kernels/elementwise_functions/negative.hpp | 1 + .../include/kernels/elementwise_functions/nextafter.hpp | 1 + .../include/kernels/elementwise_functions/not_equal.hpp | 1 + .../include/kernels/elementwise_functions/positive.hpp | 1 + .../libtensor/include/kernels/elementwise_functions/pow.hpp | 1 + .../include/kernels/elementwise_functions/proj.hpp | 1 + .../include/kernels/elementwise_functions/real.hpp | 1 + .../include/kernels/elementwise_functions/reciprocal.hpp | 1 + .../include/kernels/elementwise_functions/remainder.hpp | 1 + .../include/kernels/elementwise_functions/round.hpp | 1 + .../include/kernels/elementwise_functions/rsqrt.hpp | 1 + .../include/kernels/elementwise_functions/sign.hpp | 1 + .../include/kernels/elementwise_functions/signbit.hpp | 1 + .../libtensor/include/kernels/elementwise_functions/sin.hpp | 1 + .../include/kernels/elementwise_functions/sinh.hpp | 1 + .../include/kernels/elementwise_functions/sqrt.hpp | 1 + .../include/kernels/elementwise_functions/square.hpp | 1 + .../include/kernels/elementwise_functions/subtract.hpp | 1 + .../libtensor/include/kernels/elementwise_functions/tan.hpp | 1 + .../include/kernels/elementwise_functions/tanh.hpp | 1 + .../include/kernels/elementwise_functions/true_divide.hpp | 1 + .../include/kernels/elementwise_functions/trunc.hpp | 1 + .../libtensor/include/kernels/integer_advanced_indexing.hpp | 2 ++ .../include/kernels/linalg_functions/dot_product.hpp | 1 + dpctl/tensor/libtensor/include/kernels/reductions.hpp | 1 + dpctl/tensor/libtensor/include/kernels/repeat.hpp | 1 + .../tensor/libtensor/include/kernels/sorting/merge_sort.hpp | 1 + .../tensor/libtensor/include/kernels/sorting/radix_sort.hpp | 2 ++ .../libtensor/include/kernels/sorting/searchsorted.hpp | 2 ++ .../include/kernels/sorting/sort_impl_fn_ptr_t.hpp | 6 +++++- dpctl/tensor/libtensor/include/kernels/where.hpp | 1 + dpctl/tensor/libtensor/include/utils/indexing_utils.hpp | 2 ++ 89 files changed, 98 insertions(+), 1 deletion(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index 86fb745eaf..68a0d1b5a1 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -48,6 +48,7 @@ namespace kernels namespace accumulators { +using dpctl::tensor::ssize_t; using namespace dpctl::tensor::offset_utils; template T ceiling_quotient(T n, T m) { return (n + m - 1) / m; } diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp index 4857142ae4..3aaadef28b 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -43,6 +43,7 @@ namespace kernels namespace indexing { +using dpctl::tensor::ssize_t; using namespace dpctl::tensor::offset_utils; template struct AbsFunctor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp index a2ce2c12c6..70551a5a60 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp @@ -48,6 +48,7 @@ namespace kernels namespace acos { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp index 2ed61244ff..f5516cc450 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp @@ -48,6 +48,7 @@ namespace kernels namespace acosh { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp index 7138a8afcc..f4173169f4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp @@ -49,6 +49,7 @@ namespace kernels namespace add { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp index dc2f455fde..7029a4171a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp @@ -49,6 +49,7 @@ namespace kernels namespace angle { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp index bf466f80d4..a3a9daabed 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp @@ -48,6 +48,7 @@ namespace kernels namespace asin { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp index e1237f0252..908180e1a4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp @@ -48,6 +48,7 @@ namespace kernels namespace asinh { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp index 103a22ddee..0d4e53c575 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp @@ -49,6 +49,7 @@ namespace kernels namespace atan { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::kernels::vec_size_utils::ContigHyperparameterSetDefault; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp index 2160531d67..eb35efd5c3 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp @@ -47,6 +47,7 @@ namespace kernels namespace atan2 { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp index 790afc8a00..9aa72448f2 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp @@ -49,6 +49,7 @@ namespace kernels namespace atanh { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp index ad1d27a11a..a4c9e93910 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp @@ -47,6 +47,7 @@ namespace kernels namespace bitwise_and { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp index 3954dbaac6..c902fac0fa 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp @@ -48,6 +48,7 @@ namespace kernels namespace bitwise_invert { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp index 23cc878727..80c9296136 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp @@ -48,6 +48,7 @@ namespace kernels namespace bitwise_left_shift { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp index 3415ea6255..26cebc851d 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp @@ -47,6 +47,7 @@ namespace kernels namespace bitwise_or { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp index e58361eca2..9b86b6a180 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp @@ -48,6 +48,7 @@ namespace kernels namespace bitwise_right_shift { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp index 2167adf40c..c28305e4d2 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp @@ -47,6 +47,7 @@ namespace kernels namespace bitwise_xor { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp index 085367d136..1239d98992 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp @@ -48,6 +48,7 @@ namespace kernels namespace cbrt { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; template struct CbrtFunctor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp index 39ed463d24..97c0bebe07 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp @@ -47,6 +47,7 @@ namespace kernels namespace ceil { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp index f6ac74ce13..77efcfb398 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp @@ -44,6 +44,7 @@ namespace kernels namespace elementwise_common { +using dpctl::tensor::ssize_t; using dpctl::tensor::kernels::alignment_utils:: disabled_sg_loadstore_wrapper_krn; using dpctl::tensor::kernels::alignment_utils::is_aligned; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp index 552b3abd8a..61b539d857 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp @@ -44,6 +44,7 @@ namespace kernels namespace elementwise_common { +using dpctl::tensor::ssize_t; using dpctl::tensor::kernels::alignment_utils:: disabled_sg_loadstore_wrapper_krn; using dpctl::tensor::kernels::alignment_utils::is_aligned; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp index c91a26148a..a57a91fb17 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp @@ -50,6 +50,7 @@ namespace kernels namespace conj { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp index 5a1d70e704..0300f68e4e 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp @@ -47,6 +47,7 @@ namespace kernels namespace copysign { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp index a4f70fccc4..b2a01783a1 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp @@ -48,6 +48,7 @@ namespace kernels namespace cos { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp index cbdc1e323c..62c1008075 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp @@ -48,6 +48,7 @@ namespace kernels namespace cosh { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp index 19c983da2b..7f74520f41 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp @@ -48,6 +48,7 @@ namespace kernels namespace equal { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp index a721700cc5..f2876437ef 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp @@ -48,6 +48,7 @@ namespace kernels namespace exp { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp index 880d5502be..260c66f3c0 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp @@ -49,6 +49,7 @@ namespace kernels namespace exp2 { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp index d4eeacef0b..210914a81a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp @@ -49,6 +49,7 @@ namespace kernels namespace expm1 { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp index c8dcf7035a..ca5128e8c5 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp @@ -47,6 +47,7 @@ namespace kernels namespace floor { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp index 2fdaad656d..3752ad1627 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp @@ -48,6 +48,7 @@ namespace kernels namespace floor_divide { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp index 84230e50f6..50b0e36f8d 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp @@ -49,6 +49,7 @@ namespace kernels namespace greater { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp index 30dea84f92..32fcab6694 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp @@ -49,6 +49,7 @@ namespace kernels namespace greater_equal { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp index 24a85dc65a..26c854c7af 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp @@ -47,6 +47,7 @@ namespace kernels namespace hypot { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp index 67026131ec..abfee0876c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp @@ -49,6 +49,7 @@ namespace kernels namespace imag { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp index 5fbee2c197..77a369cc54 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp @@ -45,6 +45,7 @@ namespace kernels namespace isfinite { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp index c3b94862d7..5c39a8b1c9 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp @@ -47,6 +47,7 @@ namespace kernels namespace isinf { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp index 28b4eaf2e9..6e2e0ab7d2 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp @@ -46,6 +46,7 @@ namespace kernels namespace isnan { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp index 61668e54a4..3c0c59f2d5 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp @@ -48,6 +48,7 @@ namespace kernels namespace less { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp index c0dfc9ed40..1070e558d2 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp @@ -48,6 +48,7 @@ namespace kernels namespace less_equal { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp index 10369ec769..eb05f90122 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp @@ -49,6 +49,7 @@ namespace kernels namespace log { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp index a1ac08479c..e3f61994a0 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp @@ -50,6 +50,7 @@ namespace kernels namespace log10 { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp index 9872a3b0be..6902d61448 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp @@ -48,6 +48,7 @@ namespace kernels namespace log1p { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp index d592da7038..a208833da5 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp @@ -50,6 +50,7 @@ namespace kernels namespace log2 { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp index 9d204aaf8f..d2afa1e233 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp @@ -50,6 +50,7 @@ namespace kernels namespace logaddexp { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp index c6fd495793..547e4b9a75 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp @@ -48,6 +48,7 @@ namespace kernels namespace logical_and { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp index 14f85ec0d4..38e777cc27 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp @@ -47,6 +47,7 @@ namespace kernels namespace logical_not { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp index 7ada2e6027..a5a4ba6dbf 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp @@ -48,6 +48,7 @@ namespace kernels namespace logical_or { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp index 657e86f2a4..918b31d4f1 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp @@ -48,6 +48,7 @@ namespace kernels namespace logical_xor { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp index d22fc98b79..3557657955 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp @@ -48,6 +48,7 @@ namespace kernels namespace maximum { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp index c8ba2d89f4..2f3969282e 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp @@ -48,6 +48,7 @@ namespace kernels namespace minimum { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp index 8130ebf30a..6273954820 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp @@ -50,6 +50,7 @@ namespace kernels namespace multiply { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp index 6acb31f581..329f3768d1 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp @@ -48,6 +48,7 @@ namespace kernels namespace negative { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp index f1d3a38542..3d4c8c798d 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp @@ -47,6 +47,7 @@ namespace kernels namespace nextafter { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp index a6653696aa..bf191c0052 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp @@ -47,6 +47,7 @@ namespace kernels namespace not_equal { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp index 659c197f15..2bdf679082 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp @@ -48,6 +48,7 @@ namespace kernels namespace positive { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp index a5e7cc9dfe..3df668672c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp @@ -50,6 +50,7 @@ namespace kernels namespace pow { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp index f4f9fd3fc7..bca2533839 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp @@ -50,6 +50,7 @@ namespace kernels namespace proj { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp index a3d0e5e8be..5bff2ebd4f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp @@ -49,6 +49,7 @@ namespace kernels namespace real { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp index 28f9f06a05..f6b1b1857c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp @@ -51,6 +51,7 @@ namespace kernels namespace reciprocal { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp index 15d6bd115d..e1ecf23b6a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp @@ -49,6 +49,7 @@ namespace kernels namespace remainder { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp index 37e52a6e3b..a1890f278c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp @@ -47,6 +47,7 @@ namespace kernels namespace round { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp index 6b19e9fba4..0a45f52c0a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp @@ -51,6 +51,7 @@ namespace kernels namespace rsqrt { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; template struct RsqrtFunctor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp index 615e80efda..98e4b7bdd6 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp @@ -49,6 +49,7 @@ namespace kernels namespace sign { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp index 88edeacf44..7ebdcaa1ae 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp @@ -47,6 +47,7 @@ namespace kernels namespace signbit { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp index d37900777b..9a7f013f85 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp @@ -48,6 +48,7 @@ namespace kernels namespace sin { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp index c918323e66..7361210384 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp @@ -48,6 +48,7 @@ namespace kernels namespace sinh { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp index 88a3db84ac..cd46f97b1b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp @@ -51,6 +51,7 @@ namespace kernels namespace sqrt { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp index aaabd9761c..1c86c5b501 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp @@ -49,6 +49,7 @@ namespace kernels namespace square { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp index 0ebf102805..62ca7df4a5 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp @@ -48,6 +48,7 @@ namespace kernels namespace subtract { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp index 7dbc80a66a..d65e0e3081 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp @@ -49,6 +49,7 @@ namespace kernels namespace tan { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp index cdb38c00a7..22d920047c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp @@ -50,6 +50,7 @@ namespace kernels namespace tanh { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp index 454745e7d3..0f4dfe4119 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp @@ -49,6 +49,7 @@ namespace kernels namespace true_divide { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp index cf9d6fa14f..d3406b1c31 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp @@ -47,6 +47,7 @@ namespace kernels namespace trunc { +using dpctl::tensor::ssize_t; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; diff --git a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp index 1ca651a06f..0006a8b123 100644 --- a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -44,6 +44,8 @@ namespace kernels namespace indexing { +using dpctl::tensor::ssize_t; + template struct IndexedProj } // end of namespace radix_sort_details +using dpctl::tensor::ssize_t; + template sycl::event radix_sort_axis1_contig_impl(sycl::queue &exec_q, diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp index bee9cea592..b235b7212a 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp @@ -42,6 +42,8 @@ namespace tensor namespace kernels { +using dpctl::tensor::ssize_t; + template #include +#include "kernels/dpctl_tensor_types.hpp" + namespace dpctl { namespace tensor @@ -35,6 +37,8 @@ namespace tensor namespace kernels { +using dpctl::tensor::ssize_t; + typedef sycl::event (*sort_contig_fn_ptr_t)(sycl::queue &, std::size_t, std::size_t, @@ -46,6 +50,6 @@ typedef sycl::event (*sort_contig_fn_ptr_t)(sycl::queue &, ssize_t, const std::vector &); -} +} // namespace kernels } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/where.hpp b/dpctl/tensor/libtensor/include/kernels/where.hpp index 1b58edd984..7c10901ddd 100644 --- a/dpctl/tensor/libtensor/include/kernels/where.hpp +++ b/dpctl/tensor/libtensor/include/kernels/where.hpp @@ -45,6 +45,7 @@ namespace kernels namespace search { +using dpctl::tensor::ssize_t; using namespace dpctl::tensor::offset_utils; using dpctl::tensor::kernels::alignment_utils:: diff --git a/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp b/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp index c115fdeeef..59f5c4042b 100644 --- a/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp @@ -40,6 +40,8 @@ namespace tensor namespace indexing_utils { +using dpctl::tensor::ssize_t; + /* * ssize_t for indices is a design choice, dpctl::tensor::usm_ndarray * uses py::ssize_t for shapes and strides internally and Python uses From 64ec889df256c7284dddf0825b3340c12a62f6fd Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Fri, 27 Dec 2024 08:59:58 -0800 Subject: [PATCH 3/9] Fixed missed uses of `size_t` --- .../libtensor/include/kernels/elementwise_functions/trunc.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp index d3406b1c31..2091fe6f37 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp @@ -136,7 +136,7 @@ class trunc_contig_kernel; template sycl::event trunc_contig_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, const char *arg_p, char *res_p, const std::vector &depends = {}) @@ -179,7 +179,7 @@ template class trunc_strided_kernel; template sycl::event trunc_strided_impl(sycl::queue &exec_q, - size_t nelems, + std::size_t nelems, int nd, const ssize_t *shape_and_strides, const char *arg_p, From 693340ba52860cf9e673e0520e8352e2ca6b8096 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Sun, 5 Jan 2025 21:39:38 -0600 Subject: [PATCH 4/9] size_t->std::size in for loop, that was added during merge --- dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp index 654aef5b01..95df05f0b3 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp @@ -199,7 +199,7 @@ sycl::event async_smart_free(sycl::queue &exec_q, cgh.depends_on(depends); cgh.host_task([ptrs, dels]() { - for (size_t i = 0; i < ptrs.size(); ++i) { + for (std::size_t i = 0; i < ptrs.size(); ++i) { dels[i](ptrs[i]); } }); From 87e36f83fd50ec3204c93259b0e08bee9accc9aa Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Mon, 6 Jan 2025 06:29:02 -0600 Subject: [PATCH 5/9] Replaced int8_t->std::int8_t, same for other integral types Added missing `#include ` --- .../tensor/libtensor/include/kernels/clip.hpp | 2 +- .../kernels/elementwise_functions/conj.hpp | 2 +- .../include/utils/type_dispatch_building.hpp | 55 ++++++++++--------- .../source/integer_advanced_indexing.cpp | 4 +- .../source/integer_advanced_indexing.hpp | 4 +- 5 files changed, 34 insertions(+), 33 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/clip.hpp b/dpctl/tensor/libtensor/include/kernels/clip.hpp index 0f6b3c502d..d808fc5bcf 100644 --- a/dpctl/tensor/libtensor/include/kernels/clip.hpp +++ b/dpctl/tensor/libtensor/include/kernels/clip.hpp @@ -113,7 +113,7 @@ class ClipContigFunctor const std::uint16_t sgSize = ndit.get_sub_group().get_local_range()[0]; const std::size_t gid = ndit.get_global_linear_id(); - const uint16_t nelems_per_sg = sgSize * nelems_per_wi; + const std::uint16_t nelems_per_sg = sgSize * nelems_per_wi; const std::size_t start = (gid / sgSize) * (nelems_per_sg - sgSize) + gid; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp index a57a91fb17..0f41decb2b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp @@ -103,7 +103,7 @@ using ConjStridedFunctor = elementwise_common:: template struct ConjOutputType { using value_type = typename std::disjunction< - td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, td_ns::TypeMapResultEntry, td_ns::TypeMapResultEntry, td_ns::TypeMapResultEntry, diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp index 1cd378f83e..c11674cca7 100644 --- a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp +++ b/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp @@ -26,6 +26,7 @@ #pragma once #include +#include #include #include @@ -69,14 +70,14 @@ class DispatchTableBuilder { std::vector per_dstTy = { factory{}.get(), - factory{}.get(), - factory{}.get(), - factory{}.get(), - factory{}.get(), - factory{}.get(), - factory{}.get(), - factory{}.get(), - factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), factory{}.get(), factory{}.get(), factory{}.get(), @@ -93,14 +94,14 @@ class DispatchTableBuilder void populate_dispatch_table(funcPtrT table[][_num_types]) const { const auto map_by_dst_type = {row_per_dst_type(), - row_per_dst_type(), - row_per_dst_type(), - row_per_dst_type(), - row_per_dst_type(), - row_per_dst_type(), - row_per_dst_type(), - row_per_dst_type(), - row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), row_per_dst_type(), row_per_dst_type(), row_per_dst_type(), @@ -108,9 +109,9 @@ class DispatchTableBuilder row_per_dst_type>()}; assert(map_by_dst_type.size() == _num_types); int dst_id = 0; - for (auto &row : map_by_dst_type) { + for (const auto &row : map_by_dst_type) { int src_id = 0; - for (auto &fn_ptr : row) { + for (const auto &fn_ptr : row) { table[dst_id][src_id] = fn_ptr; ++src_id; } @@ -139,14 +140,14 @@ class DispatchVectorBuilder void populate_dispatch_vector(funcPtrT vector[]) const { const auto fn_map_by_type = {func_per_type(), - func_per_type(), - func_per_type(), - func_per_type(), - func_per_type(), - func_per_type(), - func_per_type(), - func_per_type(), - func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), func_per_type(), func_per_type(), func_per_type(), @@ -154,7 +155,7 @@ class DispatchVectorBuilder func_per_type>()}; assert(fn_map_by_type.size() == _num_types); int ty_id = 0; - for (auto &fn : fn_map_by_type) { + for (const auto &fn : fn_map_by_type) { vector[ty_id] = fn; ++ty_id; } diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp index 6745948c5e..aa620690d3 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -245,7 +245,7 @@ usm_ndarray_take(const dpctl::tensor::usm_ndarray &src, const py::object &py_ind, const dpctl::tensor::usm_ndarray &dst, int axis_start, - uint8_t mode, + std::uint8_t mode, sycl::queue &exec_q, const std::vector &depends) { @@ -560,7 +560,7 @@ usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst, const py::object &py_ind, const dpctl::tensor::usm_ndarray &val, int axis_start, - uint8_t mode, + std::uint8_t mode, sycl::queue &exec_q, const std::vector &depends) { diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp index bee4883920..77bde0f49f 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp @@ -43,7 +43,7 @@ usm_ndarray_take(const dpctl::tensor::usm_ndarray &, const py::object &, const dpctl::tensor::usm_ndarray &, int, - uint8_t, + std::uint8_t, sycl::queue &, const std::vector & = {}); @@ -52,7 +52,7 @@ usm_ndarray_put(const dpctl::tensor::usm_ndarray &, const py::object &, const dpctl::tensor::usm_ndarray &, int, - uint8_t, + std::uint8_t, sycl::queue &, const std::vector & = {}); From affe2db10ea22ac71f605b4481cf01250a4a4b69 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Mon, 6 Jan 2025 07:39:43 -0600 Subject: [PATCH 6/9] Use C++ std:: qualified fixed width integral types in C++ implementation --- .../source/dpctl_sycl_queue_interface.cpp | 42 ++++++++++--------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp index 3db14451ea..1c4ac2b516 100644 --- a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp @@ -31,9 +31,13 @@ #include "dpctl_sycl_device_interface.h" #include "dpctl_sycl_device_manager.h" #include "dpctl_sycl_type_casters.hpp" + +#include +#include + +#include #include #include -#include #include #include /* SYCL headers */ #include @@ -45,49 +49,49 @@ using namespace sycl; switch ((ARGTY)) { \ case DPCTL_INT8_T: \ { \ - auto la = local_accessor(R, CGH); \ + auto la = local_accessor(R, CGH); \ CGH.set_arg(IDX, la); \ return true; \ } \ case DPCTL_UINT8_T: \ { \ - auto la = local_accessor(R, CGH); \ + auto la = local_accessor(R, CGH); \ CGH.set_arg(IDX, la); \ return true; \ } \ case DPCTL_INT16_T: \ { \ - auto la = local_accessor(R, CGH); \ + auto la = local_accessor(R, CGH); \ CGH.set_arg(IDX, la); \ return true; \ } \ case DPCTL_UINT16_T: \ { \ - auto la = local_accessor(R, CGH); \ + auto la = local_accessor(R, CGH); \ CGH.set_arg(IDX, la); \ return true; \ } \ case DPCTL_INT32_T: \ { \ - auto la = local_accessor(R, CGH); \ + auto la = local_accessor(R, CGH); \ CGH.set_arg(IDX, la); \ return true; \ } \ case DPCTL_UINT32_T: \ { \ - auto la = local_accessor(R, CGH); \ + auto la = local_accessor(R, CGH); \ CGH.set_arg(IDX, la); \ return true; \ } \ case DPCTL_INT64_T: \ { \ - auto la = local_accessor(R, CGH); \ + auto la = local_accessor(R, CGH); \ CGH.set_arg(IDX, la); \ return true; \ } \ case DPCTL_UINT64_T: \ { \ - auto la = local_accessor(R, CGH); \ + auto la = local_accessor(R, CGH); \ CGH.set_arg(IDX, la); \ return true; \ } \ @@ -119,8 +123,8 @@ using namespace dpctl::syclinterface; typedef struct complex { - uint64_t real; - uint64_t imag; + std::uint64_t real; + std::uint64_t imag; } complexNumber; void set_dependent_events(handler &cgh, @@ -177,28 +181,28 @@ bool set_kernel_arg(handler &cgh, switch (ArgTy) { case DPCTL_INT8_T: - cgh.set_arg(idx, *(int8_t *)Arg); + cgh.set_arg(idx, *(std::int8_t *)Arg); break; case DPCTL_UINT8_T: - cgh.set_arg(idx, *(uint8_t *)Arg); + cgh.set_arg(idx, *(std::uint8_t *)Arg); break; case DPCTL_INT16_T: - cgh.set_arg(idx, *(int16_t *)Arg); + cgh.set_arg(idx, *(std::int16_t *)Arg); break; case DPCTL_UINT16_T: - cgh.set_arg(idx, *(uint16_t *)Arg); + cgh.set_arg(idx, *(std::uint16_t *)Arg); break; case DPCTL_INT32_T: - cgh.set_arg(idx, *(int32_t *)Arg); + cgh.set_arg(idx, *(std::int32_t *)Arg); break; case DPCTL_UINT32_T: - cgh.set_arg(idx, *(uint32_t *)Arg); + cgh.set_arg(idx, *(std::uint32_t *)Arg); break; case DPCTL_INT64_T: - cgh.set_arg(idx, *(int64_t *)Arg); + cgh.set_arg(idx, *(std::int64_t *)Arg); break; case DPCTL_UINT64_T: - cgh.set_arg(idx, *(uint64_t *)Arg); + cgh.set_arg(idx, *(std::uint64_t *)Arg); break; case DPCTL_FLOAT32_T: cgh.set_arg(idx, *(float *)Arg); From 73cd278fcedf92c096c0ac1b0cb431a391fdc0c5 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Mon, 6 Jan 2025 08:40:35 -0600 Subject: [PATCH 7/9] Use qualified C++ integral types in test_sycl_queue_submit_local_accessor_arg.cpp --- .../tests/test_sycl_queue_submit.cpp | 101 +++++++++--------- ...t_sycl_queue_submit_local_accessor_arg.cpp | 76 ++++++------- 2 files changed, 90 insertions(+), 87 deletions(-) diff --git a/libsyclinterface/tests/test_sycl_queue_submit.cpp b/libsyclinterface/tests/test_sycl_queue_submit.cpp index 21aa7653e3..5cf3a941b9 100644 --- a/libsyclinterface/tests/test_sycl_queue_submit.cpp +++ b/libsyclinterface/tests/test_sycl_queue_submit.cpp @@ -32,17 +32,20 @@ #include "dpctl_sycl_queue_interface.h" #include "dpctl_sycl_type_casters.hpp" #include "dpctl_sycl_usm_interface.h" + +#include + +#include #include #include #include #include -#include #include #include namespace { -constexpr size_t SIZE = 1024; +constexpr std::size_t SIZE = 1024; static_assert(SIZE % 8 == 0); using namespace dpctl::syclinterface; @@ -51,15 +54,15 @@ template void submit_kernel(DPCTLSyclQueueRef QRef, DPCTLSyclKernelBundleRef KBRef, std::vector spirvBuffer, - size_t spirvFileSize, + std::size_t spirvFileSize, DPCTLKernelArgType kernelArgTy, std::string kernelName) { T scalarVal = 3; - constexpr size_t NARGS = 4; - constexpr size_t RANGE_NDIMS_1 = 1; - constexpr size_t RANGE_NDIMS_2 = 2; - constexpr size_t RANGE_NDIMS_3 = 3; + constexpr std::size_t NARGS = 4; + constexpr std::size_t RANGE_NDIMS_1 = 1; + constexpr std::size_t RANGE_NDIMS_2 = 2; + constexpr std::size_t RANGE_NDIMS_3 = 3; ASSERT_TRUE(DPCTLKernelBundle_HasKernel(KBRef, kernelName.c_str())); auto kernel = DPCTLKernelBundle_GetKernel(KBRef, kernelName.c_str()); @@ -73,7 +76,7 @@ void submit_kernel(DPCTLSyclQueueRef QRef, ASSERT_TRUE(c != nullptr); // Create kernel args for vector_add - size_t Range[] = {SIZE}; + std::size_t Range[] = {SIZE}; void *args[NARGS] = {unwrap(a), unwrap(b), unwrap(c), (void *)&scalarVal}; DPCTLKernelArgType addKernelArgTypes[] = {DPCTL_VOID_PTR, DPCTL_VOID_PTR, @@ -84,7 +87,7 @@ void submit_kernel(DPCTLSyclQueueRef QRef, ASSERT_TRUE(E1Ref != nullptr); // Create kernel args for vector_add - size_t Range2D[] = {SIZE, 1}; + std::size_t Range2D[] = {SIZE, 1}; DPCTLSyclEventRef DepEvs[] = {E1Ref}; auto E2Ref = DPCTLQueue_SubmitRange(kernel, QRef, args, addKernelArgTypes, NARGS, @@ -92,7 +95,7 @@ void submit_kernel(DPCTLSyclQueueRef QRef, ASSERT_TRUE(E2Ref != nullptr); // Create kernel args for vector_add - size_t Range3D[] = {SIZE, 1, 1}; + std::size_t Range3D[] = {SIZE, 1, 1}; DPCTLSyclEventRef DepEvs2[] = {E1Ref, E2Ref}; auto E3Ref = DPCTLQueue_SubmitRange(kernel, QRef, args, addKernelArgTypes, NARGS, @@ -174,7 +177,7 @@ void submit_kernel( } template -void driver(size_t N) +void driver(std::size_t N) { sycl::queue q; auto *a = sycl::malloc_shared(N, q); @@ -191,19 +194,19 @@ void driver(size_t N) int main(int argc, const char **argv) { - size_t N = 0; + std::size_t N = 0; std::cout << "Enter problem size in N:\n"; std::cin >> N; std::cout << "Executing with N = " << N << std::endl; - driver(N); - driver(N); - driver(N); - driver(N); - driver(N); - driver(N); - driver(N); - driver(N); + driver(N); + driver(N); + driver(N); + driver(N); + driver(N); + driver(N); + driver(N); + driver(N); driver(N); driver(N); @@ -214,7 +217,7 @@ int main(int argc, const char **argv) struct TestQueueSubmit : public ::testing::Test { std::ifstream spirvFile; - size_t spirvFileSize_; + std::size_t spirvFileSize_; std::vector spirvBuffer_; DPCTLSyclQueueRef QRef = nullptr; DPCTLSyclKernelBundleRef KBRef = nullptr; @@ -255,7 +258,7 @@ struct TestQueueSubmit : public ::testing::Test struct TestQueueSubmitFP64 : public ::testing::Test { std::ifstream spirvFile; - size_t spirvFileSize_; + std::size_t spirvFileSize_; std::vector spirvBuffer_; DPCTLSyclDeviceRef DRef = nullptr; DPCTLSyclQueueRef QRef = nullptr; @@ -294,58 +297,58 @@ struct TestQueueSubmitFP64 : public ::testing::Test TEST_F(TestQueueSubmit, CheckForInt8) { - submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, - DPCTLKernelArgType::DPCTL_INT8_T, - "_ZTS11RangeKernelIaE"); + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_INT8_T, + "_ZTS11RangeKernelIaE"); } TEST_F(TestQueueSubmit, CheckForUInt8) { - submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, - DPCTLKernelArgType::DPCTL_UINT8_T, - "_ZTS11RangeKernelIhE"); + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_UINT8_T, + "_ZTS11RangeKernelIhE"); } TEST_F(TestQueueSubmit, CheckForInt16) { - submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, - DPCTLKernelArgType::DPCTL_INT16_T, - "_ZTS11RangeKernelIsE"); + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_INT16_T, + "_ZTS11RangeKernelIsE"); } TEST_F(TestQueueSubmit, CheckForUInt16) { - submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, - DPCTLKernelArgType::DPCTL_UINT16_T, - "_ZTS11RangeKernelItE"); + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_UINT16_T, + "_ZTS11RangeKernelItE"); } TEST_F(TestQueueSubmit, CheckForInt32) { - submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, - DPCTLKernelArgType::DPCTL_INT32_T, - "_ZTS11RangeKernelIiE"); + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_INT32_T, + "_ZTS11RangeKernelIiE"); } TEST_F(TestQueueSubmit, CheckForUInt32) { - submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, - DPCTLKernelArgType::DPCTL_UINT32_T, - "_ZTS11RangeKernelIjE"); + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_UINT32_T, + "_ZTS11RangeKernelIjE"); } TEST_F(TestQueueSubmit, CheckForInt64) { - submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, - DPCTLKernelArgType::DPCTL_INT64_T, - "_ZTS11RangeKernelIlE"); + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_INT64_T, + "_ZTS11RangeKernelIlE"); } TEST_F(TestQueueSubmit, CheckForUInt64) { - submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, - DPCTLKernelArgType::DPCTL_UINT64_T, - "_ZTS11RangeKernelImE"); + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_UINT64_T, + "_ZTS11RangeKernelImE"); } TEST_F(TestQueueSubmit, CheckForFloat) @@ -368,9 +371,9 @@ TEST_F(TestQueueSubmit, CheckForUnsupportedArgTy) { int scalarVal = 3; - size_t Range[] = {SIZE}; - size_t RANGE_NDIMS = 1; - constexpr size_t NARGS = 4; + std::size_t Range[] = {SIZE}; + std::size_t RANGE_NDIMS = 1; + constexpr std::size_t NARGS = 4; auto kernel = DPCTLKernelBundle_GetKernel(KBRef, "_ZTS11RangeKernelIdE"); void *args[NARGS] = {unwrap(nullptr), unwrap(nullptr), diff --git a/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp b/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp index 539fdbd4c5..5fd6f628b8 100644 --- a/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp +++ b/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp @@ -41,7 +41,7 @@ namespace { -constexpr size_t SIZE = 100; +constexpr std::size_t SIZE = 100; using namespace dpctl::syclinterface; @@ -49,12 +49,12 @@ template void submit_kernel(DPCTLSyclQueueRef QRef, DPCTLSyclKernelBundleRef KBRef, std::vector spirvBuffer, - size_t spirvFileSize, + std::size_t spirvFileSize, DPCTLKernelArgType kernelArgTy, std::string kernelName) { - constexpr size_t NARGS = 2; - constexpr size_t RANGE_NDIMS = 1; + constexpr std::size_t NARGS = 2; + constexpr std::size_t RANGE_NDIMS = 1; ASSERT_TRUE(DPCTLKernelBundle_HasKernel(KBRef, kernelName.c_str())); auto kernel = DPCTLKernelBundle_GetKernel(KBRef, kernelName.c_str()); @@ -70,8 +70,8 @@ void submit_kernel(DPCTLSyclQueueRef QRef, auto la1 = MDLocalAccessor{1, kernelArgTy, SIZE / 10, 1, 1}; // Create kernel args for vector_add - size_t gRange[] = {SIZE}; - size_t lRange[] = {SIZE / 10}; + std::size_t gRange[] = {SIZE}; + std::size_t lRange[] = {SIZE / 10}; void *args_1d[NARGS] = {unwrap(a), (void *)&la1}; DPCTLKernelArgType addKernelArgTypes[] = {DPCTL_VOID_PTR, DPCTL_LOCAL_ACCESSOR}; @@ -174,7 +174,7 @@ void submit_kernel(sycl::queue q, const unsigned long N, T *a) } template -void driver(size_t N) +void driver(std::size_t N) { sycl::queue q; auto *a = sycl::malloc_shared(N, q); @@ -185,7 +185,7 @@ void driver(size_t N) int main(int argc, const char **argv) { - size_t N = 0; + std::size_t N = 0; std::cout << "Enter problem size in N:\n"; std::cin >> N; std::cout << "Executing with N = " << N << std::endl; @@ -209,7 +209,7 @@ int main(int argc, const char **argv) struct TestQueueSubmitWithLocalAccessor : public ::testing::Test { std::ifstream spirvFile; - size_t spirvFileSize_; + std::size_t spirvFileSize_; std::vector spirvBuffer_; DPCTLSyclQueueRef QRef = nullptr; DPCTLSyclKernelBundleRef KBRef = nullptr; @@ -250,7 +250,7 @@ struct TestQueueSubmitWithLocalAccessor : public ::testing::Test struct TestQueueSubmitWithLocalAccessorFP64 : public ::testing::Test { std::ifstream spirvFile; - size_t spirvFileSize_; + std::size_t spirvFileSize_; std::vector spirvBuffer_; DPCTLSyclDeviceRef DRef = nullptr; DPCTLSyclQueueRef QRef = nullptr; @@ -289,58 +289,58 @@ struct TestQueueSubmitWithLocalAccessorFP64 : public ::testing::Test TEST_F(TestQueueSubmitWithLocalAccessor, CheckForInt8) { - submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, - DPCTLKernelArgType::DPCTL_INT8_T, - "_ZTS14SyclKernel_SLMIaE"); + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_INT8_T, + "_ZTS14SyclKernel_SLMIaE"); } TEST_F(TestQueueSubmitWithLocalAccessor, CheckForUInt8) { - submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, - DPCTLKernelArgType::DPCTL_UINT8_T, - "_ZTS14SyclKernel_SLMIhE"); + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_UINT8_T, + "_ZTS14SyclKernel_SLMIhE"); } TEST_F(TestQueueSubmitWithLocalAccessor, CheckForInt16) { - submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, - DPCTLKernelArgType::DPCTL_INT16_T, - "_ZTS14SyclKernel_SLMIsE"); + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_INT16_T, + "_ZTS14SyclKernel_SLMIsE"); } TEST_F(TestQueueSubmitWithLocalAccessor, CheckForUInt16) { - submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, - DPCTLKernelArgType::DPCTL_UINT16_T, - "_ZTS14SyclKernel_SLMItE"); + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_UINT16_T, + "_ZTS14SyclKernel_SLMItE"); } TEST_F(TestQueueSubmitWithLocalAccessor, CheckForInt32) { - submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, - DPCTLKernelArgType::DPCTL_INT32_T, - "_ZTS14SyclKernel_SLMIiE"); + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_INT32_T, + "_ZTS14SyclKernel_SLMIiE"); } TEST_F(TestQueueSubmitWithLocalAccessor, CheckForUInt32) { - submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, - DPCTLKernelArgType::DPCTL_UINT32_T, - "_ZTS14SyclKernel_SLMIjE"); + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_UINT32_T, + "_ZTS14SyclKernel_SLMIjE"); } TEST_F(TestQueueSubmitWithLocalAccessor, CheckForInt64) { - submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, - DPCTLKernelArgType::DPCTL_INT64_T, - "_ZTS14SyclKernel_SLMIlE"); + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_INT64_T, + "_ZTS14SyclKernel_SLMIlE"); } TEST_F(TestQueueSubmitWithLocalAccessor, CheckForUInt64) { - submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, - DPCTLKernelArgType::DPCTL_UINT64_T, - "_ZTS14SyclKernel_SLMImE"); + submit_kernel(QRef, KBRef, spirvBuffer_, spirvFileSize_, + DPCTLKernelArgType::DPCTL_UINT64_T, + "_ZTS14SyclKernel_SLMImE"); } TEST_F(TestQueueSubmitWithLocalAccessor, CheckForFloat) @@ -361,10 +361,10 @@ TEST_F(TestQueueSubmitWithLocalAccessorFP64, CheckForDouble) TEST_F(TestQueueSubmitWithLocalAccessor, CheckForUnsupportedArgTy) { - size_t gRange[] = {SIZE}; - size_t lRange[] = {SIZE / 10}; - size_t RANGE_NDIMS = 1; - constexpr size_t NARGS = 2; + std::size_t gRange[] = {SIZE}; + std::size_t lRange[] = {SIZE / 10}; + std::size_t RANGE_NDIMS = 1; + constexpr std::size_t NARGS = 2; auto la = MDLocalAccessor{1, DPCTL_UNSUPPORTED_KERNEL_ARG, SIZE / 10, 1, 1}; auto kernel = DPCTLKernelBundle_GetKernel(KBRef, "_ZTS14SyclKernel_SLMImE"); From 5c92afebe9fcded34d587c5cc2ace37607f49ca2 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Mon, 6 Jan 2025 12:34:15 -0600 Subject: [PATCH 8/9] Separate include of C header from include of C++ headers by a new line --- libsyclinterface/tests/test_sycl_context_interface.cpp | 4 +++- libsyclinterface/tests/test_sycl_device_aspects.cpp | 4 +++- libsyclinterface/tests/test_sycl_device_interface.cpp | 4 +++- libsyclinterface/tests/test_sycl_device_manager.cpp | 4 +++- libsyclinterface/tests/test_sycl_device_subdevices.cpp | 5 +++-- libsyclinterface/tests/test_sycl_kernel_bundle_interface.cpp | 4 +++- libsyclinterface/tests/test_sycl_kernel_interface.cpp | 4 +++- libsyclinterface/tests/test_sycl_queue_interface.cpp | 4 +++- .../tests/test_sycl_queue_submit_local_accessor_arg.cpp | 4 +++- libsyclinterface/tests/test_sycl_usm_interface.cpp | 4 +++- 10 files changed, 30 insertions(+), 11 deletions(-) diff --git a/libsyclinterface/tests/test_sycl_context_interface.cpp b/libsyclinterface/tests/test_sycl_context_interface.cpp index 65995178db..2fda9435c0 100644 --- a/libsyclinterface/tests/test_sycl_context_interface.cpp +++ b/libsyclinterface/tests/test_sycl_context_interface.cpp @@ -29,8 +29,10 @@ #include "dpctl_sycl_device_interface.h" #include "dpctl_sycl_device_selector_interface.h" #include "dpctl_sycl_types.h" -#include + #include + +#include #include #include diff --git a/libsyclinterface/tests/test_sycl_device_aspects.cpp b/libsyclinterface/tests/test_sycl_device_aspects.cpp index a535a66e69..c806cff4fb 100644 --- a/libsyclinterface/tests/test_sycl_device_aspects.cpp +++ b/libsyclinterface/tests/test_sycl_device_aspects.cpp @@ -30,8 +30,10 @@ #include "dpctl_sycl_enum_types.h" #include "dpctl_sycl_type_casters.hpp" #include "dpctl_utils_helper.h" -#include + #include + +#include #include #include diff --git a/libsyclinterface/tests/test_sycl_device_interface.cpp b/libsyclinterface/tests/test_sycl_device_interface.cpp index 5698752afa..5d3c4d810a 100644 --- a/libsyclinterface/tests/test_sycl_device_interface.cpp +++ b/libsyclinterface/tests/test_sycl_device_interface.cpp @@ -29,8 +29,10 @@ #include "dpctl_sycl_platform_interface.h" #include "dpctl_utils.h" #include "dpctl_utils_helper.h" -#include + #include + +#include #include using namespace sycl; diff --git a/libsyclinterface/tests/test_sycl_device_manager.cpp b/libsyclinterface/tests/test_sycl_device_manager.cpp index 745ba5ed1c..b5f1fbc725 100644 --- a/libsyclinterface/tests/test_sycl_device_manager.cpp +++ b/libsyclinterface/tests/test_sycl_device_manager.cpp @@ -30,8 +30,10 @@ #include "dpctl_sycl_device_selector_interface.h" #include "dpctl_utils.h" #include "dpctl_utils_helper.h" -#include + #include + +#include #include using dpctl::syclinterface::dpctl_default_selector; diff --git a/libsyclinterface/tests/test_sycl_device_subdevices.cpp b/libsyclinterface/tests/test_sycl_device_subdevices.cpp index edc832982c..5d5ab7b933 100644 --- a/libsyclinterface/tests/test_sycl_device_subdevices.cpp +++ b/libsyclinterface/tests/test_sycl_device_subdevices.cpp @@ -1,4 +1,3 @@ - //===--- test_sycl_device_interface.cpp - Test cases for device interface ===// // // Data Parallel Control (dpCtl) @@ -32,8 +31,10 @@ #include "dpctl_sycl_type_casters.hpp" #include "dpctl_utils.h" #include "dpctl_utils_helper.h" -#include + #include + +#include #include using namespace sycl; diff --git a/libsyclinterface/tests/test_sycl_kernel_bundle_interface.cpp b/libsyclinterface/tests/test_sycl_kernel_bundle_interface.cpp index de755a3b23..aa35856d7f 100644 --- a/libsyclinterface/tests/test_sycl_kernel_bundle_interface.cpp +++ b/libsyclinterface/tests/test_sycl_kernel_bundle_interface.cpp @@ -33,11 +33,13 @@ #include "dpctl_sycl_kernel_bundle_interface.h" #include "dpctl_sycl_kernel_interface.h" #include "dpctl_sycl_queue_interface.h" + +#include + #include #include #include #include -#include #include using namespace sycl; diff --git a/libsyclinterface/tests/test_sycl_kernel_interface.cpp b/libsyclinterface/tests/test_sycl_kernel_interface.cpp index c2c0d8bda0..dfcbc377a6 100644 --- a/libsyclinterface/tests/test_sycl_kernel_interface.cpp +++ b/libsyclinterface/tests/test_sycl_kernel_interface.cpp @@ -32,9 +32,11 @@ #include "dpctl_sycl_kernel_interface.h" #include "dpctl_sycl_queue_interface.h" #include "dpctl_utils.h" + +#include + #include #include -#include #include using namespace sycl; diff --git a/libsyclinterface/tests/test_sycl_queue_interface.cpp b/libsyclinterface/tests/test_sycl_queue_interface.cpp index 1de3b90891..d5a5b58bd9 100644 --- a/libsyclinterface/tests/test_sycl_queue_interface.cpp +++ b/libsyclinterface/tests/test_sycl_queue_interface.cpp @@ -33,8 +33,10 @@ #include "dpctl_sycl_queue_interface.h" #include "dpctl_sycl_type_casters.hpp" #include "dpctl_sycl_usm_interface.h" -#include + #include + +#include #include using namespace sycl; diff --git a/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp b/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp index 5fd6f628b8..86f37b3c51 100644 --- a/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp +++ b/libsyclinterface/tests/test_sycl_queue_submit_local_accessor_arg.cpp @@ -32,10 +32,12 @@ #include "dpctl_sycl_queue_interface.h" #include "dpctl_sycl_type_casters.hpp" #include "dpctl_sycl_usm_interface.h" + +#include + #include #include #include -#include #include #include diff --git a/libsyclinterface/tests/test_sycl_usm_interface.cpp b/libsyclinterface/tests/test_sycl_usm_interface.cpp index fc54e3a411..8c3b39a2c7 100644 --- a/libsyclinterface/tests/test_sycl_usm_interface.cpp +++ b/libsyclinterface/tests/test_sycl_usm_interface.cpp @@ -31,9 +31,11 @@ #include "dpctl_sycl_queue_interface.h" #include "dpctl_sycl_type_casters.hpp" #include "dpctl_sycl_usm_interface.h" + +#include + #include #include -#include #include using namespace sycl; From 919d77247899a90fa3e84dfb1db00ba52fc50fe8 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Mon, 6 Jan 2025 16:07:53 -0800 Subject: [PATCH 9/9] Add gh-1950 to the changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e7138a6a73..12f20edeff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,6 +42,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Add support of CV-qualifiers in `is_complex` helper [gh-1900](https://github.com/IntelPython/dpctl/pull/1900) * Tuning work for elementwise functions with modest performance gains (under 10%) [gh-1889](https://github.com/IntelPython/dpctl/pull/1889) * Support for Python 3.13 for `dpctl` [gh-1941](https://github.com/IntelPython/dpctl/pull/1941) +* Change libtensor to use `std::size_t` and `dpctl::tensor::ssize_t` throughout and fix missing includes for `std::size_t` and `size_t` [gh-1950](https://github.com/IntelPython/dpctl/pull/1950) ## [0.18.3] - Dec. 07, 2024