diff --git a/thrust/system/cuda/detail/binary_search.h b/thrust/system/cuda/detail/binary_search.h index 3400515dc..fb769a4ac 100644 --- a/thrust/system/cuda/detail/binary_search.h +++ b/thrust/system/cuda/detail/binary_search.h @@ -1,782 +1,19 @@ -/****************************************************************************** - * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ -#pragma once - -#if 0 - -#include - -#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#if 1 -# define BS_SIMPLE -#endif - -THRUST_NAMESPACE_BEGIN -namespace cuda_cub { - -namespace __binary_search { - - template - struct lbf - { - typedef typename iterator_traits::difference_type result_type; - typedef typename iterator_traits::value_type T; - - template - THRUST_DEVICE_FUNCTION result_type - operator()(It begin, It end, T const& value, CompareOp comp) - { - return system::detail::generic::scalar::lower_bound(begin, - end, - value, - comp) - - begin; - } - }; // struct lbf - - template - struct ubf - { - typedef typename iterator_traits::difference_type result_type; - typedef typename iterator_traits::value_type T; - - template - THRUST_DEVICE_FUNCTION result_type - operator()(It begin, It end, T const& value, CompareOp comp) - { - return system::detail::generic::scalar::upper_bound(begin, - end, - value, - comp) - - begin; - } - }; // struct ubf - - template - struct bsf - { - typedef bool result_type; - typedef typename iterator_traits::value_type T; - - template - THRUST_DEVICE_FUNCTION bool - operator()(It begin, It end, T const& value, CompareOp comp) - { - HaystackIt iter = system::detail::generic::scalar::lower_bound(begin, - end, - value, - comp); - - detail::wrapped_function wrapped_comp(comp); - - return iter != end && !wrapped_comp(value, *iter); - } - }; // struct bsf - - template - THRUST_DEVICE_FUNCTION Size - merge_path(KeysIt1 keys1, - KeysIt2 keys2, - Size keys1_count, - Size keys2_count, - Size diag, - BinaryPred binary_pred) - { - typedef typename iterator_traits::value_type key1_type; - typedef typename iterator_traits::value_type key2_type; - - Size keys1_begin = thrust::max(0, diag - keys2_count); - Size keys1_end = thrust::min(diag, keys1_count); - - while (keys1_begin < keys1_end) - { - Size mid = (keys1_begin + keys1_end) >> 1; - key1_type key1 = keys1[mid]; - key2_type key2 = keys2[diag - 1 - mid]; - bool pred = binary_pred(key2, key1); - if (pred) - { - keys1_end = mid; - } - else - { - keys1_begin = mid + 1; - } - } - return keys1_begin; - } - - template - THRUST_DEVICE_FUNCTION void - serial_merge(It keys_shared, - int keys1_beg, - int keys2_beg, - int keys1_count, - int keys2_count, - T2 (&output)[ITEMS_PER_THREAD], - int (&indices)[ITEMS_PER_THREAD], - CompareOp compare_op) - { - int keys1_end = keys1_beg + keys1_count; - int keys2_end = keys2_beg + keys2_count; - - typedef typename iterator_value::type key_type; - - key_type key1 = keys_shared[keys1_beg]; - key_type key2 = keys_shared[keys2_beg]; - - -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - bool p = (keys2_beg < keys2_end) && - ((keys1_beg >= keys1_end) || - compare_op(key2,key1)); - - output[ITEM] = p ? key2 : key1; - indices[ITEM] = p ? keys2_beg++ : keys1_beg++; - - if (p) - { - key2 = keys_shared[keys2_beg]; - } - else - { - key1 = keys_shared[keys1_beg]; - } - } - } - - template - struct PtxPolicy - { - enum - { - BLOCK_THREADS = _BLOCK_THREADS, - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, - ITEMS_PER_TILE = _BLOCK_THREADS * _ITEMS_PER_THREAD - }; - - static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; - static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; - }; // PtxPolicy - - template - struct Tuning; - - template - struct Tuning - { - enum - { - NOMINAL_4B_ITEMS_PER_THREAD = 7, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef PtxPolicy<128, - ITEMS_PER_THREAD, - cub::BLOCK_LOAD_WARP_TRANSPOSE, - cub::LOAD_LDG, - cub::BLOCK_STORE_TRANSPOSE> - type; - }; - - template - struct Tuning - { - const static int INPUT_SIZE = sizeof(T); - - enum - { - NOMINAL_4B_ITEMS_PER_THREAD = 7, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef PtxPolicy<128, - ITEMS_PER_THREAD, - cub::BLOCK_LOAD_WARP_TRANSPOSE, - cub::LOAD_LDG, - cub::BLOCK_STORE_WARP_TRANSPOSE> - type; - }; - - template - struct VectorizedBinarySearchAgent - { - typedef typename iterator_traits::value_type needle_type; - typedef typename iterator_traits::value_type haystack_type; - typedef typename SearchOp::result_type result_type; - - template - struct PtxPlan : Tuning::type - { - typedef Tuning tuning; - - typedef typename core::LoadIterator::type NeedlesLoadIt; - typedef typename core::LoadIterator::type HaystackLoadIt; - - typedef typename core::BlockLoad::type BlockLoadNeedles; - - typedef typename core::BlockStore::type BlockStoreResult; - - union TempStorage - { - typename BlockLoadNeedles::TempStorage load_needles; - typename BlockStoreResult::TempStorage store_result; - -#ifndef BS_SIMPLE - core::uninitialized_array needles_shared; - core::uninitialized_array result_shared; - core::uninitialized_array indices_shared; -#endif - }; // union TempStorage - }; - - typedef typename core::specialize_plan_msvc10_war::type::type ptx_plan; - - typedef typename ptx_plan::NeedlesLoadIt NeedlesLoadIt; - typedef typename ptx_plan::HaystackLoadIt HaystackLoadIt; - typedef typename ptx_plan::BlockLoadNeedles BlockLoadNeedles; - typedef typename ptx_plan::BlockStoreResult BlockStoreResult; - typedef typename ptx_plan::TempStorage TempStorage; - - enum - { - ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD, - BLOCK_THREADS = ptx_plan::BLOCK_THREADS, - ITEMS_PER_TILE = ptx_plan::ITEMS_PER_TILE - }; - - struct impl - { - TempStorage& storage; - NeedlesLoadIt needles_load_it; - HaystackLoadIt haystack_load_it; - Size needles_count; - Size haystack_size; - OutputIt result; - CompareOp compare_op; - SearchOp search_op; - - THRUST_DEVICE_FUNCTION - void stable_odd_even_sort(needle_type (&needles)[ITEMS_PER_THREAD], - int (&indices)[ITEMS_PER_THREAD]) - { -#pragma unroll - for (int I = 0; I < ITEMS_PER_THREAD; ++I) - { -#pragma unroll - for (int J = 1 & I; J < ITEMS_PER_THREAD - 1; J += 2) - { - if (compare_op(needles[J + 1], needles[J])) - { - using thrust::swap; - swap(needles[J], needles[J + 1]); - swap(indices[J], indices[J + 1]); - } - } // inner loop - } // outer loop - } - - THRUST_DEVICE_FUNCTION void - block_mergesort(int tid, - int count, - needle_type (&needles_loc)[ITEMS_PER_THREAD], - int (&indices_loc)[ITEMS_PER_THREAD]) - { - using core::sync_threadblock; - - // stable sort items in a single thread - // - stable_odd_even_sort(needles_loc,indices_loc); - - // each thread has sorted keys_loc - // merge sort keys_loc in shared memory - // -#pragma unroll - for (int coop = 2; coop <= BLOCK_THREADS; coop *= 2) - { - sync_threadblock(); - - // store keys in shmem - // -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int idx = ITEMS_PER_THREAD * threadIdx.x + ITEM; - storage.needles_shared[idx] = needles_loc[ITEM]; - } - - sync_threadblock(); - - int indices[ITEMS_PER_THREAD]; - - int list = ~(coop - 1) & tid; - int start = ITEMS_PER_THREAD * list; - int size = ITEMS_PER_THREAD * (coop >> 1); - - int diag = min(count, ITEMS_PER_THREAD * ((coop - 1) & tid)); +/* +* Copyright 2021 NVIDIA Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ - int keys1_beg = min(count, start); - int keys1_end = min(count, keys1_beg + size); - int keys2_beg = keys1_end; - int keys2_end = min(count, keys2_beg + size); - - int keys1_count = keys1_end - keys1_beg; - int keys2_count = keys2_end - keys2_beg; - - int partition_diag = merge_path(&storage.needles_shared[keys1_beg], - &storage.needles_shared[keys2_beg], - keys1_count, - keys2_count, - diag, - compare_op); - - int keys1_beg_loc = keys1_beg + partition_diag; - int keys1_end_loc = keys1_end; - int keys2_beg_loc = keys2_beg + diag - partition_diag; - int keys2_end_loc = keys2_end; - int keys1_count_loc = keys1_end_loc - keys1_beg_loc; - int keys2_count_loc = keys2_end_loc - keys2_beg_loc; - serial_merge(&storage.needles_shared[0], - keys1_beg_loc, - keys2_beg_loc, - keys1_count_loc, - keys2_count_loc, - needles_loc, - indices, - compare_op); - - - sync_threadblock(); - -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int idx = ITEMS_PER_THREAD * threadIdx.x + ITEM; - storage.indices_shared[idx] = indices_loc[ITEM]; - } - - sync_threadblock(); - -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - indices_loc[ITEM] = storage.indices_shared[indices[ITEM]]; - } - } - } // func block_merge_sort - - template - THRUST_DEVICE_FUNCTION void - consume_tile(int tid, - Size tile_idx, - Size tile_base, - int num_remaining) - { - using core::sync_threadblock; - - needle_type needles_loc[ITEMS_PER_THREAD]; - BlockLoadNeedles(storage.load_needles) - .Load(needles_load_it + tile_base, needles_loc, num_remaining); - -#ifdef BS_SIMPLE - - result_type results_loc[ITEMS_PER_THREAD]; - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - results_loc[ITEM] = search_op(haystack_load_it, - haystack_load_it + haystack_size, - needles_loc[ITEM], - compare_op); - } - - -#else - - if (IS_LAST_TILE) - { - needle_type max_value = needles_loc[0]; -#pragma unroll - for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (ITEMS_PER_THREAD * tid + ITEM < num_remaining) - { - max_value = compare_op(max_value, needles_loc[ITEM]) - ? needles_loc[ITEM] - : max_value; - } - else - { - needles_loc[ITEM] = max_value; - } - } - } - - sync_threadblock(); - - int indices_loc[ITEMS_PER_THREAD]; - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int idx = ITEMS_PER_THREAD*threadIdx.x + ITEM; - indices_loc[ITEM] = idx; - } - - if (IS_LAST_TILE) - { - block_mergesort(tid, - num_remaining, - needles_loc, - indices_loc); - } - else - { - block_mergesort(tid, - ITEMS_PER_TILE, - needles_loc, - indices_loc); - } - - sync_threadblock(); - -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int idx = indices_loc[ITEM]; - storage.result_shared[idx] = - search_op(haystack_load_it, - haystack_load_it + haystack_size, - needles_loc[ITEM], - compare_op); - } - - sync_threadblock(); - - result_type results_loc[ITEMS_PER_THREAD]; -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int idx = ITEMS_PER_THREAD*threadIdx.x + ITEM; - results_loc[ITEM] = storage.result_shared[idx]; - } - - sync_threadblock(); -#endif - - BlockStoreResult(storage.store_result) - .Store(result + tile_base, results_loc, num_remaining); - } - - THRUST_DEVICE_FUNCTION - impl(TempStorage& storage_, - NeedlesIt needles_it_, - HaystackIt haystack_it_, - Size needles_count_, - Size haystack_size_, - OutputIt result_, - CompareOp compare_op_, - SearchOp search_op_) - : storage(storage_), - needles_load_it(core::make_load_iterator(ptx_plan(), needles_it_)), - haystack_load_it(core::make_load_iterator(ptx_plan(), haystack_it_)), - needles_count(needles_count_), - haystack_size(haystack_size_), - result(result_), - compare_op(compare_op_), - search_op(search_op_) - { - int tid = threadIdx.x; - Size tile_idx = blockIdx.x; - Size num_tiles = gridDim.x; - Size tile_base = tile_idx * ITEMS_PER_TILE; - int items_in_tile = min(needles_count - tile_base, ITEMS_PER_TILE); - if (tile_idx < num_tiles - 1) - { - consume_tile(tid, tile_idx, tile_base, ITEMS_PER_TILE); - } - else - { - consume_tile(tid, tile_idx, tile_base, items_in_tile); - } - } - }; // struct impl - - - THRUST_AGENT_ENTRY(NeedlesIt needles_it, - HaystackIt haystack_it, - Size needles_count, - Size haystack_size, - OutputIt result, - CompareOp compare_op, - SearchOp search_op, - char* shmem) - { - TempStorage& storage = *reinterpret_cast(shmem); - - impl(storage, - needles_it, - haystack_it, - needles_count, - haystack_size, - result, - compare_op, - search_op); - } - }; // struct VectorizedBinarySearchAgent - - template - cudaError_t THRUST_RUNTIME_FUNCTION - doit_pass(void* d_temp_storage, - size_t& temp_storage_size, - NeedlesIt needles_it, - HaystackIt haystack_it, - Size needles_count, - Size haystack_size, - OutputIt result, - CompareOp compare_op, - SearchOp search_op, - cudaStream_t stream, - bool debug_sync) - { - if (needles_count == 0) - return cudaErrorNotSupported; - - cudaError_t status = cudaSuccess; - - using core::AgentPlan; - using core::AgentLauncher; - - - typedef AgentLauncher< - VectorizedBinarySearchAgent > - search_agent; - - AgentPlan search_plan = search_agent::get_plan(stream); - - temp_storage_size = 1; - if (d_temp_storage == NULL) - { - return status; - } - - search_agent sa(search_plan, needles_count, stream, "binary_search::search_agent", debug_sync); - sa.launch(needles_it, - haystack_it, - needles_count, - haystack_size, - result, - compare_op, - search_op); - - CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); - - return status; - } - - template - OutputIt THRUST_RUNTIME_FUNCTION - doit(execution_policy& policy, - HaystackIt haystack_begin, - HaystackIt haystack_end, - NeedlesIt needles_begin, - NeedlesIt needles_end, - OutputIt result, - CompareOp compare_op, - SearchOp search_op) - { - typedef typename iterator_traits::difference_type size_type; - - size_type needles_count = thrust::distance(needles_begin, needles_end); - size_type haystack_size = thrust::distance(haystack_begin, haystack_end); - - if (needles_count == 0) - return result; - - size_t storage_size = 0; - cudaStream_t stream = cuda_cub::stream(policy); - bool debug_sync = THRUST_DEBUG_SYNC_FLAG; - - cudaError status; - status = doit_pass(NULL, - storage_size, - needles_begin, - haystack_begin, - needles_count, - haystack_size, - result, - compare_op, - search_op, - stream, - debug_sync); - cuda_cub::throw_on_error(status, "binary_search: failed on 1st call"); - - // Allocate temporary storage. - thrust::detail::temporary_array - tmp(policy, storage_size); - void *ptr = static_cast(tmp.data().get()); - - status = doit_pass(ptr, - storage_size, - needles_begin, - haystack_begin, - needles_count, - haystack_size, - result, - compare_op, - search_op, - stream, - debug_sync); - cuda_cub::throw_on_error(status, "binary_search: failed on 2nt call"); - - status = cuda_cub::synchronize(policy); - cuda_cub::throw_on_error(status, "binary_search: failed to synchronize"); - - return result + needles_count; - } - - struct less - { - template - THRUST_DEVICE_FUNCTION bool - operator()(const T1& lhs, const T2& rhs) const - { - return lhs < rhs; - } - }; -} // namespace __binary_search - -//------------------------- -// Thrust API entry points -//------------------------- - -__thrust_exec_check_disable__ -template -OutputIt __host__ __device__ -lower_bound(execution_policy& policy, - HaystackIt first, - HaystackIt last, - NeedlesIt values_first, - NeedlesIt values_last, - OutputIt result, - CompareOp compare_op) -{ - OutputIt ret = result; - if (__THRUST_HAS_CUDART__) - { - ret = __binary_search::doit(policy, - first, - last, - values_first, - values_last, - result, - compare_op, - __binary_search::lbf()); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::lower_bound(cvt_to_seq(derived_cast(policy)), - first, - last, - values_first, - values_last, - result); -#endif - } - return ret; -} - - -template -OutputIt __host__ __device__ -lower_bound(execution_policy& policy, - HaystackIt first, - HaystackIt last, - NeedlesIt values_first, - NeedlesIt values_last, - OutputIt result) -{ - return cuda_cub::lower_bound(policy, - first, - last, - values_first, - values_last, - result, - __binary_search::less()); -} - -} // namespace cuda_cub -THRUST_NAMESPACE_END -#endif +#pragma once -#endif +// this system has no special version of this algorithm diff --git a/thrust/system/cuda/execution_policy.h b/thrust/system/cuda/execution_policy.h index 39bbb7927..c171ac3d9 100644 --- a/thrust/system/cuda/execution_policy.h +++ b/thrust/system/cuda/execution_policy.h @@ -26,59 +26,6 @@ ******************************************************************************/ #pragma once -// histogram -// sort (radix-sort, merge-sort) - #include #include #include - -// pass -// ---------------- -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// fail -// ---------------- -// fails with mixed types -#include - -// mixed types are not compiling, commented in testing/scan.cu -#include - -// stubs passed -// ---------------- -#include -#include -#include -#include -#include - -// work in progress -