From 75812e3a9f7bc882d6c9ce91b16ec8f0d48d7621 Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Tue, 9 Nov 2021 13:44:27 -0500 Subject: [PATCH] Overhaul CUB test suite. # Missing test cases Issue #399 reported that we were missing several test cases in CUB that were ifdef'd out. This patch enables most of those tests, though CDP tests are not added here. Some other deficiencies were addressed as they were noticed, for instance, adding value_types other than unsigned char to test_block_histogram. # Removed MINIMAL and BENCHMARK variants The way we split up tests into "BENCHMARK", "MINIMAL", and "THOROUGH" variants wasn't well suited for regression testing, as a lot of redundant code paths were generated between the various test executables. These have been removed, leaving only the "THOROUGH" tests, which should capture all test cases. Benchmarks should go into the new `thrust_benchmark` project. # New %PARAM% mechanism Some tests also took an excessively long time to build, especially after enabling the missing test cases from #399. This patch adds a new mechanism that allows a test to include a comment such as: ``` // %PARAM% TEST_FOO foo 0:1:2 // %PARAM% TEST_BAR bar 4:8 ``` CMake will parse these out, and generate multiple test executables for each combination of parameters, e.g: ``` cub.test.baz.foo_0.bar_4 -DTEST_FOO=0 -DTEST_BAR=4 cub.test.baz.foo_0.bar_8 -DTEST_FOO=0 -DTEST_BAR=8 cub.test.baz.foo_1.bar_4 -DTEST_FOO=1 -DTEST_BAR=4 cub.test.baz.foo_1.bar_8 -DTEST_FOO=1 -DTEST_BAR=8 cub.test.baz.foo_2.bar_4 -DTEST_FOO=2 -DTEST_BAR=4 cub.test.baz.foo_2.bar_8 -DTEST_FOO=2 -DTEST_BAR=8 ``` This can be used to quickly split up problematically large tests. See the note at the top of cub/test/CMakeLists.txt for more details. The PrintNinjaBuildTimes.cmake file from Thrust was used to identify tests that needed to be split. # Remove non-CUB test code Several tests were testing Thrust APIs. This isn't necessary, as Thrust has it's own test suite. These tests have been removed. # Removed `g_repeat` options This isn't needed for regression testing and has been removed. Some of the other command line options could also be removed now that benchmarking isn't handled by these regression tests, but this is a start. # Missing input precondition for block_histogram Extended testing revealed that the cub::BlockHistogram algorithm's behavior is undefined when input values are outside of [0, BINS). Added this info to the algorithm docs. # test_device_histogram runtime Reduced runtime from 15 minutes to ~35 seconds by guarding some writes to stdout with g_verbose and reducing the number of test cases. --- CMakeLists.txt | 3 - CONTRIBUTING.md | 6 - cub/block/block_histogram.cuh | 5 + cub/iterator/tex_ref_input_iterator.cuh | 19 + test/CMakeLists.txt | 210 +++++++--- test/test_block_histogram.cu | 86 ++-- test/test_block_load_store.cu | 32 +- ..._radix_sort.h => test_block_radix_sort.cu} | 66 ++-- test/test_block_radix_sort_160.cu | 61 --- test/test_block_radix_sort_32.cu | 72 ---- test/test_block_radix_sort_64.cu | 61 --- test/test_block_reduce.cu | 124 ++---- test/test_block_scan.cu | 148 +++---- test/test_device_histogram.cu | 316 +++------------ test/test_device_merge_sort.cu | 6 +- test/test_device_radix_sort.cu | 287 ++++---------- test/test_device_reduce.cu | 210 ++-------- test/test_device_reduce_by_key.cu | 186 ++------- test/test_device_run_length_encode.cu | 153 +------- test/test_device_scan.cu | 359 +++-------------- test/test_device_scan_by_key.cu | 350 +++-------------- test/test_device_select_if.cu | 371 +----------------- test/test_device_select_unique.cu | 152 +------ test/test_iterator.cu | 178 --------- test/test_util.h | 10 +- test/test_warp_reduce.cu | 30 +- test/test_warp_scan.cu | 33 +- 27 files changed, 719 insertions(+), 2815 deletions(-) rename test/{test_block_radix_sort.h => test_block_radix_sort.cu} (95%) delete mode 100644 test/test_block_radix_sort_160.cu delete mode 100644 test/test_block_radix_sort_32.cu delete mode 100644 test/test_block_radix_sort_64.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d0cf765e8..8e57218aa6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,9 +50,6 @@ endif() option(CUB_ENABLE_HEADER_TESTING "Test that all public headers compile." ON) option(CUB_ENABLE_TESTING "Build CUB testing suite." ON) -option(CUB_ENABLE_THOROUGH_TESTING "Build CUB thorough test variants." ON) -option(CUB_ENABLE_BENCHMARK_TESTING "Build CUB benchmark test variants." ON) -option(CUB_ENABLE_MINIMAL_TESTING "Build CUB minimal test variants." ON) option(CUB_ENABLE_EXAMPLES "Build CUB examples." ON) # This is needed for NVCXX QA, which requires a static set of executable names. # Only a single dialect may be enabled when this is off. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6d901098b1..f02910dffb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -43,12 +43,6 @@ The configuration options for CUB are: - Whether to test compile public headers. Default is `ON`. - `CUB_ENABLE_TESTING={ON, OFF}` - Whether to build unit tests. Default is `ON`. -- `CUB_ENABLE_THOROUGH_TESTS={ON, OFF}` - - Whether to build the thorough test variants. -- `CUB_ENABLE_BENCHMARK_TESTS={ON, OFF}` - - Whether to build the benchmark test variants. -- `CUB_ENABLE_MINIMAL_TESTS={ON, OFF}` - - Whether to build the minimal test variants. - `CUB_ENABLE_EXAMPLES={ON, OFF}` - Whether to build examples. Default is `ON`. - `CUB_ENABLE_DIALECT_CPPXX={ON, OFF}` diff --git a/cub/block/block_histogram.cuh b/cub/block/block_histogram.cuh index 02f6e83f23..b3971b6484 100644 --- a/cub/block/block_histogram.cuh +++ b/cub/block/block_histogram.cuh @@ -99,6 +99,10 @@ enum BlockHistogramAlgorithm * \par Overview * - A histogram * counts the number of observations that fall into each of the disjoint categories (known as bins). + * - The `T` type must be implicitly castable to an integer type. + * - BlockHistogram expects each integral `input[i]` value to satisfy + * `0 <= input[i] < BINS`. Values outside of this range result in undefined + * behavior. * - BlockHistogram can be optionally specialized to use different algorithms: * -# cub::BLOCK_HISTO_SORT. Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm) * -# cub::BLOCK_HISTO_ATOMIC. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm) @@ -136,6 +140,7 @@ enum BlockHistogramAlgorithm * \endcode * * \par Performance and Usage Considerations + * - All input values must fall between [0, BINS), or behavior is undefined. * - The histogram output can be constructed in shared or device-accessible memory * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives * diff --git a/cub/iterator/tex_ref_input_iterator.cuh b/cub/iterator/tex_ref_input_iterator.cuh index 2080bd41dc..1f565c71b1 100644 --- a/cub/iterator/tex_ref_input_iterator.cuh +++ b/cub/iterator/tex_ref_input_iterator.cuh @@ -76,6 +76,11 @@ struct CUB_DEPRECATED IteratorTexRef CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diagnostic push +#pragma nv_diag_suppress 1215 #endif /// And by unique ID @@ -141,6 +146,10 @@ template typename IteratorTexRef::template TexId::TexRef IteratorTexRef::template TexId::ref = 0; // Re-enable deprecation warnings: +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diagnostic pop +#endif + #if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC #pragma warning(default:4996) #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC || \ @@ -233,6 +242,11 @@ class CUB_DEPRECATED TexRefInputIterator #pragma GCC diagnostic ignored "-Wdeprecated-declarations" #endif +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diagnostic push +#pragma nv_diag_suppress 1215 +#endif + public: // Required iterator traits @@ -399,6 +413,11 @@ public: } // Re-enable deprecation warnings: + +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diagnostic pop +#endif + #if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC #pragma warning(default:4996) #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC || \ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 444e0de6e1..26ceb8ef27 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,3 +1,116 @@ +# Note on "%PARAM%" comments: +# +# CUB's legacy tests are often very large and need to be split into multiple +# executables to take advantage of parallel computation and reduce memory usage. +# +# To help split these tests up, a test may define comments such as: +# // %PARAM% [definition] [label] [value]:[...]:[value] +# CMake will parse the file and extract these comments, using them to generate +# multiple test executables for the full cartesian product of values. +# - `definition` will be used as a preprocessor definition name. By convention, +# these begin with `TEST_`. +# - `label` is a short, human-readable label that will be used in the test +# executable's name to identify the variant. +# - `values` are a colon-separated list of values used during test generation. +# Only numeric values have been tested. +# +# For example, if `test_baz.cu` contains the following lines: +# +# ``` +# // %PARAM% TEST_FOO foo 0:1:2 +# // %PARAM% TEST_BAR bar 4:8 +# ``` +# +# Six executables and CTest targets will be generated with unique definitions: +# +# ``` +# cub.test.baz.foo_0.bar_4 -DTEST_FOO=0 -DTEST_BAR=4 +# cub.test.baz.foo_0.bar_8 -DTEST_FOO=0 -DTEST_BAR=8 +# cub.test.baz.foo_1.bar_4 -DTEST_FOO=1 -DTEST_BAR=4 +# cub.test.baz.foo_1.bar_8 -DTEST_FOO=1 -DTEST_BAR=8 +# cub.test.baz.foo_2.bar_4 -DTEST_FOO=2 -DTEST_BAR=4 +# cub.test.baz.foo_2.bar_8 -DTEST_FOO=2 -DTEST_BAR=8 +# ``` +# +# There is no limit on how many %PARAM% entries may exist nor the number of +# values, but be wary of the combinatorial explosion lurking in the number of +# generated executables. +# +# Ideally, only parameters that directly influence kernel template parameters +# should be split out in this way. If changing a parameter doesn't change +# the kernel template type, the same kernel will be compiled into multiple +# executables. This defeats the purpose of splitting up the test since the +# compiler will generate redundant code across the new split executables. +# +# The best candidate parameters for splitting are input value types, rather than +# integral parameters like BLOCK_THREADS, etc. Splitting by value type allows +# more infrastructure (data generation, validation) to be reused and will save +# more time. Over-splitting non-type parameters can cause build times to +# increase since that infrastructure has to be rebuilt for each TU. +# +# Note that CMake will need to be re-run when the %PARAM% comments change -- +# it will not automatically detect changes during incremental builds. +# +# The function below reads the filepath `src`, extracts the %PARAM% comments, +# and fills `labels_var` with a list of `label1_value1.label2_value2...` +# strings, and puts the corresponding `DEFINITION=value1:DEFINITION=value2` +# entries into `defs_var`. +function(cub_get_test_params src labels_var defs_var) + file(READ "${src}" file_data) + string(REGEX MATCHALL + "//[ ]+%PARAM%[ ]+([^ ]+)[ ]+([^ ]+)[ ]+([^\n]*)" + matches + "${file_data}" + ) + + set(variant_labels) + set(variant_defs) + + foreach(match IN LISTS matches) + string(REGEX MATCH + "//[ ]+%PARAM%[ ]+([^ ]+)[ ]+([^ ]+)[ ]+([^\n]*)" + unused + "${match}" + ) + set(def ${CMAKE_MATCH_1}) + set(label ${CMAKE_MATCH_2}) + set(values "${CMAKE_MATCH_3}") + string(REPLACE ":" ";" values "${values}") + + if (NOT variant_labels) + foreach(value IN LISTS values) + list(APPEND variant_labels ${label}_${value}) + endforeach() + else() + set(tmp_labels) + foreach(old_label IN LISTS variant_labels) + foreach(value IN LISTS values) + list(APPEND tmp_labels ${old_label}.${label}_${value}) + endforeach() + endforeach() + set(variant_labels "${tmp_labels}") + endif() + + if (NOT variant_defs) + foreach(value IN LISTS values) + list(APPEND variant_defs ${def}=${value}) + endforeach() + else() + set(tmp_defs) + foreach(old_def IN LISTS variant_defs) + foreach(value IN LISTS values) + list(APPEND tmp_defs ${old_def}:${def}=${value}) + endforeach() + endforeach() + set(variant_defs "${tmp_defs}") + endif() + + endforeach() + + set(${labels_var} "${variant_labels}" PARENT_SCOPE) + set(${defs_var} "${variant_defs}" PARENT_SCOPE) +endfunction() + # Create meta targets that build all tests for a single configuration: foreach(cub_target IN LISTS CUB_TARGETS) cub_get_target_property(config_prefix ${cub_target} PREFIX) @@ -58,80 +171,53 @@ function(cub_add_test target_name_var test_name test_src cub_target) ) endfunction() -# Sets HAS_BENCHMARK_VARIANT / HAS_MINIMAL_VARIANT / NO_VARIANTS to True/False in -# the calling scope. -# Used to detect variants of unit tests depending on whether a source file -# contains the strings "CUB_TEST_BENCHMARK" or "CUB_TEST_MINIMAL". -function(cub_check_for_test_variants src) - file(READ "${src}" data) - - string(FIND "${data}" "CUB_TEST_BENCHMARK" benchmark_loc) - set(HAS_BENCHMARK_VARIANT False PARENT_SCOPE) - if (NOT benchmark_loc EQUAL -1) - set(HAS_BENCHMARK_VARIANT True PARENT_SCOPE) - endif() - - string(FIND "${data}" "CUB_TEST_MINIMAL" minimal_loc) - set(HAS_MINIMAL_VARIANT False PARENT_SCOPE) - if (NOT minimal_loc EQUAL -1) - set(HAS_MINIMAL_VARIANT True PARENT_SCOPE) - endif() - - set(NO_VARIANTS False PARENT_SCOPE) - if (NOT (HAS_BENCHMARK_VARIANT OR HAS_MINIMAL_VARIANT)) - set(NO_VARIANTS True PARENT_SCOPE) - endif() -endfunction() - foreach (test_src IN LISTS test_srcs) - # TODO: Per-test flags. - get_filename_component(test_name "${test_src}" NAME_WE) string(REGEX REPLACE "^test_" "" test_name "${test_name}") - # Some tests change behavior based on whether the compiler defs BENCHMARK - # and/or MINIMAL_TEST are defined. Detect these and build variants for each - # configuration: - cub_check_for_test_variants("${test_src}") + cub_get_test_params("${test_src}" variant_labels variant_defs) + list(LENGTH variant_labels num_variants) + + # Subtract 1 to support the inclusive endpoint of foreach(...RANGE...): + math(EXPR range_end "${num_variants} - 1") + + # Verbose output: + if (NOT num_variants EQUAL 0) + message(VERBOSE "Detected ${num_variants} variants of test '${test_src}':") + foreach(var_idx RANGE ${range_end}) + math(EXPR i "${var_idx} + 1") + list(GET variant_labels ${var_idx} label) + list(GET variant_defs ${var_idx} defs) + message(VERBOSE " ${i}: ${test_name} ${label} ${defs}") + endforeach() + endif() foreach(cub_target IN LISTS CUB_TARGETS) - if (NO_VARIANTS) + cub_get_target_property(config_prefix ${cub_target} PREFIX) + + if (num_variants EQUAL 0) # Only one version of this test. cub_add_test(test_target ${test_name} "${test_src}" ${cub_target}) else() - # Multiple test variants requested, so we need to give the targets and - # binaries suffixes. - - if (CUB_ENABLE_THOROUGH_TESTING) - cub_add_test(test_target_thorough - ${test_name}.thorough + # Meta target to build all parametrizations of the current test: + set(variant_meta_target ${config_prefix}.test.${test_name}.all) + add_custom_target(${variant_meta_target}) + + # Generate multiple tests, one per variant. + # See `cub_get_test_params` for details. + foreach(var_idx RANGE ${range_end}) + list(GET variant_labels ${var_idx} label) + list(GET variant_defs ${var_idx} defs) + string(REPLACE ":" ";" defs "${defs}") + + cub_add_test(test_target + ${test_name}.${label} "${test_src}" ${cub_target} ) - target_compile_definitions(${test_target_thorough} PRIVATE CUB_TEST_THOROUGH) - endif() - - if (CUB_ENABLE_BENCHMARK_TESTING) - if (HAS_BENCHMARK_VARIANT) - cub_add_test(test_target_benchmark - ${test_name}.benchmark - "${test_src}" - ${cub_target} - ) - target_compile_definitions(${test_target_benchmark} PRIVATE CUB_TEST_BENCHMARK) - endif() - endif() - - if (CUB_ENABLE_MINIMAL_TESTING) - if (HAS_MINIMAL_VARIANT) - cub_add_test(test_target_minimal - ${test_name}.minimal - "${test_src}" - ${cub_target} - ) - target_compile_definitions(${test_target_minimal} PRIVATE CUB_TEST_MINIMAL) - endif() - endif() + add_dependencies(${variant_meta_target} ${test_target}) + target_compile_definitions(${test_target} PRIVATE ${defs}) + endforeach() endif() endforeach() endforeach() diff --git a/test/test_block_histogram.cu b/test/test_block_histogram.cu index bafcd9021a..d6359e6dd4 100644 --- a/test/test_block_histogram.cu +++ b/test/test_block_histogram.cu @@ -33,11 +33,6 @@ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR -#include -#include -#include -#include - #include #include #include @@ -45,6 +40,12 @@ #include "test_util.h" +#include +#include +#include +#include +#include + using namespace cub; @@ -53,8 +54,6 @@ using namespace cub; //--------------------------------------------------------------------- bool g_verbose = false; -int g_timing_iterations = 0; -int g_repeat = 0; CachingDeviceAllocator g_allocator(true); @@ -90,6 +89,19 @@ __global__ void BlockHistogramKernel( BlockHistogram(temp_storage).Histogram(data, d_histogram); } +// WAR warning "pointless comparison of unsigned with zero" +template +typename std::enable_if::value, T>::type +clamp_input(T val, int bins) +{ + return val % bins; +} +template +typename std::enable_if::value, T>::type +clamp_input(T val, int bins) +{ + return (val < 0 ? -val : val) % bins; +} /** * Initialize problem (and solution) @@ -109,20 +121,31 @@ void Initialize( h_histograms_linear[bin] = 0; } - if (g_verbose) printf("Samples: \n"); + if (g_verbose) + { + printf("Samples: \n"); + } // Initialize interleaved channel samples and histogram them correspondingly for (int i = 0; i < num_samples; ++i) { - InitValue(gen_mode, h_samples[i], i); - h_samples[i] %= BINS; + SampleT sample; + InitValue(gen_mode, sample, i); + sample = clamp_input(sample, BINS); - if (g_verbose) std::cout << CoutCast(h_samples[i]) << ", "; + if (g_verbose) + { + std::cout << CoutCast(sample) << ", "; + } - h_histograms_linear[h_samples[i]]++; + h_samples[i] = sample; + h_histograms_linear[sample]++; } - if (g_verbose) printf("\n\n"); + if (g_verbose) + { + printf("\n\n"); + } } @@ -252,9 +275,16 @@ void Test() Test(); } - - - +/** + * Test different BINS + */ +template +void Test() +{ + Test(); + Test(); + Test(); +} //--------------------------------------------------------------------- // Main @@ -268,15 +298,12 @@ int main(int argc, char** argv) // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); - args.GetCmdLineArgument("repeat", g_repeat); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " - "[--n= " "[--device=] " - "[--repeat=]" "[--v] " "\n", argv[0]); exit(0); @@ -285,23 +312,10 @@ int main(int argc, char** argv) // Initialize device CubDebugExit(args.DeviceInit()); -#ifdef CUB_TEST_BENCHMARK - - // Compile/run quick tests - Test(RANDOM); - Test(RANDOM); - -#else - - // Compile/run thorough tests - for (int i = 0; i <= g_repeat; ++i) - { - Test(); - Test(); - Test(); - } - -#endif + Test(); + Test(); + Test(); + Test(); return 0; } diff --git a/test/test_block_load_store.cu b/test/test_block_load_store.cu index ec1c2bd02c..9096f529b7 100644 --- a/test/test_block_load_store.cu +++ b/test/test_block_load_store.cu @@ -388,15 +388,9 @@ void TestPointerType( typedef BlockLoad BlockLoad; typedef BlockStore BlockStore; -#if defined(SM100) || defined(SM110) || defined(SM130) - static const bool sufficient_load_smem = sizeof(typename BlockLoad::TempStorage) <= 1024 * 16; - static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage) <= 1024 * 16; - static const bool sufficient_threads = BLOCK_THREADS <= 512; -#else static const bool sufficient_load_smem = sizeof(typename BlockLoad::TempStorage) <= 1024 * 48; static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage) <= 1024 * 48; static const bool sufficient_threads = BLOCK_THREADS <= 1024; -#endif static const bool sufficient_resources = sufficient_load_smem && sufficient_store_smem && sufficient_threads; @@ -474,8 +468,6 @@ void TestItemsPerThread( Int2Type is_warp_multiple; TestStrategy(grid_size, fraction_valid, is_warp_multiple); - TestStrategy(grid_size, fraction_valid, is_warp_multiple); - TestStrategy(grid_size, fraction_valid, is_warp_multiple); TestStrategy(grid_size, fraction_valid, is_warp_multiple); } @@ -483,19 +475,17 @@ void TestItemsPerThread( /** * Evaluate different thread block sizes */ -template + template void TestThreads( int grid_size, float fraction_valid) -{ + { TestItemsPerThread(grid_size, fraction_valid); TestItemsPerThread(grid_size, fraction_valid); - TestItemsPerThread(grid_size, fraction_valid); - TestItemsPerThread(grid_size, fraction_valid); + TestItemsPerThread(grid_size, fraction_valid); TestItemsPerThread(grid_size, fraction_valid); } - /** * Main */ @@ -522,25 +512,19 @@ int main(int argc, char** argv) int ptx_version = 0; CubDebugExit(PtxVersion(ptx_version)); -#ifdef CUB_TEST_BENCHMARK - - // Compile/run quick tests - TestNative< int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(1, 0.8f, Int2Type()); - TestIterator< int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE, LOAD_DEFAULT, STORE_DEFAULT>(1, 0.8f, Int2Type()); - -#else + // %PARAM% TEST_VALUE_TYPES types 0:1:2 // Compile/run thorough tests +#if TEST_VALUE_TYPES == 0 TestThreads(2, 0.8f); TestThreads(2, 0.8f); TestThreads(2, 0.8f); +#elif TEST_VALUE_TYPES == 1 TestThreads(2, 0.8f); - - if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted - TestThreads(2, 0.8f); + TestThreads(2, 0.8f); +#elif TEST_VALUE_TYPES == 2 TestThreads(2, 0.8f); TestThreads(2, 0.8f); - #endif return 0; diff --git a/test/test_block_radix_sort.h b/test/test_block_radix_sort.cu similarity index 95% rename from test/test_block_radix_sort.h rename to test/test_block_radix_sort.cu index 6de78cd81c..8a309cb9f8 100644 --- a/test/test_block_radix_sort.h +++ b/test/test_block_radix_sort.cu @@ -486,11 +486,7 @@ void Test() // Check size of smem storage for the target arch to make sure it will fit typedef BlockRadixSort BlockRadixSortT; -#if defined(SM100) || defined(SM110) || defined(SM130) - Int2Type fits_smem_capacity; -#else Int2Type<(sizeof(typename BlockRadixSortT::TempStorage) <= 48 * 1024)> fits_smem_capacity; -#endif // Sort-ascending, to-striped TestValid(fits_smem_capacity); @@ -554,12 +550,6 @@ template < BlockScanAlgorithm INNER_SCAN_ALGORITHM> void Test() { - // Get ptx version - int ptx_version = 0; - CubDebugExit(PtxVersion(ptx_version)); - -#ifdef TEST_KEYS_ONLY - // Test unsigned types with keys-only TestKeys(); TestKeys(); @@ -567,8 +557,6 @@ void Test() TestKeys(); TestKeys(); -#else - // Test signed and fp types with paired values TestKeysAndPairs(); TestKeysAndPairs(); @@ -576,13 +564,7 @@ void Test() TestKeysAndPairs(); TestKeysAndPairs(); TestKeysAndPairs(); - if (ptx_version > 120) - { - // Don't check doubles on PTX120 or below because they're down-converted - TestKeysAndPairs(); - } - -#endif + TestKeysAndPairs(); } @@ -614,32 +596,32 @@ void Test() Test(); } - /** - * Test radix bits + * Main */ -template < - int BLOCK_THREADS, - int ITEMS_PER_THREAD> -void Test() +int main(int argc, char** argv) { - Test(); - Test(); - Test(); -} + // Initialize command line + CommandLineArgs args(argc, argv); + g_verbose = args.CheckCmdLineFlag("v"); + // Print usage + if (args.CheckCmdLineFlag("help")) + { + printf("%s " + "[--device=] " + "[--v] " + "\n", argv[0]); + exit(0); + } -/** - * Test items per thread - */ -template -void Test() -{ - Test(); -#if defined(SM100) || defined(SM110) || defined(SM130) - // Open64 compiler can't handle the number of test cases -#else - Test(); -#endif - Test(); + // Initialize device + CubDebugExit(args.DeviceInit()); + + // %PARAM% TEST_BLOCK_THREADS threads 32:160 + // %PARAM% TEST_ITEMS_PER_BLOCK items 1:11 + // %PARAM% TEST_BITS_PER_DIGIT digit 1:5 + Test(); + + return 0; } diff --git a/test/test_block_radix_sort_160.cu b/test/test_block_radix_sort_160.cu deleted file mode 100644 index 4ffd80f4c7..0000000000 --- a/test/test_block_radix_sort_160.cu +++ /dev/null @@ -1,61 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/****************************************************************************** - * Test of BlockRadixSort utilities - ******************************************************************************/ - -#include "test_block_radix_sort.h" - -/** - * Main - */ -int main(int argc, char** argv) -{ - // Initialize command line - CommandLineArgs args(argc, argv); - g_verbose = args.CheckCmdLineFlag("v"); - - // Print usage - if (args.CheckCmdLineFlag("help")) - { - printf("%s " - "[--device=] " - "[--v] " - "\n", argv[0]); - exit(0); - } - - // Initialize device - CubDebugExit(args.DeviceInit()); - - // Compile/run thorough tests - Test<160>(); - - return 0; -} diff --git a/test/test_block_radix_sort_32.cu b/test/test_block_radix_sort_32.cu deleted file mode 100644 index e80d4566c4..0000000000 --- a/test/test_block_radix_sort_32.cu +++ /dev/null @@ -1,72 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/****************************************************************************** - * Test of BlockRadixSort utilities - ******************************************************************************/ - -#include "test_block_radix_sort.h" - -/** - * Main - */ -int main(int argc, char** argv) -{ - // Initialize command line - CommandLineArgs args(argc, argv); - g_verbose = args.CheckCmdLineFlag("v"); - - // Print usage - if (args.CheckCmdLineFlag("help")) - { - printf("%s " - "[--device=] " - "[--v] " - "\n", argv[0]); - exit(0); - } - - // Initialize device - CubDebugExit(args.DeviceInit()); - -#ifdef CUB_TEST_BENCHMARK - - { - typedef float T; - TestDriver<32, 4, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(INTEGER_SEED, 0, 0, sizeof(T) * 8); - } - -#else - - // Compile/run thorough tests - Test<32>(); - -#endif // CUB_TEST_BENCHMARK - - return 0; -} diff --git a/test/test_block_radix_sort_64.cu b/test/test_block_radix_sort_64.cu deleted file mode 100644 index 11ee2866e9..0000000000 --- a/test/test_block_radix_sort_64.cu +++ /dev/null @@ -1,61 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/****************************************************************************** - * Test of BlockRadixSort utilities - ******************************************************************************/ - -#include "test_block_radix_sort.h" - -/** - * Main - */ -int main(int argc, char** argv) -{ - // Initialize command line - CommandLineArgs args(argc, argv); - g_verbose = args.CheckCmdLineFlag("v"); - - // Print usage - if (args.CheckCmdLineFlag("help")) - { - printf("%s " - "[--device=] " - "[--v] " - "\n", argv[0]); - exit(0); - } - - // Initialize device - CubDebugExit(args.DeviceInit()); - - // Compile/run thorough tests - Test<64>(); - - return 0; -} diff --git a/test/test_block_reduce.cu b/test/test_block_reduce.cu index 81d9f31a31..bad7c8af86 100644 --- a/test/test_block_reduce.cu +++ b/test/test_block_reduce.cu @@ -54,7 +54,6 @@ using namespace cub; //--------------------------------------------------------------------- bool g_verbose = false; -int g_repeat = 0; CachingDeviceAllocator g_allocator(true); @@ -163,12 +162,7 @@ __global__ void FullTileReduceKernel( T block_aggregate = DeviceTest(block_reduce, data, reduction_op); // Stop cycle timer - #if CUB_PTX_ARCH == 100 - // Bug: recording stop clock causes mis-write of running prefix value - clock_t stop = 0; -#else clock_t stop = clock(); -#endif // CUB_PTX_ARCH == 100 clock_t elapsed = (start > stop) ? start - stop : stop - start; // Loop over input tiles @@ -189,12 +183,7 @@ __global__ void FullTileReduceKernel( T tile_aggregate = DeviceTest(block_reduce, data, reduction_op); // Stop cycle timer -#if CUB_PTX_ARCH == 100 - // Bug: recording stop clock causes mis-write of running prefix value - clock_t stop = 0; -#else clock_t stop = clock(); -#endif // CUB_PTX_ARCH == 100 elapsed += (start > stop) ? start - stop : stop - start; // Reduce thread block aggregate @@ -255,12 +244,7 @@ __global__ void PartialTileReduceKernel( T tile_aggregate = DeviceTest(block_reduce, partial, reduction_op, num_items); // Stop cycle timer -#if CUB_PTX_ARCH == 100 - // Bug: recording stop clock causes mis-write of running prefix value - clock_t stop = 0; -#else clock_t stop = clock(); -#endif // CUB_PTX_ARCH == 100 clock_t elapsed = (start > stop) ? start - stop : stop - start; @@ -685,17 +669,11 @@ template < void Test( ReductionOp reduction_op) { - (void)reduction_op; -#ifdef TEST_RAKING - Test(reduction_op); - Test(reduction_op); -#endif -#ifdef TEST_WARP_REDUCTIONS - Test(reduction_op); -#endif + Test(reduction_op); + Test(reduction_op); + Test(reduction_op); } - /** * Run battery of tests for different block sizes */ @@ -707,10 +685,8 @@ void Test( { Test<7, T>(reduction_op); Test<32, T>(reduction_op); - Test<63, T>(reduction_op); - Test<97, T>(reduction_op); + Test<65, T>(reduction_op); Test<128, T>(reduction_op); - Test<238, T>(reduction_op); } @@ -724,6 +700,13 @@ void Test() Test(Max()); } +template +void Test() +{ + Test(Sum()); + Test(Max()); +} + /** * Main @@ -733,14 +716,12 @@ int main(int argc, char** argv) // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); - args.GetCmdLineArgument("repeat", g_repeat); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " - "[--repeat=]" "[--v] " "\n", argv[0]); exit(0); @@ -749,61 +730,34 @@ int main(int argc, char** argv) // Initialize device CubDebugExit(args.DeviceInit()); - // Get ptx version - int ptx_version = 0; - CubDebugExit(PtxVersion(ptx_version)); - -#ifdef CUB_TEST_BENCHMARK - - // Compile/run quick tests - - - printf("\n full tile ------------------------\n\n"); - - TestFullTile(RANDOM, 1, Sum()); - TestFullTile(RANDOM, 1, Sum()); - TestFullTile(RANDOM, 1, Sum()); - - TestFullTile(RANDOM, 1, Sum()); - TestFullTile(RANDOM, 1, Sum()); - TestFullTile(RANDOM, 1, Sum()); - - printf("\n partial tile ------------------------\n\n"); - - TestPartialTile(RANDOM, 7, Sum()); - TestPartialTile(RANDOM, 7, Sum()); - TestPartialTile(RANDOM, 7, Sum()); - -#else - - // Compile/run thorough tests - for (int i = 0; i <= g_repeat; ++i) - { - // primitives - Test(); - Test(); - Test(); - Test(); - if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted - Test(); - - Test(); - - // vector types - Test(); - Test(); - Test(); - Test(); - - Test(); - Test(); - Test(); - Test(); - - // Complex types - Test(); - Test(); - } + // %PARAM% TEST_VALUE_TYPES types 0:1:2:3 + + // primitives +#if TEST_VALUE_TYPES == 0 + Test(); + Test(); + Test(); + Test(); +#elif TEST_VALUE_TYPES == 1 + Test(); + Test(); + + // vector types + Test(); + Test(); +#elif TEST_VALUE_TYPES == 2 + Test(); + Test(); + + Test(); + Test(); +#elif TEST_VALUE_TYPES == 3 + Test(); + Test(); + + // Complex types + Test(); + Test(); #endif diff --git a/test/test_block_scan.cu b/test/test_block_scan.cu index 03592c8929..d358f7f338 100644 --- a/test/test_block_scan.cu +++ b/test/test_block_scan.cu @@ -54,7 +54,6 @@ using namespace cub; //--------------------------------------------------------------------- bool g_verbose = false; -int g_repeat = 0; CachingDeviceAllocator g_allocator(true); @@ -728,18 +727,9 @@ void Test( ScanOpT scan_op, T initial_value) { - (void)gen_mode; - (void)scan_op; - (void)initial_value; -#ifdef TEST_RAKING Test(gen_mode, scan_op, initial_value); -#endif -#ifdef TEST_RAKING_MEMOIZE Test(gen_mode, scan_op, initial_value); -#endif -#ifdef TEST_WARP_SCANS Test(gen_mode, scan_op, initial_value); -#endif } @@ -795,70 +785,24 @@ void Test( Test(RANDOM, scan_op, identity, initial_value); } - -/** - * Run tests for different data types and scan ops - */ -template < - int BLOCK_THREADS, - int ITEMS_PER_THREAD> -void Test() +// Dispatch ITEMS_PER_THREAD +template +void Test(ScanOpT op, T identity, T initial_value) { - // Get ptx version - int ptx_version = 0; - CubDebugExit(PtxVersion(ptx_version)); - - // primitive - Test(Sum(), (unsigned char) 0, (unsigned char) 99); - Test(Sum(), (unsigned short) 0, (unsigned short) 99); - Test(Sum(), (unsigned int) 0, (unsigned int) 99); - Test(Sum(), (unsigned long long) 0, (unsigned long long) 99); - Test(Sum(), (float) 0, (float) 99); - - // primitive (alternative scan op) - Test(Max(), std::numeric_limits::min(), (char) 99); - Test(Max(), std::numeric_limits::min(), (short) 99); - Test(Max(), std::numeric_limits::min(), (int) 99); - Test(Max(), std::numeric_limits::min(), (long long) 99); - - if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted - Test(Max(), std::numeric_limits::max() * -1, (double) 99); - - // vec-1 - Test(Sum(), make_uchar1(0), make_uchar1(17)); - - // vec-2 - Test(Sum(), make_uchar2(0, 0), make_uchar2(17, 21)); - Test(Sum(), make_ushort2(0, 0), make_ushort2(17, 21)); - Test(Sum(), make_uint2(0, 0), make_uint2(17, 21)); - Test(Sum(), make_ulonglong2(0, 0), make_ulonglong2(17, 21)); - - // vec-4 - Test(Sum(), make_char4(0, 0, 0, 0), make_char4(17, 21, 32, 85)); - Test(Sum(), make_short4(0, 0, 0, 0), make_short4(17, 21, 32, 85)); - Test(Sum(), make_int4(0, 0, 0, 0), make_int4(17, 21, 32, 85)); - Test(Sum(), make_longlong4(0, 0, 0, 0), make_longlong4(17, 21, 32, 85)); - - // complex - Test(Sum(), TestFoo::MakeTestFoo(0, 0, 0, 0), TestFoo::MakeTestFoo(17, 21, 32, 85)); - Test(Sum(), TestBar(0, 0), TestBar(17, 21)); - + Test(op, identity, initial_value); + Test(op, identity, initial_value); } - -/** - * Run tests for different items per thread - */ -template -void Test() +// Dispatch BLOCK_THREADS +template +void Test(ScanOpT op, T identity, T initial_value) { - Test(); - Test(); - Test(); + Test<17>(op, identity, initial_value); + Test<32>(op, identity, initial_value); + Test<65>(op, identity, initial_value); + Test<96>(op, identity, initial_value); } - - /** * Main */ @@ -867,14 +811,12 @@ int main(int argc, char** argv) // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); - args.GetCmdLineArgument("repeat", g_repeat); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " - "[--repeat=]" "[--v] " "\n", argv[0]); exit(0); @@ -883,32 +825,58 @@ int main(int argc, char** argv) // Initialize device CubDebugExit(args.DeviceInit()); -#ifdef CUB_TEST_BENCHMARK + // %PARAM% TEST_VALUE_TYPES types 0:1:2:3:4:5:6:7:8:9 - Test<128, 1, 1, 1, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), int(0)); +#if TEST_VALUE_TYPES == 0 - // Compile/run quick tests - Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), int(0)); - Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_RAKING>(UNIFORM, Sum(), int(0)); - Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_RAKING_MEMOIZE>(UNIFORM, Sum(), int(0)); + // primitive + Test(Sum(), static_cast(0), static_cast(99)); + Test(Sum(), static_cast(0), static_cast(99)); +#elif TEST_VALUE_TYPES == 1 + Test(Sum(), static_cast(0), static_cast(99)); + Test(Sum(), static_cast(0), static_cast(99)); - Test<128, 1, 1, 2, INCLUSIVE, PREFIX, BLOCK_SCAN_RAKING>(INTEGER_SEED, Sum(), TestFoo::MakeTestFoo(17, 21, 32, 85)); - Test<128, 1, 1, 1, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), make_longlong4(17, 21, 32, 85)); +#elif TEST_VALUE_TYPES == 2 + // primitive (alternative scan op) + Test(Max(), std::numeric_limits::lowest(), static_cast(99)); + Test(Max(), std::numeric_limits::lowest(), static_cast(99)); +#elif TEST_VALUE_TYPES == 3 + Test(Max(), std::numeric_limits::lowest(), static_cast(99)); + Test(Max(), std::numeric_limits::lowest(), static_cast(99)); -#else +#elif TEST_VALUE_TYPES == 4 - // Compile/run thorough tests - for (int i = 0; i <= g_repeat; ++i) - { - // Run tests for different thread block sizes - Test<17>(); - Test<32>(); - Test<62>(); - Test<65>(); -// Test<96>(); // TODO: file bug for UNREACHABLE error for Test<96, 9, BASIC, BLOCK_SCAN_RAKING>(UNIFORM, Sum(), NullType(), make_ulonglong2(17, 21)); - Test<128>(); - } + // Floats + Test(Sum(), static_cast(0), static_cast(99)); + Test(Max(), std::numeric_limits::lowest(), static_cast(99)); + +#elif TEST_VALUE_TYPES == 5 + + // vec-1 + Test(Sum(), make_uchar1(0), make_uchar1(17)); + + // vec-2 + Test(Sum(), make_uchar2(0, 0), make_uchar2(17, 21)); + Test(Sum(), make_ushort2(0, 0), make_ushort2(17, 21)); +#elif TEST_VALUE_TYPES == 6 + Test(Sum(), make_uint2(0, 0), make_uint2(17, 21)); + Test(Sum(), make_ulonglong2(0, 0), make_ulonglong2(17, 21)); + +#elif TEST_VALUE_TYPES == 7 + + // vec-4 + Test(Sum(), make_char4(0, 0, 0, 0), make_char4(17, 21, 32, 85)); + Test(Sum(), make_short4(0, 0, 0, 0), make_short4(17, 21, 32, 85)); +#elif TEST_VALUE_TYPES == 8 + Test(Sum(), make_int4(0, 0, 0, 0), make_int4(17, 21, 32, 85)); + Test(Sum(), make_longlong4(0, 0, 0, 0), make_longlong4(17, 21, 32, 85)); + +#elif TEST_VALUE_TYPES == 9 + + // complex + Test(Sum(), TestFoo::MakeTestFoo(0, 0, 0, 0), TestFoo::MakeTestFoo(17, 21, 32, 85)); + Test(Sum(), TestBar(0, 0), TestBar(17, 21)); #endif diff --git a/test/test_device_histogram.cu b/test/test_device_histogram.cu index 7db4a8cd70..399da69a4f 100644 --- a/test/test_device_histogram.cu +++ b/test/test_device_histogram.cu @@ -63,7 +63,6 @@ enum Backend bool g_verbose_input = false; bool g_verbose = false; int g_timing_iterations = 0; -int g_repeat = 0; CachingDeviceAllocator g_allocator(true); @@ -585,7 +584,12 @@ void TestEven( NUM_CHANNELS); std::cout << CoutCast(max_level) << "\n"; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) - std::cout << "\n\tChannel " << channel << ": " << num_levels[channel] - 1 << " bins [" << lower_level[channel] << ", " << upper_level[channel] << ")\n"; + { + std::cout << "\tChannel " << channel << ": " + << num_levels[channel] - 1 << " bins " + << "[" << lower_level[channel] << ", " + << upper_level[channel] << ")\n"; + } fflush(stdout); // Allocate and initialize host and device data @@ -837,11 +841,17 @@ void TestRange( std::cout << CoutCast(max_level) << "\n"; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { - printf("Channel %d: %d bins [", channel, num_levels[channel] - 1); - std::cout << levels[channel][0]; - for (int level = 1; level < num_levels[channel]; ++level) - std::cout << ", " << levels[channel][level]; - printf("]\n"); + printf("Channel %d: %d bins", channel, num_levels[channel] - 1); + if (g_verbose) + { + std::cout << "[ " << levels[channel][0]; + for (int level = 1; level < num_levels[channel]; ++level) + { + std::cout << ", " << levels[channel][level]; + } + printf("]"); + } + printf("\n"); } fflush(stdout); @@ -1139,15 +1149,6 @@ void Test( { int num_levels[NUM_ACTIVE_CHANNELS]; -// Unnecessary testing -// // All the same level -// for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) -// { -// num_levels[channel] = max_num_levels; -// } -// Test( -// num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels); - // All different levels num_levels[0] = max_num_levels; for (int channel = 1; channel < NUM_ACTIVE_CHANNELS; ++channel) @@ -1177,9 +1178,6 @@ void Test( LevelT max_level, int max_num_levels) { - Test( - num_row_pixels, num_rows, row_stride_bytes, 0, max_level, max_num_levels); - Test( num_row_pixels, num_rows, row_stride_bytes, -1, max_level, max_num_levels); @@ -1249,19 +1247,6 @@ void Test( cols, rows, max_level, max_num_levels); } } - - // Randomly select linear problem size between 1:10,000,000 - unsigned int max_int = (unsigned int) -1; - for (int i = 0; i < 4; ++i) - { - unsigned int num_items; - RandomBits(num_items); - num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int)); - num_items = CUB_MAX(1, num_items); - - Test( - OffsetT(num_items), 1, max_level, max_num_levels); - } } @@ -1269,82 +1254,62 @@ void Test( /** * Test different channel interleavings (valid specialiation) */ -template < - typename SampleT, - typename CounterT, - typename LevelT, - typename OffsetT> -void TestChannels( - LevelT max_level, - int max_num_levels, - Int2Type /*is_valid_tag*/) +template +void TestChannels(LevelT max_level, + int max_num_levels, + Int2Type /*is_valid_tag*/, + Int2Type /*test_extra_channels*/) { - Test(max_level, max_num_levels); - Test(max_level, max_num_levels); - Test(max_level, max_num_levels); - Test(max_level, max_num_levels); + Test(max_level, max_num_levels); + Test(max_level, max_num_levels); } - -/** - * Test different channel interleavings (invalid specialiation) - */ -template < - typename SampleT, - typename CounterT, - typename LevelT, - typename OffsetT> -void TestChannels( - LevelT /*max_level*/, - int /*max_num_levels*/, - Int2Type /*is_valid_tag*/) +template +void TestChannels(LevelT max_level, + int max_num_levels, + Int2Type /*is_valid_tag*/, + Int2Type /*test_extra_channels*/) +{ + Test(max_level, max_num_levels); + Test(max_level, max_num_levels); + Test(max_level, max_num_levels); + Test(max_level, max_num_levels); +} +template +void TestChannels(LevelT /*max_level*/, + int /*max_num_levels*/, + Int2Type /*is_valid_tag*/, + TestExtraChannels) {} - - //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- - - - /** * Main */ int main(int argc, char** argv) { - int num_row_pixels = -1; - int entropy_reduction = 0; - int num_rows = 1; - // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); g_verbose_input = args.CheckCmdLineFlag("v2"); - args.GetCmdLineArgument("n", num_row_pixels); - int row_stride_pixels = num_row_pixels; - - args.GetCmdLineArgument("rows", num_rows); - args.GetCmdLineArgument("stride", row_stride_pixels); args.GetCmdLineArgument("i", g_timing_iterations); - args.GetCmdLineArgument("repeat", g_repeat); - args.GetCmdLineArgument("entropy", entropy_reduction); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " - "[--n=] " - "[--rows=] " - "[--stride=] " "[--i=] " "[--device=] " - "[--repeat=]" - "[--entropy=]" "[--v] " - "[--cdp]" + "[--v2] " "\n", argv[0]); exit(0); } @@ -1352,192 +1317,19 @@ int main(int argc, char** argv) // Initialize device CubDebugExit(args.DeviceInit()); - // Get ptx version - int ptx_version = 0; - CubDebugExit(PtxVersion(ptx_version)); - - if (num_row_pixels < 0) - { - num_row_pixels = 1920 * 1080; - row_stride_pixels = num_row_pixels; - } - -#if defined(CUB_TEST_MINIMAL) - - // Compile/run quick tests - { - // HistogramEven: unsigned char 256 bins - typedef unsigned char SampleT; - typedef int LevelT; - - LevelT max_level = 256; - int num_levels[1] = {257}; - int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1; - - TestEven(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]); - } - - { - // HistogramRange: signed char 256 bins - typedef signed char SampleT; - typedef int LevelT; - - LevelT max_level = 256; - int num_levels[1] = {257}; - int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1; - - TestRange(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]); - } - - - -#elif defined(CUB_TEST_BENCHMARK) - - // Compile/run quick tests - { - // HistogramEven: unsigned char 256 bins - typedef unsigned char SampleT; - typedef int LevelT; + using true_t = Int2Type; + using false_t = Int2Type; - LevelT max_level = 256; - int num_levels[1] = {257}; - int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1; - - TestEven(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]); - } - - { - // HistogramEven: 4/4 multichannel Unsigned char 256 bins - typedef unsigned char SampleT; - typedef int LevelT; - - LevelT max_level = 256; - int num_levels[4] = {257, 257, 257, 257}; - int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 4; - - TestEven(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]); - } - - { - // HistogramEven: 3/4 multichannel Unsigned char 256 bins - typedef unsigned char SampleT; - typedef int LevelT; - - LevelT max_level = 256; - int num_levels[3] = {257, 257, 257}; - int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 4; - - TestEven(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]); - } - - { - // HistogramEven: short [0,1024] 256 bins - typedef unsigned short SampleT; - typedef unsigned short LevelT; - - LevelT max_level = 1024; - int num_levels[1] = {257}; - int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1; - - TestEven(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]); - } - - { - // HistogramEven: float [0,1.0] 256 bins - typedef float SampleT; - typedef float LevelT; - - LevelT max_level = 1.0; - int num_levels[1] = {257}; - int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1; - - TestEven(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]); - } - - #if !defined(__ICC) + TestChannels (256, 256 + 1, true_t{}, true_t{}); + TestChannels (8192, 8192 + 1, true_t{}, false_t{}); +#if !defined(__ICC) // Fails with ICC for unknown reasons, see #332. - { - // HistogramEven: 3/4 multichannel float [0,1.0] 256 bins - typedef float SampleT; - typedef float LevelT; - - LevelT max_level = 1.0; - int num_levels[3] = {257, 257, 257}; - int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 4; - - TestEven(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]); - } - #endif - - { - // HistogramRange: signed char 256 bins - typedef signed char SampleT; - typedef int LevelT; - - LevelT max_level = 256; - int num_levels[1] = {257}; - int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1; - - TestRange(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]); - } - - { - // HistogramRange: 3/4 channel, unsigned char, varied bins (256, 128, 64) - typedef unsigned char SampleT; - typedef int LevelT; - - LevelT max_level = 256; - int num_levels[3] = {257, 129, 65}; - int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 4; - - TestRange(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]); - } - - if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted - { - // HistogramEven: double [0,1.0] 64 bins - typedef double SampleT; - typedef double LevelT; - - LevelT max_level = 1.0; - int num_levels[1] = {65}; - int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1; - - TestEven(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]); - } - - { - // HistogramEven: short [0,1024] 512 bins - typedef unsigned short SampleT; - typedef unsigned short LevelT; - - LevelT max_level = 1024; - int num_levels[1] = {513}; - int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1; - - TestEven(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]); - } - -#else - - // Compile/run thorough tests - for (int i = 0; i <= g_repeat; ++i) - { - TestChannels (256, 256 + 1, Int2Type()); - TestChannels (256, 256 + 1, Int2Type()); - TestChannels (128, 128 + 1, Int2Type()); - TestChannels (8192, 8192 + 1, Int2Type()); - #if !defined(__ICC) - // Fails with ICC for unknown reasons, see #332. - TestChannels (1.0, 256 + 1, Int2Type()); - #endif - - // Test down-conversion of size_t offsets to int - TestChannels (256, 256 + 1, Int2Type<(sizeof(size_t) != sizeof(int))>()); - } - + TestChannels (1.0, 256 + 1, true_t{}, false_t{}); #endif + // Test down-conversion of size_t offsets to int + TestChannels (256, 256 + 1, Int2Type<(sizeof(size_t) != sizeof(int))>{}, false_t{}); + return 0; } diff --git a/test/test_device_merge_sort.cu b/test/test_device_merge_sort.cu index f8115f82d6..ad416a1cf7 100644 --- a/test/test_device_merge_sort.cu +++ b/test/test_device_merge_sort.cu @@ -319,10 +319,12 @@ void AllocateAndTestIterators(unsigned int num_items) template void Test(thrust::default_random_engine &rng) { - for (int pow2 = 9; pow2 < 22; pow2 += 2) + for (unsigned int pow2 = 9; pow2 < 22; pow2 += 2) { - const int num_items = 1 << pow2; + const unsigned int num_items = 1 << pow2; AllocateAndTestIterators(num_items); + + TestHelper::AllocateAndTest(rng, num_items); Test(rng, num_items); } diff --git a/test/test_device_radix_sort.cu b/test/test_device_radix_sort.cu index 74bf47f9aa..7927ed0714 100644 --- a/test/test_device_radix_sort.cu +++ b/test/test_device_radix_sort.cu @@ -53,10 +53,6 @@ #include "test_util.h" -#include -#include -#include - using namespace cub; @@ -66,7 +62,6 @@ using namespace cub; bool g_verbose = false; int g_timing_iterations = 0; -int g_repeat = 0; CachingDeviceAllocator g_allocator(true); // Dispatch types @@ -78,10 +73,29 @@ enum Backend CUB_SEGMENTED, // CUB method (allows overwriting of input) CUB_SEGMENTED_NO_OVERWRITE, // CUB method (disallows overwriting of input) - THRUST, // Thrust method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; +static const char* BackendToString(Backend b) +{ + switch (b) + { + case CUB: + return "CUB"; + case CUB_NO_OVERWRITE: + return "CUB_NO_OVERWRITE"; + case CUB_SEGMENTED: + return "CUB_SEGMENTED"; + case CUB_SEGMENTED_NO_OVERWRITE: + return "CUB_SEGMENTED_NO_OVERWRITE"; + case CDP: + return "CDP"; + default: + break; + } + + return ""; +} //--------------------------------------------------------------------- // Dispatch to different DeviceRadixSort entrypoints @@ -381,104 +395,6 @@ cudaError_t Dispatch( return retval; } - -//--------------------------------------------------------------------- -// Dispatch to different Thrust entrypoints -//--------------------------------------------------------------------- - -/** - * Dispatch keys-only to Thrust sorting entrypoint - */ -template -cudaError_t Dispatch( - Int2Type /*is_descending*/, - Int2Type /*dispatch_to*/, - int */*d_selector*/, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void *d_temp_storage, - size_t &temp_storage_bytes, - DoubleBuffer &d_keys, - DoubleBuffer &/*d_values*/, - int num_items, - int /*num_segments*/, - BeginOffsetIteratorT /*d_segment_begin_offsets*/, - EndOffsetIteratorT /*d_segment_end_offsets*/, - int /*begin_bit*/, - int /*end_bit*/, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::device_ptr d_keys_wrapper(d_keys.Current()); - - if (IS_DESCENDING) THRUST_NS_QUALIFIER::reverse(d_keys_wrapper, d_keys_wrapper + num_items); - THRUST_NS_QUALIFIER::sort(d_keys_wrapper, d_keys_wrapper + num_items); - if (IS_DESCENDING) THRUST_NS_QUALIFIER::reverse(d_keys_wrapper, d_keys_wrapper + num_items); - } - - return cudaSuccess; -} - - -/** - * Dispatch key-value pairs to Thrust sorting entrypoint - */ -template -cudaError_t Dispatch( - Int2Type /*is_descending*/, - Int2Type /*dispatch_to*/, - int */*d_selector*/, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void *d_temp_storage, - size_t &temp_storage_bytes, - DoubleBuffer &d_keys, - DoubleBuffer &d_values, - int num_items, - int /*num_segments*/, - BeginOffsetIteratorT /*d_segment_begin_offsets*/, - EndOffsetIteratorT /*d_segment_end_offsets*/, - int /*begin_bit*/, - int /*end_bit*/, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::device_ptr d_keys_wrapper(d_keys.Current()); - THRUST_NS_QUALIFIER::device_ptr d_values_wrapper(d_values.Current()); - - if (IS_DESCENDING) { - THRUST_NS_QUALIFIER::reverse(d_keys_wrapper, d_keys_wrapper + num_items); - THRUST_NS_QUALIFIER::reverse(d_values_wrapper, d_values_wrapper + num_items); - } - - THRUST_NS_QUALIFIER::sort_by_key(d_keys_wrapper, d_keys_wrapper + num_items, d_values_wrapper); - - if (IS_DESCENDING) { - THRUST_NS_QUALIFIER::reverse(d_keys_wrapper, d_keys_wrapper + num_items); - THRUST_NS_QUALIFIER::reverse(d_values_wrapper, d_values_wrapper + num_items); - } - } - - return cudaSuccess; -} - - //--------------------------------------------------------------------- // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- @@ -747,7 +663,7 @@ void Test( const bool KEYS_ONLY = Equals::VALUE; printf("%s %s cub::DeviceRadixSort %d items, %d segments, %d-byte keys (%s) %d-byte values (%s), descending %d, begin_bit %d, end_bit %d\n", - (BACKEND == CUB_NO_OVERWRITE) ? "CUB_NO_OVERWRITE" : (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", + BackendToString(BACKEND), (KEYS_ONLY) ? "keys-only" : "key-value", num_items, num_segments, (int) sizeof(KeyT), typeid(KeyT).name(), (KEYS_ONLY) ? 0 : (int) sizeof(ValueT), typeid(ValueT).name(), @@ -917,29 +833,24 @@ void TestBackend( } } -#ifdef SEGMENTED_SORT - // Test multi-segment implementations Test( h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values); Test( h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values); -#else // SEGMENTED_SORT + if (num_segments == 1) { - // Test single-segment implementations Test( h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values); Test( h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values); - #ifdef CUB_CDP + + #ifdef CUB_CDP // FIXME: Enable this Test( h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values); #endif } -#endif // SEGMENTED_SORT if (h_values) delete[] h_values; if (h_reference_values) delete[] h_reference_values; } - - /** * Test value type */ @@ -960,21 +871,33 @@ void TestValueTypes( KeyT *h_reference_keys = NULL; InitializeSolution(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_ranks, h_reference_keys); +#if TEST_VALUE_TYPE == 0 + // Test keys-only TestBackend (h_keys, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks); +#elif TEST_VALUE_TYPE == 1 + // Test with 8b value TestBackend (h_keys, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks); +#elif TEST_VALUE_TYPE == 2 + // Test with 32b value TestBackend (h_keys, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks); +#elif TEST_VALUE_TYPE == 3 + // Test with 64b value TestBackend(h_keys, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks); // Test with non-trivially-constructable value + // These are cheap to build (less work for OCG?), so lump them in with + // the 64b value tests. TestBackend (h_keys, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks); +#endif + // Cleanup if (h_reference_ranks) delete[] h_reference_ranks; if (h_reference_keys) delete[] h_reference_keys; @@ -1066,7 +989,8 @@ void TestSegmentIterators( InitializeSegments(num_items, num_segments, h_segment_offsets); CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(int) * (num_segments + 1), cudaMemcpyHostToDevice)); - // Test with segment pointer + // Test with segment pointer. + // This is also used to test non-segmented sort. TestBits(h_keys, num_items, num_segments, h_segment_offsets, d_segment_offsets, d_segment_offsets + 1); // Test with segment iterator @@ -1101,7 +1025,6 @@ void TestSegments( int *d_segment_offsets = nullptr; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(int) * (max_segments + 1))); -#ifdef SEGMENTED_SORT for (int num_segments = max_segments; num_segments > 1; num_segments = (num_segments + 32 - 1) / 32) { if (num_items / num_segments < 128 * 1000) { @@ -1109,13 +1032,12 @@ void TestSegments( TestSegmentIterators(h_keys, num_items, num_segments, h_segment_offsets, d_segment_offsets); } } -#else + // Test single segment if (num_items < 128 * 1000) { // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment TestSegmentIterators(h_keys, num_items, 1, h_segment_offsets, d_segment_offsets); } -#endif if (h_segment_offsets) delete[] h_segment_offsets; if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets)); @@ -1273,7 +1195,6 @@ int main(int argc, char** argv) args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("s", num_segments); args.GetCmdLineArgument("i", g_timing_iterations); - args.GetCmdLineArgument("repeat", g_repeat); args.GetCmdLineArgument("bits", bits); args.GetCmdLineArgument("entropy", entropy_reduction); @@ -1286,7 +1207,6 @@ int main(int argc, char** argv) "[--s= " "[--i= " "[--device=] " - "[--repeat=]" "[--v] " "[--entropy=]" "\n", argv[0]); @@ -1300,108 +1220,69 @@ int main(int argc, char** argv) int ptx_version = 0; CubDebugExit(PtxVersion(ptx_version)); -#ifdef CUB_TEST_MINIMAL - - enum { - IS_DESCENDING = false - }; + // %PARAM% TEST_KEY_BYTES bytes 1:2:4:8 + // %PARAM% TEST_VALUE_TYPE pairs 0:1:2:3 + // 0->Keys only + // 1->uchar + // 2->uint + // 3->[ull,TestBar] (TestBar is cheap to build, included here to + // reduce total number of targets) + + // To reduce testing time, some key types are only tested when not + // testing pairs: +#if TEST_VALUE_TYPE == 0 +#define TEST_EXTENDED_KEY_TYPES +#endif - // Compile/run basic CUB test - if (num_items < 0) num_items = 24000000; - if (num_segments < 0) num_segments = 5000; + // Compile/run thorough tests +#if TEST_KEY_BYTES == 1 - Test(num_items, num_segments, RANDOM, entropy_reduction, 0, bits); + TestGen (num_items, num_segments); - printf("\n-------------------------------\n"); +#ifdef TEST_EXTENDED_KEY_TYPES + TestGen (num_items, num_segments); + TestGen (num_items, num_segments); + TestGen (num_items, num_segments); +#endif // TEST_EXTENDED_KEY_TYPES - Test(num_items, 1, RANDOM, entropy_reduction, 0, bits); - Test(num_items, 1, RANDOM, entropy_reduction, 0, bits); - Test(num_items, 1, RANDOM, entropy_reduction, 0, bits); +#elif TEST_KEY_BYTES == 2 + TestGen (num_items, num_segments); - printf("\n-------------------------------\n"); +#ifdef TEST_EXTENDED_KEY_TYPES + TestGen (num_items, num_segments); #if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !__NVCOMPILER_CUDA__ - Test(num_items, 1, RANDOM, entropy_reduction, 0, bits); -#endif + TestGen (num_items, num_segments); +#endif // CTK >= 9 + #if (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11000) && !__NVCOMPILER_CUDA__ #if !defined(__ICC) // Fails with `-0 != 0` with ICC for unknown reasons. See #333. - Test(num_items, 1, RANDOM, entropy_reduction, 0, bits); -#endif -#endif - Test(num_items, 1, RANDOM, entropy_reduction, 0, bits); - Test(num_items, 1, RANDOM, entropy_reduction, 0, bits); - - printf("\n-------------------------------\n"); - - Test(num_items, 1, RANDOM, entropy_reduction, 0, bits); - Test(num_items, 1, RANDOM, entropy_reduction, 0, bits); - Test(num_items, 1, RANDOM, entropy_reduction, 0, bits); - -#elif defined(CUB_TEST_BENCHMARK) - - // Compile/run quick tests - if (num_items < 0) num_items = 48000000; - if (num_segments < 0) num_segments = 5000; + TestGen (num_items, num_segments); +#endif // !ICC +#endif // CTK >= 11 - // Compare CUB and thrust on 32b keys-only - Test ( num_items, 1, RANDOM, entropy_reduction, 0, bits); - Test ( num_items, 1, RANDOM, entropy_reduction, 0, bits); +#endif // TEST_EXTENDED_KEY_TYPES - // Compare CUB and thrust on 64b keys-only - Test ( num_items, 1, RANDOM, entropy_reduction, 0, bits); - Test ( num_items, 1, RANDOM, entropy_reduction, 0, bits); +#elif TEST_KEY_BYTES == 4 + TestGen (num_items, num_segments); - // Compare CUB and thrust on 32b key-value pairs - Test ( num_items, 1, RANDOM, entropy_reduction, 0, bits); - Test ( num_items, 1, RANDOM, entropy_reduction, 0, bits); - - // Compare CUB and thrust on 64b key + 32b value pairs - Test ( num_items, 1, RANDOM, entropy_reduction, 0, bits); - Test ( num_items, 1, RANDOM, entropy_reduction, 0, bits); - - -#else - - // Compile/run thorough tests - for (int i = 0; i <= g_repeat; ++i) - { - TestGen (num_items, num_segments); +#ifdef TEST_EXTENDED_KEY_TYPES + TestGen (num_items, num_segments); + TestGen (num_items, num_segments); +#endif // TEST_EXTENDED_KEY_TYPES - TestGen (num_items, num_segments); - TestGen (num_items, num_segments); - TestGen (num_items, num_segments); +#elif TEST_KEY_BYTES == 8 - TestGen (num_items, num_segments); - TestGen (num_items, num_segments); + TestGen (num_items, num_segments); - TestGen (num_items, num_segments); - TestGen (num_items, num_segments); +#ifdef TEST_EXTENDED_KEY_TYPES + TestGen (num_items, num_segments); + TestGen (num_items, num_segments); +#endif // TEST_EXTENDED_KEY_TYPES - TestGen (num_items, num_segments); - TestGen (num_items, num_segments); - - TestGen (num_items, num_segments); - TestGen (num_items, num_segments); - -#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !__NVCOMPILER_CUDA__ - TestGen (num_items, num_segments); -#endif -#if (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11000) && !__NVCOMPILER_CUDA__ -#if !defined(__ICC) - // Fails with `-0 != 0` with ICC for unknown reasons. See #333. - TestGen (num_items, num_segments); -#endif -#endif - TestGen (num_items, num_segments); - - if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted - TestGen (num_items, num_segments); - - } - -#endif +#endif // TEST_KEY_BYTES switch return 0; } diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu index fb467249df..e70e70ce69 100644 --- a/test/test_device_reduce.cu +++ b/test/test_device_reduce.cu @@ -37,9 +37,6 @@ #include #include -#include -#include - #include #include #include @@ -63,7 +60,6 @@ double g_device_giga_bandwidth; bool g_verbose = false; bool g_verbose_input = false; int g_timing_iterations = 0; -int g_repeat = 0; CachingDeviceAllocator g_allocator(true); @@ -73,7 +69,6 @@ enum Backend CUB, // CUB method CUB_SEGMENTED, // CUB segmented method CUB_CDP, // GPU-based (dynamic parallelism) dispatch to CUB method - THRUST, // Thrust method }; @@ -524,108 +519,6 @@ cudaError_t Dispatch( } -//--------------------------------------------------------------------- -// Dispatch to different Thrust entrypoints -//--------------------------------------------------------------------- - -/** - * Dispatch to reduction entrypoint (min or max specialization) - */ -template -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - int timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - int /*max_segments*/, - BeginOffsetIteratorT /*d_segment_begin_offsets*/, - EndOffsetIteratorT /*d_segment_end_offsets*/, - ReductionOpT reduction_op, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - OutputT init; - CubDebugExit(cudaMemcpy(&init, d_in + 0, sizeof(OutputT), cudaMemcpyDeviceToHost)); - - THRUST_NS_QUALIFIER::device_ptr d_in_wrapper(d_in); - OutputT retval; - for (int i = 0; i < timing_iterations; ++i) - { - retval = THRUST_NS_QUALIFIER::reduce(d_in_wrapper, d_in_wrapper + num_items, init, reduction_op); - } - - if (!Equals >::VALUE) - CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice)); - } - - return cudaSuccess; -} - -/** - * Dispatch to reduction entrypoint (sum specialization) - */ -template -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - int timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - int /*max_segments*/, - BeginOffsetIteratorT /*d_segment_begin_offsets*/, - EndOffsetIteratorT /*d_segment_end_offsets*/, - Sum /*reduction_op*/, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::device_ptr d_in_wrapper(d_in); - OutputT retval; - for (int i = 0; i < timing_iterations; ++i) - { - retval = THRUST_NS_QUALIFIER::reduce(d_in_wrapper, d_in_wrapper + num_items); - } - - if (!Equals >::VALUE) - CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice)); - } - - return cudaSuccess; -} - - //--------------------------------------------------------------------- // CUDA nested-parallelism test kernel //--------------------------------------------------------------------- @@ -968,7 +861,7 @@ void SolveAndTest( typedef typename SolutionT::OutputT OutputT; printf("\n\n%s cub::DeviceReduce<%s> %d items (%s), %d segments\n", - (BACKEND == CUB_CDP) ? "CUB_CDP" : (BACKEND == THRUST) ? "Thrust" : (BACKEND == CUB_SEGMENTED) ? "CUB_SEGMENTED" : "CUB", + (BACKEND == CUB_CDP) ? "CUB_CDP" : (BACKEND == CUB_SEGMENTED) ? "CUB_SEGMENTED" : "CUB", typeid(ReductionOpT).name(), num_items, typeid(HostInputIteratorT).name(), num_segments); fflush(stdout); @@ -976,9 +869,9 @@ void SolveAndTest( OutputT *h_reference = new OutputT[num_segments]; SolutionT::Solve(h_in, h_reference, num_segments, h_segment_begin_offsets, h_segment_end_offsets, reduction_op); -// // Run with discard iterator -// DiscardOutputIterator discard_itr; -// Test(Int2Type(), d_in, discard_itr, num_items, num_segments, d_segment_offsets, reduction_op, h_reference); + // Run with discard iterator + DiscardOutputIterator discard_itr; + Test(Int2Type(), d_in, discard_itr, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, h_reference); // Run with output data OutputT *d_out = NULL; @@ -1321,7 +1214,6 @@ int main(int argc, char** argv) args.GetCmdLineArgument("n", max_items); args.GetCmdLineArgument("s", max_segments); args.GetCmdLineArgument("i", g_timing_iterations); - args.GetCmdLineArgument("repeat", g_repeat); // Print usage if (args.CheckCmdLineFlag("help")) @@ -1331,7 +1223,6 @@ int main(int argc, char** argv) "[--s= " "[--i= " "[--device=] " - "[--repeat=]" "[--v] " "[--cdp]" "\n", argv[0]); @@ -1348,81 +1239,26 @@ int main(int argc, char** argv) // Get SM count g_sm_count = args.deviceProp.multiProcessorCount; -#ifdef CUB_TEST_MINIMAL - - // Compile/run basic test - - - TestProblem( max_items, 1, RANDOM_BIT, Sum()); - TestProblem( max_items, 1, RANDOM_BIT, Sum()); - - printf("\n-------------------------------\n"); - - TestProblem( max_items, 1, RANDOM_BIT, Sum()); - TestProblem( max_items, 1, RANDOM_BIT, Sum()); - - printf("\n-------------------------------\n"); - - TestProblem( max_items, 1, RANDOM_BIT, Sum()); - TestProblem( max_items, 1, RANDOM_BIT, Sum()); - - printf("\n-------------------------------\n"); - - TestProblem(max_items, max_segments, RANDOM_BIT, Sum()); - - -#elif defined(CUB_TEST_BENCHMARK) - - // Compile/run quick comparison tests - - TestProblem( max_items * 4, 1, UNIFORM, Sum()); - TestProblem( max_items * 4, 1, UNIFORM, Sum()); - - printf("\n----------------------------\n"); - TestProblem( max_items * 2, 1, UNIFORM, Sum()); - TestProblem( max_items * 2, 1, UNIFORM, Sum()); - - printf("\n----------------------------\n"); - TestProblem( max_items, 1, UNIFORM, Sum()); - TestProblem( max_items, 1, UNIFORM, Sum()); - - printf("\n----------------------------\n"); - TestProblem( max_items / 2, 1, UNIFORM, Sum()); - TestProblem( max_items / 2, 1, UNIFORM, Sum()); - - printf("\n----------------------------\n"); - TestProblem( max_items / 4, 1, UNIFORM, Max()); - TestProblem( max_items / 4, 1, UNIFORM, Max()); - -#else - - // Compile/run thorough tests - for (int i = 0; i <= g_repeat; ++i) - { - // Test different input types - TestType(max_items, max_segments); - - TestType(max_items, max_segments); - - TestType(max_items, max_segments); - - TestType(max_items, max_segments); - TestType(max_items, max_segments); - TestType(max_items, max_segments); - TestType(max_items, max_segments); - - TestType(max_items, max_segments); - TestType(max_items, max_segments); - TestType(max_items, max_segments); - TestType(max_items, max_segments); - - TestType(max_items, max_segments); - TestType(max_items, max_segments); - } - + // %PARAM% TEST_TYPES types 0:1:2:3 + +#if TEST_TYPES == 0 + TestType(max_items, max_segments); + TestType(max_items, max_segments); + TestType(max_items, max_segments); +#elif TEST_TYPES == 1 + TestType(max_items, max_segments); + TestType(max_items, max_segments); + TestType(max_items, max_segments); + TestType(max_items, max_segments); +#elif TEST_TYPES == 2 + TestType(max_items, max_segments); + TestType(max_items, max_segments); + TestType(max_items, max_segments); + TestType(max_items, max_segments); +#else // TEST_TYPES == 3 + TestType(max_items, max_segments); + TestType(max_items, max_segments); #endif - - printf("\n"); return 0; } diff --git a/test/test_device_reduce_by_key.cu b/test/test_device_reduce_by_key.cu index c08b228885..4e4e0a9e41 100644 --- a/test/test_device_reduce_by_key.cu +++ b/test/test_device_reduce_by_key.cu @@ -36,10 +36,6 @@ #include #include -#include -#include -#include - #include #include #include @@ -57,14 +53,12 @@ using namespace cub; bool g_verbose = false; int g_timing_iterations = 0; -int g_repeat = 0; CachingDeviceAllocator g_allocator(true); // Dispatch types enum Backend { CUB, // CUB method - THRUST, // Thrust method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; @@ -124,92 +118,6 @@ cudaError_t Dispatch( return error; } - -//--------------------------------------------------------------------- -// Dispatch to different Thrust entrypoints -//--------------------------------------------------------------------- - -/** - * Dispatch to reduce-by-key entrypoint - */ -template < - typename KeyInputIteratorT, - typename KeyOutputIteratorT, - typename ValueInputIteratorT, - typename ValueOutputIteratorT, - typename NumRunsIteratorT, - typename EqualityOpT, - typename ReductionOpT, - typename OffsetT> -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void *d_temp_storage, - size_t &temp_storage_bytes, - KeyInputIteratorT d_keys_in, - KeyOutputIteratorT d_keys_out, - ValueInputIteratorT d_values_in, - ValueOutputIteratorT d_values_out, - NumRunsIteratorT d_num_runs, - EqualityOpT /*equality_op*/, - ReductionOpT /*reduction_op*/, - OffsetT num_items, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - // The input keys type - typedef typename std::iterator_traits::value_type KeyInputT; - - // The output keys type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type KeyOutputT; // ... else the output iterator's value type - - // The input values type - typedef typename std::iterator_traits::value_type ValueInputT; - - // The output values type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type ValueOuputT; // ... else the output iterator's value type - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::device_ptr d_keys_in_wrapper(d_keys_in); - THRUST_NS_QUALIFIER::device_ptr d_keys_out_wrapper(d_keys_out); - - THRUST_NS_QUALIFIER::device_ptr d_values_in_wrapper(d_values_in); - THRUST_NS_QUALIFIER::device_ptr d_values_out_wrapper(d_values_out); - - THRUST_NS_QUALIFIER::pair, THRUST_NS_QUALIFIER::device_ptr > d_out_ends; - - for (int i = 0; i < timing_timing_iterations; ++i) - { - d_out_ends = THRUST_NS_QUALIFIER::reduce_by_key( - d_keys_in_wrapper, - d_keys_in_wrapper + num_items, - d_values_in_wrapper, - d_keys_out_wrapper, - d_values_out_wrapper); - } - - OffsetT num_segments = OffsetT(d_out_ends.first - d_keys_out_wrapper); - CubDebugExit(cudaMemcpy(d_num_runs, &num_segments, sizeof(OffsetT), cudaMemcpyHostToDevice)); - - } - - return cudaSuccess; -} - - - //--------------------------------------------------------------------- // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- @@ -541,7 +449,7 @@ void TestPointer( int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items); printf("\nPointer %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n", - (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", + (BACKEND == CDP) ? "CDP CUB" : "CUB", (Equals::VALUE) ? "Sum" : "Max", num_items, num_segments, float(num_items) / num_segments, typeid(KeyT).name(), typeid(ValueT).name(), @@ -600,7 +508,7 @@ void TestIterator( int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items); printf("\nIterator %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n", - (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", + (BACKEND == CDP) ? "CDP CUB" : "CUB", (Equals::VALUE) ? "Sum" : "Max", num_items, num_segments, float(num_items) / num_segments, typeid(KeyT).name(), typeid(ValueT).name(), @@ -750,7 +658,6 @@ int main(int argc, char** argv) g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("i", g_timing_iterations); - args.GetCmdLineArgument("repeat", g_repeat); args.GetCmdLineArgument("maxseg", maxseg); args.GetCmdLineArgument("entropy", entropy_reduction); @@ -763,7 +670,6 @@ int main(int argc, char** argv) "[--device=] " "[--maxseg=]" "[--entropy=]" - "[--repeat=]" "[--v] " "[--cdp]" "\n", argv[0]); @@ -778,73 +684,27 @@ int main(int argc, char** argv) int ptx_version = 0; CubDebugExit(PtxVersion(ptx_version)); -#ifdef CUB_TEST_MINIMAL - - // Compile/run basic CUB test - if (num_items < 0) num_items = 32000000; - - TestPointer(num_items, entropy_reduction, maxseg, cub::Sum()); - TestPointer(num_items, entropy_reduction, maxseg, cub::Sum()); - TestIterator(num_items, entropy_reduction, maxseg, cub::Sum()); - -#elif defined(CUB_TEST_BENCHMARK) - - // Compile/run quick tests - if (num_items < 0) num_items = 32000000; - - printf("---- RLE int ---- \n"); - TestIterator(num_items, entropy_reduction, maxseg, cub::Sum()); - - printf("---- RLE long long ---- \n"); - TestIterator(num_items, entropy_reduction, maxseg, cub::Sum()); - - printf("---- int ---- \n"); - TestPointer(num_items, entropy_reduction, maxseg, cub::Sum()); - TestPointer(num_items, entropy_reduction, maxseg, cub::Sum()); - - printf("---- float ---- \n"); - TestPointer(num_items, entropy_reduction, maxseg, cub::Sum()); - TestPointer(num_items, entropy_reduction, maxseg, cub::Sum()); - - if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted - { - printf("---- double ---- \n"); - TestPointer(num_items, entropy_reduction, maxseg, cub::Sum()); - TestPointer(num_items, entropy_reduction, maxseg, cub::Sum()); - } - -#else - - // Compile/run thorough tests - for (int i = 0; i <= g_repeat; ++i) - { - - // Test different input types - TestOp(num_items); - TestOp(num_items); - TestOp(num_items); - TestOp(num_items); - TestOp(num_items); - TestOp(num_items); - if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted - TestOp(num_items); - - TestOp(num_items); - TestOp(num_items); - TestOp(num_items); - TestOp(num_items); - TestOp(num_items); - TestOp(num_items); - TestOp(num_items); - - TestOp(num_items); - TestOp(num_items); - TestOp(num_items); - TestOp(num_items); - - } - -#endif + // Test different input types + TestOp(num_items); + TestOp(num_items); + TestOp(num_items); + TestOp(num_items); + TestOp(num_items); + TestOp(num_items); + TestOp(num_items); + + TestOp(num_items); + TestOp(num_items); + TestOp(num_items); + TestOp(num_items); + TestOp(num_items); + TestOp(num_items); + TestOp(num_items); + + TestOp(num_items); + TestOp(num_items); + TestOp(num_items); + TestOp(num_items); return 0; } diff --git a/test/test_device_run_length_encode.cu b/test/test_device_run_length_encode.cu index e90ee63089..02b941eea6 100644 --- a/test/test_device_run_length_encode.cu +++ b/test/test_device_run_length_encode.cu @@ -36,10 +36,6 @@ #include #include -#include -#include -#include - #include #include #include @@ -57,14 +53,12 @@ using namespace cub; bool g_verbose = false; int g_timing_iterations = 0; -int g_repeat = 0; CachingDeviceAllocator g_allocator(true); // Dispatch types enum Backend { CUB, // CUB method - THRUST, // Thrust method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; @@ -177,89 +171,6 @@ cudaError_t Dispatch( return error; } - - -//--------------------------------------------------------------------- -// Dispatch to different Thrust entrypoints -//--------------------------------------------------------------------- - -/** - * Dispatch to run-length encode entrypoint - */ -template < - typename InputIteratorT, - typename UniqueOutputIteratorT, - typename OffsetsOutputIteratorT, - typename LengthsOutputIteratorT, - typename NumRunsIterator, - typename OffsetT> -cudaError_t Dispatch( - Int2Type /*method*/, - Int2Type /*dispatch_to*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - UniqueOutputIteratorT d_unique_out, - OffsetsOutputIteratorT /*d_offsets_out*/, - LengthsOutputIteratorT d_lengths_out, - NumRunsIterator d_num_runs, - cub::Equality /*equality_op*/, - OffsetT num_items, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type UniqueT; // ... else the output iterator's value type - - // The lengths output value type - typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ? - OffsetT, // ... then the OffsetT type, - typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::device_ptr d_in_wrapper(d_in); - THRUST_NS_QUALIFIER::device_ptr d_unique_out_wrapper(d_unique_out); - THRUST_NS_QUALIFIER::device_ptr d_lengths_out_wrapper(d_lengths_out); - - THRUST_NS_QUALIFIER::pair, THRUST_NS_QUALIFIER::device_ptr > d_out_ends; - - LengthT one_val; - InitValue(INTEGER_SEED, one_val, 1); - THRUST_NS_QUALIFIER::constant_iterator constant_one(one_val); - - for (int i = 0; i < timing_timing_iterations; ++i) - { - d_out_ends = THRUST_NS_QUALIFIER::reduce_by_key( - d_in_wrapper, - d_in_wrapper + num_items, - constant_one, - d_unique_out_wrapper, - d_lengths_out_wrapper); - } - - OffsetT num_runs = OffsetT(d_out_ends.first - d_unique_out_wrapper); - CubDebugExit(cudaMemcpy(d_num_runs, &num_runs, sizeof(OffsetT), cudaMemcpyHostToDevice)); - } - - return cudaSuccess; -} - - - //--------------------------------------------------------------------- // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- @@ -627,7 +538,7 @@ void TestPointer( printf("\nPointer %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}, max_segment %d, entropy_reduction %d\n", (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other", - (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", + (BACKEND == CDP) ? "CDP CUB" : "CUB", num_items, num_runs, float(num_items) / num_runs, typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name(), max_segment, entropy_reduction); @@ -680,7 +591,7 @@ void TestIterator( printf("\nIterator %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}\n", (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other", - (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", + (BACKEND == CDP) ? "CDP CUB" : "CUB", num_items, num_runs, float(num_items) / num_runs, typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name()); fflush(stdout); @@ -812,7 +723,6 @@ int main(int argc, char** argv) g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("i", g_timing_iterations); - args.GetCmdLineArgument("repeat", g_repeat); args.GetCmdLineArgument("maxseg", max_segment); args.GetCmdLineArgument("entropy", entropy_reduction); @@ -825,7 +735,6 @@ int main(int argc, char** argv) "[--device=] " "[--maxseg=]" "[--entropy=]" - "[--repeat=]" "[--v] " "[--cdp]" "\n", argv[0]); @@ -840,48 +749,22 @@ int main(int argc, char** argv) int ptx_version = 0; CubDebugExit(PtxVersion(ptx_version)); -#ifdef CUB_TEST_MINIMAL - - // Compile/run basic CUB test - if (num_items < 0) num_items = 32000000; - - TestPointer( num_items, entropy_reduction, max_segment); - TestPointer( num_items, entropy_reduction, max_segment); - TestIterator( num_items, Int2Type::PRIMITIVE>()); - - -#elif defined(CUB_TEST_BENCHMARK) - - // Compile/run quick tests - if (num_items < 0) num_items = 32000000; - - TestPointer( num_items, entropy_reduction, max_segment); - TestPointer( num_items, entropy_reduction, max_segment); - -#else - - // Compile/run thorough tests - for (int i = 0; i <= g_repeat; ++i) - { - // Test different input types - TestSize(num_items); - TestSize(num_items); - TestSize(num_items); - TestSize(num_items); - TestSize(num_items); - TestSize(num_items); - TestSize(num_items); - - TestSize(num_items); - TestSize(num_items); - TestSize(num_items); - TestSize(num_items); - TestSize(num_items); - TestSize(num_items); - TestSize(num_items); - } - -#endif + // Test different input types + TestSize(num_items); + TestSize(num_items); + TestSize(num_items); + TestSize(num_items); + TestSize(num_items); + TestSize(num_items); + TestSize(num_items); + + TestSize(num_items); + TestSize(num_items); + TestSize(num_items); + TestSize(num_items); + TestSize(num_items); + TestSize(num_items); + TestSize(num_items); return 0; } diff --git a/test/test_device_scan.cu b/test/test_device_scan.cu index b9bb2b2fae..63fa85d598 100644 --- a/test/test_device_scan.cu +++ b/test/test_device_scan.cu @@ -36,9 +36,6 @@ #include #include -#include -#include - #include #include #include @@ -55,7 +52,6 @@ using namespace cub; bool g_verbose = false; int g_timing_iterations = 0; -int g_repeat = 0; double g_device_giga_bandwidth; CachingDeviceAllocator g_allocator(true); @@ -63,7 +59,6 @@ CachingDeviceAllocator g_allocator(true); enum Backend { CUB, // CUB method - THRUST, // Thrust method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; @@ -213,199 +208,6 @@ cudaError_t Dispatch( return error; } -//--------------------------------------------------------------------- -// Dispatch to different Thrust entrypoints -//--------------------------------------------------------------------- - -/** - * Dispatch to exclusive scan entrypoint - */ -template -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - IsPrimitiveT /*is_primitive*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ScanOpT scan_op, - InitialValueT initial_value, - OffsetT num_items, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::device_ptr d_in_wrapper(d_in); - THRUST_NS_QUALIFIER::device_ptr d_out_wrapper(d_out); - for (int i = 0; i < timing_timing_iterations; ++i) - { - THRUST_NS_QUALIFIER::exclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, initial_value, scan_op); - } - } - - return cudaSuccess; -} - - -/** - * Dispatch to exclusive sum entrypoint - */ -template -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - Int2Type /*is_primitive*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - Sum /*scan_op*/, - InitialValueT /*initial_value*/, - OffsetT num_items, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::device_ptr d_in_wrapper(d_in); - THRUST_NS_QUALIFIER::device_ptr d_out_wrapper(d_out); - for (int i = 0; i < timing_timing_iterations; ++i) - { - THRUST_NS_QUALIFIER::exclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper); - } - } - - return cudaSuccess; -} - - -/** - * Dispatch to inclusive scan entrypoint - */ -template -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - IsPrimitiveT /*is_primitive*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ScanOpT scan_op, - NullType /*initial_value*/, - OffsetT num_items, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::device_ptr d_in_wrapper(d_in); - THRUST_NS_QUALIFIER::device_ptr d_out_wrapper(d_out); - for (int i = 0; i < timing_timing_iterations; ++i) - { - THRUST_NS_QUALIFIER::inclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, scan_op); - } - } - - return cudaSuccess; -} - - -/** - * Dispatch to inclusive sum entrypoint - */ -template -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - Int2Type /*is_primitive*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - Sum /*scan_op*/, - NullType /*initial_value*/, - OffsetT num_items, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::device_ptr d_in_wrapper(d_in); - THRUST_NS_QUALIFIER::device_ptr d_out_wrapper(d_out); - for (int i = 0; i < timing_timing_iterations; ++i) - { - THRUST_NS_QUALIFIER::inclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper); - } - } - - return cudaSuccess; -} - - - //--------------------------------------------------------------------- // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- @@ -748,7 +550,7 @@ template < typename OutputT, typename ScanOpT, typename InitialValueT> -typename std::enable_if::value && BACKEND != THRUST>::type +typename std::enable_if::value>::type TestFutureInitValue( DeviceInputIteratorT d_in, OutputT *h_reference, @@ -775,7 +577,7 @@ template < typename OutputT, typename ScanOpT, typename InitialValueT> -typename std::enable_if::value || BACKEND == THRUST>::type +typename std::enable_if::value>::type TestFutureInitValue( DeviceInputIteratorT, OutputT *, @@ -787,12 +589,12 @@ TestFutureInitValue( } template < - Backend BACKEND, - typename DeviceInputIteratorT, - typename OutputT, - typename ScanOpT, - typename InitialValueT> -typename std::enable_if::value && BACKEND != THRUST>::type + Backend BACKEND, + typename DeviceInputIteratorT, + typename OutputT, + typename ScanOpT, + typename InitialValueT> +typename std::enable_if::value>::type TestFutureInitValueIter( DeviceInputIteratorT d_in, OutputT *h_reference, @@ -812,7 +614,7 @@ template < typename OutputT, typename ScanOpT, typename InitialValueT> -typename std::enable_if::value || BACKEND == THRUST>::type +typename std::enable_if::value>::type TestFutureInitValueIter( DeviceInputIteratorT, OutputT *, @@ -871,7 +673,7 @@ void TestPointer( InitialValueT initial_value) { printf("\nPointer %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes) , gen-mode %s\n", - (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", + (BACKEND == CDP) ? "CDP CUB" : "CUB", (Equals::VALUE) ? "Inclusive" : "Exclusive", (Equals::VALUE) ? "Sum" : "Scan", num_items, @@ -936,7 +738,7 @@ void TestIterator( InitialValueT initial_value) { printf("\nIterator %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes)\n", - (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", + (BACKEND == CDP) ? "CDP CUB" : "CUB", (Equals::VALUE) ? "Inclusive" : "Exclusive", (Equals::VALUE) ? "Sum" : "Scan", num_items, @@ -1079,7 +881,6 @@ int main(int argc, char** argv) g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("i", g_timing_iterations); - args.GetCmdLineArgument("repeat", g_repeat); // Print usage if (args.CheckCmdLineFlag("help")) @@ -1088,7 +889,6 @@ int main(int argc, char** argv) "[--n= " "[--i= " "[--device=] " - "[--repeat=]" "[--v] " "[--cdp]" "\n", argv[0]); @@ -1100,92 +900,57 @@ int main(int argc, char** argv) g_device_giga_bandwidth = args.device_giga_bandwidth; printf("\n"); -#ifdef CUB_TEST_MINIMAL - - // Compile/run basic CUB test - if (num_items < 0) num_items = 32000000; - - TestPointer( num_items , RANDOM_BIT, Sum(), (int) (0)); - TestPointer( num_items , RANDOM_BIT, Sum(), (int) (0)); - - printf("----------------------------\n"); - - TestPointer( num_items , RANDOM_BIT, Sum(), (int) (0)); - TestPointer( num_items , RANDOM_BIT, Sum(), (long long) (0)); - - printf("----------------------------\n"); - - TestPointer( num_items , RANDOM_BIT, Sum(), (float) (0)); - TestPointer( num_items , RANDOM_BIT, Sum(), (double) (0)); - - -#elif defined(CUB_TEST_BENCHMARK) - - // Get device ordinal - int device_ordinal; - CubDebugExit(cudaGetDevice(&device_ordinal)); - - // Get device SM version - int sm_version = 0; - CubDebugExit(SmVersion(sm_version, device_ordinal)); - - // Compile/run quick tests - if (num_items < 0) num_items = 32000000; - - TestPointer( num_items * ((sm_version <= 130) ? 1 : 4), UNIFORM, Sum(), char(0)); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 4), UNIFORM, Sum(), char(0)); - - printf("----------------------------\n"); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 2), UNIFORM, Sum(), short(0)); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 2), UNIFORM, Sum(), short(0)); - - printf("----------------------------\n"); - TestPointer( num_items , UNIFORM, Sum(), (int) (0)); - TestPointer( num_items , UNIFORM, Sum(), (int) (0)); - - printf("----------------------------\n"); - TestPointer( num_items / 2, UNIFORM, Sum(), (long long) (0)); - TestPointer(num_items / 2, UNIFORM, Sum(), (long long) (0)); - - printf("----------------------------\n"); - TestPointer( num_items / 4, UNIFORM, Sum(), TestBar()); - TestPointer( num_items / 4, UNIFORM, Sum(), TestBar()); - -#else - - // Compile/run thorough tests - for (int i = 0; i <= g_repeat; ++i) - { - // Test different input+output data types - TestSize(num_items, (int) 0, (int) 99); - - // Test same intput+output data types - TestSize(num_items, (unsigned char) 0, (unsigned char) 99); - TestSize(num_items, (char) 0, (char) 99); - TestSize(num_items, (unsigned short) 0, (unsigned short)99); - TestSize(num_items, (unsigned int) 0, (unsigned int) 99); - TestSize(num_items, (unsigned long long) 0, (unsigned long long) 99); - - TestSize(num_items, make_uchar2(0, 0), make_uchar2(17, 21)); - TestSize(num_items, make_char2(0, 0), make_char2(17, 21)); - TestSize(num_items, make_ushort2(0, 0), make_ushort2(17, 21)); - TestSize(num_items, make_uint2(0, 0), make_uint2(17, 21)); - TestSize(num_items, make_ulonglong2(0, 0), make_ulonglong2(17, 21)); - TestSize(num_items, make_uchar4(0, 0, 0, 0), make_uchar4(17, 21, 32, 85)); - TestSize(num_items, make_char4(0, 0, 0, 0), make_char4(17, 21, 32, 85)); - - TestSize(num_items, make_ushort4(0, 0, 0, 0), make_ushort4(17, 21, 32, 85)); - TestSize(num_items, make_uint4(0, 0, 0, 0), make_uint4(17, 21, 32, 85)); - TestSize(num_items, make_ulonglong4(0, 0, 0, 0), make_ulonglong4(17, 21, 32, 85)); - - TestSize(num_items, - TestFoo::MakeTestFoo(0, 0, 0, 0), - TestFoo::MakeTestFoo(1ll << 63, 1 << 31, static_cast(1 << 15), static_cast(1 << 7))); - - TestSize(num_items, - TestBar(0, 0), - TestBar(1ll << 63, 1 << 31)); - } + // %PARAM% TEST_VALUE_TYPES types 0:1:2 + +#if TEST_VALUE_TYPES == 0 + + // Test different input+output data types + TestSize(num_items, (int)0, (int)99); + + // Test same input+output data types + TestSize(num_items, (unsigned char)0, (unsigned char)99); + TestSize(num_items, (char)0, (char)99); + TestSize(num_items, (unsigned short)0, (unsigned short)99); + TestSize(num_items, (unsigned int)0, (unsigned int)99); + TestSize(num_items, + (unsigned long long)0, + (unsigned long long)99); + +#elif TEST_VALUE_TYPES == 1 + + TestSize(num_items, make_uchar2(0, 0), make_uchar2(17, 21)); + TestSize(num_items, make_char2(0, 0), make_char2(17, 21)); + TestSize(num_items, make_ushort2(0, 0), make_ushort2(17, 21)); + TestSize(num_items, make_uint2(0, 0), make_uint2(17, 21)); + TestSize(num_items, + make_ulonglong2(0, 0), + make_ulonglong2(17, 21)); + TestSize(num_items, + make_uchar4(0, 0, 0, 0), + make_uchar4(17, 21, 32, 85)); +#elif TEST_VALUE_TYPES == 2 + TestSize(num_items, + make_char4(0, 0, 0, 0), + make_char4(17, 21, 32, 85)); + + TestSize(num_items, + make_ushort4(0, 0, 0, 0), + make_ushort4(17, 21, 32, 85)); + TestSize(num_items, + make_uint4(0, 0, 0, 0), + make_uint4(17, 21, 32, 85)); + TestSize(num_items, + make_ulonglong4(0, 0, 0, 0), + make_ulonglong4(17, 21, 32, 85)); + + TestSize(num_items, + TestFoo::MakeTestFoo(0, 0, 0, 0), + TestFoo::MakeTestFoo(1ll << 63, + 1 << 31, + static_cast(1 << 15), + static_cast(1 << 7))); + + TestSize(num_items, TestBar(0, 0), TestBar(1ll << 63, 1 << 31)); #endif diff --git a/test/test_device_scan_by_key.cu b/test/test_device_scan_by_key.cu index cb91814647..ed625bd7b0 100644 --- a/test/test_device_scan_by_key.cu +++ b/test/test_device_scan_by_key.cu @@ -35,9 +35,6 @@ #include #include -#include -#include - #include #include #include @@ -56,7 +53,6 @@ using namespace cub; bool g_verbose = false; int g_timing_iterations = 0; -int g_repeat = 0; double g_device_giga_bandwidth; CachingDeviceAllocator g_allocator(true); @@ -64,7 +60,6 @@ CachingDeviceAllocator g_allocator(true); enum Backend { CUB, // CUB method - THRUST, // Thrust method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; @@ -247,219 +242,6 @@ cudaError_t Dispatch( return error; } -//--------------------------------------------------------------------- -// Dispatch to different Thrust entrypoints -//--------------------------------------------------------------------- - -/** - * Dispatch to exclusive scan entrypoint - */ -template -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - IsPrimitiveT /*is_primitive*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - ScanOpT scan_op, - InitialValueT initial_value, - OffsetT num_items, - EqualityOpT equality_op, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - // The input key type - typedef typename std::iterator_traits::value_type KeyT; - - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::device_ptr d_keys_in_wrapper(d_keys_in); - THRUST_NS_QUALIFIER::device_ptr d_values_in_wrapper(d_values_in); - THRUST_NS_QUALIFIER::device_ptr d_values_out_wrapper(d_values_out); - for (int i = 0; i < timing_timing_iterations; ++i) - { - THRUST_NS_QUALIFIER::exclusive_scan_by_key(d_keys_in_wrapper, d_keys_in_wrapper + num_items, d_values_in_wrapper, d_values_out_wrapper, initial_value, equality_op, scan_op); - } - } - - return cudaSuccess; -} - - -/** - * Dispatch to exclusive sum entrypoint - */ -template -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - Int2Type /*is_primitive*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - Sum /*scan_op*/, - InitialValueT /*initial_value*/, - OffsetT num_items, - EqualityOpT /*equality_op*/, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - // The input key type - typedef typename std::iterator_traits::value_type KeyT; - - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::device_ptr d_keys_in_wrapper(d_keys_in); - THRUST_NS_QUALIFIER::device_ptr d_values_in_wrapper(d_values_in); - THRUST_NS_QUALIFIER::device_ptr d_values_out_wrapper(d_values_out); - for (int i = 0; i < timing_timing_iterations; ++i) - { - THRUST_NS_QUALIFIER::exclusive_scan_by_key(d_keys_in_wrapper, d_keys_in_wrapper + num_items, d_values_in_wrapper, d_values_out_wrapper); - } - } - - return cudaSuccess; -} - - -/** - * Dispatch to inclusive scan entrypoint - */ -template -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - IsPrimitiveT /*is_primitive*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - ScanOpT scan_op, - NullType /*initial_value*/, - OffsetT num_items, - EqualityOpT equality_op, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - // The input key type - typedef typename std::iterator_traits::value_type KeyT; - - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::device_ptr d_keys_in_wrapper(d_keys_in); - THRUST_NS_QUALIFIER::device_ptr d_values_in_wrapper(d_values_in); - THRUST_NS_QUALIFIER::device_ptr d_values_out_wrapper(d_values_out); - for (int i = 0; i < timing_timing_iterations; ++i) - { - THRUST_NS_QUALIFIER::inclusive_scan(d_keys_in_wrapper, d_keys_in_wrapper + num_items, d_values_in_wrapper, d_values_out_wrapper, scan_op); - } - } - - return cudaSuccess; -} - -template -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - Int2Type /*is_primitive*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - Sum /*scan_op*/, - NullType /*initial_value*/, - OffsetT num_items, - EqualityOpT equality_op, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - // The input key type - typedef typename std::iterator_traits::value_type KeyT; - - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::device_ptr d_keys_in_wrapper(d_keys_in); - THRUST_NS_QUALIFIER::device_ptr d_values_in_wrapper(d_values_in); - THRUST_NS_QUALIFIER::device_ptr d_values_out_wrapper(d_values_out); - for (int i = 0; i < timing_timing_iterations; ++i) - { - THRUST_NS_QUALIFIER::inclusive_scan(d_keys_in_wrapper, d_keys_in_wrapper + num_items, d_values_in_wrapper, d_values_out_wrapper); - } - } - - return cudaSuccess; -} - - - //--------------------------------------------------------------------- // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- @@ -885,7 +667,7 @@ void TestPointer( EqualityOpT equality_op) { printf("\nPointer %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes) , gen-mode %s\n", - (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", + (BACKEND == CDP) ? "CDP CUB" : "CUB", (Equals::VALUE) ? "Inclusive" : "Exclusive", (Equals::VALUE) ? "Sum" : "Scan", num_items, @@ -911,11 +693,7 @@ void TestPointer( Equals::VALUE && !Equals::VALUE) { - if (BACKEND == THRUST) { - Solve(h_keys_in, h_values_in, h_reference, num_items, cub::Sum{}, InputT{}, Equality{}); - } else { - Solve(h_keys_in, h_values_in, h_reference, num_items, cub::Sum{}, InputT{}, equality_op); - } + Solve(h_keys_in, h_values_in, h_reference, num_items, cub::Sum{}, InputT{}, equality_op); } else { @@ -963,7 +741,7 @@ void TestIterator( EqualityOpT equality_op) { printf("\nIterator %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes)\n", - (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", + (BACKEND == CDP) ? "CDP CUB" : "CUB", (Equals::VALUE) ? "Inclusive" : "Exclusive", (Equals::VALUE) ? "Sum" : "Scan", num_items, @@ -1071,8 +849,8 @@ void TestKeyTAndEqualityOp( OutputT identity, OutputT initial_value) { - TestOp(num_items, identity, initial_value, Equality()); - TestOp( num_items, identity, initial_value, Mod2Equality()); + TestOp(num_items, identity, initial_value, Equality()); + TestOp( num_items, identity, initial_value, Mod2Equality()); } /** @@ -1096,7 +874,7 @@ void TestSize( // Randomly select problem size between 1:10,000,000 unsigned int max_int = (unsigned int) -1; - for (int i = 0; i < 10; ++i) + for (int i = 0; i < 4; ++i) { unsigned int num; RandomBits(num); @@ -1129,7 +907,6 @@ int main(int argc, char** argv) g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("i", g_timing_iterations); - args.GetCmdLineArgument("repeat", g_repeat); // Print usage if (args.CheckCmdLineFlag("help")) @@ -1138,7 +915,6 @@ int main(int argc, char** argv) "[--n= " "[--i= " "[--device=] " - "[--repeat=]" "[--v] " "[--cdp]" "\n", argv[0]); @@ -1150,91 +926,69 @@ int main(int argc, char** argv) g_device_giga_bandwidth = args.device_giga_bandwidth; printf("\n"); -#ifdef CUB_TEST_MINIMAL + // %PARAM% TEST_VALUE_TYPES types 0:1:2:3:4:5 - // Compile/run basic CUB test - if (num_items < 0) num_items = 32000000; +#if TEST_VALUE_TYPES == 0 - TestPointer( num_items , RANDOM_BIT, Sum(), (int) (0), Equality()); - TestPointer( num_items , RANDOM_BIT, Sum(), (int) (0), Equality()); + // Test different input+output data types + TestSize(num_items, (int)0, (int)99); - printf("----------------------------\n"); + // Test same input+output data types + TestSize(num_items, (unsigned char)0, (unsigned char)99); + TestSize(num_items, (char)0, (char)99); - TestPointer( num_items , RANDOM_BIT, Sum(), (int) (0), Equality()); - TestPointer( num_items , RANDOM_BIT, Sum(), (long long) (0), Equality()); +#elif TEST_VALUE_TYPES == 1 - printf("----------------------------\n"); + TestSize(num_items, (unsigned short)0, (unsigned short)99); + TestSize(num_items, (unsigned int)0, (unsigned int)99); + TestSize(num_items, + (unsigned long long)0, + (unsigned long long)99); - TestPointer( num_items , RANDOM_BIT, Sum(), (float) (0), Equality()); - TestPointer( num_items , RANDOM_BIT, Sum(), (double) (0), Equality()); +#elif TEST_VALUE_TYPES == 2 + TestSize(num_items, make_uchar2(0, 0), make_uchar2(17, 21)); + TestSize(num_items, make_char2(0, 0), make_char2(17, 21)); + TestSize(num_items, make_ushort2(0, 0), make_ushort2(17, 21)); -#elif defined(CUB_TEST_BENCHMARK) +#elif TEST_VALUE_TYPES == 3 - // Get device ordinal - int device_ordinal; - CubDebugExit(cudaGetDevice(&device_ordinal)); + TestSize(num_items, make_uint2(0, 0), make_uint2(17, 21)); + TestSize(num_items, + make_ulonglong2(0, 0), + make_ulonglong2(17, 21)); + TestSize(num_items, + make_uchar4(0, 0, 0, 0), + make_uchar4(17, 21, 32, 85)); - // Get device SM version - int sm_version = 0; - CubDebugExit(SmVersion(sm_version, device_ordinal)); +#elif TEST_VALUE_TYPES == 4 - // Compile/run quick tests - if (num_items < 0) num_items = 32000000; + TestSize(num_items, + make_char4(0, 0, 0, 0), + make_char4(17, 21, 32, 85)); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 4), UNIFORM, Sum(), char(0), Equality()); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 4), UNIFORM, Sum(), char(0), Equality()); + TestSize(num_items, + make_ushort4(0, 0, 0, 0), + make_ushort4(17, 21, 32, 85)); + TestSize(num_items, + make_uint4(0, 0, 0, 0), + make_uint4(17, 21, 32, 85)); - printf("----------------------------\n"); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 2), UNIFORM, Sum(), short(0), Equality()); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 2), UNIFORM, Sum(), short(0), Equality()); +#elif TEST_VALUE_TYPES == 5 - printf("----------------------------\n"); - TestPointer( num_items , UNIFORM, Sum(), (int) (0), Equality()); - TestPointer( num_items , UNIFORM, Sum(), (int) (0), Equality()); + TestSize(num_items, + make_ulonglong4(0, 0, 0, 0), + make_ulonglong4(17, 21, 32, 85)); - printf("----------------------------\n"); - TestPointer( num_items / 2, UNIFORM, Sum(), (long long) (0), Equality()); - TestPointer(num_items / 2, UNIFORM, Sum(), (long long) (0), Equality()); + TestSize(num_items, + TestFoo::MakeTestFoo(0, 0, 0, 0), + TestFoo::MakeTestFoo(1ll << 63, + 1 << 31, + static_cast(1 << 15), + static_cast(1 << 7))); - printf("----------------------------\n"); - TestPointer( num_items / 4, UNIFORM, Sum(), TestBar(), Equality()); - TestPointer( num_items / 4, UNIFORM, Sum(), TestBar(), Equality()); + TestSize(num_items, TestBar(0, 0), TestBar(1ll << 63, 1 << 31)); -#else - // Compile/run thorough tests - for (int i = 0; i <= g_repeat; ++i) - { - // Test different input+output data types - TestSize(num_items, (int) 0, (int) 99); - - // Test same intput+output data types - TestSize(num_items, (unsigned char) 0, (unsigned char) 99); - TestSize(num_items, (char) 0, (char) 99); - TestSize(num_items, (unsigned short) 0, (unsigned short)99); - TestSize(num_items, (unsigned int) 0, (unsigned int) 99); - TestSize(num_items, (unsigned long long) 0, (unsigned long long) 99); - - TestSize(num_items, make_uchar2(0, 0), make_uchar2(17, 21)); - TestSize(num_items, make_char2(0, 0), make_char2(17, 21)); - TestSize(num_items, make_ushort2(0, 0), make_ushort2(17, 21)); - TestSize(num_items, make_uint2(0, 0), make_uint2(17, 21)); - TestSize(num_items, make_ulonglong2(0, 0), make_ulonglong2(17, 21)); - TestSize(num_items, make_uchar4(0, 0, 0, 0), make_uchar4(17, 21, 32, 85)); - TestSize(num_items, make_char4(0, 0, 0, 0), make_char4(17, 21, 32, 85)); - - TestSize(num_items, make_ushort4(0, 0, 0, 0), make_ushort4(17, 21, 32, 85)); - TestSize(num_items, make_uint4(0, 0, 0, 0), make_uint4(17, 21, 32, 85)); - TestSize(num_items, make_ulonglong4(0, 0, 0, 0), make_ulonglong4(17, 21, 32, 85)); - - TestSize(num_items, - TestFoo::MakeTestFoo(0, 0, 0, 0), - TestFoo::MakeTestFoo(1ll << 63, 1 << 31, static_cast(1 << 15), static_cast(1 << 7))); - - TestSize(num_items, - TestBar(0, 0), - TestBar(1ll << 63, 1 << 31)); - } #endif return 0; diff --git a/test/test_device_select_if.cu b/test/test_device_select_if.cu index fb77072487..c3cc1d8e2a 100644 --- a/test/test_device_select_if.cu +++ b/test/test_device_select_if.cu @@ -36,11 +36,6 @@ #include #include -#include -#include -#include -#include - #include #include #include @@ -57,7 +52,6 @@ using namespace cub; bool g_verbose = false; int g_timing_iterations = 0; -int g_repeat = 0; float g_device_giga_bandwidth; CachingDeviceAllocator g_allocator(true); @@ -65,7 +59,6 @@ CachingDeviceAllocator g_allocator(true); enum Backend { CUB, // CUB method - THRUST, // Thrust method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; @@ -221,261 +214,6 @@ cudaError_t Dispatch( return error; } - -//--------------------------------------------------------------------- -// Dispatch to different Thrust entrypoints -//--------------------------------------------------------------------- - -/** - * Dispatch to select if entrypoint - */ -template -__host__ __forceinline__ -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - Int2Type /*is_flagged*/, - Int2Type /*is_partition*/, - int timing_timing_iterations, - size_t* /*d_temp_storage_bytes*/, - cudaError_t* /*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - FlagIteratorT /*d_flags*/, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - OffsetT num_items, - SelectOpT select_op, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::device_ptr d_out_wrapper_end; - THRUST_NS_QUALIFIER::device_ptr d_in_wrapper(d_in); - THRUST_NS_QUALIFIER::device_ptr d_out_wrapper(d_out); - - for (int i = 0; i < timing_timing_iterations; ++i) - { - d_out_wrapper_end = THRUST_NS_QUALIFIER::copy_if(d_in_wrapper, - d_in_wrapper + num_items, - d_out_wrapper, select_op); - } - - OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper); - CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice)); - } - - return cudaSuccess; -} - - -/** - * Dispatch to partition if entrypoint - */ -template -__host__ __forceinline__ -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - Int2Type /*is_flagged*/, - Int2Type /*is_partition*/, - int timing_timing_iterations, - size_t* /*d_temp_storage_bytes*/, - cudaError_t* /*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - FlagIteratorT /*d_flags*/, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - OffsetT num_items, - SelectOpT select_op, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - typedef THRUST_NS_QUALIFIER::reverse_iterator > ReverseOutputIteratorT; - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::pair, ReverseOutputIteratorT> d_out_wrapper_end; - - THRUST_NS_QUALIFIER::device_ptr d_in_wrapper(d_in); - THRUST_NS_QUALIFIER::device_ptr d_out_wrapper(d_out); - - ReverseOutputIteratorT d_out_unselected(d_out_wrapper + num_items); - - for (int i = 0; i < timing_timing_iterations; ++i) - { - d_out_wrapper_end = THRUST_NS_QUALIFIER::partition_copy( - d_in_wrapper, - d_in_wrapper + num_items, - d_out_wrapper, - d_out_unselected, - select_op); - } - - OffsetT num_selected = OffsetT(d_out_wrapper_end.first - d_out_wrapper); - CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice)); - } - - return cudaSuccess; -} - - -/** - * Dispatch to select flagged entrypoint - */ -template -__host__ __forceinline__ -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - Int2Type /*is_flagged*/, - Int2Type /*is_partition*/, - int timing_timing_iterations, - size_t* /*d_temp_storage_bytes*/, - cudaError_t* /*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - FlagIteratorT d_flags, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - OffsetT num_items, - SelectOpT /*select_op*/, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - // The flag type - typedef typename std::iterator_traits::value_type FlagT; - - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::device_ptr d_out_wrapper_end; - THRUST_NS_QUALIFIER::device_ptr d_in_wrapper(d_in); - THRUST_NS_QUALIFIER::device_ptr d_out_wrapper(d_out); - THRUST_NS_QUALIFIER::device_ptr d_flags_wrapper(d_flags); - - for (int i = 0; i < timing_timing_iterations; ++i) - { - d_out_wrapper_end = THRUST_NS_QUALIFIER::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_flags_wrapper, d_out_wrapper, CastOp()); - } - - OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper); - CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice)); - } - - return cudaSuccess; -} - - -/** - * Dispatch to partition flagged entrypoint - */ -template -__host__ __forceinline__ -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - Int2Type /*is_flagged*/, - Int2Type /*is_partition*/, - int timing_timing_iterations, - size_t* /*d_temp_storage_bytes*/, - cudaError_t* /*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - FlagIteratorT d_flags, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - OffsetT num_items, - SelectOpT /*select_op*/, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - // The flag type - typedef typename std::iterator_traits::value_type FlagT; - - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - typedef THRUST_NS_QUALIFIER::reverse_iterator > ReverseOutputIteratorT; - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::pair, ReverseOutputIteratorT> d_out_wrapper_end; - - THRUST_NS_QUALIFIER::device_ptr d_in_wrapper(d_in); - THRUST_NS_QUALIFIER::device_ptr d_out_wrapper(d_out); - THRUST_NS_QUALIFIER::device_ptr d_flags_wrapper(d_flags); - ReverseOutputIteratorT d_out_unselected(d_out_wrapper + num_items); - - for (int i = 0; i < timing_timing_iterations; ++i) - { - d_out_wrapper_end = THRUST_NS_QUALIFIER::partition_copy( - d_in_wrapper, - d_in_wrapper + num_items, - d_flags_wrapper, - d_out_wrapper, - d_out_unselected, - CastOp()); - } - - OffsetT num_selected = OffsetT(d_out_wrapper_end.first - d_out_wrapper); - CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice)); - } - - return cudaSuccess; -} - - //--------------------------------------------------------------------- // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- @@ -768,7 +506,7 @@ void TestPointer( printf("\nPointer %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n", (IS_PARTITION) ? "DevicePartition" : "DeviceSelect", (IS_FLAGGED) ? "Flagged" : "If", - (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", + (BACKEND == CDP) ? "CDP CUB" : "CUB", num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T)); fflush(stdout); @@ -828,7 +566,7 @@ void TestIterator( printf("\nIterator %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n", (IS_PARTITION) ? "DevicePartition" : "DeviceSelect", (IS_FLAGGED) ? "Flagged" : "If", - (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", + (BACKEND == CDP) ? "CDP CUB" : "CUB", num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T)); fflush(stdout); @@ -914,32 +652,6 @@ void Test( } } -/** - * Test select/partition on pointer types - */ -template -void ComparePointer( - int num_items, - float select_ratio) -{ - printf("-- Select-if ----------------------------\n"); - TestPointer(num_items, select_ratio); - TestPointer(num_items, select_ratio); - - printf("-- Partition-if ----------------------------\n"); - TestPointer(num_items, select_ratio); - TestPointer(num_items, select_ratio); - - printf("-- Select-flagged ----------------------------\n"); - TestPointer(num_items, select_ratio); - TestPointer(num_items, select_ratio); - - printf("-- Partition-flagged ----------------------------\n"); - TestPointer(num_items, select_ratio); - TestPointer(num_items, select_ratio); - -} - //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- @@ -957,7 +669,6 @@ int main(int argc, char** argv) g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("i", g_timing_iterations); - args.GetCmdLineArgument("repeat", g_repeat); args.GetCmdLineArgument("ratio", select_ratio); // Print usage @@ -968,7 +679,6 @@ int main(int argc, char** argv) "[--i= " "[--device=] " "[--ratio=] " - "[--repeat=] " "[--v] " "[--cdp] " "\n", argv[0]); @@ -980,72 +690,23 @@ int main(int argc, char** argv) g_device_giga_bandwidth = args.device_giga_bandwidth; printf("\n"); -#ifdef CUB_TEST_MINIMAL - - // Compile/run basic CUB test - if (num_items < 0) num_items = 32000000; - - printf("-- Select-if ----------------------------\n"); - TestPointer(num_items, select_ratio); - - printf("-- Partition-if ----------------------------\n"); - TestPointer(num_items, select_ratio); - - printf("-- Select-flagged ----------------------------\n"); - TestPointer(num_items, select_ratio); - - printf("-- Partition-flagged ----------------------------\n"); - TestPointer(num_items, select_ratio); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); -#elif defined(CUB_TEST_BENCHMARK) + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); - // Get device ordinal - int device_ordinal; - CubDebugExit(cudaGetDevice(&device_ordinal)); - - // Get device SM version - int sm_version = 0; - CubDebugExit(SmVersion(sm_version, device_ordinal)); - - // Compile/run quick tests - if (num_items < 0) num_items = 32000000; - - printf("-- Iterator ----------------------------\n"); - TestIterator(num_items, select_ratio); - - ComparePointer( num_items * ((sm_version <= 130) ? 1 : 4), select_ratio); - ComparePointer( num_items * ((sm_version <= 130) ? 1 : 2), select_ratio); - ComparePointer( num_items, select_ratio); - ComparePointer( num_items / 2, select_ratio); - ComparePointer( num_items / 4, select_ratio); - -#else - - // Compile/run thorough tests - for (int i = 0; i <= g_repeat; ++i) - { - // Test different input types - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - - Test(num_items); - Test(num_items); - } - -#endif + Test(num_items); + Test(num_items); return 0; } diff --git a/test/test_device_select_unique.cu b/test/test_device_select_unique.cu index a25fea8ead..d33e4895ac 100644 --- a/test/test_device_select_unique.cu +++ b/test/test_device_select_unique.cu @@ -36,16 +36,10 @@ #include #include -#include -#include - #include #include #include -#include -#include - #include "test_util.h" using namespace cub; @@ -57,7 +51,6 @@ using namespace cub; bool g_verbose = false; int g_timing_iterations = 0; -int g_repeat = 0; float g_device_giga_bandwidth; CachingDeviceAllocator g_allocator(true); @@ -65,7 +58,6 @@ CachingDeviceAllocator g_allocator(true); enum Backend { CUB, // CUB method - THRUST, // Thrust method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; @@ -103,64 +95,6 @@ cudaError_t Dispatch( return error; } - -//--------------------------------------------------------------------- -// Dispatch to different Thrust entrypoints -//--------------------------------------------------------------------- - - -/** - * Dispatch to unique entrypoint - */ -template -__host__ __forceinline__ -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - OffsetT num_items, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::device_ptr d_out_wrapper_end; - THRUST_NS_QUALIFIER::device_ptr d_in_wrapper(d_in); - THRUST_NS_QUALIFIER::device_ptr d_out_wrapper(d_out); - for (int i = 0; i < timing_timing_iterations; ++i) - { - d_out_wrapper_end = THRUST_NS_QUALIFIER::unique_copy(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper); - } - - OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper); - CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice)); - - } - - return cudaSuccess; -} - - - //--------------------------------------------------------------------- // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- @@ -429,7 +363,7 @@ void TestPointer( int num_selected = Solve(h_in, h_reference, num_items); printf("\nPointer %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements, entropy_reduction %d\n", - (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", + (BACKEND == CDP) ? "CDP CUB" : "CUB", num_items, num_selected, float(num_items) / num_selected, typeid(T).name(), (int) sizeof(T), @@ -472,7 +406,7 @@ void TestIterator( int num_selected = Solve(h_in, h_reference, num_items); printf("\nIterator %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements\n", - (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", + (BACKEND == CDP) ? "CDP CUB" : "CUB", num_items, num_selected, float(num_items) / num_selected, typeid(T).name(), (int) sizeof(T)); @@ -560,7 +494,6 @@ int main(int argc, char** argv) g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("i", g_timing_iterations); - args.GetCmdLineArgument("repeat", g_repeat); args.GetCmdLineArgument("maxseg", maxseg); args.GetCmdLineArgument("entropy", entropy_reduction); @@ -573,7 +506,6 @@ int main(int argc, char** argv) "[--device=] " "[--maxseg=]" "[--entropy=]" - "[--repeat=]" "[--v] " "[--cdp]" "\n", argv[0]); @@ -585,74 +517,24 @@ int main(int argc, char** argv) g_device_giga_bandwidth = args.device_giga_bandwidth; printf("\n"); -#ifdef CUB_TEST_MINIMAL - - // Compile/run basic CUB test - if (num_items < 0) num_items = 32000000; - TestPointer( num_items, entropy_reduction, maxseg); - -#elif defined(CUB_TEST_BENCHMARK) - - // Get device ordinal - int device_ordinal; - CubDebugExit(cudaGetDevice(&device_ordinal)); + // Test different input types + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); - // Get device SM version - int sm_version = 0; - CubDebugExit(SmVersion(sm_version, device_ordinal)); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); - // Compile/run quick tests - if (num_items < 0) num_items = 32000000; + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); - printf("-- Iterator ----------------------------\n"); - TestIterator( num_items); - - printf("----------------------------\n"); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg); - - printf("----------------------------\n"); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg); - - printf("----------------------------\n"); - TestPointer( num_items, entropy_reduction, maxseg); - TestPointer( num_items, entropy_reduction, maxseg); - - printf("----------------------------\n"); - TestPointer( num_items / 2, entropy_reduction, maxseg); - TestPointer(num_items / 2, entropy_reduction, maxseg); - - printf("----------------------------\n"); - TestPointer( num_items / 4, entropy_reduction, maxseg); - TestPointer( num_items / 4, entropy_reduction, maxseg); - -#else - - // Compile/run thorough tests - for (int i = 0; i <= g_repeat; ++i) - { - // Test different input types - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - - Test(num_items); - Test(num_items); - } - -#endif + Test(num_items); + Test(num_items); return 0; } diff --git a/test/test_iterator.cu b/test/test_iterator.cu index bd7efe126a..1eb6f10730 100644 --- a/test/test_iterator.cu +++ b/test/test_iterator.cu @@ -51,9 +51,6 @@ #include "test_util.h" -#include -#include - using namespace cub; @@ -68,7 +65,6 @@ CachingDeviceAllocator g_allocator(true); enum Backend { CUB, // CUB method - THRUST, // Thrust method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; @@ -199,33 +195,6 @@ void TestConstant(T base) T h_reference[8] = {base, base, base, base, base, base, base, base}; ConstantInputIterator d_itr(base); Test(d_itr, h_reference); - -#if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer - - // - // Test with thrust::copy_if() - // - - int copy_items = 100; - T *h_copy = new T[copy_items]; - T *d_copy = NULL; - - for (int i = 0; i < copy_items; ++i) - h_copy[i] = d_itr[i]; - - CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * copy_items)); - THRUST_NS_QUALIFIER::device_ptr d_copy_wrapper(d_copy); - - THRUST_NS_QUALIFIER::copy_if(d_itr, d_itr + copy_items, d_copy_wrapper, SelectOp()); - - int compare = CompareDeviceResults(h_copy, d_copy, copy_items, g_verbose, g_verbose); - printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS"); - AssertEquals(0, compare); - - if (h_copy) delete[] h_copy; - if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy)); - -#endif // THRUST_VERSION } @@ -254,33 +223,6 @@ void TestCounting(T base) CountingInputIterator d_itr(base); Test(d_itr, h_reference); - -#if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer - - // - // Test with thrust::copy_if() - // - - unsigned long long max_items = ((1ull << ((sizeof(T) * 8) - 1)) - 1); - size_t copy_items = (size_t) CUB_MIN(max_items - base, 100); // potential issue with differencing overflows when T is a smaller type than can handle the offset - T *h_copy = new T[copy_items]; - T *d_copy = NULL; - - for (unsigned long long i = 0; i < copy_items; ++i) - h_copy[i] = d_itr[i]; - - CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * copy_items)); - THRUST_NS_QUALIFIER::device_ptr d_copy_wrapper(d_copy); - THRUST_NS_QUALIFIER::copy_if(d_itr, d_itr + copy_items, d_copy_wrapper, SelectOp()); - - int compare = CompareDeviceResults(h_copy, d_copy, copy_items, g_verbose, g_verbose); - printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS"); - AssertEquals(0, compare); - - if (h_copy) delete[] h_copy; - if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy)); - -#endif // THRUST_VERSION } @@ -328,29 +270,6 @@ void TestModified() Test(CacheModifiedInputIterator((CastT*) d_data), h_reference); Test(CacheModifiedInputIterator((CastT*) d_data), h_reference); -#if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer - - // - // Test with thrust::copy_if() - // - - T *d_copy = NULL; - CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES)); - - CacheModifiedInputIterator d_in_itr((CastT*) d_data); - CacheModifiedOutputIterator d_out_itr((CastT*) d_copy); - - THRUST_NS_QUALIFIER::copy_if(d_in_itr, d_in_itr + TEST_VALUES, d_out_itr, SelectOp()); - - int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose); - printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS"); - AssertEquals(0, compare); - - // Cleanup - if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy)); - -#endif // THRUST_VERSION - if (h_data) delete[] h_data; if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data)); } @@ -397,32 +316,6 @@ void TestTransform() TransformInputIterator, CastT*> d_itr((CastT*) d_data, op); Test(d_itr, h_reference); -#if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer - - // - // Test with thrust::copy_if() - // - - T *h_copy = new T[TEST_VALUES]; - for (int i = 0; i < TEST_VALUES; ++i) - h_copy[i] = op(h_data[i]); - - T *d_copy = NULL; - CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES)); - THRUST_NS_QUALIFIER::device_ptr d_copy_wrapper(d_copy); - - THRUST_NS_QUALIFIER::copy_if(d_itr, d_itr + TEST_VALUES, d_copy_wrapper, SelectOp()); - - int compare = CompareDeviceResults(h_copy, d_copy, TEST_VALUES, g_verbose, g_verbose); - printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS"); - AssertEquals(0, compare); - - // Cleanup - if (h_copy) delete[] h_copy; - if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy)); - -#endif // THRUST_VERSION - if (h_data) delete[] h_data; if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data)); } @@ -476,30 +369,6 @@ void TestTexObj() Test(d_obj_itr, h_reference); -#if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer - - // - // Test with thrust::copy_if() - // - - T *d_copy = NULL; - CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES)); - THRUST_NS_QUALIFIER::device_ptr d_copy_wrapper(d_copy); - - CubDebugExit(cudaMemset(d_copy, 0, sizeof(T) * TEST_VALUES)); - THRUST_NS_QUALIFIER::copy_if(d_obj_itr, d_obj_itr + TEST_VALUES, d_copy_wrapper, SelectOp()); - - int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose); - printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS"); - AssertEquals(0, compare); - - // Cleanup - CubDebugExit(d_obj_itr.UnbindTexture()); - - if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy)); - -#endif // THRUST_VERSION - if (h_data) delete[] h_data; if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data)); if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy)); @@ -560,27 +429,6 @@ void TestTexRef() Test(d_ref_itr, h_reference); -#if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer - - // - // Test with thrust::copy_if() - // - - T *d_copy = NULL; - CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES)); - THRUST_NS_QUALIFIER::device_ptr d_copy_wrapper(d_copy); - - CubDebugExit(cudaMemset(d_copy, 0, sizeof(T) * TEST_VALUES)); - THRUST_NS_QUALIFIER::copy_if(d_ref_itr, d_ref_itr + TEST_VALUES, d_copy_wrapper, SelectOp()); - - int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose); - printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS"); - AssertEquals(0, compare); - - if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy)); - -#endif // THRUST_VERSION - CubDebugExit(d_ref_itr.UnbindTexture()); CubDebugExit(d_ref_itr2.UnbindTexture()); @@ -639,32 +487,6 @@ void TestTexTransform() Test(xform_itr, h_reference); -#if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer - - // - // Test with thrust::copy_if() - // - - T *h_copy = new T[TEST_VALUES]; - for (int i = 0; i < TEST_VALUES; ++i) - h_copy[i] = op(h_data[i]); - - T *d_copy = NULL; - CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES)); - THRUST_NS_QUALIFIER::device_ptr d_copy_wrapper(d_copy); - - THRUST_NS_QUALIFIER::copy_if(xform_itr, xform_itr + TEST_VALUES, d_copy_wrapper, SelectOp()); - - int compare = CompareDeviceResults(h_copy, d_copy, TEST_VALUES, g_verbose, g_verbose); - printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS"); - AssertEquals(0, compare); - - // Cleanup - if (h_copy) delete[] h_copy; - if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy)); - -#endif // THRUST_VERSION - CubDebugExit(d_tex_itr.UnbindTexture()); if (h_data) delete[] h_data; if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data)); diff --git a/test/test_util.h b/test/test_util.h index ec6fecd0f6..40987356a7 100644 --- a/test/test_util.h +++ b/test/test_util.h @@ -1496,11 +1496,11 @@ int CompareDeviceResults( */ template int CompareDeviceResults( - S *h_reference, - CUB_NS_QUALIFIER::DiscardOutputIterator d_data, - std::size_t num_items, - bool verbose = true, - bool display_data = false) + S */*h_reference*/, + CUB_NS_QUALIFIER::DiscardOutputIterator /*d_data*/, + std::size_t /*num_items*/, + bool /*verbose*/ = true, + bool /*display_data*/ = false) { return 0; } diff --git a/test/test_warp_reduce.cu b/test/test_warp_reduce.cu index f5451e789a..18308c1dba 100644 --- a/test/test_warp_reduce.cu +++ b/test/test_warp_reduce.cu @@ -48,7 +48,6 @@ using namespace cub; //--------------------------------------------------------------------- bool g_verbose = false; -int g_repeat = 0; CachingDeviceAllocator g_allocator(true); @@ -799,14 +798,12 @@ int main(int argc, char** argv) // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); - args.GetCmdLineArgument("repeat", g_repeat); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " - "[--repeat=]" "[--v] " "\n", argv[0]); exit(0); @@ -815,28 +812,11 @@ int main(int argc, char** argv) // Initialize device CubDebugExit(args.DeviceInit()); -#ifdef CUB_TEST_BENCHMARK - - // Compile/run quick tests - TestReduce<1, 32, int>(UNIFORM, Sum()); - - TestReduce<1, 32, double>(UNIFORM, Sum()); - TestReduce<2, 16, TestBar>(UNIFORM, Sum()); - TestSegmentedReduce<1, 32, int>(UNIFORM, 1, Sum()); - -#else - - // Compile/run thorough tests - for (int i = 0; i <= g_repeat; ++i) - { - // Test logical warp sizes - Test<32>(); - Test<16>(); - Test<9>(); - Test<7>(); - } - -#endif + // Test logical warp sizes + Test<32>(); + Test<16>(); + Test<9>(); + Test<7>(); return 0; } diff --git a/test/test_warp_scan.cu b/test/test_warp_scan.cu index 913ffb6bb9..81577e4fe2 100644 --- a/test/test_warp_scan.cu +++ b/test/test_warp_scan.cu @@ -51,7 +51,6 @@ static const int NUM_WARPS = 2; bool g_verbose = false; -int g_repeat = 0; CachingDeviceAllocator g_allocator(true); @@ -611,14 +610,12 @@ int main(int argc, char** argv) // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); - args.GetCmdLineArgument("repeat", g_repeat); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " - "[--repeat=]" "[--v] " "\n", argv[0]); exit(0); @@ -627,31 +624,11 @@ int main(int argc, char** argv) // Initialize device CubDebugExit(args.DeviceInit()); -#ifdef CUB_TEST_BENCHMARK - - // Compile/run quick tests - Test<32, AGGREGATE, int>(UNIFORM, Sum(), (int) 0); - Test<32, AGGREGATE, float>(UNIFORM, Sum(), (float) 0); - Test<32, AGGREGATE, long long>(UNIFORM, Sum(), (long long) 0); - Test<32, AGGREGATE, double>(UNIFORM, Sum(), (double) 0); - - typedef KeyValuePair T; - cub::Sum sum_op; - Test<32, AGGREGATE, T>(UNIFORM, ReduceBySegmentOp(sum_op), T()); - -#else - - // Compile/run thorough tests - for (int i = 0; i <= g_repeat; ++i) - { - // Test logical warp sizes - Test<32>(); - Test<16>(); - Test<9>(); - Test<2>(); - } - -#endif + // Test logical warp sizes + Test<32>(); + Test<16>(); + Test<9>(); + Test<2>(); return 0; }