From 52b7a9e6d9423990f8c1155804c5f57031e4d7d3 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 20 Oct 2021 18:37:13 -0400 Subject: [PATCH] Rename strings/array_tests.cu to strings/array_tests.cpp (#9480) This PR renames `cpp/tests/strings/array_tests.cu` to `.cpp` file and cleans up several test cases. The gtests that required a `.cu` extension called the `cudf::strings::detail::scatter` function directly. This was created when `cudf::scatter` had not yet supported strings columns. Changing this to just calling `cudf::scatter` allows removing device code dependencies. Also, the `GatherTooBig` test case was taking over 16s to run. Too much time was spent building fake data to just verify an exception. This PR fixes the fake data (which is never actually read) to speed up it's creation. Finally, this PR includes removing the `hash_string.cu` since it is redundant with the `hashing/hash_test.cpp` which already covers strings inputs. The `hash_string.cu` had been created before strings support was added to `cudf::hash`. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/9480 --- cpp/tests/CMakeLists.txt | 3 +- .../{array_tests.cu => array_tests.cpp} | 114 ++++++------------ cpp/tests/strings/hash_string.cu | 79 ------------ 3 files changed, 36 insertions(+), 160 deletions(-) rename cpp/tests/strings/{array_tests.cu => array_tests.cpp} (61%) delete mode 100644 cpp/tests/strings/hash_string.cu diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index eb4e8735638..1bd1bb04b57 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -351,7 +351,7 @@ ConfigureTest(DISPATCHER_TEST types/type_dispatcher_test.cu) ################################################################################################### # - strings test ---------------------------------------------------------------------------------- ConfigureTest(STRINGS_TEST - strings/array_tests.cu + strings/array_tests.cpp strings/attrs_tests.cpp strings/booleans_tests.cpp strings/case_tests.cpp @@ -371,7 +371,6 @@ ConfigureTest(STRINGS_TEST strings/find_multiple_tests.cpp strings/fixed_point_tests.cpp strings/floats_tests.cpp - strings/hash_string.cu strings/integers_tests.cpp strings/ipv4_tests.cpp strings/json_tests.cpp diff --git a/cpp/tests/strings/array_tests.cu b/cpp/tests/strings/array_tests.cpp similarity index 61% rename from cpp/tests/strings/array_tests.cu rename to cpp/tests/strings/array_tests.cpp index a4d8ecb2bec..8b61999f93e 100644 --- a/cpp/tests/strings/array_tests.cu +++ b/cpp/tests/strings/array_tests.cpp @@ -15,25 +15,17 @@ */ #include + #include #include #include -#include -#include #include -#include -#include #include #include #include -#include -#include #include #include -#include - -#include #include @@ -120,7 +112,7 @@ TEST_P(SliceParmsTest, SliceAllEmpty) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -INSTANTIATE_TEST_CASE_P(SliceParms, +INSTANTIATE_TEST_CASE_P(StringsColumnTest, SliceParmsTest, testing::ValuesIn(std::array{5, 6, 7})); @@ -161,98 +153,62 @@ TEST_F(StringsColumnTest, GatherZeroSizeStringsColumn) cudf::test::expect_strings_empty(results.front()->view()); } -struct column_to_string_view_vector { - cudf::column_device_view const d_strings; - __device__ cudf::string_view operator()(cudf::size_type idx) const - { - cudf::string_view d_str{nullptr, 0}; - if (d_strings.is_valid(idx)) d_str = d_strings.element(idx); - return d_str; - } -}; - TEST_F(StringsColumnTest, GatherTooBig) { - cudf::test::strings_column_wrapper strings({"0123456789012345678901234567890123456789"}); + std::vector h_chars(3000000); + cudf::test::fixed_width_column_wrapper chars(h_chars.begin(), h_chars.end()); + cudf::test::fixed_width_column_wrapper offsets({0, 3000000}); + auto input = cudf::column_view( + cudf::data_type{cudf::type_id::STRING}, 1, nullptr, nullptr, 0, 0, {offsets, chars}); auto map = thrust::constant_iterator(0); - cudf::test::fixed_width_column_wrapper gather_map( - map, map + std::numeric_limits::max() / 20); - EXPECT_THROW(cudf::gather(cudf::table_view{{strings}}, gather_map), cudf::logic_error); + cudf::test::fixed_width_column_wrapper gather_map(map, map + 1000); + EXPECT_THROW(cudf::gather(cudf::table_view{{input}}, gather_map), cudf::logic_error); } TEST_F(StringsColumnTest, Scatter) { - std::vector h_strings1{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"}; - cudf::test::strings_column_wrapper strings1( - h_strings1.begin(), - h_strings1.end(), - thrust::make_transform_iterator(h_strings1.begin(), [](auto str) { return str != nullptr; })); - auto target = cudf::strings_column_view(strings1); - std::vector h_strings2{"1", "22"}; - cudf::test::strings_column_wrapper strings2( - h_strings2.begin(), - h_strings2.end(), - thrust::make_transform_iterator(h_strings2.begin(), [](auto str) { return str != nullptr; })); - auto source = cudf::strings_column_view(strings2); - - std::vector h_scatter_map({4, 1}); - auto scatter_map = cudf::detail::make_device_uvector_sync(h_scatter_map); - - auto source_column = cudf::column_device_view::create(source.parent()); - auto begin = - cudf::detail::make_counting_transform_iterator(0, column_to_string_view_vector{*source_column}); + cudf::test::strings_column_wrapper target({"eee", "bb", "", "", "aa", "bbb", "ééé"}, + {1, 1, 0, 1, 1, 1, 1}); + cudf::test::strings_column_wrapper source({"1", "22"}); - auto results = - cudf::strings::detail::scatter(begin, begin + source.size(), scatter_map.begin(), target); + cudf::test::fixed_width_column_wrapper scatter_map({4, 1}); - std::vector h_expected{"eee", "22", nullptr, "", "1", "bbb", "ééé"}; - cudf::test::strings_column_wrapper expected( - h_expected.begin(), - h_expected.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + auto results = cudf::scatter(cudf::table_view({source}), scatter_map, cudf::table_view({target})); + + cudf::test::strings_column_wrapper expected({"eee", "22", "", "", "1", "bbb", "ééé"}, + {1, 1, 0, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); } TEST_F(StringsColumnTest, ScatterScalar) { - std::vector h_strings1{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"}; - cudf::test::strings_column_wrapper strings1( - h_strings1.begin(), - h_strings1.end(), - thrust::make_transform_iterator(h_strings1.begin(), [](auto str) { return str != nullptr; })); - auto target = cudf::strings_column_view(strings1); + cudf::test::strings_column_wrapper target({"eee", "bb", "", "", "aa", "bbb", "ééé"}, + {1, 1, 0, 1, 1, 1, 1}); - std::vector h_scatter_map({0, 5}); - auto scatter_map = cudf::detail::make_device_uvector_sync(h_scatter_map); + cudf::test::fixed_width_column_wrapper scatter_map({0, 5}); cudf::string_scalar scalar("__"); - auto begin = thrust::make_constant_iterator(cudf::string_view(scalar.data(), scalar.size())); - - auto results = - cudf::strings::detail::scatter(begin, begin + scatter_map.size(), scatter_map.begin(), target); + auto source = std::vector>({scalar}); + auto results = cudf::scatter(source, scatter_map, cudf::table_view({target})); - std::vector h_expected{"__", "bb", nullptr, "", "aa", "__", "ééé"}; - cudf::test::strings_column_wrapper expected( - h_expected.begin(), - h_expected.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + cudf::test::strings_column_wrapper expected({"__", "bb", "", "", "aa", "__", "ééé"}, + {1, 1, 0, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); } TEST_F(StringsColumnTest, ScatterZeroSizeStringsColumn) { - cudf::column_view zero_size_strings_column( - cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); - auto source = cudf::strings_column_view(zero_size_strings_column); - cudf::column_view values(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); - auto target = cudf::strings_column_view(values); + cudf::column_view source(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); + cudf::column_view target(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); + cudf::column_view scatter_map(cudf::data_type{cudf::type_id::INT8}, 0, nullptr, nullptr, 0); - rmm::device_uvector scatter_map(0, rmm::cuda_stream_default); - cudf::string_scalar scalar(""); - auto begin = thrust::make_constant_iterator(cudf::string_view(scalar.data(), scalar.size())); + auto results = cudf::scatter(cudf::table_view({source}), scatter_map, cudf::table_view({target})); + cudf::test::expect_strings_empty(results->view().column(0)); - auto results = cudf::strings::detail::scatter(begin, begin, scatter_map.begin(), target); - cudf::test::expect_strings_empty(results->view()); + cudf::string_scalar scalar(""); + auto scalar_source = std::vector>({scalar}); + results = cudf::scatter(scalar_source, scatter_map, cudf::table_view({target})); + cudf::test::expect_strings_empty(results->view().column(0)); } CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/strings/hash_string.cu b/cpp/tests/strings/hash_string.cu deleted file mode 100644 index b5298d39bda..00000000000 --- a/cpp/tests/strings/hash_string.cu +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "./utilities.h" -#include "rmm/exec_policy.hpp" - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include - -struct StringsHashTest : public cudf::test::BaseFixture { -}; - -struct hash_string_fn { - cudf::column_device_view d_strings; - uint32_t __device__ operator()(uint32_t idx) - { - if (d_strings.is_null(idx)) return 0; - auto item = d_strings.element(idx); - return MurmurHash3_32{}(item); - } -}; - -TEST_F(StringsHashTest, HashTest) -{ - std::vector h_strings{"abcdefghijklmnopqrstuvwxyz", - "abcdefghijklmnopqrstuvwxyz", - "ABCDEFGHIJKLMNOPQRSTUVWXYZ", - "0123456789", - "4", - "", - nullptr, - "last one"}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - - auto strings_view = cudf::strings_column_view(strings); - auto strings_column = cudf::column_device_view::create(strings_view.parent()); - auto d_view = *strings_column; - - rmm::device_uvector d_values(strings_view.size(), rmm::cuda_stream_default); - thrust::transform(rmm::exec_policy(), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_view.size()), - d_values.begin(), - hash_string_fn{d_view}); - - uint32_t h_expected[] = { - 2739798893, 2739798893, 3506676360, 1891213601, 3778137224, 0, 0, 1551088011}; - auto h_values = cudf::detail::make_host_vector_sync(d_values); - for (uint32_t idx = 0; idx < h_values.size(); ++idx) - EXPECT_EQ(h_values[idx], h_expected[idx]); -}