diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp index 2b39662456b..673442902cc 100644 --- a/cpp/include/cudf/strings/repeat_strings.hpp +++ b/cpp/include/cudf/strings/repeat_strings.hpp @@ -30,10 +30,13 @@ namespace strings { /** * @brief Repeat the given string scalar by a given number of times. * - * For a given string scalar, an output string scalar is generated by repeating the input string by - * a number of times given by the @p `repeat_times` parameter. If `repeat_times` is not a positive - * value, an empty (valid) string scalar will be returned. An invalid input scalar will always - * result in an invalid output scalar regardless of the value of `repeat_times` parameter. + * An output string scalar is generated by repeating the input string by a number of times given by + * the @p `repeat_times` parameter. + * + * In special cases: + * - If @p `repeat_times` is not a positive value, an empty (valid) string scalar will be returned. + * - An invalid input scalar will always result in an invalid output scalar regardless of the + * value of @p `repeat_times` parameter. * * @code{.pseudo} * Example: @@ -47,11 +50,11 @@ namespace strings { * (i.e., `input.size() * repeat_times > numeric_limits::max()`). * * @param input The scalar containing the string to repeat. - * @param repeat_times The number of times the `input` string is copied to the output. + * @param repeat_times The number of times the input string is repeated. * @param mr Device memory resource used to allocate the returned string scalar. - * @return New string scalar in which the string is repeated from the input. + * @return New string scalar in which the input string is repeated. */ -std::unique_ptr repeat_strings( +std::unique_ptr repeat_string( string_scalar const& input, size_type repeat_times, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -59,14 +62,17 @@ std::unique_ptr repeat_strings( /** * @brief Repeat each string in the given strings column by a given number of times. * - * For a given strings column, an output strings column is generated by repeating each string from - * the input by a number of times given by the @p `repeat_times` parameter. If `repeat_times` is not - * a positive value, all the rows of the output strings column will be an empty string. Any null row - * will result in a null row regardless of the value of `repeat_times` parameter. + * An output strings column is generated by repeating each string from the input strings column by a + * number of times given by the @p `repeat_times` parameter. + * + * In special cases: + * - If @p `repeat_times` is not a positive number, a non-null input string will always result in + * an empty output string. + * - A null input string will always result in a null output string regardless of the value of the + * @p `repeat_times` parameter. * - * Note that this function cannot handle the cases when the size of the output column exceeds the - * maximum value that can be indexed by size_type (offset_type). In such situations, an exception - * may be thrown, or the output result is undefined. + * The caller is responsible for checking the output column size will not exceed the maximum size of + * a strings column (number of total characters is less than the max size_type value). * * @code{.pseudo} * Example: @@ -76,15 +82,89 @@ std::unique_ptr repeat_strings( * @endcode * * @param input The column containing strings to repeat. - * @param repeat_times The number of times each input string is copied to the output. + * @param repeat_times The number of times each input string is repeated. * @param mr Device memory resource used to allocate the returned strings column. - * @return New column with concatenated results. + * @return New column containing the repeated strings. */ std::unique_ptr repeat_strings( strings_column_view const& input, size_type repeat_times, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Repeat each string in the given strings column by the numbers of times given in another + * numeric column. + * + * An output strings column is generated by repeating each of the input string by a number of times + * given by the corresponding row in a @p `repeat_times` numeric column. The computational time can + * be reduced if sizes of the output strings are known and provided. + * + * In special cases: + * - Any null row (from either the input strings column or the `repeat_times` column) will always + * result in a null output string. + * - If any value in the `repeat_times` column is not a positive number and its corresponding input + * string is not null, the output string will be an empty string. + * + * The caller is responsible for checking the output column size will not exceed the maximum size of + * a strings column (number of total characters is less than the max size_type value). + * + * @code{.pseudo} + * Example: + * strs = ['aa', null, '', 'bbc-'] + * repeat_times = [ 1, 2, 3, 4 ] + * out = repeat_strings(strs, repeat_times) + * out is ['aa', null, '', 'bbc-bbc-bbc-bbc-'] + * @endcode + * + * @throw cudf::logic_error if the input `repeat_times` column has data type other than integer. + * @throw cudf::logic_error if the input columns have different sizes. + * + * @param input The column containing strings to repeat. + * @param repeat_times The column containing numbers of times that the corresponding input strings + * are repeated. + * @param output_strings_sizes The optional column containing pre-computed sizes of the output + * strings. + * @param mr Device memory resource used to allocate the returned strings column. + * @return New column containing the repeated strings. + */ +std::unique_ptr repeat_strings( + strings_column_view const& input, + column_view const& repeat_times, + std::optional output_strings_sizes = std::nullopt, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Compute sizes of the output strings if each string in the input strings column + * is repeated by the numbers of times given in another numeric column. + * + * The output column storing string output sizes is not nullable. These string sizes are + * also summed up and returned (in an `int64_t` value), which can be used to detect if the input + * strings column can be safely repeated without data corruption due to overflow in string indexing. + * + * @code{.pseudo} + * Example: + * strs = ['aa', null, '', 'bbc-'] + * repeat_times = [ 1, 2, 3, 4 ] + * [output_sizes, total_size] = repeat_strings_output_sizes(strs, repeat_times) + * out is [2, 0, 0, 16], and total_size = 18 + * @endcode + * + * @throw cudf::logic_error if the input `repeat_times` column has data type other than integer. + * @throw cudf::logic_error if the input columns have different sizes. + * + * @param input The column containing strings to repeat. + * @param repeat_times The column containing numbers of times that the corresponding input strings + * are repeated. + * @param mr Device memory resource used to allocate the returned strings column. + * @return A pair with the first item is an int32_t column containing sizes of the output strings, + * and the second item is an int64_t number containing the total sizes (in bytes) of the + * output strings column. + */ +std::pair, int64_t> repeat_strings_output_sizes( + strings_column_view const& input, + column_view const& repeat_times, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index dd91fe0e49d..d495b412b87 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -16,26 +16,27 @@ #include #include +#include #include #include #include -#include #include #include #include #include +#include #include +#include namespace cudf { namespace strings { namespace detail { - -std::unique_ptr repeat_strings(string_scalar const& input, - size_type repeat_times, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr repeat_string(string_scalar const& input, + size_type repeat_times, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (!input.is_valid(stream)) { return std::make_unique("", false, stream, mr); } if (input.size() == 0 || repeat_times <= 0) { @@ -64,7 +65,7 @@ std::unique_ptr repeat_strings(string_scalar const& input, namespace { /** - * @brief Generate a strings column in which each row is an empty or null string. + * @brief Generate a strings column in which each row is an empty string or a null. * * The output strings column has the same bitmask as the input column. */ @@ -92,7 +93,7 @@ auto generate_empty_output(strings_column_view const& input, } /** - * @brief Functor to compute string sizes and repeat the input strings. + * @brief Functor to compute output string sizes and repeat the input strings. * * This functor is called only when `repeat_times > 0`. In addition, the total number of threads * running this functor is `repeat_times * strings_count` (instead of `string_count`) for maximizing @@ -156,13 +157,8 @@ std::unique_ptr repeat_strings(strings_column_view const& input, auto const strings_dv_ptr = column_device_view::create(input.parent(), stream); auto const fn = compute_size_and_repeat_fn{*strings_dv_ptr, repeat_times, input.has_nulls()}; - // Repeat the strings in each row. - // Note that this cannot handle the cases when the size of the output column exceeds the maximum - // value that can be indexed by size_type (offset_type). - // In such situations, an exception may be thrown, or the output result is undefined. auto [offsets_column, chars_column] = make_strings_children(fn, strings_count * repeat_times, strings_count, stream, mr); - return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -172,14 +168,226 @@ std::unique_ptr repeat_strings(strings_column_view const& input, mr); } +namespace { +/** + * @brief Functor to compute string sizes and repeat the input strings, each string is repeated by a + * separate number of times. + */ +template +struct compute_size_and_repeat_separately_fn { + column_device_view const strings_dv; + column_device_view const repeat_times_dv; + Iterator const repeat_times_iter; + bool const strings_has_nulls; + bool const rtimes_has_nulls; + + offset_type* d_offsets{nullptr}; + + // If d_chars == nullptr: only compute sizes of the output strings. + // If d_chars != nullptr: only repeat strings. + char* d_chars{nullptr}; + + __device__ int64_t operator()(size_type const idx) const noexcept + { + auto const string_is_valid = !strings_has_nulls || strings_dv.is_valid_nocheck(idx); + auto const rtimes_is_valid = !rtimes_has_nulls || repeat_times_dv.is_valid_nocheck(idx); + + // Any null input (either string or repeat_times value) will result in a null output. + auto const is_valid = string_is_valid && rtimes_is_valid; + + // When the input string is null, `repeat_times` and `string_size` are also set to 0. + // This makes sure that if `repeat_times > 0` then we will always have a valid input string, + // and if `repeat_times <= 0` we will never copy anything to the output. + auto const repeat_times = is_valid ? repeat_times_iter[idx] : size_type{0}; + auto const string_size = + is_valid ? strings_dv.element(idx).size_bytes() : size_type{0}; + + // The output_size is returned, and it needs to be an int64_t number to prevent overflow. + auto const output_size = + repeat_times > 0 ? static_cast(repeat_times) * static_cast(string_size) + : int64_t{0}; + + if (!d_chars) { + // If overflow happen, the stored value of output string size will be incorrect due to + // downcasting. In such cases, the entire output string size array should be discarded. + d_offsets[idx] = static_cast(output_size); + } else if (repeat_times > 0 && string_size > 0) { + auto const d_str = strings_dv.element(idx); + auto const input_ptr = d_str.data(); + auto output_ptr = d_chars + d_offsets[idx]; + for (size_type repeat_idx = 0; repeat_idx < repeat_times; ++repeat_idx) { + output_ptr = copy_and_increment(output_ptr, input_ptr, string_size); + } + } + + // The output_size value may be used to sum up to detect overflow at the caller site. + // The caller can detect overflow easily by checking `SUM(output_size) > INT_MAX`. + return output_size; + } +}; + +/** + * @brief Creates child offsets and chars columns by applying the template function that + * can be used for computing the output size of each string as well as create the output. + * + * This function is similar to `strings::detail::make_strings_children`, except that it accepts an + * optional input `std::optional` that can contain the precomputed sizes of the output + * strings. + */ +template +auto make_strings_children(Func fn, + size_type exec_size, + size_type strings_count, + std::optional output_strings_sizes, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto offsets_column = make_numeric_column( + data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); + + auto offsets_view = offsets_column->mutable_view(); + auto d_offsets = offsets_view.template data(); + fn.d_offsets = d_offsets; + + // This may be called twice -- once for offsets and once for chars. + auto for_each_fn = [exec_size, stream](Func& fn) { + thrust::for_each_n( + rmm::exec_policy(stream), thrust::make_counting_iterator(0), exec_size, fn); + }; + + if (!output_strings_sizes.has_value()) { + // Compute the output sizes only if they are not given. + for_each_fn(fn); + + // Compute the offsets values. + thrust::exclusive_scan( + rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets); + } else { + // Compute the offsets values from the provided output string sizes. + auto const string_sizes = output_strings_sizes.value(); + CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(offset_type), stream.value())); + thrust::inclusive_scan(rmm::exec_policy(stream), + string_sizes.template begin(), + string_sizes.template end(), + d_offsets + 1); + } + + // Now build the chars column + auto const bytes = cudf::detail::get_value(offsets_view, strings_count, stream); + auto chars_column = create_chars_child_column(bytes, stream, mr); + + // Execute the function fn again to fill the chars column. + // Note that if the output chars column has zero size, the function fn should not be called to + // avoid accidentally overwriting the offsets. + if (bytes > 0) { + fn.d_chars = chars_column->mutable_view().template data(); + for_each_fn(fn); + } + + return std::make_pair(std::move(offsets_column), std::move(chars_column)); +} + +} // namespace + +std::unique_ptr repeat_strings(strings_column_view const& input, + column_view const& repeat_times, + std::optional output_strings_sizes, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(input.size() == repeat_times.size(), "The input columns must have the same size."); + CUDF_EXPECTS(cudf::is_index_type(repeat_times.type()), + "repeat_strings expects an integer type for the `repeat_times` input column."); + if (output_strings_sizes.has_value()) { + auto const output_sizes = output_strings_sizes.value(); + CUDF_EXPECTS(input.size() == output_sizes.size() && + (!output_sizes.nullable() || !output_sizes.has_nulls()), + "The given column of output string sizes is invalid."); + } + + auto const strings_count = input.size(); + if (strings_count == 0) { return make_empty_column(data_type{type_id::STRING}); } + + auto const strings_dv_ptr = column_device_view::create(input.parent(), stream); + auto const repeat_times_dv_ptr = column_device_view::create(repeat_times, stream); + auto const strings_has_nulls = input.has_nulls(); + auto const rtimes_has_nulls = repeat_times.has_nulls(); + auto const repeat_times_iter = + cudf::detail::indexalator_factory::make_input_iterator(repeat_times); + auto const fn = compute_size_and_repeat_separately_fn{ + *strings_dv_ptr, *repeat_times_dv_ptr, repeat_times_iter, strings_has_nulls, rtimes_has_nulls}; + + auto [offsets_column, chars_column] = + make_strings_children(fn, strings_count, strings_count, output_strings_sizes, stream, mr); + + // We generate new bitmask by AND of the input columns' bitmasks. + // Note that if the input columns are nullable, the output column will also be nullable (which may + // not have nulls). + auto null_mask = + cudf::detail::bitmask_and(table_view{{input.parent(), repeat_times}}, stream, mr); + + return make_strings_column(strings_count, + std::move(offsets_column), + std::move(chars_column), + UNKNOWN_NULL_COUNT, + std::move(null_mask), + stream, + mr); +} + +std::pair, int64_t> repeat_strings_output_sizes( + strings_column_view const& input, + column_view const& repeat_times, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(input.size() == repeat_times.size(), "The input columns must have the same size."); + CUDF_EXPECTS( + cudf::is_index_type(repeat_times.type()), + "repeat_strings_output_sizes expects an integer type for the `repeat_times` input column."); + + auto const strings_count = input.size(); + if (strings_count == 0) { + return std::make_pair(make_empty_column(data_type{type_to_id()}), int64_t{0}); + } + + auto output_sizes = make_numeric_column( + data_type{type_to_id()}, strings_count, mask_state::UNALLOCATED, stream, mr); + + auto const strings_dv_ptr = column_device_view::create(input.parent(), stream); + auto const repeat_times_dv_ptr = column_device_view::create(repeat_times, stream); + auto const strings_has_nulls = input.has_nulls(); + auto const rtimes_has_nulls = repeat_times.has_nulls(); + auto const repeat_times_iter = + cudf::detail::indexalator_factory::make_input_iterator(repeat_times); + + auto const fn = compute_size_and_repeat_separately_fn{ + *strings_dv_ptr, + *repeat_times_dv_ptr, + repeat_times_iter, + strings_has_nulls, + rtimes_has_nulls, + output_sizes->mutable_view().template begin()}; + + auto const total_bytes = + thrust::transform_reduce(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + fn, + int64_t{0}, + thrust::plus{}); + + return std::make_pair(std::move(output_sizes), total_bytes); +} + } // namespace detail -std::unique_ptr repeat_strings(string_scalar const& input, - size_type repeat_times, - rmm::mr::device_memory_resource* mr) +std::unique_ptr repeat_string(string_scalar const& input, + size_type repeat_times, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::repeat_strings(input, repeat_times, rmm::cuda_stream_default, mr); + return detail::repeat_string(input, repeat_times, rmm::cuda_stream_default, mr); } std::unique_ptr repeat_strings(strings_column_view const& input, @@ -190,5 +398,24 @@ std::unique_ptr repeat_strings(strings_column_view const& input, return detail::repeat_strings(input, repeat_times, rmm::cuda_stream_default, mr); } +std::unique_ptr repeat_strings(strings_column_view const& input, + column_view const& repeat_times, + std::optional output_strings_sizes, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::repeat_strings( + input, repeat_times, output_strings_sizes, rmm::cuda_stream_default, mr); +} + +std::pair, int64_t> repeat_strings_output_sizes( + strings_column_view const& input, + column_view const& repeat_times, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::repeat_strings_output_sizes(input, repeat_times, rmm::cuda_stream_default, mr); +} + } // namespace strings } // namespace cudf diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp index 98f91cb2a0a..69d0494c253 100644 --- a/cpp/tests/strings/repeat_strings_tests.cpp +++ b/cpp/tests/strings/repeat_strings_tests.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -25,219 +26,681 @@ using namespace cudf::test::iterators; namespace { -using STR_COL = cudf::test::strings_column_wrapper; +using strs_col = cudf::test::strings_column_wrapper; +using int32s_col = cudf::test::fixed_width_column_wrapper; +constexpr int32_t null{0}; // mark for null elements in a column of int32_t values constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR}; } // namespace -struct RepeatJoinStringTest : public cudf::test::BaseFixture { +struct RepeatStringsTest : public cudf::test::BaseFixture { }; -TEST_F(RepeatJoinStringTest, InvalidStringScalar) +template +struct RepeatStringsTypedTest : public cudf::test::BaseFixture { +}; + +// Test for signed types only, as we will need to use non-positive values. +using TypesForTest = cudf::test::Types; +TYPED_TEST_SUITE(RepeatStringsTypedTest, TypesForTest); + +TYPED_TEST(RepeatStringsTypedTest, InvalidStringScalar) { auto const str = cudf::string_scalar("", false); - auto const result = cudf::strings::repeat_strings(str, 3); + auto const result = cudf::strings::repeat_string(str, 3); EXPECT_EQ(result->is_valid(), false); } -TEST_F(RepeatJoinStringTest, ZeroSizeStringScalar) +TYPED_TEST(RepeatStringsTypedTest, ZeroSizeStringScalar) { auto const str = cudf::string_scalar(""); - auto const result = cudf::strings::repeat_strings(str, 3); + auto const result = cudf::strings::repeat_string(str, 3); EXPECT_EQ(result->is_valid(), true); EXPECT_EQ(result->size(), 0); } -TEST_F(RepeatJoinStringTest, ValidStringScalar) +TYPED_TEST(RepeatStringsTypedTest, ValidStringScalar) { auto const str = cudf::string_scalar("abc123xyz-"); { - auto const result = cudf::strings::repeat_strings(str, 3); - auto const expected = cudf::string_scalar("abc123xyz-abc123xyz-abc123xyz-"); - CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), result->data(), expected.size()); + auto const result = cudf::strings::repeat_string(str, 3); + auto const expected_strs = cudf::string_scalar("abc123xyz-abc123xyz-abc123xyz-"); + CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected_strs.data(), result->data(), expected_strs.size()); } // Repeat once. { - auto const result = cudf::strings::repeat_strings(str, 1); + auto const result = cudf::strings::repeat_string(str, 1); CUDF_TEST_EXPECT_EQUAL_BUFFERS(str.data(), result->data(), str.size()); } // Zero repeat times. { - auto const result = cudf::strings::repeat_strings(str, 0); + auto const result = cudf::strings::repeat_string(str, 0); EXPECT_EQ(result->is_valid(), true); EXPECT_EQ(result->size(), 0); } // Negative repeat times. { - auto const result = cudf::strings::repeat_strings(str, -10); + auto const result = cudf::strings::repeat_string(str, -10); EXPECT_EQ(result->is_valid(), true); EXPECT_EQ(result->size(), 0); } // Repeat too many times. { - EXPECT_THROW(cudf::strings::repeat_strings(str, std::numeric_limits::max() / 2), + EXPECT_THROW(cudf::strings::repeat_string(str, std::numeric_limits::max() / 2), cudf::logic_error); } } -TEST_F(RepeatJoinStringTest, ZeroSizeStringsColumn) +TYPED_TEST(RepeatStringsTypedTest, ZeroSizeStringsColumnWithScalarRepeatTimes) { - auto const strs = STR_COL{}; + auto const strs = strs_col{}; auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 10); CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, verbosity); } -TEST_F(RepeatJoinStringTest, AllEmptyStringsColumn) +TYPED_TEST(RepeatStringsTypedTest, ZeroSizeStringsColumnWithColumnRepeatTimes) { - auto const strs = STR_COL{"", "", "", "", ""}; + using ints_col = cudf::test::fixed_width_column_wrapper; + + auto const strs = strs_col{}; + auto const repeat_times = ints_col{}; + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), repeat_times); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, verbosity); +} + +TYPED_TEST(RepeatStringsTypedTest, AllEmptyStringsColumnWithScalarRepeatTimes) +{ + auto const strs = strs_col{"", "", "", "", ""}; auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 10); CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, verbosity); } -TEST_F(RepeatJoinStringTest, AllNullStringsColumn) +TYPED_TEST(RepeatStringsTypedTest, AllEmptyStringsColumnWithColumnRepeatTimes) +{ + using ints_col = cudf::test::fixed_width_column_wrapper; + + auto const strs = strs_col{"", "", "", "", ""}; + auto const repeat_times = ints_col{-2, -1, 0, 1, 2}; + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), repeat_times); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, verbosity); +} + +TYPED_TEST(RepeatStringsTypedTest, AllNullStringsColumnWithScalarRepeatTimes) { - auto const strs = STR_COL{{"" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, all_nulls()}; + auto const strs = strs_col{{"" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, all_nulls()}; auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 10); CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, verbosity); } -TEST_F(RepeatJoinStringTest, ZeroSizeAndNullStringsColumn) +TYPED_TEST(RepeatStringsTypedTest, AllNullStringsColumnWithColumnRepeatTimes) +{ + using ints_col = cudf::test::fixed_width_column_wrapper; + + auto const strs = strs_col{{"" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, all_nulls()}; + auto const strs_cv = cudf::strings_column_view(strs); + + // The repeat_times column contains all valid numbers. + { + auto const repeat_times = ints_col{-1, 0, 1}; + auto const results = cudf::strings::repeat_strings(strs_cv, repeat_times); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, verbosity); + } + + // The repeat_times column also contains some nulls and some valid numbers. + { + auto const repeat_times = ints_col{{null, 1, null}, nulls_at({0, 2})}; + auto const results = cudf::strings::repeat_strings(strs_cv, repeat_times); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, verbosity); + } + + // The repeat_times column also contains all nulls. + { + auto const repeat_times = ints_col{{null, null, null}, all_nulls()}; + auto const results = cudf::strings::repeat_strings(strs_cv, repeat_times); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, verbosity); + } +} + +TYPED_TEST(RepeatStringsTypedTest, StringsColumnWithAllNullColumnRepeatTimes) +{ + using ints_col = cudf::test::fixed_width_column_wrapper; + + auto const strs = strs_col{"ABC", "abc", "xyz"}; + auto const repeat_times = ints_col{{null, null, null}, all_nulls()}; + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), repeat_times); + auto const expected_strs = strs_col{{"" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, all_nulls()}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); +} + +TYPED_TEST(RepeatStringsTypedTest, ZeroSizeAndNullStringsColumnWithScalarRepeatTimes) { auto const strs = - STR_COL{{"" /*NULL*/, "", "" /*NULL*/, "", "", "" /*NULL*/}, nulls_at({0, 2, 5})}; + strs_col{{"" /*NULL*/, "", "" /*NULL*/, "", "", "" /*NULL*/}, nulls_at({0, 2, 5})}; auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 10); CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, verbosity); } -TEST_F(RepeatJoinStringTest, StringsColumnNoNull) +TYPED_TEST(RepeatStringsTypedTest, ZeroSizeAndNullStringsColumnWithColumnRepeatTimes) +{ + using ints_col = cudf::test::fixed_width_column_wrapper; + + auto const strs = + strs_col{{"" /*NULL*/, "", "" /*NULL*/, "", "", "" /*NULL*/}, nulls_at({0, 2, 5})}; + auto const repeat_times = ints_col{1, 2, 3, 4, 5, 6}; + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 10); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, verbosity); +} + +TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesInvalidInput) { - auto const strs = STR_COL{"0a0b0c", "abcxyz", "xyzééé", "ááá", "íí"}; + auto const strs = strs_col{"abc", "xyz"}; + auto const strs_cv = cudf::strings_column_view(strs); + + // Sizes mismatched between strings column and repeat_times column. + { + auto const repeat_times = int32s_col{1, 2, 3, 4, 5, 6}; + EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times), cudf::logic_error); + } + + // Sizes mismatched between strings column and output_strings_sizes column. + { + auto const repeat_times = int32s_col{1, 2}; + auto const sizes = int32s_col{1, 2, 3, 4, 5}; + EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times, sizes), cudf::logic_error); + } + // output_strings_sizes column has nulls. { - auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 2); - auto const expected = STR_COL{"0a0b0c0a0b0c", "abcxyzabcxyz", "xyzéééxyzééé", "áááááá", "íííí"}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, verbosity); + auto const repeat_times = int32s_col{1, 2}; + auto const sizes = int32s_col{{null, 2}, null_at(0)}; + EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times, sizes), cudf::logic_error); + } + + // Invalid data type for repeat_times column. + { + auto const repeat_times = cudf::test::fixed_width_column_wrapper{1, 2, 3, 4, 5, 6}; + EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times), cudf::logic_error); + } + + // Invalid data type for repeat_times column. + { + auto const repeat_times = strs_col{"xxx", "xxx"}; + EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times), cudf::logic_error); + } +} + +TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesOverflowOutput) +{ + auto const strs = strs_col{"1", "12", "123", "1234", "12345", "123456", "1234567"}; + auto const strs_cv = cudf::strings_column_view(strs); + + auto const half_max = std::numeric_limits::max() / 2; + auto const repeat_times = + int32s_col{half_max, half_max, half_max, half_max, half_max, half_max, half_max}; + + auto const [sizes, total_bytes] = + cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times); + (void)sizes; + auto const expected_bytes = static_cast(half_max) * int64_t{1 + 2 + 3 + 4 + 5 + 6 + 7}; + EXPECT_EQ(expected_bytes, total_bytes); +} + +TYPED_TEST(RepeatStringsTypedTest, StringsColumnNoNullWithScalarRepeatTimes) +{ + auto const strs = strs_col{"0a0b0c", "abcxyz", "xyzééé", "ááá", "íí"}; + auto const strs_cv = cudf::strings_column_view(strs); + + { + auto const results = cudf::strings::repeat_strings(strs_cv, 2); + auto const expected_strs = + strs_col{"0a0b0c0a0b0c", "abcxyzabcxyz", "xyzéééxyzééé", "áááááá", "íííí"}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // Repeat once. { - auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 1); + auto const results = cudf::strings::repeat_strings(strs_cv, 1); CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, verbosity); } // Non-positive repeat times. { - auto const expected = STR_COL{"", "", "", "", ""}; + auto const expected_strs = strs_col{"", "", "", "", ""}; + + auto results = cudf::strings::repeat_strings(strs_cv, 0); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + + results = cudf::strings::repeat_strings(strs_cv, -100); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + } +} + +TYPED_TEST(RepeatStringsTypedTest, StringsColumnNoNullWithColumnRepeatTimes) +{ + using ints_col = cudf::test::fixed_width_column_wrapper; + + auto const strs = strs_col{"0a0b0c", "abcxyz", "xyzééé", "ááá", "íí"}; + auto const strs_cv = cudf::strings_column_view(strs); + + // Repeat once. + { + auto const repeat_times = ints_col{1, 1, 1, 1, 1}; + auto const results = cudf::strings::repeat_strings(strs_cv, repeat_times); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, verbosity); + } + + // repeat_times column has negative values. + { + auto const repeat_times = ints_col{1, 2, 3, -1, -2}; + auto const expected_strs = strs_col{"0a0b0c", "abcxyzabcxyz", "xyzéééxyzéééxyzééé", "", ""}; + + auto results = cudf::strings::repeat_strings(strs_cv, repeat_times); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + + auto const expected_sizes = int32s_col{6, 12, 27, 0, 0}; + auto const [sizes, total_bytes] = + cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); + EXPECT_EQ(45, total_bytes); + + results = cudf::strings::repeat_strings(strs_cv, repeat_times, *sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + } + + // repeat_times column has nulls. + { + auto const repeat_times = ints_col{{1, null, 3, 2, null}, nulls_at({1, 4})}; + auto const expected_strs = strs_col{ + {"0a0b0c", "" /*NULL*/, "xyzéééxyzéééxyzééé", "áááááá", "" /*NULL*/}, nulls_at({1, 4})}; + + auto results = cudf::strings::repeat_strings(strs_cv, repeat_times); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); - auto results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, verbosity); + auto const expected_sizes = int32s_col{6, 0, 27, 12, 0}; + auto const [sizes, total_bytes] = + cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); + EXPECT_EQ(45, total_bytes); - results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), -100); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, verbosity); + results = cudf::strings::repeat_strings(strs_cv, repeat_times, *sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } +} + +TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnNoNullWithScalarRepeatTimes) +{ + auto const strs = strs_col{"0a0b0c", "abcxyz", "xyzééé", "ááá", "íí"}; // Sliced the first half of the column. { auto const sliced_strs = cudf::slice(strs, {0, 3})[0]; - auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(sliced_strs), 2); - auto const expected = STR_COL{"0a0b0c0a0b0c", "abcxyzabcxyz", "xyzéééxyzééé"}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, verbosity); + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(sliced_strs), 2); + auto const expected_strs = strs_col{"0a0b0c0a0b0c", "abcxyzabcxyz", "xyzéééxyzééé"}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // Sliced the middle of the column. { auto const sliced_strs = cudf::slice(strs, {1, 3})[0]; - auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(sliced_strs), 2); - auto const expected = STR_COL{"abcxyzabcxyz", "xyzéééxyzééé"}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, verbosity); + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(sliced_strs), 2); + auto const expected_strs = strs_col{"abcxyzabcxyz", "xyzéééxyzééé"}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // Sliced the second half of the column. { auto const sliced_strs = cudf::slice(strs, {2, 5})[0]; - auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(sliced_strs), 2); - auto const expected = STR_COL{"xyzéééxyzééé", "áááááá", "íííí"}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, verbosity); - } -} - -TEST_F(RepeatJoinStringTest, StringsColumnWithNulls) -{ - auto const strs = STR_COL{{"0a0b0c", - "" /*NULL*/, - "abcxyz", - "" /*NULL*/, - "xyzééé", - "" /*NULL*/, - "ááá", - "íí", - "", - "Hello World"}, - nulls_at({1, 3, 5})}; - - { - auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 2); - auto const expected = STR_COL{{"0a0b0c0a0b0c", - "" /*NULL*/, - "abcxyzabcxyz", - "" /*NULL*/, - "xyzéééxyzééé", - "" /*NULL*/, - "áááááá", - "íííí", - "", - "Hello WorldHello World"}, - nulls_at({1, 3, 5})}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, verbosity); + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(sliced_strs), 2); + auto const expected_strs = strs_col{"xyzéééxyzééé", "áááááá", "íííí"}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + } +} + +TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnNoNullWithColumnRepeatTimes) +{ + using ints_col = cudf::test::fixed_width_column_wrapper; + + auto const strs = strs_col{"0a0b0c", "abcxyz", "xyzééé", "ááá", "íí"}; + auto const repeat_times = ints_col{1, 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + + // Sliced the first half of the column. + { + auto const sliced_strs = cudf::slice(strs, {0, 3})[0]; + auto const sliced_rtimes = cudf::slice(repeat_times, {0, 3})[0]; + auto const sliced_strs_cv = cudf::strings_column_view(sliced_strs); + auto const expected_strs = strs_col{"0a0b0c", "abcxyzabcxyz", "xyzéééxyzéééxyzééé"}; + + auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + + auto const expected_sizes = int32s_col{6, 12, 27}; + auto const [sizes, total_bytes] = + cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); + EXPECT_EQ(45, total_bytes); + + results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + } + + // Sliced the middle of the column. + { + auto const sliced_strs = cudf::slice(strs, {1, 3})[0]; + auto const sliced_rtimes = cudf::slice(repeat_times, {1, 3})[0]; + auto const sliced_strs_cv = cudf::strings_column_view(sliced_strs); + auto const expected_strs = strs_col{"abcxyzabcxyz", "xyzéééxyzéééxyzééé"}; + + auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + + auto const expected_sizes = int32s_col{12, 27}; + auto const [sizes, total_bytes] = + cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); + EXPECT_EQ(39, total_bytes); + + results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + } + + // Sliced the second half of the column. + { + auto const sliced_strs = cudf::slice(strs, {2, 5})[0]; + auto const sliced_rtimes = cudf::slice(repeat_times, {2, 5})[0]; + auto const sliced_strs_cv = cudf::strings_column_view(sliced_strs); + auto const expected_strs = strs_col{"xyzéééxyzéééxyzééé", "áááááá", "íííííí"}; + + auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + + auto const expected_sizes = int32s_col{27, 12, 12}; + auto const [sizes, total_bytes] = + cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); + EXPECT_EQ(51, total_bytes); + + results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + } +} + +TYPED_TEST(RepeatStringsTypedTest, StringsColumnWithNullsWithScalarRepeatTimes) +{ + auto const strs = strs_col{{"0a0b0c", + "" /*NULL*/, + "abcxyz", + "" /*NULL*/, + "xyzééé", + "" /*NULL*/, + "ááá", + "íí", + "", + "Hello World"}, + nulls_at({1, 3, 5})}; + auto const strs_cv = cudf::strings_column_view(strs); + + { + auto const results = cudf::strings::repeat_strings(strs_cv, 2); + auto const expected_strs = strs_col{{"0a0b0c0a0b0c", + "" /*NULL*/, + "abcxyzabcxyz", + "" /*NULL*/, + "xyzéééxyzééé", + "" /*NULL*/, + "áááááá", + "íííí", + "", + "Hello WorldHello World"}, + nulls_at({1, 3, 5})}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // Repeat once. { - auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 1); + auto const results = cudf::strings::repeat_strings(strs_cv, 1); CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, verbosity); } // Non-positive repeat times. { - auto const expected = STR_COL{ + auto const expected_strs = strs_col{ {"", "" /*NULL*/, "", "" /*NULL*/, "", "" /*NULL*/, "", "", "", ""}, nulls_at({1, 3, 5})}; - auto results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, verbosity); + auto results = cudf::strings::repeat_strings(strs_cv, 0); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + + results = cudf::strings::repeat_strings(strs_cv, -100); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + } +} + +TYPED_TEST(RepeatStringsTypedTest, StringsColumnWithNullsWithColumnRepeatTimes) +{ + using ints_col = cudf::test::fixed_width_column_wrapper; + + auto const strs = strs_col{{"0a0b0c", + "" /*NULL*/, + "abcxyz", + "" /*NULL*/, + "xyzééé", + "" /*NULL*/, + "ááá", + "íí", + "", + "Hello World"}, + nulls_at({1, 3, 5})}; + auto const strs_cv = cudf::strings_column_view(strs); + + // Repeat once. + { + auto const repeat_times = ints_col{1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + auto const results = cudf::strings::repeat_strings(strs_cv, repeat_times); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, verbosity); + } + + // repeat_times column has negative values. + { + auto const repeat_times = ints_col{1, 2, 3, -1, -2, 1, 2, 3, -5, 0}; + auto const expected_strs = strs_col{{"0a0b0c", + "" /*NULL*/, + "abcxyzabcxyzabcxyz", + "" /*NULL*/, + "", + "" /*NULL*/, + "áááááá", + "íííííí", + "", + ""}, + nulls_at({1, 3, 5})}; + + auto results = cudf::strings::repeat_strings(strs_cv, repeat_times); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + + auto const expected_sizes = int32s_col{6, 0, 18, 0, 0, 0, 12, 12, 0, 0}; + auto const [sizes, total_bytes] = + cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); + EXPECT_EQ(48, total_bytes); + + results = cudf::strings::repeat_strings(strs_cv, repeat_times, *sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + } - results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), -100); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, verbosity); + // repeat_times column has nulls. + { + auto const repeat_times = + ints_col{{1, 2, null, -1, null, 1, 2, null, -5, 0}, nulls_at({2, 4, 7})}; + auto const expected_strs = strs_col{{"0a0b0c", + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/, + "áááááá", + "" /*NULL*/, + "", + ""}, + nulls_at({1, 2, 3, 4, 5, 7})}; + + auto results = cudf::strings::repeat_strings(strs_cv, repeat_times); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + + auto const expected_sizes = int32s_col{6, 0, 0, 0, 0, 0, 12, 0, 0, 0}; + auto const [sizes, total_bytes] = + cudf::strings::repeat_strings_output_sizes(strs_cv, repeat_times); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); + EXPECT_EQ(18, total_bytes); + + results = cudf::strings::repeat_strings(strs_cv, repeat_times, *sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } +} + +TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnWithNullsWithScalarRepeatTimes) +{ + auto const strs = strs_col{{"0a0b0c", + "" /*NULL*/, + "abcxyz", + "" /*NULL*/, + "xyzééé", + "" /*NULL*/, + "ááá", + "íí", + "", + "Hello World"}, + nulls_at({1, 3, 5})}; // Sliced the first half of the column. { auto const sliced_strs = cudf::slice(strs, {0, 3})[0]; - auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(sliced_strs), 2); - auto const expected = STR_COL{{"0a0b0c0a0b0c", "" /*NULL*/, "abcxyzabcxyz"}, null_at(1)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, verbosity); + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(sliced_strs), 2); + auto const expected_strs = strs_col{{"0a0b0c0a0b0c", "" /*NULL*/, "abcxyzabcxyz"}, null_at(1)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // Sliced the middle of the column. { auto const sliced_strs = cudf::slice(strs, {2, 7})[0]; - auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(sliced_strs), 2); - auto const expected = STR_COL{ + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(sliced_strs), 2); + auto const expected_strs = strs_col{ {"abcxyzabcxyz", "" /*NULL*/, "xyzéééxyzééé", "" /*NULL*/, "áááááá"}, nulls_at({1, 3})}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); } // Sliced the second half of the column. { auto const sliced_strs = cudf::slice(strs, {6, 10})[0]; - auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(sliced_strs), 2); - auto const expected = STR_COL{"áááááá", "íííí", "", "Hello WorldHello World"}; + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(sliced_strs), 2); + auto const expected_strs = strs_col{"áááááá", "íííí", "", "Hello WorldHello World"}; // The results strings column may have a bitmask with all valid values. - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_strs, *results, verbosity); + } +} + +TYPED_TEST(RepeatStringsTypedTest, SlicedStringsColumnWithNullsWithColumnRepeatTimes) +{ + using ints_col = cudf::test::fixed_width_column_wrapper; + + auto const strs = strs_col{{"0a0b0c", + "" /*NULL*/, + "abcxyz", + "" /*NULL*/, + "xyzééé", + "" /*NULL*/, + "ááá", + "íí", + "", + "Hello World"}, + nulls_at({1, 3, 5})}; + + auto const repeat_times = + ints_col{{1, 2, null, -1, null, 1, 2, null, -5, 0, 6, 7, 8, 9, 10}, nulls_at({2, 4, 7})}; + + // Sliced the first half of the column. + { + auto const sliced_strs = cudf::slice(strs, {0, 3})[0]; + auto const sliced_rtimes = cudf::slice(repeat_times, {0, 3})[0]; + auto const sliced_strs_cv = cudf::strings_column_view(sliced_strs); + auto const expected_strs = strs_col{{"0a0b0c", "" /*NULL*/, "" /*NULL*/}, nulls_at({1, 2})}; + + auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + + auto const expected_sizes = int32s_col{6, 0, 0}; + auto const [sizes, total_bytes] = + cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); + EXPECT_EQ(6, total_bytes); + + results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + } + + // Sliced the middle of the column. + { + auto const sliced_strs = cudf::slice(strs, {2, 7})[0]; + auto const sliced_rtimes = cudf::slice(repeat_times, {2, 7})[0]; + auto const sliced_strs_cv = cudf::strings_column_view(sliced_strs); + auto const expected_strs = strs_col{ + {"" /*NULL*/, "" /*NULL*/, "" /*NULL*/, "" /*NULL*/, "áááááá"}, nulls_at({0, 1, 2, 3})}; + + auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + + auto const expected_sizes = int32s_col{0, 0, 0, 0, 12}; + auto const [sizes, total_bytes] = + cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); + EXPECT_EQ(12, total_bytes); + + results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + } + + // Sliced the second half of the column, output has nulls. + { + auto const sliced_strs = cudf::slice(strs, {6, 10})[0]; + auto const sliced_rtimes = cudf::slice(repeat_times, {6, 10})[0]; + auto const sliced_strs_cv = cudf::strings_column_view(sliced_strs); + auto const expected_strs = strs_col{{"áááááá", "" /*NULL*/, "", ""}, null_at(1)}; + + auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + + auto const expected_sizes = int32s_col{12, 0, 0, 0}; + auto const [sizes, total_bytes] = + cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); + EXPECT_EQ(12, total_bytes); + + results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_strs, *results, verbosity); + } + + // Sliced the second half of the column, output does not have null. + // Since the input has nulls, the output column is nullable (but doesn't have nulls). + { + auto const sliced_strs = cudf::slice(strs, {8, 10})[0]; + auto const sliced_rtimes = cudf::slice(repeat_times, {8, 10})[0]; + auto const sliced_strs_cv = cudf::strings_column_view(sliced_strs); + auto const expected_strs = strs_col{"", ""}; + + auto results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_strs, *results, verbosity); + + auto const expected_sizes = int32s_col{0, 0}; + auto const [sizes, total_bytes] = + cudf::strings::repeat_strings_output_sizes(sliced_strs_cv, sliced_rtimes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *sizes, verbosity); + EXPECT_EQ(0, total_bytes); + + results = cudf::strings::repeat_strings(sliced_strs_cv, sliced_rtimes, *sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_strs, *results, verbosity); } } diff --git a/java/src/main/native/src/ScalarJni.cpp b/java/src/main/native/src/ScalarJni.cpp index 50e6a66ce4f..fb4f14fdb80 100644 --- a/java/src/main/native/src/ScalarJni.cpp +++ b/java/src/main/native/src/ScalarJni.cpp @@ -517,7 +517,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_repeatString(JNIEnv *env, jcl try { cudf::jni::auto_set_device(env); auto const str = *reinterpret_cast(handle); - return reinterpret_cast(cudf::strings::repeat_strings(str, repeat_times).release()); + return reinterpret_cast(cudf::strings::repeat_string(str, repeat_times).release()); } CATCH_STD(env, 0); }