From 6920f9be9237c77258972aab9bfebd1566ac11aa Mon Sep 17 00:00:00 2001 From: Ray Douglass <3107146+raydouglass@users.noreply.github.com> Date: Fri, 21 May 2021 13:17:20 -0400 Subject: [PATCH 1/5] Update readme with correct CUDA versions (#8315) Replaces CUDA 10.1/10.2 with 11.0/11.2. Authors: - Ray Douglass (https://github.com/raydouglass) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) URL: https://github.com/rapidsai/cudf/pull/8315 --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 733d1c7897b..587f18d2603 100644 --- a/README.md +++ b/README.md @@ -67,13 +67,13 @@ cuDF can be installed with conda ([miniconda](https://conda.io/miniconda.html), For `cudf version == 21.06` : ```bash -# for CUDA 10.1 +# for CUDA 11.0 conda install -c rapidsai -c nvidia -c numba -c conda-forge \ - cudf=21.06 python=3.7 cudatoolkit=10.1 + cudf=21.06 python=3.7 cudatoolkit=11.0 -# or, for CUDA 10.2 +# or, for CUDA 11.2 conda install -c rapidsai -c nvidia -c numba -c conda-forge \ - cudf=21.06 python=3.7 cudatoolkit=10.2 + cudf=21.06 python=3.7 cudatoolkit=11.2 ``` From 5c6b92a38c5a82ee259b6414a8bbc568d8e78389 Mon Sep 17 00:00:00 2001 From: MithunR Date: Fri, 21 May 2021 10:44:27 -0700 Subject: [PATCH 2/5] COLLECT_LIST support returning empty output columns. (#8279) Fixes the group-by portion of #7611. When `COLLECT_LIST()` or `COLLECT_SET()` aggregations are called on a grouped input, if the input column is empty, then one sees the following failure: ``` C++ exception with description "cuDF failure at: .../cpp/src/column/column_factories.cpp:67: make_empty_column is invalid to call on nested types" thrown in the test body. ``` The operation should have resulted in an empty `LIST` column. `make_empty_column()` does not support `LIST` types (in part because the `data_type` parameter does not capture the types of the child columns). This commit fixes this by constructing the output column from the specified `values` input, but only for `COLLECT_LIST()` and `COLLECT_SET()`; other aggregation types are unchanged. Authors: - MithunR (https://github.com/mythrocks) Approvers: - Conor Hoekstra (https://github.com/codereport) - Nghia Truong (https://github.com/ttnghia) - https://github.com/nvdbaranec URL: https://github.com/rapidsai/cudf/pull/8279 --- cpp/src/groupby/groupby.cu | 41 +++++++++++++- cpp/tests/groupby/collect_list_tests.cpp | 70 ++++++++++++++++++++++++ cpp/tests/groupby/collect_set_tests.cpp | 3 +- cpp/tests/groupby/nth_element_tests.cpp | 40 ++++++++++++++ 4 files changed, 151 insertions(+), 3 deletions(-) diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index a5fd6d6f9bb..f132d6b1511 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -79,6 +79,44 @@ std::pair, std::vector> groupby::disp groupby::~groupby() = default; namespace { + +/** + * @brief Factory to construct empty result columns. + * + * Adds special handling for COLLECT_LIST/COLLECT_SET, because: + * 1. `make_empty_column()` does not support construction of nested columns. + * 2. Empty lists need empty child columns, to persist type information. + */ +struct empty_column_constructor { + column_view values; + + template + std::unique_ptr operator()() const + { + using namespace cudf; + using namespace cudf::detail; + + if constexpr (k == aggregation::Kind::COLLECT_LIST || k == aggregation::Kind::COLLECT_SET) { + return make_lists_column( + 0, make_empty_column(data_type{type_to_id()}), empty_like(values), 0, {}); + } + + // If `values` is LIST typed, and the aggregation results match the type, + // construct empty results based on `values`. + // Most generally, this applies if input type matches output type. + // + // Note: `target_type_t` is not recursive, and `ValuesType` does not consider children. + // It is important that `COLLECT_LIST` and `COLLECT_SET` are handled before this + // point, because `COLLECT_LIST(LIST)` produces `LIST`, but `target_type_t` + // wouldn't know the difference. + if constexpr (std::is_same_v, ValuesType>) { + return empty_like(values); + } + + return make_empty_column(target_type(values.type(), k)); + } +}; + /// Make an empty table with appropriate types for requested aggs auto empty_results(host_span requests) { @@ -93,7 +131,8 @@ auto empty_results(host_span requests) request.aggregations.end(), std::back_inserter(results), [&request](auto const& agg) { - return make_empty_column(cudf::detail::target_type(request.values.type(), agg->kind)); + return cudf::detail::dispatch_type_and_aggregation( + request.values.type(), agg->kind, empty_column_constructor{request.values}); }); return aggregation_result{std::move(results)}; diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp index 7580c1c4e3b..9d2141c913c 100644 --- a/cpp/tests/groupby/collect_list_tests.cpp +++ b/cpp/tests/groupby/collect_list_tests.cpp @@ -86,6 +86,21 @@ TYPED_TEST(groupby_collect_list_test, CollectWithNullExclusion) test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); } +TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInput) +{ + using K = int32_t; + using V = TypeParam; + + fixed_width_column_wrapper keys{}; + fixed_width_column_wrapper values{}; + + fixed_width_column_wrapper expect_keys{}; + lists_column_wrapper expect_vals{}; + + auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE); + test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); +} + TYPED_TEST(groupby_collect_list_test, CollectLists) { using K = int32_t; @@ -124,6 +139,61 @@ TYPED_TEST(groupby_collect_list_test, CollectListsWithNullExclusion) test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); } +TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputLists) +{ + using K = int32_t; + using V = TypeParam; + + using LCW = cudf::test::lists_column_wrapper; + + auto offsets = data_type{type_to_id()}; + + fixed_width_column_wrapper keys{}; + auto values = cudf::make_lists_column(0, make_empty_column(offsets), LCW{}.release(), 0, {}); + + fixed_width_column_wrapper expect_keys{}; + + auto expect_child = + cudf::make_lists_column(0, make_empty_column(offsets), LCW{}.release(), 0, {}); + auto expect_values = + cudf::make_lists_column(0, make_empty_column(offsets), std::move(expect_child), 0, {}); + + auto agg = cudf::make_collect_list_aggregation(); + test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg)); +} + +TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputListsOfStructs) +{ + using K = int32_t; + using V = TypeParam; + + using LCW = cudf::test::lists_column_wrapper; + + fixed_width_column_wrapper keys{}; + auto struct_child = LCW{}; + auto struct_column = structs_column_wrapper{{struct_child}}; + + auto values = cudf::make_lists_column( + 0, make_empty_column(data_type{type_to_id()}), struct_column.release(), 0, {}); + + fixed_width_column_wrapper expect_keys{}; + + auto expect_struct_child = LCW{}; + auto expect_struct_column = structs_column_wrapper{{expect_struct_child}}; + + auto expect_child = + cudf::make_lists_column(0, + make_empty_column(data_type{type_to_id()}), + expect_struct_column.release(), + 0, + {}); + auto expect_values = cudf::make_lists_column( + 0, make_empty_column(data_type{type_to_id()}), std::move(expect_child), 0, {}); + + auto agg = cudf::make_collect_list_aggregation(); + test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg)); +} + TYPED_TEST(groupby_collect_list_test, dictionary) { using K = int32_t; diff --git a/cpp/tests/groupby/collect_set_tests.cpp b/cpp/tests/groupby/collect_set_tests.cpp index ce3a9a49372..d5a881a1993 100644 --- a/cpp/tests/groupby/collect_set_tests.cpp +++ b/cpp/tests/groupby/collect_set_tests.cpp @@ -58,8 +58,7 @@ TYPED_TEST_CASE(CollectSetTypedTest, FixedWidthTypesNotBool); TYPED_TEST(CollectSetTypedTest, TrivialInput) { // Empty input - // TODO: Enable this test after issue#7611 has been fixed - // test_single_agg(COL_K{}, COL_V{}, COL_K{}, COL_V{}, COLLECT_SET); + test_single_agg(COL_K{}, COL_V{}, COL_K{}, LCL_V{}, CollectSetTest::collect_set()); // Single key input { diff --git a/cpp/tests/groupby/nth_element_tests.cpp b/cpp/tests/groupby/nth_element_tests.cpp index ec0265a3023..5630cba09da 100644 --- a/cpp/tests/groupby/nth_element_tests.cpp +++ b/cpp/tests/groupby/nth_element_tests.cpp @@ -362,5 +362,45 @@ TEST_F(groupby_nth_element_string_test, dictionary) keys, vals, expect_keys, expect_vals->view(), cudf::make_nth_element_aggregation(2)); } +template +struct groupby_nth_element_lists_test : BaseFixture { +}; + +TYPED_TEST_CASE(groupby_nth_element_lists_test, FixedWidthTypesWithoutFixedPoint); + +TYPED_TEST(groupby_nth_element_lists_test, Basics) +{ + using K = int32_t; + using V = TypeParam; + + using lists = cudf::test::lists_column_wrapper; + + auto keys = fixed_width_column_wrapper{1, 1, 2, 2, 3, 3}; + auto values = lists{{1, 2}, {3, 4}, {5, 6, 7}, lists{}, {9, 10}, {11}}; + + auto expected_keys = fixed_width_column_wrapper{1, 2, 3}; + auto expected_values = lists{{1, 2}, {5, 6, 7}, {9, 10}}; + + test_single_agg( + keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(0)); +} + +TYPED_TEST(groupby_nth_element_lists_test, EmptyInput) +{ + using K = int32_t; + using V = TypeParam; + + using lists = cudf::test::lists_column_wrapper; + + auto keys = fixed_width_column_wrapper{}; + auto values = lists{}; + + auto expected_keys = fixed_width_column_wrapper{}; + auto expected_values = lists{}; + + test_single_agg( + keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(2)); +} + } // namespace test } // namespace cudf From de579a59714f960fe33440811b4c49e5efeb3f3f Mon Sep 17 00:00:00 2001 From: Kumar Aatish Date: Fri, 21 May 2021 16:05:18 -0400 Subject: [PATCH 3/5] Added decimal writing for CSV writer (#8296) Addresses #7110 column_to_strings_fn was specialized for fixed point type to enable support for csv writer. A test was added to validate output file created by csv writer for decimal type column. Authors: - Kumar Aatish (https://github.com/kaatish) Approvers: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) - Vukasin Milovanovic (https://github.com/vuule) - Devavret Makkar (https://github.com/devavret) URL: https://github.com/rapidsai/cudf/pull/8296 --- cpp/src/io/csv/writer_impl.cu | 12 +++- cpp/tests/io/csv_test.cpp | 104 ++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index d2b6be5eead..13760381373 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -119,7 +119,8 @@ struct column_to_strings_fn { return not((std::is_same::value) || (std::is_integral::value) || (std::is_floating_point::value) || - (cudf::is_timestamp()) || (cudf::is_duration())); + (cudf::is_fixed_point()) || (cudf::is_timestamp()) || + (cudf::is_duration())); } explicit column_to_strings_fn( @@ -189,6 +190,15 @@ struct column_to_strings_fn { return cudf::strings::detail::from_floats(column, stream_, mr_); } + // fixed point: + // + template + std::enable_if_t(), std::unique_ptr> operator()( + column_view const& column) const + { + return cudf::strings::detail::from_fixed_point(column, stream_, mr_); + } + // timestamps: // template diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 6bc08cf24a6..e45b67505ba 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -22,9 +22,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -61,6 +63,16 @@ using table_view = cudf::table_view; auto const temp_env = static_cast( ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); +// Base test fixture for tests +struct CsvWriterTest : public cudf::test::BaseFixture { +}; + +template +struct CsvFixedPointWriterTest : public CsvWriterTest { +}; + +TYPED_TEST_CASE(CsvFixedPointWriterTest, cudf::test::FixedPointTypes); + // Base test fixture for tests struct CsvReaderTest : public cudf::test::BaseFixture { }; @@ -307,6 +319,98 @@ TYPED_TEST(CsvReaderNumericTypeTest, SingleColumn) expect_column_data_equal(std::vector(sequence, sequence + num_rows), view.column(0)); } +TYPED_TEST(CsvFixedPointWriterTest, SingleColumnNegativeScale) +{ + std::vector reference_strings = { + "1.23", "-8.76", "5.43", "-0.12", "0.25", "-0.23", "-0.27", "0.00", "0.00"}; + + auto validity = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return (i % 2 == 0) ? true : false; }); + cudf::test::strings_column_wrapper strings( + reference_strings.begin(), reference_strings.end(), validity); + + std::vector valid_reference_strings; + thrust::copy_if(thrust::host, + reference_strings.begin(), + reference_strings.end(), + thrust::make_counting_iterator(0), + std::back_inserter(valid_reference_strings), + validity.functor()); + reference_strings = valid_reference_strings; + + using DecimalType = TypeParam; + auto input_column = cudf::strings::to_fixed_point( + cudf::strings_column_view(strings), + cudf::data_type{cudf::type_to_id(), numeric::scale_type{-2}}); + + auto input_table = cudf::table_view{std::vector{*input_column}}; + + auto filepath = temp_env->get_temp_dir() + "FixedPointSingleColumnNegativeScale.csv"; + + cudf_io::csv_writer_options writer_options = + cudf_io::csv_writer_options::builder(cudf_io::sink_info(filepath), input_table); + + cudf_io::write_csv(writer_options); + + std::vector result_strings; + result_strings.reserve(reference_strings.size()); + + std::ifstream read_result_file(filepath); + assert(read_result_file.is_open()); + + std::copy(std::istream_iterator(read_result_file), + std::istream_iterator(), + std::back_inserter(result_strings)); + + EXPECT_EQ(result_strings, reference_strings); +} + +TYPED_TEST(CsvFixedPointWriterTest, SingleColumnPositiveScale) +{ + std::vector reference_strings = { + "123000", "-876000", "543000", "-12000", "25000", "-23000", "-27000", "0000", "0000"}; + + auto validity = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return (i % 2 == 0) ? true : false; }); + cudf::test::strings_column_wrapper strings( + reference_strings.begin(), reference_strings.end(), validity); + + std::vector valid_reference_strings; + thrust::copy_if(thrust::host, + reference_strings.begin(), + reference_strings.end(), + thrust::make_counting_iterator(0), + std::back_inserter(valid_reference_strings), + validity.functor()); + reference_strings = valid_reference_strings; + + using DecimalType = TypeParam; + auto input_column = cudf::strings::to_fixed_point( + cudf::strings_column_view(strings), + cudf::data_type{cudf::type_to_id(), numeric::scale_type{3}}); + + auto input_table = cudf::table_view{std::vector{*input_column}}; + + auto filepath = temp_env->get_temp_dir() + "FixedPointSingleColumnPositiveScale.csv"; + + cudf_io::csv_writer_options writer_options = + cudf_io::csv_writer_options::builder(cudf_io::sink_info(filepath), input_table); + + cudf_io::write_csv(writer_options); + + std::vector result_strings; + result_strings.reserve(reference_strings.size()); + + std::ifstream read_result_file(filepath); + assert(read_result_file.is_open()); + + std::copy(std::istream_iterator(read_result_file), + std::istream_iterator(), + std::back_inserter(result_strings)); + + EXPECT_EQ(result_strings, reference_strings); +} + TEST_F(CsvReaderTest, MultiColumn) { constexpr auto num_rows = 10; From 696902d236eb580f947a89ddd147d1c6b7fd1c89 Mon Sep 17 00:00:00 2001 From: ChrisJar Date: Sun, 23 May 2021 06:33:51 -0500 Subject: [PATCH 4/5] Enable implicit casting when concatenating mixed types (#8276) This enables implicit casting when decimal columns are concatenated with numeric columns by casting the numeric columns to decimal columns. Closes #8264 Authors: - https://github.com/ChrisJar Approvers: - Ashwin Srinath (https://github.com/shwina) - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/8276 --- python/cudf/cudf/core/frame.py | 19 +- python/cudf/cudf/core/series.py | 9 +- python/cudf/cudf/tests/test_concat.py | 265 ++++++++++++++++++++++++++ python/cudf/cudf/utils/dtypes.py | 26 ++- 4 files changed, 291 insertions(+), 28 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index f59954aaf08..cda4e8cbd4c 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -32,6 +32,7 @@ is_numerical_dtype, is_scalar, min_scalar_type, + find_common_type, ) T = TypeVar("T", bound="Frame") @@ -4029,8 +4030,11 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): # default to the first non-null dtype dtypes[idx] = cols[0].dtype # If all the non-null dtypes are int/float, find a common dtype - if all(is_numerical_dtype(col.dtype) for col in cols): - dtypes[idx] = np.find_common_type([col.dtype for col in cols], []) + if all( + is_numerical_dtype(col.dtype) or is_decimal_dtype(col.dtype) + for col in cols + ): + dtypes[idx] = find_common_type([col.dtype for col in cols]) # If all categorical dtypes, combine the categories elif all( isinstance(col, cudf.core.column.CategoricalColumn) for col in cols @@ -4045,17 +4049,6 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): # Set the column dtype to the codes' dtype. The categories # will be re-assigned at the end dtypes[idx] = min_scalar_type(len(categories[idx])) - elif all( - isinstance(col, cudf.core.column.DecimalColumn) for col in cols - ): - # Find the largest scale and the largest difference between - # precision and scale of the columns to be concatenated - s = max([col.dtype.scale for col in cols]) - lhs = max([col.dtype.precision - col.dtype.scale for col in cols]) - # Combine to get the necessary precision and clip at the maximum - # precision - p = min(cudf.Decimal64Dtype.MAX_PRECISION, s + lhs) - dtypes[idx] = cudf.Decimal64Dtype(p, s) # Otherwise raise an error if columns have different dtypes elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols): raise ValueError("All columns must be the same type") diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index d812214caf8..a894baf8235 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -45,7 +45,6 @@ from cudf.utils import cudautils, docutils, ioutils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( - _decimal_normalize_types, can_convert_to_column, is_decimal_dtype, is_list_dtype, @@ -53,7 +52,7 @@ is_mixed_with_object_dtype, is_scalar, min_scalar_type, - numeric_normalize_types, + find_common_type, ) from cudf.utils.utils import ( get_appropriate_dispatched_func, @@ -2402,10 +2401,8 @@ def _concat(cls, objs, axis=0, index=True): ) if dtype_mismatch: - if isinstance(objs[0]._column, cudf.core.column.DecimalColumn): - objs = _decimal_normalize_types(*objs) - else: - objs = numeric_normalize_types(*objs) + common_dtype = find_common_type([obj.dtype for obj in objs]) + objs = [obj.astype(common_dtype) for obj in objs] col = _concat_columns([o._column for o in objs]) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 31dc6012905..5c4c121db4d 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd import pytest +from decimal import Decimal import cudf as gd from cudf.tests.utils import assert_eq, assert_exceptions_equal @@ -1262,3 +1263,267 @@ def test_concat_decimal_series(ltype, rtype): expected = pd.concat([ps1, ps2]) assert_eq(expected, got) + + +@pytest.mark.parametrize( + "df1, df2, df3, expected", + [ + ( + gd.DataFrame( + {"val": [Decimal("42.5"), Decimal("8.7")]}, + dtype=Decimal64Dtype(5, 2), + ), + gd.DataFrame( + {"val": [Decimal("9.23"), Decimal("-67.49")]}, + dtype=Decimal64Dtype(6, 4), + ), + gd.DataFrame({"val": [8, -5]}, dtype="int32"), + gd.DataFrame( + { + "val": [ + Decimal("42.5"), + Decimal("8.7"), + Decimal("9.23"), + Decimal("-67.49"), + Decimal("8"), + Decimal("-5"), + ] + }, + dtype=Decimal64Dtype(7, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.DataFrame( + {"val": [Decimal("95.2"), Decimal("23.4")]}, + dtype=Decimal64Dtype(5, 2), + ), + gd.DataFrame({"val": [54, 509]}, dtype="uint16"), + gd.DataFrame({"val": [24, -48]}, dtype="int32"), + gd.DataFrame( + { + "val": [ + Decimal("95.2"), + Decimal("23.4"), + Decimal("54"), + Decimal("509"), + Decimal("24"), + Decimal("-48"), + ] + }, + dtype=Decimal64Dtype(5, 2), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.DataFrame( + {"val": [Decimal("36.56"), Decimal("-59.24")]}, + dtype=Decimal64Dtype(9, 4), + ), + gd.DataFrame({"val": [403.21, 45.13]}, dtype="float32"), + gd.DataFrame({"val": [52.262, -49.25]}, dtype="float64"), + gd.DataFrame( + { + "val": [ + Decimal("36.56"), + Decimal("-59.24"), + Decimal("403.21"), + Decimal("45.13"), + Decimal("52.262"), + Decimal("-49.25"), + ] + }, + dtype=Decimal64Dtype(9, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.DataFrame( + {"val": [Decimal("9563.24"), Decimal("236.633")]}, + dtype=Decimal64Dtype(9, 4), + ), + gd.DataFrame({"val": [5393, -95832]}, dtype="int64"), + gd.DataFrame({"val": [-29.234, -31.945]}, dtype="float64"), + gd.DataFrame( + { + "val": [ + Decimal("9563.24"), + Decimal("236.633"), + Decimal("5393"), + Decimal("-95832"), + Decimal("-29.234"), + Decimal("-31.945"), + ] + }, + dtype=Decimal64Dtype(9, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ], +) +def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected): + df = gd.concat([df1, df2, df3]) + assert_eq(df, expected) + assert_eq(df.val.dtype, expected.val.dtype) + + +@pytest.mark.parametrize( + "s1, s2, s3, expected", + [ + ( + gd.Series( + [Decimal("32.8"), Decimal("-87.7")], dtype=Decimal64Dtype(6, 2) + ), + gd.Series( + [Decimal("101.243"), Decimal("-92.449")], + dtype=Decimal64Dtype(9, 6), + ), + gd.Series([94, -22], dtype="int32"), + gd.Series( + [ + Decimal("32.8"), + Decimal("-87.7"), + Decimal("101.243"), + Decimal("-92.449"), + Decimal("94"), + Decimal("-22"), + ], + dtype=Decimal64Dtype(10, 6), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("7.2"), Decimal("122.1")], dtype=Decimal64Dtype(5, 2) + ), + gd.Series([33, 984], dtype="uint32"), + gd.Series([593, -702], dtype="int32"), + gd.Series( + [ + Decimal("7.2"), + Decimal("122.1"), + Decimal("33"), + Decimal("984"), + Decimal("593"), + Decimal("-702"), + ], + dtype=Decimal64Dtype(5, 2), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("982.94"), Decimal("-493.626")], + dtype=Decimal64Dtype(9, 4), + ), + gd.Series([847.98, 254.442], dtype="float32"), + gd.Series([5299.262, -2049.25], dtype="float64"), + gd.Series( + [ + Decimal("982.94"), + Decimal("-493.626"), + Decimal("847.98"), + Decimal("254.442"), + Decimal("5299.262"), + Decimal("-2049.25"), + ], + dtype=Decimal64Dtype(9, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("492.204"), Decimal("-72824.455")], + dtype=Decimal64Dtype(9, 4), + ), + gd.Series([8438, -27462], dtype="int64"), + gd.Series([-40.292, 49202.953], dtype="float64"), + gd.Series( + [ + Decimal("492.204"), + Decimal("-72824.455"), + Decimal("8438"), + Decimal("-27462"), + Decimal("-40.292"), + Decimal("49202.953"), + ], + dtype=Decimal64Dtype(9, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ], +) +def test_concat_decimal_numeric_series(s1, s2, s3, expected): + s = gd.concat([s1, s2, s3]) + assert_eq(s, expected) + + +@pytest.mark.parametrize( + "s1, s2, expected", + [ + ( + gd.Series( + [Decimal("955.22"), Decimal("8.2")], dtype=Decimal64Dtype(5, 2) + ), + gd.Series(["2007-06-12", "2006-03-14"], dtype="datetime64"), + gd.Series( + [ + "955.22", + "8.20", + "2007-06-12 00:00:00", + "2006-03-14 00:00:00", + ], + index=[0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("-52.44"), Decimal("365.22")], + dtype=Decimal64Dtype(5, 2), + ), + gd.Series( + np.arange( + "2005-02-01T12", "2005-02-01T15", dtype="datetime64[h]" + ), + dtype="datetime64[s]", + ), + gd.Series( + [ + "-52.44", + "365.22", + "2005-02-01 12:00:00", + "2005-02-01 13:00:00", + "2005-02-01 14:00:00", + ], + index=[0, 1, 0, 1, 2], + ), + ), + ( + gd.Series( + [Decimal("753.0"), Decimal("94.22")], + dtype=Decimal64Dtype(5, 2), + ), + gd.Series([np.timedelta64(111, "s"), np.timedelta64(509, "s")]), + gd.Series( + ["753.00", "94.22", "0 days 00:01:51", "0 days 00:08:29"], + index=[0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("753.0"), Decimal("94.22")], + dtype=Decimal64Dtype(5, 2), + ), + gd.Series( + [np.timedelta64(940252, "s"), np.timedelta64(758385, "s")] + ), + gd.Series( + ["753.00", "94.22", "10 days 21:10:52", "8 days 18:39:45"], + index=[0, 1, 0, 1], + ), + ), + ], +) +def test_concat_decimal_non_numeric(s1, s2, expected): + s = gd.concat([s1, s2]) + assert_eq(s, expected) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 16c35bab4b1..0b59116f8e6 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -290,13 +290,15 @@ def is_decimal_dtype(obj): ) -def _decimal_normalize_types(*args): - s = max([a.dtype.scale for a in args]) - lhs = max([a.dtype.precision - a.dtype.scale for a in args]) +def _find_common_type_decimal(dtypes): + # Find the largest scale and the largest difference between + # precision and scale of the columns to be concatenated + s = max([dtype.scale for dtype in dtypes]) + lhs = max([dtype.precision - dtype.scale for dtype in dtypes]) + # Combine to get the necessary precision and clip at the maximum + # precision p = min(cudf.Decimal64Dtype.MAX_PRECISION, s + lhs) - dtype = cudf.Decimal64Dtype(p, s) - - return [a.astype(dtype) for a in args] + return cudf.Decimal64Dtype(p, s) def cudf_dtype_from_pydata_dtype(dtype): @@ -690,9 +692,15 @@ def find_common_type(dtypes): dtypes = set(dtypes) if any(is_decimal_dtype(dtype) for dtype in dtypes): - raise NotImplementedError( - "DecimalDtype is not yet supported in find_common_type" - ) + if all( + is_decimal_dtype(dtype) or is_numerical_dtype(dtype) + for dtype in dtypes + ): + return _find_common_type_decimal( + [dtype for dtype in dtypes if is_decimal_dtype(dtype)] + ) + else: + return np.dtype("O") # Corner case 1: # Resort to np.result_type to handle "M" and "m" types separately From ef20706d2f66ba6b32611f99c7b265c26d543d11 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 24 May 2021 07:22:37 -0400 Subject: [PATCH 5/5] Add separator-on-null parameter to strings concatenate APIs (#8282) Closes #4728 This PR adds a new parameter to the `cudf::strings::concatenate` APIs to specify if separators should be added between null entries when the null-replacement (narep) parameter is valid. If the narep scalar is invalid (i.e. null itself) then the entire output row becomes null. If not, separators are added between each element. Examples: ``` s1 = ['a', 'b', null, 'dd', null] s2 = ['A', null, 'CC', 'D', null] concatenate( {s1, s2}, sep='+', narep=invalid ) -> ['a+A', null, null, 'dd+D', null] concatenate( {s1, s2}, sep='+', narep='@' ) -> ['a+A', 'b+@', '@+CC', 'dd+D', '@+@'] concatenate( {s1, s2}, sep='+', narep='' ) -> ['a+A', 'b+', '+CC', 'dd+D', '+'] ``` The new parameter is an enum `separator_on_nulls` which has `YES` or `NO` settings. The default parameter value will be `YES` to keep the current behavior as expected by Python cudf and for consistency with Pandas behavior. Specifying `NO` here will suppress the separator with null elements (when narep is valid). ``` concatenate( {s1, s2}, sep='+', narep='', NO ) -> ['a+A', 'b', 'CC', 'dd+D', ''] ``` This PR also changes the name of the `cudf::strings::concatenate_list_elements` API to `cudf::strings::join_list_elements` instead. The API pattern and behavior more mimic the `cudf::strings::join_strings` then the concatenate functions. Also, these are called by the Python `join` functions so the rename makes it more consistent with cudf. This is a breaking change in order to make these APIs more consistent. Previously, the separators column version was returning nulls only for an all-null row. This has been changed to honor the `separator_on_null` parameter instead. Currently there was no Python cudf API calling this version. Only the rename required minor changes to the Cython layer. The gtests were updated to reflect the new behavior. None of the pytests required any changes since the default parameter value matches the original behavior for those APIs that cudf actually calls. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Nghia Truong (https://github.com/ttnghia) - Keith Kraus (https://github.com/kkraus14) - Thomas Graves (https://github.com/tgravescs) - Christopher Harris (https://github.com/cwharris) URL: https://github.com/rapidsai/cudf/pull/8282 --- cpp/CMakeLists.txt | 2 +- cpp/include/cudf/strings/combine.hpp | 134 ++++++++----- cpp/include/cudf/strings/detail/combine.hpp | 4 +- cpp/src/io/csv/writer_impl.cu | 15 +- cpp/src/strings/combine/concatenate.cu | 177 +++++++++--------- ...list_elements.cu => join_list_elements.cu} | 128 +++++++------ cpp/tests/CMakeLists.txt | 2 +- .../strings/combine/concatenate_tests.cpp | 125 ++++++++++--- ...tests.cpp => join_list_elements_tests.cpp} | 117 +++++++----- python/cudf/cudf/_lib/cpp/strings/combine.pxd | 4 +- python/cudf/cudf/_lib/strings/combine.pyx | 6 +- 11 files changed, 445 insertions(+), 269 deletions(-) rename cpp/src/strings/combine/{concatenate_list_elements.cu => join_list_elements.cu} (64%) rename cpp/tests/strings/combine/{concatenate_list_elements_tests.cpp => join_list_elements_tests.cpp} (82%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index af6f60b031d..aa3b4406320 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -333,8 +333,8 @@ add_library(cudf src/strings/char_types/char_cases.cu src/strings/char_types/char_types.cu src/strings/combine/concatenate.cu - src/strings/combine/concatenate_list_elements.cu src/strings/combine/join.cu + src/strings/combine/join_list_elements.cu src/strings/contains.cu src/strings/convert/convert_booleans.cu src/strings/convert/convert_datetime.cu diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp index 6887ef0e670..360efe15303 100644 --- a/cpp/include/cudf/strings/combine.hpp +++ b/cpp/include/cudf/strings/combine.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,12 +30,21 @@ namespace strings { * @brief Strings APIs for concatenate and join */ +/** + * @brief Setting for specifying how separators are added with + * null strings elements. + */ +enum class separator_on_nulls { + YES, ///< Always add separators between elements + NO ///< Do not add separators if an element is null +}; + /** * @brief Concatenates all strings in the column into one new string delimited * by an optional separator string. * * This returns a column with one string. Any null entries are ignored unless - * the narep parameter specifies a replacement string. + * the @p narep parameter specifies a replacement string. * * @code{.pseudo} * Example: @@ -70,11 +79,9 @@ std::unique_ptr join_strings( * * - If row separator for a given row is null, output column for that row is null, unless * there is a valid @p separator_narep - * - If all column values for a given row is null, output column for that row is null, unless - * there is a valid @p col_narep - * - null column values for a given row are skipped, if the column replacement isn't valid - * - The separator is only applied between two valid column values - * - If valid @p separator_narep and @p col_narep are provided, the output column is always + * - The separator is applied between two output row values if the @p separate_nulls + * is `YES` or only between valid rows if @p separate_nulls is `NO`. + * - If @p separator_narep and @p col_narep are both valid, the output column is always * non nullable * * @code{.pseudo} @@ -83,16 +90,25 @@ std::unique_ptr join_strings( * c1 = [null, 'cc', 'dd', null, null, 'gg'] * c2 = ['bb', '', null, null, null, 'hh'] * sep = ['::', '%%', '^^', '!', '*', null] - * out0 = concatenate([c0, c1, c2], sep) - * out0 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, null] + * out = concatenate({c0, c1, c2}, sep) + * // all rows have at least one null or sep[i]==null + * out is [null, null, null, null, null, null] * * sep_rep = '+' - * out1 = concatenate([c0, c1, c2], sep, sep_rep) - * out1 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, 'ff+gg+hh'] - * - * col_rep = '-' - * out2 = concatenate([c0, c1, c2], sep, invalid_sep_rep, col_rep) - * out2 is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null] + * out = concatenate({c0, c1, c2}, sep, sep_rep) + * // all rows with at least one null output as null + * out is [null, null, null, null, null, 'ff+gg+hh'] + * + * col_narep = '-' + * sep_na = non-valid scalar + * out = concatenate({c0, c1, c2}, sep, sep_na, col_narep) + * // only the null entry in the sep column produces a null row + * out is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null] + * + * col_narep = '' + * out = concatenate({c0, c1, c2}, sep, sep_rep, col_narep, separator_on_nulls:NO) + * // parameter suppresses separator for null rows + * out is ['aa::bb', 'cc%%', '^^dd', 'ee', '', 'ff+gg+hh'] * @endcode * * @throw cudf::logic_error if no input columns are specified - table view is empty @@ -108,6 +124,8 @@ std::unique_ptr join_strings( * @param col_narep String that should be used in place of any null strings * found in any column. Default of invalid-scalar means no null column value replacements. * Default is an invalid string. + * @param separate_nulls If YES, then the separator is included for null rows + * if `col_narep` is valid. * @param mr Resource for allocating device memory. * @return New column with concatenated results. */ @@ -116,15 +134,9 @@ std::unique_ptr concatenate( strings_column_view const& separators, string_scalar const& separator_narep = string_scalar("", false), string_scalar const& col_narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @addtogroup strings_combine - * @{ - * @file strings/combine.hpp - * @brief Strings APIs for concatenate and join - */ - /** * @brief Row-wise concatenates the given list of strings columns and * returns a single strings column result. @@ -136,20 +148,30 @@ std::unique_ptr concatenate( * row to be null entry unless a narep string is specified to be used * in its place. * - * The number of strings in the columns provided must be the same. + * If @p separate_nulls is set to `NO` and @p narep is valid then + * separators are not added to the output between null elements. + * Otherwise, separators are always added if @p narep is valid. + * + * More than one column must be specified in the input @p strings_columns + * table. * * @code{.pseudo} * Example: - * s1 = ['aa', null, '', 'aa'] - * s2 = ['', 'bb', 'bb', null] - * r1 = concatenate([s1,s2]) - * r1 is ['aa', null, 'bb', null] - * r2 = concatenate([s1,s2],':','_') - * r2 is ['aa:', '_:bb', ':bb', 'aa:_'] + * s1 = ['aa', null, '', 'dd'] + * s2 = ['', 'bb', 'cc', null] + * out = concatenate({s1, s2}) + * out is ['aa', null, 'cc', null] + * + * out = concatenate({s1, s2}, ':', '_') + * out is ['aa:', '_:bb', ':cc', 'dd:_'] + * + * out = concatenate({s1, s2}, ':', '', separator_on_nulls::NO) + * out is ['aa:', 'bb', ':cc', 'dd'] * @endcode * * @throw cudf::logic_error if input columns are not all strings columns. * @throw cudf::logic_error if separator is not valid. + * @throw cudf::logic_error if only one column is specified * * @param strings_columns List of string columns to concatenate. * @param separator String that should inserted between each string from each row. @@ -157,6 +179,7 @@ std::unique_ptr concatenate( * @param narep String that should be used in place of any null strings * found in any column. Default of invalid-scalar means any null entry in any column will * produces a null result for that row. + * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column with concatenated results. */ @@ -164,6 +187,7 @@ std::unique_ptr concatenate( table_view const& strings_columns, string_scalar const& separator = string_scalar(""), string_scalar const& narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -171,24 +195,30 @@ std::unique_ptr concatenate( * within each row and returns a single strings column result. * * Each new string is created by concatenating the strings from the same row (same list element) - * delimited by the row separator provided in the `separators` strings column. + * delimited by the row separator provided in the @p separators strings column. * * A null list row will always result in a null string in the output row. Any non-null list row * having a null element will result in the corresponding output row to be null unless a valid - * `string_narep` scalar is provided to be used in its place. Any null row in the `separators` - * column will also result in a null output row unless a valid `separator_narep` scalar is provided + * @p string_narep scalar is provided to be used in its place. Any null row in the @p separators + * column will also result in a null output row unless a valid @p separator_narep scalar is provided * to be used in place of the null separators. * + * If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the + * output between null elements. Otherwise, separators are always added if @p narep is valid. + * * @code{.pseudo} * Example: - * s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff', 'gg'} ] + * s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff', 'gg'] ] * sep = ['::', '%%', '!', '*', null] * - * r1 = strings::concatenate_list_elements(s, sep) - * r1 is ['aa::bb::cc', null, '!dd', null, null] + * out = join_list_elements(s, sep) + * out is ['aa::bb::cc', null, '!dd', null, null] + * + * out = join_list_elements(s, sep, ':', '_') + * out is ['aa::bb::cc', null, '!dd', 'ee*_', 'ff:gg'] * - * r2 = strings::concatenate_list_elements(s, sep, ':', '_') - * r2 is ['aa::bb::cc', null, '!dd', 'ee*_', 'ff:gg'] + * out = join_list_elements(s, sep, ':', '', separator_on_nulls::NO) + * out is ['aa::bb::cc', null, '!dd', 'ee', 'ff:gg'] * @endcode * * @throw cudf::logic_error if input column is not lists of strings column. @@ -203,14 +233,16 @@ std::unique_ptr concatenate( * @param string_narep String that should be used to replace null strings in any non-null list row, * default is an invalid-scalar denoting that list rows containing null strings will result * in null string in the corresponding output rows. + * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column with concatenated results. */ -std::unique_ptr concatenate_list_elements( +std::unique_ptr join_list_elements( const lists_column_view& lists_strings_column, const strings_column_view& separators, string_scalar const& separator_narep = string_scalar("", false), string_scalar const& string_narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -218,21 +250,27 @@ std::unique_ptr concatenate_list_elements( * within each row and returns a single strings column result. * * Each new string is created by concatenating the strings from the same row (same list element) - * delimited by the separator provided. + * delimited by the @p separator provided. * * A null list row will always result in a null string in the output row. Any non-null list row - * having a null elenent will result in the corresponding output row to be null unless a narep - * string is specified to be used in its place. + * having a null elenent will result in the corresponding output row to be null unless a + * @p narep string is specified to be used in its place. + * + * If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the + * output between null elements. Otherwise, separators are always added if @p narep is valid. * * @code{.pseudo} * Example: - * s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff'} ] + * s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff'] ] + * + * out = join_list_elements(s) + * out is ['aabbcc', null, 'dd', null, 'ff'] * - * r1 = strings::concatenate_list_elements(s) - * r1 is ['aabbcc', null, 'dd', null, 'ff'] + * out = join_list_elements(s, ':', '_') + * out is ['aa:bb:cc', null, ':dd', 'ee:_', 'ff'] * - * r2 = strings::concatenate_list_elements(s, ':', '_') - * r2 is ['aa:bb:cc', null, ':dd', 'ee:_', 'ff'] + * out = join_list_elements(s, ':', '', separator_on_nulls::NO) + * out is ['aa:bb:cc', null, ':dd', 'ee', 'ff'] * @endcode * * @throw cudf::logic_error if input column is not lists of strings column. @@ -244,13 +282,15 @@ std::unique_ptr concatenate_list_elements( * @param narep String that should be used to replace null strings in any non-null list row, default * is an invalid-scalar denoting that list rows containing null strings will result in null * string in the corresponding output rows. + * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column with concatenated results. */ -std::unique_ptr concatenate_list_elements( +std::unique_ptr join_list_elements( const lists_column_view& lists_strings_column, string_scalar const& separator = string_scalar(""), string_scalar const& narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp index 6e25a4dfa38..d6bdf398886 100644 --- a/cpp/include/cudf/strings/detail/combine.hpp +++ b/cpp/include/cudf/strings/detail/combine.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -36,6 +37,7 @@ std::unique_ptr concatenate( table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 13760381373..bc0e1243d4f 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -414,11 +415,19 @@ void writer::impl::write(table_view const& table, auto str_table_view = str_table_ptr->view(); // concatenate columns in each row into one big string column - //(using null representation and delimiter): + // (using null representation and delimiter): // std::string delimiter_str{options_.get_inter_column_delimiter()}; - auto str_concat_col = cudf::strings::detail::concatenate( - str_table_view, delimiter_str, options_.get_na_rep(), stream); + auto str_concat_col = [&] { + if (str_table_view.num_columns() > 1) + return cudf::strings::detail::concatenate(str_table_view, + delimiter_str, + options_.get_na_rep(), + strings::separator_on_nulls::YES, + stream); + cudf::string_scalar narep{options_.get_na_rep()}; + return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream); + }(); write_chunked(str_concat_col->view(), metadata, stream); } diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu index 5d7b9152ff3..1329ad3113f 100644 --- a/cpp/src/strings/combine/concatenate.cu +++ b/cpp/src/strings/combine/concatenate.cu @@ -41,67 +41,93 @@ namespace strings { namespace detail { namespace { -/** - * @brief Concatenate strings functor - * - * This will concatenate the strings from each row of the given table - * and apply the separator. The null-replacement string `d_narep` is - * used in place of any string in a row that contains a null entry. - */ -struct concat_strings_fn { +struct concat_strings_base { table_device_view const d_table; - string_view const d_separator; string_scalar_device_view const d_narep; + separator_on_nulls separate_nulls; offset_type* d_offsets{}; char* d_chars{}; - __device__ void operator()(size_type idx) + /** + * @brief Concatenate each table row to a single output string. + * + * This will concatenate the strings from each row of the given table + * and apply the separator. The null-replacement string `d_narep` is + * used in place of any string in a row that contains a null entry. + * + * @param idx The current row to process + * @param d_separator String to place in between each column's row + */ + __device__ void process_row(size_type idx, string_view const d_separator) { - bool const null_element = - thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { - return col.is_null(idx); - }); - // handle a null row - if (null_element && !d_narep.is_valid()) { + if (!d_narep.is_valid() && + thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { + return col.is_null(idx); + })) { if (!d_chars) d_offsets[idx] = 0; return; } - char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - size_type bytes = 0; + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + offset_type bytes = 0; + bool write_separator = false; + for (auto itr = d_table.begin(); itr < d_table.end(); ++itr) { - auto const d_column = *itr; - auto const d_str = - d_column.is_null(idx) ? d_narep.value() : d_column.element(idx); - if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str); - bytes += d_str.size_bytes(); - // separator goes only in between elements - if (itr + 1 < d_table.end()) { + auto const d_column = *itr; + bool const null_element = d_column.is_null(idx); + + if (write_separator && (separate_nulls == separator_on_nulls::YES || !null_element)) { if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_separator); bytes += d_separator.size_bytes(); + write_separator = false; } + + // write out column's row data (or narep if the row is null) + auto const d_str = null_element ? d_narep.value() : d_column.element(idx); + if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str); + bytes += d_str.size_bytes(); + + write_separator = + write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element; } + if (!d_chars) d_offsets[idx] = bytes; } }; +/** + * @brief Single separator concatenate functor + */ +struct concat_strings_fn : concat_strings_base { + string_view const d_separator; + + concat_strings_fn(table_device_view const& d_table, + string_view const& d_separator, + string_scalar_device_view const& d_narep, + separator_on_nulls separate_nulls) + : concat_strings_base{d_table, d_narep, separate_nulls}, d_separator(d_separator) + { + } + + __device__ void operator()(size_type idx) { process_row(idx, d_separator); } +}; + } // namespace std::unique_ptr concatenate(table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, + separator_on_nulls separate_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto const num_columns = strings_columns.num_columns(); - CUDF_EXPECTS(num_columns > 0, "At least one column must be specified"); + CUDF_EXPECTS(num_columns > 1, "At least two columns must be specified"); // check all columns are of type string CUDF_EXPECTS(std::all_of(strings_columns.begin(), strings_columns.end(), [](auto c) { return c.type().id() == type_id::STRING; }), "All columns must be of type string"); - if (num_columns == 1) // single strings column returns a copy - return std::make_unique(*(strings_columns.begin()), stream, mr); auto const strings_count = strings_columns.num_rows(); if (strings_count == 0) // empty begets empty return detail::make_empty_strings_column(stream, mr); @@ -112,7 +138,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Create device views from the strings columns. auto d_table = table_device_view::create(strings_columns, stream); - concat_strings_fn fn{*d_table, d_separator, d_narep}; + concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls}; auto children = make_strings_children(fn, strings_count, stream, mr); // create resulting null mask @@ -120,9 +146,9 @@ std::unique_ptr concatenate(table_view const& strings_columns, thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), [d_table = *d_table, d_narep] __device__(size_type idx) { - bool null_element = thrust::any_of( + if (d_narep.is_valid()) return true; + return !thrust::any_of( thrust::seq, d_table.begin(), d_table.end(), [idx](auto col) { return col.is_null(idx); }); - return (!null_element || d_narep.is_valid()); }, stream, mr); @@ -145,68 +171,42 @@ namespace { * when a separator row is null `d_separator_narep`. The `d_narep` is * used in place of a null entry in the strings columns. */ -struct multi_separator_concat_fn { - table_device_view const d_table; +struct multi_separator_concat_fn : concat_strings_base { column_device_view const d_separators; string_scalar_device_view const d_separator_narep; - string_scalar_device_view const d_narep; - offset_type* d_offsets{}; - char* d_chars{}; - __device__ void operator()(size_type idx) + multi_separator_concat_fn(table_device_view const& d_table, + column_device_view const& d_separators, + string_scalar_device_view const& d_separator_narep, + string_scalar_device_view const& d_narep, + separator_on_nulls separate_nulls) + : concat_strings_base{d_table, d_narep, separate_nulls}, + d_separators(d_separators), + d_separator_narep(d_separator_narep) { - bool const all_nulls = - thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { - return col.is_null(idx); - }); + } - if ((d_separators.is_null(idx) && !d_separator_narep.is_valid()) || - (all_nulls && !d_narep.is_valid())) { + __device__ void operator()(size_type idx) + { + if (d_separators.is_null(idx) && !d_separator_narep.is_valid()) { if (!d_chars) d_offsets[idx] = 0; return; } - // point to output location - char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - offset_type bytes = 0; - - // there is at least one non-null column value auto const d_separator = d_separators.is_valid(idx) ? d_separators.element(idx) : d_separator_narep.value(); - auto const d_null_rep = d_narep.is_valid() ? d_narep.value() : string_view{}; - - // write output entry for this row - bool colval_written = false; // state variable for writing separators - for (auto const d_column : d_table) { - // if the row is null and if there is no replacement, skip it - if (d_column.is_null(idx) && !d_narep.is_valid()) continue; - - // separator in this row is written only after the first output - if (colval_written) { - if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_separator); - bytes += d_separator.size_bytes(); - } - - // write out column's row data (or narep if the row is null) - string_view const d_str = - d_column.is_null(idx) ? d_null_rep : d_column.element(idx); - if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str); - bytes += d_str.size_bytes(); - - // column's string or narep could by empty so we need this flag - // to know we got this far even if no actual bytes were copied - colval_written = true; // use the separator before the next column - } - - if (!d_chars) d_offsets[idx] = bytes; + // base class utility function handles the rest + process_row(idx, d_separator); } }; + } // namespace std::unique_ptr concatenate(table_view const& strings_columns, strings_column_view const& separators, string_scalar const& separator_narep, string_scalar const& col_narep, + separator_on_nulls separate_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -234,20 +234,19 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Create device views from the strings columns. auto d_table = table_device_view::create(strings_columns, stream); - multi_separator_concat_fn mscf{*d_table, separator_col_view, separator_rep, col_rep}; + multi_separator_concat_fn mscf{ + *d_table, separator_col_view, separator_rep, col_rep, separate_nulls}; auto children = make_strings_children(mscf, strings_count, stream, mr); // Create resulting null mask auto [null_mask, null_count] = cudf::detail::valid_if( thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), - [d_table = *d_table, separator_col_view, separator_rep, col_rep] __device__(size_type ridx) { - if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return false; - bool all_nulls = - thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [ridx](auto const& col) { - return col.is_null(ridx); - }); - return all_nulls ? col_rep.is_valid() : true; + [d_table = *d_table, separator_col_view, separator_rep, col_rep] __device__(size_type idx) { + if (!separator_col_view.is_valid(idx) && !separator_rep.is_valid()) return false; + if (col_rep.is_valid()) return true; + return !thrust::any_of( + thrust::seq, d_table.begin(), d_table.end(), [idx](auto col) { return col.is_null(idx); }); }, stream, mr); @@ -268,21 +267,29 @@ std::unique_ptr concatenate(table_view const& strings_columns, std::unique_ptr concatenate(table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, + separator_on_nulls separate_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate(strings_columns, separator, narep, rmm::cuda_stream_default, mr); + return detail::concatenate( + strings_columns, separator, narep, separate_nulls, rmm::cuda_stream_default, mr); } std::unique_ptr concatenate(table_view const& strings_columns, strings_column_view const& separators, string_scalar const& separator_narep, string_scalar const& col_narep, + separator_on_nulls separate_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate( - strings_columns, separators, separator_narep, col_narep, rmm::cuda_stream_default, mr); + return detail::concatenate(strings_columns, + separators, + separator_narep, + col_narep, + separate_nulls, + rmm::cuda_stream_default, + mr); } } // namespace strings diff --git a/cpp/src/strings/combine/concatenate_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu similarity index 64% rename from cpp/src/strings/combine/concatenate_list_elements.cu rename to cpp/src/strings/combine/join_list_elements.cu index 1157b8f3fce..7a83097566c 100644 --- a/cpp/src/strings/combine/concatenate_list_elements.cu +++ b/cpp/src/strings/combine/join_list_elements.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,7 @@ struct compute_size_and_concatenate_fn { offset_type const* const list_offsets; column_device_view const strings_dv; string_scalar_device_view const string_narep_dv; + separator_on_nulls const separate_nulls; offset_type* d_offsets{nullptr}; @@ -72,33 +74,38 @@ struct compute_size_and_concatenate_fn { return; } - auto const separator = func.separator(idx); - auto const separator_size = separator.size_bytes(); - auto size_bytes = size_type{0}; - bool written = false; - char* output_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; + auto const separator = func.separator(idx); + auto size_bytes = size_type{0}; + char* output_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; + bool write_separator = false; for (size_type str_idx = list_offsets[idx], idx_end = list_offsets[idx + 1]; str_idx < idx_end; ++str_idx) { - if (not d_chars and (strings_dv.is_null(str_idx) and not string_narep_dv.is_valid())) { + bool null_element = strings_dv.is_null(str_idx); + + if (not d_chars and (null_element and not string_narep_dv.is_valid())) { d_offsets[idx] = 0; d_validities[idx] = false; return; // early termination: the entire list of strings will result in a null string } - auto const d_str = strings_dv.is_null(str_idx) ? string_narep_dv.value() - : strings_dv.element(str_idx); - size_bytes += separator_size + d_str.size_bytes(); - if (output_ptr) { - // Separator is inserted only in between strings - if (written) { output_ptr = detail::copy_string(output_ptr, separator); } - output_ptr = detail::copy_string(output_ptr, d_str); - written = true; + + if (write_separator && (separate_nulls == separator_on_nulls::YES || !null_element)) { + if (output_ptr) output_ptr = detail::copy_string(output_ptr, separator); + size_bytes += separator.size_bytes(); + write_separator = false; } + + auto const d_str = + null_element ? string_narep_dv.value() : strings_dv.element(str_idx); + if (output_ptr) output_ptr = detail::copy_string(output_ptr, d_str); + size_bytes += d_str.size_bytes(); + + write_separator = + write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element; } - // Separator is inserted only in between strings if (not d_chars) { - d_offsets[idx] = static_cast(size_bytes - separator_size); + d_offsets[idx] = size_bytes; d_validities[idx] = true; } } @@ -123,11 +130,12 @@ struct scalar_separator_fn { } // namespace -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - string_scalar const& separator, - string_scalar const& narep, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, + string_scalar const& separator, + string_scalar const& narep, + separator_on_nulls separate_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING, "The input column must be a column of lists of strings"); @@ -146,14 +154,14 @@ std::unique_ptr concatenate_list_elements(lists_column_view const& lists auto const sep_dv = get_scalar_device_view(const_cast(separator)); auto const string_narep_dv = get_scalar_device_view(const_cast(narep)); - auto const func = scalar_separator_fn{sep_dv}; - auto const comp_fn = compute_size_and_concatenate_fn{ - func, - *lists_dv_ptr, - lists_strings_column.offsets_begin(), - *strings_dv_ptr, - string_narep_dv, - }; + auto const func = scalar_separator_fn{sep_dv}; + auto const comp_fn = + compute_size_and_concatenate_fn{func, + *lists_dv_ptr, + lists_strings_column.offsets_begin(), + *strings_dv_ptr, + string_narep_dv, + separate_nulls}; auto [offsets_column, chars_column, null_mask, null_count] = make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr); @@ -191,12 +199,13 @@ struct column_separators_fn { } // namespace -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - strings_column_view const& separators, - string_scalar const& separator_narep, - string_scalar const& string_narep, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, + strings_column_view const& separators, + string_scalar const& separator_narep, + string_scalar const& string_narep, + separator_on_nulls separate_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING, "The input column must be a column of lists of strings"); @@ -217,14 +226,14 @@ std::unique_ptr concatenate_list_elements(lists_column_view const& lists auto const sep_dv_ptr = column_device_view::create(separators.parent(), stream); auto const sep_narep_dv = get_scalar_device_view(const_cast(separator_narep)); - auto const func = column_separators_fn{*sep_dv_ptr, sep_narep_dv}; - auto const comp_fn = compute_size_and_concatenate_fn{ - func, - *lists_dv_ptr, - lists_strings_column.offsets_begin(), - *strings_dv_ptr, - string_narep_dv, - }; + auto const func = column_separators_fn{*sep_dv_ptr, sep_narep_dv}; + auto const comp_fn = + compute_size_and_concatenate_fn{func, + *lists_dv_ptr, + lists_strings_column.offsets_begin(), + *strings_dv_ptr, + string_narep_dv, + separate_nulls}; auto [offsets_column, chars_column, null_mask, null_count] = make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr); @@ -239,25 +248,32 @@ std::unique_ptr concatenate_list_elements(lists_column_view const& lists } // namespace detail -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - string_scalar const& separator, - string_scalar const& narep, - rmm::mr::device_memory_resource* mr) +std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, + string_scalar const& separator, + string_scalar const& narep, + separator_on_nulls separate_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_list_elements( - lists_strings_column, separator, narep, rmm::cuda_stream_default, mr); + return detail::join_list_elements( + lists_strings_column, separator, narep, separate_nulls, rmm::cuda_stream_default, mr); } -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - strings_column_view const& separators, - string_scalar const& separator_narep, - string_scalar const& string_narep, - rmm::mr::device_memory_resource* mr) +std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, + strings_column_view const& separators, + string_scalar const& separator_narep, + string_scalar const& string_narep, + separator_on_nulls separate_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_list_elements( - lists_strings_column, separators, separator_narep, string_narep, rmm::cuda_stream_default, mr); + return detail::join_list_elements(lists_strings_column, + separators, + separator_narep, + string_narep, + separate_nulls, + rmm::cuda_stream_default, + mr); } } // namespace strings diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index f36ec70479b..bbcfd69a52b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -328,8 +328,8 @@ ConfigureTest(STRINGS_TEST strings/booleans_tests.cpp strings/case_tests.cpp strings/chars_types_tests.cpp - strings/combine/concatenate_list_elements_tests.cpp strings/combine/concatenate_tests.cpp + strings/combine/join_list_elements_tests.cpp strings/combine/join_strings_tests.cpp strings/concatenate_tests.cpp strings/contains_tests.cpp diff --git a/cpp/tests/strings/combine/concatenate_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp index c1c390e8a82..d91f669e42d 100644 --- a/cpp/tests/strings/combine/concatenate_tests.cpp +++ b/cpp/tests/strings/combine/concatenate_tests.cpp @@ -95,6 +95,58 @@ TEST_F(StringsCombineTest, Concatenate) } } +TEST_F(StringsCombineTest, ConcatenateSkipNulls) +{ + cudf::test::strings_column_wrapper strings1({"eee", "", "", "", "aa", "bbb", "ééé"}, + {1, 0, 0, 1, 1, 1, 1}); + cudf::test::strings_column_wrapper strings2({"xyz", "", "d", "éa", "", "", "f"}, + {1, 0, 1, 1, 1, 0, 1}); + cudf::test::strings_column_wrapper strings3({"q", "", "s", "t", "u", "", "w"}, + {1, 1, 1, 1, 1, 0, 1}); + + cudf::table_view table({strings1, strings2, strings3}); + + { + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "++", "+d+s", "+éa+t", "aa++u", "bbb++", "ééé+f+w"}); + auto results = cudf::strings::concatenate(table, + cudf::string_scalar("+"), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::YES); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } + { + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "", "d+s", "+éa+t", "aa++u", "bbb", "ééé+f+w"}); + auto results = cudf::strings::concatenate(table, + cudf::string_scalar("+"), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } + { + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "", "", "+éa+t", "aa++u", "", "ééé+f+w"}, {1, 0, 0, 1, 1, 0, 1}); + auto results = cudf::strings::concatenate(table, + cudf::string_scalar("+"), + cudf::string_scalar("", false), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } + { + cudf::test::strings_column_wrapper sep_col({"+", "-", ".", "@", "*", "^^", "#"}); + auto results = cudf::strings::concatenate(table, + cudf::strings_column_view(sep_col), + cudf::string_scalar(""), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "", "d.s", "@éa@t", "aa**u", "bbb", "ééé#f#w"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } +} + TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns) { cudf::column_view zero_size_strings_column( @@ -107,6 +159,12 @@ TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns) cudf::test::expect_strings_empty(results->view()); } +TEST_F(StringsCombineTest, SingleColumnErrorCheck) +{ + cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); + EXPECT_THROW(cudf::strings::concatenate(cudf::table_view{{col0}}), cudf::logic_error); +} + struct StringsConcatenateWithColSeparatorTest : public cudf::test::BaseFixture { }; @@ -157,7 +215,6 @@ TEST_F(StringsConcatenateWithColSeparatorTest, SingleColumnEmptyAndNullStringsNo auto exp_results = cudf::test::strings_column_wrapper({"", "", "", ""}, {false, true, false, false}); - auto results = cudf::strings::concatenate(cudf::table_view{{col0}}, cudf::strings_column_view(sep_col)); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); @@ -295,12 +352,20 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnEmptyAndNullStringsNoR auto sep_col = cudf::test::strings_column_wrapper( {"", "", "", "", "", "", "", ""}, {true, false, true, false, true, false, true, false}); - auto exp_results = cudf::test::strings_column_wrapper( - {"", "", "", "", "", "", "", ""}, {false, false, true, false, true, false, true, false}); - + auto exp_results1 = cudf::test::strings_column_wrapper( + {"", "", "", "", "", "", "", ""}, {false, false, true, false, false, false, false, false}); auto results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true); + + auto exp_results2 = cudf::test::strings_column_wrapper( + {"", "", "", "", "", "", "", ""}, {true, false, true, false, true, false, true, false}); + results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, + cudf::strings_column_view(sep_col), + cudf::string_scalar("", false), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true); } TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixNoReplacements) @@ -315,13 +380,23 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixNoReplacement {"", "~~~", "", "@", "", "", "", "^^^^", "", "--", "*****", "######"}, {true, true, false, true, false, true, false, true, true, true, true, true}); - auto exp_results = cudf::test::strings_column_wrapper( - {"eeexyzfoo", "~~~", "", "éééf", "", "", "", "valid", "doo", "", "", ""}, - {true, true, false, true, false, true, false, true, true, false, false, false}); + auto exp_results1 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "", "", "", "", "", "", "", "", "", ""}, + {true, true, false, false, false, false, false, false, false, false, false, false}); auto results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true); + + auto exp_results2 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "", "éééf", "", "", "", "valid", "doo", "", "", ""}, + {true, true, false, true, false, true, false, true, true, true, true, true}); + results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, + cudf::strings_column_view(sep_col), + cudf::string_scalar("", false), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true); } TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixSeparatorReplacement) @@ -335,26 +410,26 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixSeparatorRepl auto sep_col = cudf::test::strings_column_wrapper( {"", "~~~", "", "@", "", "", "", "^^^^", "", "--", "*****", "######"}, {true, true, false, true, false, true, false, true, true, true, true, true}); - auto sep_rep = cudf::string_scalar("!!!!!!!!!!"); + auto sep_rep = cudf::string_scalar("!!!!!!!"); - auto exp_results = cudf::test::strings_column_wrapper( - {"eeexyzfoo", - "~~~", - "!!!!!!!!!!éaff", - "éééf", - "éa", - "", - "éaff", - "valid", - "doo", - "", - "", - ""}, - {true, true, true, true, true, true, true, true, true, false, false, false}); + auto exp_results1 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "!!!!!!!éaff", "éééf", "éa", "", "éaff", "valid", "doo", "", "", ""}, + {true, true, true, false, false, false, false, false, false, false, false, false}); auto results = cudf::strings::concatenate( cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col), sep_rep); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true); + + auto exp_results2 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "!!!!!!!éaff", "éééf", "éa", "", "éaff", "valid", "doo", "", "", ""}, + {true, true, true, true, true, true, true, true, true, true, true, true}); + + results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, + cudf::strings_column_view(sep_col), + sep_rep, + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true); } TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixColumnReplacement) diff --git a/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp b/cpp/tests/strings/combine/join_list_elements_tests.cpp similarity index 82% rename from cpp/tests/strings/combine/concatenate_list_elements_tests.cpp rename to cpp/tests/strings/combine/join_list_elements_tests.cpp index b6afd588dfb..e2f7c3e36a2 100644 --- a/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp +++ b/cpp/tests/strings/combine/join_list_elements_tests.cpp @@ -58,7 +58,7 @@ TEST_F(StringsListsConcatenateTest, InvalidInput) { auto const string_lists = INT_LISTS{{1, 2, 3}, {4, 5, 6}}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); - EXPECT_THROW(cudf::strings::concatenate_list_elements(string_lv), cudf::logic_error); + EXPECT_THROW(cudf::strings::join_list_elements(string_lv), cudf::logic_error); } // Invalid scalar separator @@ -66,9 +66,8 @@ TEST_F(StringsListsConcatenateTest, InvalidInput) auto const string_lists = STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); - EXPECT_THROW( - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("", false)), - cudf::logic_error); + EXPECT_THROW(cudf::strings::join_list_elements(string_lv, cudf::string_scalar("", false)), + cudf::logic_error); } // Invalid column separators @@ -77,7 +76,7 @@ TEST_F(StringsListsConcatenateTest, InvalidInput) STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); auto const separators = STR_COL{"+++"}.release(); // size doesn't match with lists column size - EXPECT_THROW(cudf::strings::concatenate_list_elements(string_lv, separators->view()), + EXPECT_THROW(cudf::strings::join_list_elements(string_lv, separators->view()), cudf::logic_error); } } @@ -87,26 +86,26 @@ TEST_F(StringsListsConcatenateTest, EmptyInput) auto const string_lists = STR_LISTS{}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); auto const expected = STR_COL{}; - auto results = cudf::strings::concatenate_list_elements(string_lv); + auto results = cudf::strings::join_list_elements(string_lv); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); auto const separators = STR_COL{}.release(); - results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + results = cudf::strings::join_list_elements(string_lv, separators->view()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); } TEST_F(StringsListsConcatenateTest, ZeroSizeStringsInput) { auto const string_lists = - STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); + STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}, STR_LISTS{}}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); - auto const expected = STR_COL{"", "", ""}; + auto const expected = STR_COL{"", "", "", ""}; - auto results = cudf::strings::concatenate_list_elements(string_lv); + auto results = cudf::strings::join_list_elements(string_lv); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - auto const separators = STR_COL{"", "", ""}.release(); - results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + auto const separators = STR_COL{"", "", "", ""}.release(); + results = cudf::strings::join_list_elements(string_lv, separators->view()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); } @@ -120,29 +119,35 @@ TEST_F(StringsListsConcatenateTest, AllNullsStringsInput) auto const string_lv = cudf::lists_column_view(string_lists->view()); auto const expected = STR_COL{{"", "", ""}, all_nulls()}; - auto results = cudf::strings::concatenate_list_elements(string_lv); + auto results = cudf::strings::join_list_elements(string_lv); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); auto const separators = STR_COL{{"", "", ""}, all_nulls()}.release(); - results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + results = cudf::strings::join_list_elements(string_lv, separators->view()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); } +auto null_at(std::initializer_list indices) +{ + return cudf::detail::make_counting_transform_iterator( + 0, [indices](auto i) { return std::find(indices.begin(), indices.end(), i) == indices.end(); }); +} + TEST_F(StringsListsConcatenateTest, ScalarSeparator) { auto const string_lists = STR_LISTS{{STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)}, STR_LISTS{}, /*NULL*/ STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)}, - STR_LISTS{"zzz", "xxxxx"}}, + STR_LISTS{"zzz", "xxxxx"}, + STR_LISTS{{"v", "", "", "w"}, null_at({1, 2})}}, null_at(1)} .release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); // No null replacement { - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); - std::vector h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx"}; + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); + std::vector h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx", nullptr}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); @@ -150,10 +155,22 @@ TEST_F(StringsListsConcatenateTest, ScalarSeparator) // With null replacement { - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{ - "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx"}; + "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx", "v+++___+++___+++w"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Turn off separator-on-nulls + { + auto const results = cudf::strings::join_list_elements(string_lv, + cudf::string_scalar("+++"), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + std::vector h_expected{"a+++ccc", nullptr, "efgh+++ijk", "zzz+++xxxxx", "v+++w"}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); @@ -181,8 +198,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the entire lists column, no null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); std::vector h_expected{nullptr, nullptr, nullptr, @@ -202,7 +218,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the entire lists column, with null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{"a+++___+++ccc", nullptr, @@ -223,8 +239,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the first half of the lists column, no null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); std::vector h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx"}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; @@ -234,7 +249,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the first half of the lists column, with null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{ "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx"}; @@ -246,8 +261,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the second half of the lists column, no null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); std::vector h_expected{ nullptr, nullptr, "0a0b0c+++5x5y5z", nullptr, "ééé+++12345abcdef", "aaaééébbbéééccc+++12345"}; auto const expected = @@ -258,7 +272,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the second half of the lists column, with null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{"abcdef+++012345+++___+++xxx000", "___+++11111+++00000", @@ -274,8 +288,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the middle part of the lists column, no null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); std::vector h_expected{ "zzz+++xxxxx", nullptr, nullptr, nullptr, "0a0b0c+++5x5y5z"}; auto const expected = @@ -286,7 +299,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the middle part of the lists column, with null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{"zzz+++xxxxx", nullptr, @@ -318,7 +331,7 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) // No null replacement { - auto const results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + auto const results = cudf::strings::join_list_elements(string_lv, separators->view()); std::vector h_expected{nullptr, nullptr, nullptr, nullptr, nullptr, "zzz^^^xxxxx"}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; @@ -327,8 +340,8 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) // With null replacement for separators { - auto const results = cudf::strings::concatenate_list_elements( - string_lv, separators->view(), cudf::string_scalar("|||")); + auto const results = + cudf::strings::join_list_elements(string_lv, separators->view(), cudf::string_scalar("|||")); std::vector h_expected{ nullptr, nullptr, "0a0b0c|||xyzééé", nullptr, nullptr, "zzz^^^xxxxx"}; auto const expected = @@ -338,7 +351,7 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) // With null replacement for strings { - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, separators->view(), cudf::string_scalar("", false), cudf::string_scalar("XXXXX")); std::vector h_expected{ "a+++XXXXX+++ccc", nullptr, nullptr, nullptr, "XXXXX%%%ááá%%%ííí", "zzz^^^xxxxx"}; @@ -349,7 +362,7 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) // With null replacement for both separators and strings { - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, separators->view(), cudf::string_scalar("|||"), cudf::string_scalar("XXXXX")); std::vector h_expected{"a+++XXXXX+++ccc", nullptr, @@ -361,6 +374,20 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); } + + // Turn off separator-on-nulls + { + auto const results = cudf::strings::join_list_elements(string_lv, + separators->view(), + cudf::string_scalar("+++"), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + std::vector h_expected{ + "a+++ccc", nullptr, "0a0b0c+++xyzééé", "efgh+++ijk", "ááá%%%ííí", "zzz^^^xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } } TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) @@ -390,7 +417,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + auto const results = cudf::strings::join_list_elements(string_lv, sep_col); std::vector h_expected{nullptr, nullptr, nullptr, @@ -411,7 +438,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); std::vector h_expected{"a+++___+++ccc", nullptr, @@ -433,7 +460,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 4})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + auto const results = cudf::strings::join_list_elements(string_lv, sep_col); std::vector h_expected{nullptr, nullptr, nullptr, nullptr}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; @@ -444,7 +471,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 4})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); std::vector h_expected{ "a+++___+++ccc", nullptr, "___|||efgh|||ijk", "zzz|||xxxxx"}; @@ -457,7 +484,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {5, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + auto const results = cudf::strings::join_list_elements(string_lv, sep_col); std::vector h_expected{ nullptr, nullptr, "0a0b0c###5x5y5z", nullptr, "ééé-+-12345abcdef", "aaaééébbbéééccc=+=12345"}; auto const expected = @@ -469,7 +496,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {5, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); std::vector h_expected{"abcdef^^^012345^^^___^^^xxx000", "___~!~11111~!~00000", @@ -486,7 +513,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {3, 8})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + auto const results = cudf::strings::join_list_elements(string_lv, sep_col); std::vector h_expected{nullptr, nullptr, nullptr, nullptr, "0a0b0c###5x5y5z"}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; @@ -497,7 +524,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {3, 8})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); std::vector h_expected{"zzz|||xxxxx", nullptr, diff --git a/python/cudf/cudf/_lib/cpp/strings/combine.pxd b/python/cudf/cudf/_lib/cpp/strings/combine.pxd index 250c6441882..51c706b68d0 100644 --- a/python/cudf/cudf/_lib/cpp/strings/combine.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/combine.pxd @@ -18,13 +18,13 @@ cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil: string_scalar separator, string_scalar narep) except + - cdef unique_ptr[column] concatenate_list_elements( + cdef unique_ptr[column] join_list_elements( column_view lists_strings_column, column_view separators, string_scalar separator_narep, string_scalar string_narep) except + - cdef unique_ptr[column] concatenate_list_elements( + cdef unique_ptr[column] join_list_elements( column_view lists_strings_column, string_scalar separator, string_scalar narep) except + diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 25619de3ed0..0d7dfb5c619 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -16,7 +16,7 @@ from cudf._lib.table cimport Table from cudf._lib.cpp.strings.combine cimport ( concatenate as cpp_concatenate, join_strings as cpp_join_strings, - concatenate_list_elements as cpp_concatenate_list_elements + join_list_elements as cpp_join_list_elements ) @@ -105,7 +105,7 @@ def join_lists_with_scalar( ) with nogil: - c_result = move(cpp_concatenate_list_elements( + c_result = move(cpp_join_list_elements( source_view, scalar_separator[0], scalar_narep[0] @@ -142,7 +142,7 @@ def join_lists_with_column( ) with nogil: - c_result = move(cpp_concatenate_list_elements( + c_result = move(cpp_join_list_elements( source_view, separator_view, scalar_separator_narep[0],