Skip to content

Commit

Permalink
Merge branch 'branch-21.06' into fix_struct_comparison
Browse files Browse the repository at this point in the history
  • Loading branch information
ttnghia committed May 24, 2021
2 parents 5d4c24a + ef20706 commit 000f62b
Show file tree
Hide file tree
Showing 21 changed files with 1,006 additions and 305 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,13 @@ cuDF can be installed with conda ([miniconda](https://conda.io/miniconda.html),

For `cudf version == 21.06` :
```bash
# for CUDA 10.1
# for CUDA 11.0
conda install -c rapidsai -c nvidia -c numba -c conda-forge \
cudf=21.06 python=3.7 cudatoolkit=10.1
cudf=21.06 python=3.7 cudatoolkit=11.0

# or, for CUDA 10.2
# or, for CUDA 11.2
conda install -c rapidsai -c nvidia -c numba -c conda-forge \
cudf=21.06 python=3.7 cudatoolkit=10.2
cudf=21.06 python=3.7 cudatoolkit=11.2

```

Expand Down
2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -333,8 +333,8 @@ add_library(cudf
src/strings/char_types/char_cases.cu
src/strings/char_types/char_types.cu
src/strings/combine/concatenate.cu
src/strings/combine/concatenate_list_elements.cu
src/strings/combine/join.cu
src/strings/combine/join_list_elements.cu
src/strings/contains.cu
src/strings/convert/convert_booleans.cu
src/strings/convert/convert_datetime.cu
Expand Down
134 changes: 87 additions & 47 deletions cpp/include/cudf/strings/combine.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -30,12 +30,21 @@ namespace strings {
* @brief Strings APIs for concatenate and join
*/

/**
* @brief Setting for specifying how separators are added with
* null strings elements.
*/
enum class separator_on_nulls {
YES, ///< Always add separators between elements
NO ///< Do not add separators if an element is null
};

/**
* @brief Concatenates all strings in the column into one new string delimited
* by an optional separator string.
*
* This returns a column with one string. Any null entries are ignored unless
* the narep parameter specifies a replacement string.
* the @p narep parameter specifies a replacement string.
*
* @code{.pseudo}
* Example:
Expand Down Expand Up @@ -70,11 +79,9 @@ std::unique_ptr<column> join_strings(
*
* - If row separator for a given row is null, output column for that row is null, unless
* there is a valid @p separator_narep
* - If all column values for a given row is null, output column for that row is null, unless
* there is a valid @p col_narep
* - null column values for a given row are skipped, if the column replacement isn't valid
* - The separator is only applied between two valid column values
* - If valid @p separator_narep and @p col_narep are provided, the output column is always
* - The separator is applied between two output row values if the @p separate_nulls
* is `YES` or only between valid rows if @p separate_nulls is `NO`.
* - If @p separator_narep and @p col_narep are both valid, the output column is always
* non nullable
*
* @code{.pseudo}
Expand All @@ -83,16 +90,25 @@ std::unique_ptr<column> join_strings(
* c1 = [null, 'cc', 'dd', null, null, 'gg']
* c2 = ['bb', '', null, null, null, 'hh']
* sep = ['::', '%%', '^^', '!', '*', null]
* out0 = concatenate([c0, c1, c2], sep)
* out0 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, null]
* out = concatenate({c0, c1, c2}, sep)
* // all rows have at least one null or sep[i]==null
* out is [null, null, null, null, null, null]
*
* sep_rep = '+'
* out1 = concatenate([c0, c1, c2], sep, sep_rep)
* out1 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, 'ff+gg+hh']
*
* col_rep = '-'
* out2 = concatenate([c0, c1, c2], sep, invalid_sep_rep, col_rep)
* out2 is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null]
* out = concatenate({c0, c1, c2}, sep, sep_rep)
* // all rows with at least one null output as null
* out is [null, null, null, null, null, 'ff+gg+hh']
*
* col_narep = '-'
* sep_na = non-valid scalar
* out = concatenate({c0, c1, c2}, sep, sep_na, col_narep)
* // only the null entry in the sep column produces a null row
* out is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null]
*
* col_narep = ''
* out = concatenate({c0, c1, c2}, sep, sep_rep, col_narep, separator_on_nulls:NO)
* // parameter suppresses separator for null rows
* out is ['aa::bb', 'cc%%', '^^dd', 'ee', '', 'ff+gg+hh']
* @endcode
*
* @throw cudf::logic_error if no input columns are specified - table view is empty
Expand All @@ -108,6 +124,8 @@ std::unique_ptr<column> join_strings(
* @param col_narep String that should be used in place of any null strings
* found in any column. Default of invalid-scalar means no null column value replacements.
* Default is an invalid string.
* @param separate_nulls If YES, then the separator is included for null rows
* if `col_narep` is valid.
* @param mr Resource for allocating device memory.
* @return New column with concatenated results.
*/
Expand All @@ -116,15 +134,9 @@ std::unique_ptr<column> concatenate(
strings_column_view const& separators,
string_scalar const& separator_narep = string_scalar("", false),
string_scalar const& col_narep = string_scalar("", false),
separator_on_nulls separate_nulls = separator_on_nulls::YES,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @addtogroup strings_combine
* @{
* @file strings/combine.hpp
* @brief Strings APIs for concatenate and join
*/

/**
* @brief Row-wise concatenates the given list of strings columns and
* returns a single strings column result.
Expand All @@ -136,59 +148,77 @@ std::unique_ptr<column> concatenate(
* row to be null entry unless a narep string is specified to be used
* in its place.
*
* The number of strings in the columns provided must be the same.
* If @p separate_nulls is set to `NO` and @p narep is valid then
* separators are not added to the output between null elements.
* Otherwise, separators are always added if @p narep is valid.
*
* More than one column must be specified in the input @p strings_columns
* table.
*
* @code{.pseudo}
* Example:
* s1 = ['aa', null, '', 'aa']
* s2 = ['', 'bb', 'bb', null]
* r1 = concatenate([s1,s2])
* r1 is ['aa', null, 'bb', null]
* r2 = concatenate([s1,s2],':','_')
* r2 is ['aa:', '_:bb', ':bb', 'aa:_']
* s1 = ['aa', null, '', 'dd']
* s2 = ['', 'bb', 'cc', null]
* out = concatenate({s1, s2})
* out is ['aa', null, 'cc', null]
*
* out = concatenate({s1, s2}, ':', '_')
* out is ['aa:', '_:bb', ':cc', 'dd:_']
*
* out = concatenate({s1, s2}, ':', '', separator_on_nulls::NO)
* out is ['aa:', 'bb', ':cc', 'dd']
* @endcode
*
* @throw cudf::logic_error if input columns are not all strings columns.
* @throw cudf::logic_error if separator is not valid.
* @throw cudf::logic_error if only one column is specified
*
* @param strings_columns List of string columns to concatenate.
* @param separator String that should inserted between each string from each row.
* Default is an empty string.
* @param narep String that should be used in place of any null strings
* found in any column. Default of invalid-scalar means any null entry in any column will
* produces a null result for that row.
* @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column with concatenated results.
*/
std::unique_ptr<column> concatenate(
table_view const& strings_columns,
string_scalar const& separator = string_scalar(""),
string_scalar const& narep = string_scalar("", false),
separator_on_nulls separate_nulls = separator_on_nulls::YES,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Given a lists column of strings (each row is a list of strings), concatenates the strings
* within each row and returns a single strings column result.
*
* Each new string is created by concatenating the strings from the same row (same list element)
* delimited by the row separator provided in the `separators` strings column.
* delimited by the row separator provided in the @p separators strings column.
*
* A null list row will always result in a null string in the output row. Any non-null list row
* having a null element will result in the corresponding output row to be null unless a valid
* `string_narep` scalar is provided to be used in its place. Any null row in the `separators`
* column will also result in a null output row unless a valid `separator_narep` scalar is provided
* @p string_narep scalar is provided to be used in its place. Any null row in the @p separators
* column will also result in a null output row unless a valid @p separator_narep scalar is provided
* to be used in place of the null separators.
*
* If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the
* output between null elements. Otherwise, separators are always added if @p narep is valid.
*
* @code{.pseudo}
* Example:
* s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff', 'gg'} ]
* s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff', 'gg'] ]
* sep = ['::', '%%', '!', '*', null]
*
* r1 = strings::concatenate_list_elements(s, sep)
* r1 is ['aa::bb::cc', null, '!dd', null, null]
* out = join_list_elements(s, sep)
* out is ['aa::bb::cc', null, '!dd', null, null]
*
* out = join_list_elements(s, sep, ':', '_')
* out is ['aa::bb::cc', null, '!dd', 'ee*_', 'ff:gg']
*
* r2 = strings::concatenate_list_elements(s, sep, ':', '_')
* r2 is ['aa::bb::cc', null, '!dd', 'ee*_', 'ff:gg']
* out = join_list_elements(s, sep, ':', '', separator_on_nulls::NO)
* out is ['aa::bb::cc', null, '!dd', 'ee', 'ff:gg']
* @endcode
*
* @throw cudf::logic_error if input column is not lists of strings column.
Expand All @@ -203,36 +233,44 @@ std::unique_ptr<column> concatenate(
* @param string_narep String that should be used to replace null strings in any non-null list row,
* default is an invalid-scalar denoting that list rows containing null strings will result
* in null string in the corresponding output rows.
* @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings column with concatenated results.
*/
std::unique_ptr<column> concatenate_list_elements(
std::unique_ptr<column> join_list_elements(
const lists_column_view& lists_strings_column,
const strings_column_view& separators,
string_scalar const& separator_narep = string_scalar("", false),
string_scalar const& string_narep = string_scalar("", false),
separator_on_nulls separate_nulls = separator_on_nulls::YES,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Given a lists column of strings (each row is a list of strings), concatenates the strings
* within each row and returns a single strings column result.
*
* Each new string is created by concatenating the strings from the same row (same list element)
* delimited by the separator provided.
* delimited by the @p separator provided.
*
* A null list row will always result in a null string in the output row. Any non-null list row
* having a null elenent will result in the corresponding output row to be null unless a narep
* string is specified to be used in its place.
* having a null elenent will result in the corresponding output row to be null unless a
* @p narep string is specified to be used in its place.
*
* If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the
* output between null elements. Otherwise, separators are always added if @p narep is valid.
*
* @code{.pseudo}
* Example:
* s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff'} ]
* s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff'] ]
*
* out = join_list_elements(s)
* out is ['aabbcc', null, 'dd', null, 'ff']
*
* r1 = strings::concatenate_list_elements(s)
* r1 is ['aabbcc', null, 'dd', null, 'ff']
* out = join_list_elements(s, ':', '_')
* out is ['aa:bb:cc', null, ':dd', 'ee:_', 'ff']
*
* r2 = strings::concatenate_list_elements(s, ':', '_')
* r2 is ['aa:bb:cc', null, ':dd', 'ee:_', 'ff']
* out = join_list_elements(s, ':', '', separator_on_nulls::NO)
* out is ['aa:bb:cc', null, ':dd', 'ee', 'ff']
* @endcode
*
* @throw cudf::logic_error if input column is not lists of strings column.
Expand All @@ -244,13 +282,15 @@ std::unique_ptr<column> concatenate_list_elements(
* @param narep String that should be used to replace null strings in any non-null list row, default
* is an invalid-scalar denoting that list rows containing null strings will result in null
* string in the corresponding output rows.
* @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings column with concatenated results.
*/
std::unique_ptr<column> concatenate_list_elements(
std::unique_ptr<column> join_list_elements(
const lists_column_view& lists_strings_column,
string_scalar const& separator = string_scalar(""),
string_scalar const& narep = string_scalar("", false),
separator_on_nulls separate_nulls = separator_on_nulls::YES,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
Expand Down
4 changes: 3 additions & 1 deletion cpp/include/cudf/strings/detail/combine.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,6 +17,7 @@

#include <cudf/column/column.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/combine.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/table/table_view.hpp>

Expand All @@ -36,6 +37,7 @@ std::unique_ptr<column> concatenate(
table_view const& strings_columns,
string_scalar const& separator,
string_scalar const& narep,
separator_on_nulls separate_nulls = separator_on_nulls::YES,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

Expand Down
41 changes: 40 additions & 1 deletion cpp/src/groupby/groupby.cu
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,44 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
groupby::~groupby() = default;

namespace {

/**
* @brief Factory to construct empty result columns.
*
* Adds special handling for COLLECT_LIST/COLLECT_SET, because:
* 1. `make_empty_column()` does not support construction of nested columns.
* 2. Empty lists need empty child columns, to persist type information.
*/
struct empty_column_constructor {
column_view values;

template <typename ValuesType, aggregation::Kind k>
std::unique_ptr<cudf::column> operator()() const
{
using namespace cudf;
using namespace cudf::detail;

if constexpr (k == aggregation::Kind::COLLECT_LIST || k == aggregation::Kind::COLLECT_SET) {
return make_lists_column(
0, make_empty_column(data_type{type_to_id<offset_type>()}), empty_like(values), 0, {});
}

// If `values` is LIST typed, and the aggregation results match the type,
// construct empty results based on `values`.
// Most generally, this applies if input type matches output type.
//
// Note: `target_type_t` is not recursive, and `ValuesType` does not consider children.
// It is important that `COLLECT_LIST` and `COLLECT_SET` are handled before this
// point, because `COLLECT_LIST(LIST)` produces `LIST<LIST>`, but `target_type_t`
// wouldn't know the difference.
if constexpr (std::is_same_v<target_type_t<ValuesType, k>, ValuesType>) {
return empty_like(values);
}

return make_empty_column(target_type(values.type(), k));
}
};

/// Make an empty table with appropriate types for requested aggs
auto empty_results(host_span<aggregation_request const> requests)
{
Expand All @@ -93,7 +131,8 @@ auto empty_results(host_span<aggregation_request const> requests)
request.aggregations.end(),
std::back_inserter(results),
[&request](auto const& agg) {
return make_empty_column(cudf::detail::target_type(request.values.type(), agg->kind));
return cudf::detail::dispatch_type_and_aggregation(
request.values.type(), agg->kind, empty_column_constructor{request.values});
});

return aggregation_result{std::move(results)};
Expand Down
Loading

0 comments on commit 000f62b

Please sign in to comment.