Skip to content

Commit

Permalink
Implement lists::stable_sort_lists for stable sorting of elements w…
Browse files Browse the repository at this point in the history
…ithin each row of lists column (#9425)

This PR adds `lists::stable_sort_lists` that can sort elements within rows of lists column using stable sort. This is necessary for implementing `lists::drop_list_duplicates` that operates on keys-values columns input when we want to remove the values corresponding to duplicate keys with `KEEP_FIRST` or `KEEP_LAST` option.

In order to implement `lists::stable_sort_lists`, stable sort versions for the `segmented_sorted_order` and `segmented_sort_by_key` have also been implemented, which can maintain the order of equally-compared elements within segments. 

This PR blocks #9345.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Conor Hoekstra (https://github.com/codereport)
  - MithunR (https://github.com/mythrocks)

URL: #9425
  • Loading branch information
ttnghia authored Oct 18, 2021
1 parent 399d5b5 commit 823958b
Show file tree
Hide file tree
Showing 7 changed files with 389 additions and 105 deletions.
27 changes: 27 additions & 0 deletions cpp/include/cudf/detail/sorting.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,19 @@ std::unique_ptr<column> segmented_sorted_order(
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::stable_segmented_sorted_order
*
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> stable_segmented_sorted_order(
table_view const& keys,
column_view const& segment_offsets,
std::vector<order> const& column_order = {},
std::vector<null_order> const& null_precedence = {},
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::segmented_sort_by_key
*
Expand All @@ -90,6 +103,20 @@ std::unique_ptr<table> segmented_sort_by_key(
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::stable_segmented_sort_by_key
*
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<table> stable_segmented_sort_by_key(
table_view const& values,
table_view const& keys,
column_view const& segment_offsets,
std::vector<order> const& column_order = {},
std::vector<null_order> const& null_precedence = {},
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::sort
*
Expand Down
13 changes: 13 additions & 0 deletions cpp/include/cudf/lists/detail/sorting.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,19 @@ std::unique_ptr<column> sort_lists(
null_order null_precedence,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::lists::stable_sort_lists
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> stable_sort_lists(
lists_column_view const& input,
order column_order,
null_order null_precedence,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace detail
} // namespace lists
} // namespace cudf
12 changes: 12 additions & 0 deletions cpp/include/cudf/lists/sorting.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,18 @@ std::unique_ptr<column> sort_lists(
null_order null_precedence,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Segmented sort of the elements within a list in each row of a list column using stable
* sort.
*
* @copydoc cudf::lists::sort_lists
*/
std::unique_ptr<column> stable_sort_lists(
lists_column_view const& source_column,
order column_order,
null_order null_precedence,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
} // namespace lists
} // namespace cudf
25 changes: 25 additions & 0 deletions cpp/include/cudf/sorting.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,18 @@ std::unique_ptr<column> segmented_sorted_order(
std::vector<null_order> const& null_precedence = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns sorted order after stably sorting each segment in the table.
*
* @copydoc cudf::segmented_sorted_order
*/
std::unique_ptr<column> stable_segmented_sorted_order(
table_view const& keys,
column_view const& segment_offsets,
std::vector<order> const& column_order = {},
std::vector<null_order> const& null_precedence = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Performs a lexicographic segmented sort of a table
*
Expand Down Expand Up @@ -241,5 +253,18 @@ std::unique_ptr<table> segmented_sort_by_key(
std::vector<null_order> const& null_precedence = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Performs a stably lexicographic segmented sort of a table
*
* @copydoc cudf::segmented_sort_by_key
*/
std::unique_ptr<table> stable_segmented_sort_by_key(
table_view const& values,
table_view const& keys,
column_view const& segment_offsets,
std::vector<order> const& column_order = {},
std::vector<null_order> const& null_precedence = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
} // namespace cudf
44 changes: 44 additions & 0 deletions cpp/src/lists/segmented_sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,40 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
input.null_count(),
std::move(null_mask));
}

std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
order column_order,
null_order null_precedence,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (input.is_empty()) { return empty_like(input.parent()); }

auto output_offset = make_numeric_column(
input.offsets().type(), input.size() + 1, mask_state::UNALLOCATED, stream, mr);
thrust::transform(rmm::exec_policy(stream),
input.offsets_begin(),
input.offsets_end(),
output_offset->mutable_view().template begin<size_type>(),
[first = input.offsets_begin()] __device__(auto offset_index) {
return offset_index - *first;
});

auto const child = input.get_sliced_child(stream);
auto const sorted_child_table = stable_segmented_sort_by_key(table_view{{child}},
table_view{{child}},
output_offset->view(),
{column_order},
{null_precedence},
stream,
mr);

return make_lists_column(input.size(),
std::move(output_offset),
std::move(sorted_child_table->release().front()),
input.null_count(),
cudf::detail::copy_bitmask(input.parent(), stream, mr));
}
} // namespace detail

std::unique_ptr<column> sort_lists(lists_column_view const& input,
Expand All @@ -279,5 +313,15 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
return detail::sort_lists(input, column_order, null_precedence, rmm::cuda_stream_default, mr);
}

std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
order column_order,
null_order null_precedence,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::stable_sort_lists(
input, column_order, null_precedence, rmm::cuda_stream_default, mr);
}

} // namespace lists
} // namespace cudf
168 changes: 132 additions & 36 deletions cpp/src/sort/segmented_sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,36 +14,26 @@
* limitations under the License.
*/

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/copy.hpp>
#include <cudf/detail/gather.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/sorting.hpp>
#include <cudf/lists/list_device_view.cuh>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/table/table_device_view.cuh>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/binary_search.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/logical.h>

#include <algorithm>
#include <iterator>
#include <memory>
#include <type_traits>

namespace cudf {
namespace detail {

namespace {
/**
* @brief The enum specifying which sorting method to use (stable or unstable).
*/
enum class sort_method { STABLE, UNSTABLE };

// returns segment indices for each element for all segments.
// first segment begin index = 0, last segment end index = num_rows.
rmm::device_uvector<size_type> get_segment_indices(size_type num_rows,
Expand All @@ -65,12 +55,14 @@ rmm::device_uvector<size_type> get_segment_indices(size_type num_rows,
return segment_ids;
}

std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
column_view const& segment_offsets,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
std::unique_ptr<column> segmented_sorted_order_common(
table_view const& keys,
column_view const& segment_offsets,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
sort_method sorting,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(segment_offsets.type() == data_type(type_to_id<size_type>()),
"segment offsets should be size_type");
Expand All @@ -95,26 +87,39 @@ std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
};
auto child_column_order = prepend_default(column_order, order::ASCENDING);
auto child_null_precedence = prepend_default(null_precedence, null_order::AFTER);

// return sorted order of child columns
return detail::sorted_order(segid_keys, child_column_order, child_null_precedence, stream, mr);
return sorting == sort_method::STABLE
? detail::stable_sorted_order(
segid_keys, child_column_order, child_null_precedence, stream, mr)
: detail::sorted_order(
segid_keys, child_column_order, child_null_precedence, stream, mr);
}

std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
table_view const& keys,
column_view const& segment_offsets,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
std::unique_ptr<table> segmented_sort_by_key_common(table_view const& values,
table_view const& keys,
column_view const& segment_offsets,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
sort_method sorting,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(values.num_rows() == keys.num_rows(),
"Mismatch in number of rows for values and keys");
auto sorted_order = segmented_sorted_order(keys,
segment_offsets,
column_order,
null_precedence,
stream,
rmm::mr::get_current_device_resource());
auto sorted_order = sorting == sort_method::STABLE
? stable_segmented_sorted_order(keys,
segment_offsets,
column_order,
null_precedence,
stream,
rmm::mr::get_current_device_resource())
: segmented_sorted_order(keys,
segment_offsets,
column_order,
null_precedence,
stream,
rmm::mr::get_current_device_resource());

// Gather segmented sort of child value columns`
return detail::gather(values,
Expand All @@ -124,8 +129,87 @@ std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
stream,
mr);
}

} // namespace

std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
column_view const& segment_offsets,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return segmented_sorted_order_common(
keys, segment_offsets, column_order, null_precedence, sort_method::UNSTABLE, stream, mr);
}

std::unique_ptr<column> stable_segmented_sorted_order(
table_view const& keys,
column_view const& segment_offsets,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return segmented_sorted_order_common(
keys, segment_offsets, column_order, null_precedence, sort_method::STABLE, stream, mr);
}

std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
table_view const& keys,
column_view const& segment_offsets,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return segmented_sort_by_key_common(values,
keys,
segment_offsets,
column_order,
null_precedence,
sort_method::UNSTABLE,
stream,
mr);
}

std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
table_view const& keys,
column_view const& segment_offsets,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return segmented_sort_by_key_common(
values, keys, segment_offsets, column_order, null_precedence, sort_method::STABLE, stream, mr);
}

} // namespace detail

std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
column_view const& segment_offsets,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::segmented_sorted_order(
keys, segment_offsets, column_order, null_precedence, rmm::cuda_stream_default, mr);
}

std::unique_ptr<column> stable_segmented_sorted_order(
table_view const& keys,
column_view const& segment_offsets,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::stable_segmented_sorted_order(
keys, segment_offsets, column_order, null_precedence, rmm::cuda_stream_default, mr);
}

std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
table_view const& keys,
column_view const& segment_offsets,
Expand All @@ -138,4 +222,16 @@ std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
values, keys, segment_offsets, column_order, null_precedence, rmm::cuda_stream_default, mr);
}

std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
table_view const& keys,
column_view const& segment_offsets,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::stable_segmented_sort_by_key(
values, keys, segment_offsets, column_order, null_precedence, rmm::cuda_stream_default, mr);
}

} // namespace cudf
Loading

0 comments on commit 823958b

Please sign in to comment.