Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use cudf::lists::distinct in Java binding #11233

Merged
merged 42 commits into from
Jul 19, 2022
Merged
Show file tree
Hide file tree
Changes from 39 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
1d7e8e0
Add new implementation and test files
ttnghia Jun 24, 2022
51b80db
Fix compile error
ttnghia Jun 24, 2022
08a76ad
Rename function
ttnghia Jun 27, 2022
16101f7
Implement `cudf::detail::stable_distinct` and `lists::distinct`
ttnghia Jun 27, 2022
5ec13d6
Rewrite doxygen
ttnghia Jun 27, 2022
6c5b738
Rename variable
ttnghia Jun 27, 2022
5b70eee
Rewrite comment
ttnghia Jun 27, 2022
238248d
Rename files
ttnghia Jun 27, 2022
ba6bf6b
Implement float tests
ttnghia Jun 27, 2022
3845c95
Implement string tests
ttnghia Jun 27, 2022
507c82d
Implement tests for `ListDistinctTypedTest`
ttnghia Jun 28, 2022
2cb8347
Complete the remaining tests
ttnghia Jun 28, 2022
7efdea0
Merge branch 'branch-22.08' into add_lists_distinct
ttnghia Jun 28, 2022
4388637
Rewrite doxygen
ttnghia Jun 28, 2022
4dd5e74
Misc
ttnghia Jun 28, 2022
3b0760c
Misc
ttnghia Jun 28, 2022
9730b70
Rewrite test
ttnghia Jun 28, 2022
9bd9b6f
Fix doxygen
ttnghia Jun 28, 2022
790a482
Fix header
ttnghia Jun 28, 2022
1c58baa
Rewrite doxygen
ttnghia Jun 28, 2022
d493c4f
Rewrite doxygen and fix headers
ttnghia Jun 28, 2022
d090d2a
Fix iterator type
ttnghia Jun 30, 2022
ee51822
Rewrite doxygen
ttnghia Jun 30, 2022
ccdd6f0
Add empty lines
ttnghia Jun 30, 2022
034ee2a
Merge branch 'branch-22.08' into add_lists_distinct
ttnghia Jun 30, 2022
b1231a2
Update default stream
ttnghia Jun 30, 2022
af91b80
Merge branch 'branch-22.08' into add_lists_distinct
ttnghia Jul 5, 2022
86c9ba8
Merge branch 'branch-22.08' into add_lists_distinct
ttnghia Jul 8, 2022
99d70b1
Handle empty input
ttnghia Jul 8, 2022
555eacc
Merge branch 'add_lists_distinct' into use_lists_distinct_in_java
ttnghia Jul 8, 2022
6270714
Use `lists::distinct` in `dropListDuplicates`
ttnghia Jul 8, 2022
6573d5a
Implement `lists_distinct_by_key` in JNI
ttnghia Jul 8, 2022
1a9207f
Merge branch 'branch-22.08' into use_lists_distinct_in_java
ttnghia Jul 12, 2022
010e455
Rewrite `lists_distinct_by_key`
ttnghia Jul 12, 2022
9423f88
Write doxygen
ttnghia Jul 12, 2022
c95af16
Fix bug
ttnghia Jul 12, 2022
0a173a0
Handle empty input
ttnghia Jul 12, 2022
0abf02c
Misc
ttnghia Jul 14, 2022
af44430
Merge branch 'branch-22.08' into use_lists_distinct_in_java
ttnghia Jul 14, 2022
c4f010a
Rewrite doxygen
ttnghia Jul 18, 2022
96eb628
Add sliced input tests
ttnghia Jul 18, 2022
d3dc961
Merge branch 'branch-22.08' into use_lists_distinct_in_java
ttnghia Jul 18, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 12 additions & 55 deletions java/src/main/native/src/ColumnViewJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
#include <cudf/lists/contains.hpp>
#include <cudf/lists/count_elements.hpp>
#include <cudf/lists/detail/concatenate.hpp>
#include <cudf/lists/drop_list_duplicates.hpp>
#include <cudf/lists/extract.hpp>
#include <cudf/lists/gather.hpp>
#include <cudf/lists/lists_column_view.hpp>
Expand Down Expand Up @@ -463,9 +462,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dropListDuplicates(JNIEnv
JNI_NULL_CHECK(env, column_view, "column is null", 0);
try {
cudf::jni::auto_set_device(env);
cudf::column_view const *cv = reinterpret_cast<cudf::column_view const *>(column_view);
cudf::lists_column_view lcv(*cv);
return release_as_jlong(cudf::lists::drop_list_duplicates(lcv));
auto const input_cv = reinterpret_cast<cudf::column_view const *>(column_view);
return release_as_jlong(cudf::lists::distinct(cudf::lists_column_view{*input_cv}));
}
CATCH_STD(env, 0);
}
Expand All @@ -476,59 +474,18 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dropListDuplicatesWithKey
try {
cudf::jni::auto_set_device(env);
auto const input_cv = reinterpret_cast<cudf::column_view const *>(keys_vals_handle);
CUDF_EXPECTS(input_cv->offset() == 0, "Input column has non-zero offset.");
mythrocks marked this conversation as resolved.
Show resolved Hide resolved
CUDF_EXPECTS(input_cv->type().id() == cudf::type_id::LIST,
"Input column is not a lists column.");
JNI_ARG_CHECK(env, input_cv->type().id() == cudf::type_id::LIST,
"Input column is not a lists column.", 0);

// Extract list offsets and a column of struct<keys, values> from the input lists column.
auto const lists_keys_vals = cudf::lists_column_view(*input_cv);
auto const keys_vals = lists_keys_vals.get_sliced_child(cudf::default_stream_value);
CUDF_EXPECTS(keys_vals.type().id() == cudf::type_id::STRUCT,
"Input column has child that is not a structs column.");
CUDF_EXPECTS(keys_vals.num_children() == 2,
"Input column has child that does not have 2 children.");

auto const lists_offsets = lists_keys_vals.offsets();
auto const structs_keys_vals = cudf::structs_column_view(keys_vals);

// Assemble a lists_column_view from the existing data (offsets + child).
// This will not copy any data, just create a view, for performance reason.
auto const make_lists_view = [&input_cv](auto const &offsets, auto const &child) {
return cudf::lists_column_view(
cudf::column_view(cudf::data_type{input_cv->type()}, input_cv->size(), nullptr,
input_cv->null_mask(), input_cv->null_count(), 0, {offsets, child}));
};

// Extract keys and values lists columns from the input lists of structs column.
auto const keys = make_lists_view(lists_offsets, structs_keys_vals.child(0));
auto const vals = make_lists_view(lists_offsets, structs_keys_vals.child(1));

// Apache Spark desires to keep the last duplicate element.
auto [out_keys, out_vals] =
cudf::lists::drop_list_duplicates(keys, vals, cudf::duplicate_keep_option::KEEP_LAST);

// Release the contents of the outputs.
auto out_keys_content = out_keys->release();
auto out_vals_content = out_vals->release();

// Total number of elements in the child column.
// This should be the same for the out_vals column.
auto const out_child_size =
out_keys_content.children[cudf::lists_column_view::child_column_index]->size();

// Assemble a lists column of struct<out_keys, out_vals> for the final output.
auto out_structs_members = std::vector<std::unique_ptr<cudf::column>>();
out_structs_members.emplace_back(
std::move(out_keys_content.children[cudf::lists_column_view::child_column_index]));
out_structs_members.emplace_back(
std::move(out_vals_content.children[cudf::lists_column_view::child_column_index]));
auto &out_offsets = out_keys_content.children[cudf::lists_column_view::offsets_column_index];

auto out_structs =
cudf::make_structs_column(out_child_size, std::move(out_structs_members), 0, {});
return release_as_jlong(cudf::make_lists_column(input_cv->size(), std::move(out_offsets),
std::move(out_structs), input_cv->null_count(),
cudf::copy_bitmask(*input_cv)));
auto const keys_vals = lists_keys_vals.child();
JNI_ARG_CHECK(env, keys_vals.type().id() == cudf::type_id::STRUCT,
"Input column has child that is not a structs column.", 0);
JNI_ARG_CHECK(env, keys_vals.num_children() == 2,
"Input column has child that does not have 2 children.", 0);

return release_as_jlong(
cudf::jni::lists_distinct_by_key(lists_keys_vals, cudf::default_stream_value));
}
CATCH_STD(env, 0);
}
Expand Down
54 changes: 54 additions & 0 deletions java/src/main/native/src/ColumnViewJni.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,16 @@

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/labeling/label_segments.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/stream_compaction.hpp>
#include <cudf/detail/valid_if.cuh>
#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/span.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>
#include <thrust/scan.h>

Expand Down Expand Up @@ -72,4 +80,50 @@ std::unique_ptr<cudf::column> generate_list_offsets(cudf::column_view const &lis

return offsets_column;
}

std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view const &input,
rmm::cuda_stream_view stream) {
if (input.is_empty()) {
return empty_like(input.parent());
}

auto const child = input.get_sliced_child(stream);

// Genereate labels for the input list elements.
auto labels = rmm::device_uvector<cudf::size_type>(child.size(), stream);
cudf::detail::label_segments(input.offsets_begin(), input.offsets_end(), labels.begin(),
mythrocks marked this conversation as resolved.
Show resolved Hide resolved
labels.end(), stream);

// Use `cudf::duplicate_keep_option::KEEP_LAST` so this will produce the desired behavior when
// being called in `create_map` in spark-rapids.
// Other options comparing nulls and NaNs are set as all-equal.
auto out_columns = cudf::detail::stable_distinct(
table_view{{column_view{cudf::device_span<cudf::size_type const>{labels}},
child.child(0), child.child(1)}}, // input table
std::vector<size_type>{0, 1}, // key columns
cudf::duplicate_keep_option::KEEP_LAST, cudf::null_equality::EQUAL,
cudf::nan_equality::ALL_EQUAL, stream)
->release();
auto const out_labels = out_columns.front()->view();

// Assemble a structs column of <out_keys, out_vals>.
auto out_structs_members = std::vector<std::unique_ptr<cudf::column>>();
out_structs_members.emplace_back(std::move(out_columns[1]));
out_structs_members.emplace_back(std::move(out_columns[2]));
auto out_structs =
cudf::make_structs_column(out_labels.size(), std::move(out_structs_members), 0, {});

// Assemble a lists column of structs<out_keys, out_vals>.
auto out_offsets = make_numeric_column(data_type{type_to_id<offset_type>()}, input.size() + 1,
mask_state::UNALLOCATED, stream);
auto const offsets_begin = out_offsets->mutable_view().template begin<offset_type>();
auto const labels_begin = out_labels.template begin<offset_type>();
cudf::detail::labels_to_offsets(labels_begin, labels_begin + out_labels.size(), offsets_begin,
offsets_begin + out_offsets->size(), stream);

return cudf::make_lists_column(input.size(), std::move(out_offsets), std::move(out_structs),
input.null_count(),
cudf::detail::copy_bitmask(input.parent(), stream), stream);
}

} // namespace cudf::jni
17 changes: 17 additions & 0 deletions java/src/main/native/src/ColumnViewJni.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include <cudf/column/column.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <rmm/cuda_stream_view.hpp>

Expand Down Expand Up @@ -52,4 +53,20 @@ std::unique_ptr<cudf::column>
generate_list_offsets(cudf::column_view const &list_length,
rmm::cuda_stream_view stream = cudf::default_stream_value);

/**
* @brief Generates lists column by copying from each row of the input lists column the list
* elements that are distinct by key.
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
*
* The input lists column must be given such that each list element is a struct of <key, value>
* pair. With such input, a list containing distinct by key elements are defined such that the keys
* of all elements in the list are distinct (i.e., any two keys are always compared unequal).
*
* There will not be any validity check for the input. The caller is responsible to make sure that
* the input lists column has the right structure.
*
* @return A new list columns in which the elements in each list are distinct by key.
*/
std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view const &input,
rmm::cuda_stream_view stream);

} // namespace cudf::jni