Skip to content

Commit

Permalink
Implement mixed equality/conditional semi/anti joins (#10037)
Browse files Browse the repository at this point in the history
This PR is a follow-up to #9917 and should be merged after that PR. This resolves #9695 and resolves #5401. The implementation here is only a first pass, but in the interest of prioritizing a working feature for the upcoming release I'm postponing making various additional changes (including some breaking ones).

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - David Wendt (https://github.com/davidwendt)
  - Robert Maynard (https://github.com/robertmaynard)

URL: #10037
  • Loading branch information
vyasr authored Jan 24, 2022
1 parent 6d11823 commit b8d2969
Show file tree
Hide file tree
Showing 15 changed files with 1,854 additions and 253 deletions.
7 changes: 6 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -324,11 +324,16 @@ add_library(
src/jit/parser.cpp
src/jit/type.cpp
src/join/conditional_join.cu
src/join/mixed_join.cu
src/join/cross_join.cu
src/join/hash_join.cu
src/join/join.cu
src/join/join_utils.cu
src/join/mixed_join.cu
src/join/mixed_join_kernels.cu
src/join/mixed_join_kernels_semi.cu
src/join/mixed_join_semi.cu
src/join/mixed_join_size_kernels.cu
src/join/mixed_join_size_kernels_semi.cu
src/join/semi_join.cu
src/lists/contains.cu
src/lists/combine/concatenate_list_elements.cu
Expand Down
187 changes: 187 additions & 0 deletions cpp/include/cudf/join.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1039,6 +1039,109 @@ mixed_full_join(
std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns an index vector corresponding to all rows in the left tables
* where the columns of the equality table are equal and the predicate
* evaluates to true on the conditional tables.
*
* If the provided predicate returns NULL for a pair of rows (left, right), the
* left row is not included in the output. It is the user's responsiblity to
* choose a suitable compare_nulls value AND use appropriate null-safe
* operators in the expression.
*
* If the provided output size or per-row counts are incorrect, behavior is undefined.
*
* @code{.pseudo}
* left_equality: {{0, 1, 2}}
* right_equality: {{1, 2, 3}}
* left_conditional: {{4, 4, 4}}
* right_conditional: {{3, 4, 5}}
* Expression: Left.Column_0 > Right.Column_0
* Result: {1}
* @endcode
*
* @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
* @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
* match.
* @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
* match.
*
* @param left_equality The left table used for the equality join.
* @param right_equality The right table used for the equality join.
* @param left_conditional The left table used for the conditional join.
* @param right_conditional The right table used for the conditional join.
* @param binary_predicate The condition on which to join.
* @param compare_nulls Whether or not null values join to each other or not.
* @param output_size_data An optional pair of values indicating the exact output size and the
* number of matches for each row in the larger of the two input tables, left or right (may be
* precomputed using the corresponding mixed_full_join_size API).
* @param mr Device memory resource used to allocate the returned table and columns' device memory
*
* @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
* the result of performing a mixed full join between the four input tables.
*/
std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
table_view const& left_equality,
table_view const& right_equality,
table_view const& left_conditional,
table_view const& right_conditional,
ast::expression const& binary_predicate,
null_equality compare_nulls = null_equality::EQUAL,
std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns an index vector corresponding to all rows in the left tables
* for which there is no row in the right tables where the columns of the
* equality table are equal and the predicate evaluates to true on the
* conditional tables.
*
* If the provided predicate returns NULL for a pair of rows (left, right), the
* left row is not included in the output. It is the user's responsiblity to
* choose a suitable compare_nulls value AND use appropriate null-safe
* operators in the expression.
*
* If the provided output size or per-row counts are incorrect, behavior is undefined.
*
* @code{.pseudo}
* left_equality: {{0, 1, 2}}
* right_equality: {{1, 2, 3}}
* left_conditional: {{4, 4, 4}}
* right_conditional: {{3, 4, 5}}
* Expression: Left.Column_0 > Right.Column_0
* Result: {0, 2}
* @endcode
*
* @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
* @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
* match.
* @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
* match.
*
* @param left_equality The left table used for the equality join.
* @param right_equality The right table used for the equality join.
* @param left_conditional The left table used for the conditional join.
* @param right_conditional The right table used for the conditional join.
* @param binary_predicate The condition on which to join.
* @param compare_nulls Whether or not null values join to each other or not.
* @param output_size_data An optional pair of values indicating the exact output size and the
* number of matches for each row in the larger of the two input tables, left or right (may be
* precomputed using the corresponding mixed_full_join_size API).
* @param mr Device memory resource used to allocate the returned table and columns' device memory
*
* @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
* the result of performing a mixed full join between the four input tables.
*/
std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
table_view const& left_equality,
table_view const& right_equality,
table_view const& left_conditional,
table_view const& right_conditional,
ast::expression const& binary_predicate,
null_equality compare_nulls = null_equality::EQUAL,
std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns the exact number of matches (rows) when performing a
* mixed inner join between the specified tables where the columns of the
Expand Down Expand Up @@ -1125,6 +1228,90 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
null_equality compare_nulls = null_equality::EQUAL,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns the exact number of matches (rows) when performing a mixed
* left semi join between the specified tables where the columns of the
* equality table are equal and the predicate evaluates to true on the
* conditional tables.
*
* If the provided predicate returns NULL for a pair of rows (left, right),
* that pair is not included in the output. It is the user's responsiblity to
* choose a suitable compare_nulls value AND use appropriate null-safe
* operators in the expression.
*
* @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
* @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
* match.
* @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
* match.
*
* @param left_equality The left table used for the equality join.
* @param right_equality The right table used for the equality join.
* @param left_conditional The left table used for the conditional join.
* @param right_conditional The right table used for the conditional join.
* @param binary_predicate The condition on which to join.
* @param compare_nulls Whether or not null values join to each other or not.
* @param output_size An optional pair of values indicating the exact output size and the number of
* matches for each row in the larger of the two input tables, left or right (may be precomputed
* using the corresponding mixed_inner_join_size API).
* @param mr Device memory resource used to allocate the returned table and columns' device memory
*
* @return A pair containing the size that would result from performing the
* requested join and the number of matches for each row in one of the two
* tables. Which of the two tables is an implementation detail and should not
* be relied upon, simply passed to the corresponding `mixed_left_join` API as
* is.
*/
std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_semi_join_size(
table_view const& left_equality,
table_view const& right_equality,
table_view const& left_conditional,
table_view const& right_conditional,
ast::expression const& binary_predicate,
null_equality compare_nulls = null_equality::EQUAL,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns the exact number of matches (rows) when performing a mixed
* left anti join between the specified tables.
*
* If the provided predicate returns NULL for a pair of rows (left, right),
* that pair is not included in the output. It is the user's responsiblity to
* choose a suitable compare_nulls value AND use appropriate null-safe
* operators in the expression.
*
* @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
* @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
* match.
* @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
* match.
*
* @param left_equality The left table used for the equality join.
* @param right_equality The right table used for the equality join.
* @param left_conditional The left table used for the conditional join.
* @param right_conditional The right table used for the conditional join.
* @param binary_predicate The condition on which to join.
* @param compare_nulls Whether or not null values join to each other or not.
* @param output_size An optional pair of values indicating the exact output size and the number of
* matches for each row in the larger of the two input tables, left or right (may be precomputed
* using the corresponding mixed_inner_join_size API).
* @param mr Device memory resource used to allocate the returned table and columns' device memory
*
* @return A pair containing the size that would result from performing the
* requested join and the number of matches for each row in one of the two
* tables. Which of the two tables is an implementation detail and should not
* be relied upon, simply passed to the corresponding `mixed_left_join` API as
* is.
*/
std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_anti_join_size(
table_view const& left_equality,
table_view const& right_equality,
table_view const& left_conditional,
table_view const& right_conditional,
ast::expression const& binary_predicate,
null_equality compare_nulls = null_equality::EQUAL,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns the exact number of matches (rows) when performing a
* conditional inner join between the specified tables where the predicate
Expand Down
7 changes: 0 additions & 7 deletions cpp/src/join/conditional_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,6 @@ conditional_join(table_view const& left,
} else {
// Allocate storage for the counter used to get the size of the join output
rmm::device_scalar<std::size_t> size(0, stream, mr);
CHECK_CUDA(stream.value());
if (has_nulls) {
compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
<<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
Expand All @@ -130,7 +129,6 @@ conditional_join(table_view const& left,
swap_tables,
size.data());
}
CHECK_CUDA(stream.value());
join_size = size.value(stream);
}

Expand Down Expand Up @@ -178,8 +176,6 @@ conditional_join(table_view const& left,
swap_tables);
}

CHECK_CUDA(stream.value());

auto join_indices = std::make_pair(std::move(left_indices), std::move(right_indices));

// For full joins, get the indices in the right table that were not joined to
Expand Down Expand Up @@ -260,7 +256,6 @@ std::size_t compute_conditional_join_output_size(table_view const& left,

// Allocate storage for the counter used to get the size of the join output
rmm::device_scalar<std::size_t> size(0, stream, mr);
CHECK_CUDA(stream.value());

// Determine number of output rows without actually building the output to simply
// find what the size of the output will be.
Expand All @@ -283,8 +278,6 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
swap_tables,
size.data());
}
CHECK_CUDA(stream.value());

return size.value(stream);
}

Expand Down
6 changes: 5 additions & 1 deletion cpp/src/join/join_common_utils.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,6 +25,7 @@

#include <rmm/mr/device/polymorphic_allocator.hpp>

#include <cuco/static_map.cuh>
#include <cuco/static_multimap.cuh>

#include <limits>
Expand Down Expand Up @@ -60,6 +61,9 @@ using mixed_multimap_type = cuco::static_multimap<hash_value_type,
hash_table_allocator_type,
cuco::double_hashing<1, hash_type, hash_type>>;

using semi_map_type = cuco::
static_map<hash_value_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;

using row_hash = cudf::row_hasher<default_hash, cudf::nullate::DYNAMIC>;

using row_equality = cudf::row_equality_comparator<cudf::nullate::DYNAMIC>;
Expand Down
Loading

0 comments on commit b8d2969

Please sign in to comment.