rapidsai · rapids-bot · Feb 2, 2022 · Jan 12, 2022 · Jan 12, 2022 · Jan 12, 2022
diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,9 +27,14 @@
 
 class Compaction : public cudf::benchmark {
 };
+class HashCompaction : public cudf::benchmark {
+};
+
+enum class algorithm { SORT_BASED, HASH_BASED };
 
-template <typename Type>
-void BM_compaction(benchmark::State& state, cudf::duplicate_keep_option keep)
+template <typename Type, algorithm Algo>
+void BM_compaction(benchmark::State& state,
+                   cudf::duplicate_keep_option keep = cudf::duplicate_keep_option::KEEP_FIRST)
 {
   auto const n_rows = static_cast<cudf::size_type>(state.range(0));
 
@@ -45,34 +50,63 @@ void BM_compaction(benchmark::State& state, cudf::duplicate_keep_option keep)
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
-    auto result = cudf::drop_duplicates(input_table, {0}, keep);
+    auto const result = [&]() {
+      if constexpr (Algo == algorithm::HASH_BASED) {
+        return cudf::unordered_drop_duplicates(input_table, {0});
+      } else {
+        return cudf::sort_and_drop_duplicates(input_table, {0}, keep);
+      }
+    }();
   }
 }
 
 #define concat(a, b, c) a##b##c
 #define get_keep(op)    cudf::duplicate_keep_option::KEEP_##op
 
 // TYPE, OP
-#define RBM_BENCHMARK_DEFINE(name, type, keep)                     \
-  BENCHMARK_DEFINE_F(Compaction, name)(::benchmark::State & state) \
-  {                                                                \
-    BM_compaction<type>(state, get_keep(keep));                    \
-  }                                                                \
-  BENCHMARK_REGISTER_F(Compaction, name)                           \
-    ->UseManualTime()                                              \
-    ->Arg(10000)    /* 10k */                                      \
-    ->Arg(100000)   /* 100k */                                     \
-    ->Arg(1000000)  /* 1M */                                       \
+#define SORT_BENCHMARK_DEFINE(name, type, keep)                        \
+  BENCHMARK_DEFINE_F(Compaction, name)(::benchmark::State & state)     \
+  {                                                                    \
+    BM_compaction<type, algorithm::SORT_BASED>(state, get_keep(keep)); \
+  }                                                                    \
+  BENCHMARK_REGISTER_F(Compaction, name)                               \
+    ->UseManualTime()                                                  \
+    ->Arg(10000)    /* 10k */                                          \
+    ->Arg(100000)   /* 100k */                                         \
+    ->Arg(1000000)  /* 1M */                                           \
     ->Arg(10000000) /* 10M */
 
 #define COMPACTION_BENCHMARK_DEFINE(type, keep) \
-  RBM_BENCHMARK_DEFINE(concat(type, _, keep), type, keep)
+  SORT_BENCHMARK_DEFINE(concat(type, _, keep), type, keep)
+
+// TYPE
+#define HASH_BENCHMARK_DEFINE(type)                                    \
+  BENCHMARK_DEFINE_F(HashCompaction, type)(::benchmark::State & state) \
+  {                                                                    \
+    BM_compaction<type, algorithm::HASH_BASED>(state);                 \
+  }                                                                    \
+  BENCHMARK_REGISTER_F(HashCompaction, type)                           \
+    ->UseManualTime()                                                  \
+    ->Arg(10000)    /* 10k */                                          \
+    ->Arg(100000)   /* 100k */                                         \
+    ->Arg(1000000)  /* 1M */                                           \
+    ->Arg(10000000) /* 10M */
+
+#define HASH_COMPACTION_BENCHMARK_DEFINE(type) HASH_BENCHMARK_DEFINE(type)
+
+using cudf::timestamp_ms;
 
 COMPACTION_BENCHMARK_DEFINE(bool, NONE);
 COMPACTION_BENCHMARK_DEFINE(int8_t, NONE);
 COMPACTION_BENCHMARK_DEFINE(int32_t, NONE);
 COMPACTION_BENCHMARK_DEFINE(int32_t, FIRST);
 COMPACTION_BENCHMARK_DEFINE(int32_t, LAST);
-using cudf::timestamp_ms;
 COMPACTION_BENCHMARK_DEFINE(timestamp_ms, NONE);
 COMPACTION_BENCHMARK_DEFINE(float, NONE);
+
+HASH_COMPACTION_BENCHMARK_DEFINE(bool);
+HASH_COMPACTION_BENCHMARK_DEFINE(int8_t);
+HASH_COMPACTION_BENCHMARK_DEFINE(int32_t);
+HASH_COMPACTION_BENCHMARK_DEFINE(int64_t);
+HASH_COMPACTION_BENCHMARK_DEFINE(timestamp_ms);
+HASH_COMPACTION_BENCHMARK_DEFINE(float);
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -21,7 +21,7 @@ function(find_and_configure_cucollections)
     cuco 0.0
     GLOBAL_TARGETS cuco::cuco
     CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections
-    GIT_TAG 193de1aa74f5721717f991ca757dc610c852bb17
+    GIT_TAG 922a87856aac17742fb964eeaf1b9bbc5d7a916e
     OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
   )
 

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,6 +67,19 @@ std::unique_ptr<table> apply_boolean_mask(
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> drop_duplicates(
+  table_view const& input,
+  std::vector<size_type> const& keys,
+  duplicate_keep_option keep,
+  null_equality nulls_equal           = null_equality::EQUAL,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc cudf::sort_and_drop_duplicates
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<table> sort_and_drop_duplicates(
   table_view const& input,
   std::vector<size_type> const& keys,
   duplicate_keep_option keep,
@@ -75,6 +88,18 @@ std::unique_ptr<table> drop_duplicates(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::unordered_drop_duplicates
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<table> unordered_drop_duplicates(
+  table_view const& input,
+  std::vector<size_type> const& keys,
+  null_equality nulls_equal           = null_equality::EQUAL,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy)
  *
@@ -94,5 +119,24 @@ cudf::size_type distinct_count(table_view const& input,
                                null_equality nulls_equal    = null_equality::EQUAL,
                                rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
+/**
+ * @copydoc cudf::unordered_distinct_count(column_view const&, null_policy, nan_policy)
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+cudf::size_type unordered_distinct_count(column_view const& input,
+                                         null_policy null_handling,
+                                         nan_policy nan_handling,
+                                         rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
+/**
+ * @copydoc cudf::unordered_distinct_count(table_view const&, null_equality)
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+cudf::size_type unordered_distinct_count(table_view const& input,
+                                         null_equality nulls_equal    = null_equality::EQUAL,
+                                         rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
 }  // namespace detail
 }  // namespace cudf
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -214,7 +214,38 @@ enum class duplicate_keep_option {
 };
 
 /**
- * @brief Create a new table without duplicate rows
+ * @brief Eliminates all except the row specified by `keep` from every consecutive group of
+ * equivalent rows.
+ *
+ * Given an `input` table_view, one row from a group of equivalent elements is copied to
+ * output table depending on the value of @p keep:
+ * - KEEP_FIRST: only the first of a sequence of duplicate rows is copied
+ * - KEEP_LAST: only the last of a sequence of duplicate rows is copied
+ * - KEEP_NONE: no duplicate rows are copied
+ *
+ * @throws cudf::logic_error if The `input` row size mismatches with `keys`.
+ *
+ * @param[in] input           input table_view to copy only unique rows
+ * @param[in] keys            vector of indices representing key columns from `input`
+ * @param[in] keep            keep first entry, last entry, or no entries if duplicates found
+ * @param[in] nulls_equal     flag to denote nulls are equal if null_equality::EQUAL, nulls are not
+ *                            equal if null_equality::UNEQUAL
+ * @param[in] mr              Device memory resource used to allocate the returned table's device
+ * memory
+ *
+ * @return Table with unique rows from each sequence of equivalent rows as per specified `keep`.
+ */
+std::unique_ptr<table> drop_duplicates(
+  table_view const& input,
+  std::vector<size_type> const& keys,
+  duplicate_keep_option keep,
+  null_equality nulls_equal           = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Create a new table without duplicate rows.
+ *
+ * The output table is sorted according to the lexicographic ordering of the `keys` rows.
  *
  * Given an `input` table_view, each row is copied to output table if the corresponding
  * row of `keys` columns is unique, where the definition of unique depends on the value of @p keep:
@@ -233,9 +264,9 @@ enum class duplicate_keep_option {
  * @param[in] mr              Device memory resource used to allocate the returned table's device
  * memory
  *
- * @return Table with unique rows as per specified `keep`.
+ * @return Table with sorted unique rows as per specified `keep`.
  */
-std::unique_ptr<table> drop_duplicates(
+std::unique_ptr<table> sort_and_drop_duplicates(
   table_view const& input,
   std::vector<size_type> const& keys,
   duplicate_keep_option keep,
@@ -244,37 +275,97 @@ std::unique_ptr<table> drop_duplicates(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Count the unique elements in the column_view
+ * @brief Create a new table without duplicate rows with hash-based algorithms.
  *
- * Given an input column_view, number of unique elements in this column_view is returned
+ * Given an `input` table_view, each row is copied to output table if the corresponding
+ * row of `keys` columns is unique. If duplicate rows are present, it is unspecified which
+ * row is copied.
+ *
+ * Elements in the output table are in a random order.
+ *
+ * @throws cudf::logic_error if The `input` row size mismatches with `keys`.
+ *
+ * @param[in] input           input table_view to copy only unique rows
+ * @param[in] keys            vector of indices representing key columns from `input`
+ * @param[in] nulls_equal     flag to denote nulls are equal if null_equality::EQUAL, nulls are not
+ *                            equal if null_equality::UNEQUAL
+ * @param[in] mr              Device memory resource used to allocate the returned table's device
+ * memory
+ *
+ * @return Table with unique rows in an unspecified order.
+ */
+std::unique_ptr<table> unordered_drop_duplicates(
+  table_view const& input,
+  std::vector<size_type> const& keys,
+  null_equality nulls_equal           = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Count the number of consecutive groups of equivalent elements in a column.
  *
  * If `null_handling` is null_policy::EXCLUDE and `nan_handling` is  nan_policy::NAN_IS_NULL, both
  * `NaN` and `null` values are ignored. If `null_handling` is null_policy::EXCLUDE and
- * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in unique
- * count.
+ * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in count.
+ *
+ * `null`s are handled as equal.
  *
- * @param[in] input The column_view whose unique elements will be counted.
+ * @param[in] input View of the input column
  * @param[in] null_handling flag to include or ignore `null` while counting
- * @param[in] nan_handling flag to consider `NaN==null` or not.
+ * @param[in] nan_handling flag to consider `NaN==null` or not
  *
- * @return number of unique elements
+ * @return number of consecutive groups in the column
  */
 cudf::size_type distinct_count(column_view const& input,
                                null_policy null_handling,
                                nan_policy nan_handling);
 
 /**
- * @brief Count the unique rows in a table.
+ * @brief Count the number of consecutive groups of equivalent elements in a table.
  *
  *
- * @param[in] input Table whose unique rows will be counted.
+ * @param[in] input Table whose number of consecutive groups will be counted
  * @param[in] nulls_equal flag to denote if null elements should be considered equal
  * nulls are not equal if null_equality::UNEQUAL
  *
- * @return number of unique rows in the table
+ * @return number of consecutive groups in the table
  */
 cudf::size_type distinct_count(table_view const& input,
                                null_equality nulls_equal = null_equality::EQUAL);
 
+/**
+ * @brief Count the unique elements in the column_view.
+ *
+ * Given an input column_view, number of unique elements in this column_view is returned.
+ *
+ * If `null_handling` is null_policy::EXCLUDE and `nan_handling` is  nan_policy::NAN_IS_NULL, both
+ * `NaN` and `null` values are ignored. If `null_handling` is null_policy::EXCLUDE and
+ * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in unique
+ * count.
+ *
+ * `null`s are handled as equal.
+ *
+ * @param[in] input The column_view whose unique elements will be counted
+ * @param[in] null_handling flag to include or ignore `null` while counting
+ * @param[in] nan_handling flag to consider `NaN==null` or not
+ *
+ * @return number of unique elements
+ */
+cudf::size_type unordered_distinct_count(column_view const& input,
+                                         null_policy null_handling,
+                                         nan_policy nan_handling);
+
+/**
+ * @brief Count the unique rows in a table.
+ *
+ *
+ * @param[in] input Table whose unique rows will be counted
+ * @param[in] nulls_equal flag to denote if null elements should be considered equal
+ * nulls are not equal if null_equality::UNEQUAL
+ *
+ * @return number of unique rows in the table
+ */
+cudf::size_type unordered_distinct_count(table_view const& input,
+                                         null_equality nulls_equal = null_equality::EQUAL);
+
 /** @} */
 }  // namespace cudf