nv-legate · manopapad · Aug 1, 2023 · Jul 31, 2023 · Jul 31, 2023 · Aug 1, 2023
diff --git a/cunumeric_cpp.cmake b/cunumeric_cpp.cmake
@@ -212,10 +212,11 @@ if(Legion_USE_OpenMP)
     src/cunumeric/search/argwhere_omp.cc
     src/cunumeric/search/nonzero_omp.cc
     src/cunumeric/set/unique_omp.cc
+    src/cunumeric/set/unique_reduce_omp.cc
     src/cunumeric/stat/bincount_omp.cc
     src/cunumeric/convolution/convolve_omp.cc
     src/cunumeric/transform/flip_omp.cc
-    src/cunumeric/stat/histogram_omp.cc    
+    src/cunumeric/stat/histogram_omp.cc
   )
 endif()
 

diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc
@@ -108,7 +108,8 @@ std::vector<StoreMapping> CuNumericMapper::store_mappings(
         return {};
     }
     case CUNUMERIC_MATMUL:
-    case CUNUMERIC_MATVECMUL: {
+    case CUNUMERIC_MATVECMUL:
+    case CUNUMERIC_UNIQUE_REDUCE: {
       // TODO: Our actual requirements are a little less strict than this; we require each array or
       // vector to have a stride of 1 on at least one dimension.
       std::vector<StoreMapping> mappings;

diff --git a/src/cunumeric/random/bitgenerator.h b/src/cunumeric/random/bitgenerator.h
@@ -84,6 +84,11 @@ class BitGeneratorTask : public CuNumericTask<BitGeneratorTask> {
 
  public:
   static void cpu_variant(legate::TaskContext& context);
+#ifdef LEGATE_USE_OPENMP
+  // TODO: Fully parallelized OpenMP implementation for BitGenerator
+  // Doing it this way is safe, but only one thread is being used out of the OpenMP pool.
+  static void omp_variant(legate::TaskContext& context) { BitGeneratorTask::cpu_variant(context); }
+#endif
 #ifdef LEGATE_USE_CUDA
   static void gpu_variant(legate::TaskContext& context);
 #endif

diff --git a/src/cunumeric/set/unique_reduce.cc b/src/cunumeric/set/unique_reduce.cc
@@ -19,33 +19,9 @@
 
 namespace cunumeric {
 
-using namespace legate;
-
-template <Type::Code CODE>
-struct UniqueReduceImplBody<VariantKind::CPU, CODE> {
-  using VAL = legate_type_of<CODE>;
-
-  void operator()(Array& output, const std::vector<std::pair<AccessorRO<VAL, 1>, Rect<1>>>& inputs)
-  {
-    std::set<VAL> dedup_set;
-
-    for (auto& pair : inputs) {
-      auto& input = pair.first;
-      auto& shape = pair.second;
-      for (coord_t idx = shape.lo[0]; idx <= shape.hi[0]; ++idx) dedup_set.insert(input[idx]);
-    }
-
-    size_t size = dedup_set.size();
-    size_t pos  = 0;
-    auto result = output.create_output_buffer<VAL, 1>(Point<1>(size), true);
-
-    for (auto e : dedup_set) result[pos++] = e;
-  }
-};
-
 /*static*/ void UniqueReduceTask::cpu_variant(TaskContext& context)
 {
-  unique_reduce_template<VariantKind::CPU>(context);
+  unique_reduce_template(context, thrust::host);
 }
 
 namespace  // unnamed

diff --git a/src/cunumeric/set/unique_reduce.h b/src/cunumeric/set/unique_reduce.h
@@ -26,6 +26,9 @@ class UniqueReduceTask : public CuNumericTask<UniqueReduceTask> {
 
  public:
   static void cpu_variant(legate::TaskContext& context);
+#ifdef LEGATE_USE_OPENMP
+  static void omp_variant(legate::TaskContext& context);
+#endif
 };
 
 }  // namespace cunumeric
diff --git a/src/cunumeric/set/unique_reduce_omp.cc b/src/cunumeric/set/unique_reduce_omp.cc
@@ -0,0 +1,29 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/set/unique_reduce.h"
+#include "cunumeric/set/unique_reduce_template.inl"
+
+#include <thrust/system/omp/execution_policy.h>
+
+namespace cunumeric {
+
+/*static*/ void UniqueReduceTask::omp_variant(TaskContext& context)
+{
+  unique_reduce_template(context, thrust::omp::par);
+}
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/set/unique_reduce_template.inl b/src/cunumeric/set/unique_reduce_template.inl
@@ -20,38 +20,54 @@
 #include "cunumeric/set/unique_reduce.h"
 #include "cunumeric/pitches.h"
 
+#include <thrust/copy.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+#include <thrust/execution_policy.h>
+
 namespace cunumeric {
 
 using namespace legate;
 
-template <VariantKind KIND, Type::Code CODE>
-struct UniqueReduceImplBody;
-
-template <VariantKind KIND>
+template <typename exe_pol_t>
 struct UniqueReduceImpl {
   template <Type::Code CODE>
-  void operator()(Array& output, std::vector<Array>& input_arrs)
+  void operator()(Array& output, std::vector<Array>& input_arrs, const exe_pol_t& exe_pol)
   {
     using VAL = legate_type_of<CODE>;
 
-    std::vector<std::pair<AccessorRO<VAL, 1>, Rect<1>>> inputs;
-
+    size_t res_size = 0;
     for (auto& input_arr : input_arrs) {
       auto shape = input_arr.shape<1>();
-      auto acc   = input_arr.read_accessor<VAL, 1>(shape);
-      inputs.push_back(std::make_pair(acc, shape));
+      res_size += shape.hi[0] - shape.lo[0] + 1;
+    }
+    auto result  = output.create_output_buffer<VAL, 1>(Point<1>(res_size));
+    VAL* res_ptr = result.ptr(0);
+
+    size_t offset = 0;
+    for (auto& input_arr : input_arrs) {
+      size_t strides[1];
+      Rect<1> shape     = input_arr.shape<1>();
+      size_t volume     = shape.volume();
+      const VAL* in_ptr = input_arr.read_accessor<VAL, 1>(shape).ptr(shape, strides);
+      assert(shape.volume() <= 1 || strides[0] == 1);
+      thrust::copy(exe_pol, in_ptr, in_ptr + volume, res_ptr + offset);
+      offset += volume;
     }
+    assert(offset == res_size);
 
-    UniqueReduceImplBody<KIND, CODE>()(output, inputs);
+    thrust::sort(exe_pol, res_ptr, res_ptr + res_size);
+    VAL* actual_end = thrust::unique(exe_pol, res_ptr, res_ptr + res_size);
+    output.bind_data(result, Point<1>(actual_end - res_ptr));
   }
 };
 
-template <VariantKind KIND>
-static void unique_reduce_template(TaskContext& context)
+template <typename exe_pol_t>
+static void unique_reduce_template(TaskContext& context, const exe_pol_t& exe_pol)
 {
   auto& inputs = context.inputs();
   auto& output = context.outputs()[0];
-  type_dispatch(output.code(), UniqueReduceImpl<KIND>{}, output, inputs);
+  type_dispatch(output.code(), UniqueReduceImpl<exe_pol_t>{}, output, inputs, exe_pol);
 }
 
 }  // namespace cunumeric