From 9594f1903de158fc7422ce907836535da1dc58ef Mon Sep 17 00:00:00 2001
From: mfoerste4 <mfoerster@nvidia.com>
Date: Tue, 8 Feb 2022 13:49:42 +0100
Subject: [PATCH 01/49] update OpenBLAS version to support new architectures

---
 install.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install.py b/install.py
index 49b1ecce8..ae624ad58 100755
--- a/install.py
+++ b/install.py
@@ -160,7 +160,7 @@ def install_openblas(openblas_dir, thread_count, verbose):
     git_clone(
         temp_dir,
         url="https://github.com/xianyi/OpenBLAS.git",
-        tag="v0.3.15",
+        tag="v0.3.19",
         verbose=verbose,
     )
     # We can just build this directly

From 69fbf7dc3115622357ae545b4de90bd78b3de053 Mon Sep 17 00:00:00 2001
From: mfoerste4 <mfoerster@nvidia.com>
Date: Tue, 8 Feb 2022 14:10:38 +0100
Subject: [PATCH 02/49] initial draft for sort, 1D, key sort

---
 cunumeric/array.py                   |  36 ++++++-
 cunumeric/config.py                  |   1 +
 cunumeric/deferred.py                |   8 ++
 cunumeric/eager.py                   |   7 ++
 cunumeric/lazy.py                    |   2 +-
 cunumeric/module.py                  |  31 +++++++
 src/cunumeric.mk                     |   3 +
 src/cunumeric/cunumeric_c.h          |   1 +
 src/cunumeric/sort/sort.cc           | 134 +++++++++++++++++++++++++++
 src/cunumeric/sort/sort.cu           |  57 ++++++++++++
 src/cunumeric/sort/sort.h            |  68 ++++++++++++++
 src/cunumeric/sort/sort_omp.cc       |  49 ++++++++++
 src/cunumeric/sort/sort_template.inl |  59 ++++++++++++
 tests/sort.py                        |  44 +++++++++
 14 files changed, 498 insertions(+), 2 deletions(-)
 create mode 100644 src/cunumeric/sort/sort.cc
 create mode 100644 src/cunumeric/sort/sort.cu
 create mode 100644 src/cunumeric/sort/sort.h
 create mode 100644 src/cunumeric/sort/sort_omp.cc
 create mode 100644 src/cunumeric/sort/sort_template.inl
 create mode 100644 tests/sort.py

diff --git a/cunumeric/array.py b/cunumeric/array.py
index d7cbe985b..ead21a975 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -1474,7 +1474,41 @@ def setfield(self, val, dtype, offset=0):
         )
 
     def setflags(self, write=None, align=None, uic=None):
-        self.__array__().setflags(write=write, align=align, uic=uic)
+        self.__array__(stacklevel=2).setflags(
+            write=write, align=align, uic=uic
+        )
+
+    def sort(self, axis=-1, kind="stable", order=None):
+        if kind != "stable":
+            runtime.warn(
+                "cuNumeric uses a different (stable) algorithm than "
+                + str(kind)
+                + " for sorting",
+                category=RuntimeWarning,
+                stacklevel=2,
+            )
+        if order is not None:
+            raise NotImplementedError(
+                "cuNumeric does not support sorting with 'order' as "
+                "ndarray only supports numeric values"
+            )
+        if axis >= self.ndim or axis < -self.ndim:
+            raise ValueError("invalid axis")
+
+        if self._thunk.scalar:
+            # nothing to do
+            return
+        elif self.ndim == 1:
+            # this is the default -- sorting of 1D array
+            self._thunk.sort(axis=axis)
+            return
+        else:
+            raise NotImplementedError(
+                "cuNumeric only supports sorting 1D arrays at the moment"
+            )
+
+            # no return value
+            return
 
     def squeeze(self, axis=None):
         if axis is not None:
diff --git a/cunumeric/config.py b/cunumeric/config.py
index 1bd6fd198..76e1b97a7 100644
--- a/cunumeric/config.py
+++ b/cunumeric/config.py
@@ -100,6 +100,7 @@ class CuNumericOpCode(IntEnum):
     RAND = _cunumeric.CUNUMERIC_RAND
     READ = _cunumeric.CUNUMERIC_READ
     SCALAR_UNARY_RED = _cunumeric.CUNUMERIC_SCALAR_UNARY_RED
+    SORT = _cunumeric.CUNUMERIC_SORT
     SYRK = _cunumeric.CUNUMERIC_SYRK
     TILE = _cunumeric.CUNUMERIC_TILE
     TRANSPOSE_COPY_2D = _cunumeric.CUNUMERIC_TRANSPOSE_COPY_2D
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index efb451175..7625eea6f 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -1517,3 +1517,11 @@ def cholesky(self, src, no_tril=False):
         cholesky(self, src)
         if not no_tril:
             self.trilu(self, 0, True)
+
+    def sort(self, axis=-1, kind="stable", order=None):
+        # TODO support axis parameter
+        self.runtime.legate_runtime.issue_execution_fence(block=True)
+        task = self.context.create_task(CuNumericOpCode.SORT)
+        task.add_output(self.base)
+        task.execute()
+        self.runtime.legate_runtime.issue_execution_fence(block=True)
diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index fc297b085..e629c5d93 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -502,6 +502,13 @@ def nonzero(self):
                 result += (EagerArray(self.runtime, array),)
             return result
 
+    def sort(self, axis=-1, kind="stable", order=None):
+        self.check_eager_args(axis, kind, order)
+        if self.deferred is not None:
+            self.deferred.sort(axis, kind, order)
+        else:
+            self.array.sort(axis, kind, order)
+
     def random_uniform(self):
         if self.deferred is not None:
             self.deferred.random_uniform()
diff --git a/cunumeric/lazy.py b/cunumeric/lazy.py
index 44eb1b0ea..90b63a842 100644
--- a/cunumeric/lazy.py
+++ b/cunumeric/lazy.py
@@ -128,7 +128,7 @@ def bincount(self, rhs, stacklevel, weights=None):
     def nonzero(self, stacklevel):
         raise NotImplementedError("Implement in derived classes")
 
-    def sort(self, rhs, stacklevel):
+    def sort(self, axis, kind, order):
         raise NotImplementedError("Implement in derived classes")
 
     def random_uniform(self, stacklevel):
diff --git a/cunumeric/module.py b/cunumeric/module.py
index db229f9bd..87455716f 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -1802,6 +1802,37 @@ def where(a, x=None, y=None):
     return ndarray.perform_where(a, x, y)
 
 
+# Sorting
+
+
+def argsort(a, axis=-1, kind="stable", order=None):
+    array = ndarray.convert_to_cunumeric_ndarray(a)
+    return array.argsort(axis=axis, kind=kind, order=order)
+
+
+def lexsort(a, axis=-1):
+    raise NotImplementedError("Not yet implemented")
+
+
+def msort(a):
+    return sort(a)
+
+
+def sort(a, axis=-1, kind="stable", order=None):
+    array = ndarray.convert_to_cunumeric_ndarray(a)
+    out = array.copy()
+    out_array = ndarray.convert_to_cunumeric_ndarray(out)
+    out_array._thunk.sort(axis=axis, kind=kind, order=order)
+    return out_array
+
+
+def sort_complex(a):
+    return sort(a)
+
+
+# Counting
+
+
 @add_boilerplate("a")
 def count_nonzero(a, axis=None):
     if a.size == 0:
diff --git a/src/cunumeric.mk b/src/cunumeric.mk
index 2c7bb80ca..08662c5ee 100644
--- a/src/cunumeric.mk
+++ b/src/cunumeric.mk
@@ -43,6 +43,7 @@ GEN_CPU_SRC += cunumeric/ternary/where.cc               \
 							 cunumeric/matrix/util.cc                 \
 							 cunumeric/random/rand.cc                 \
 							 cunumeric/search/nonzero.cc              \
+							 cunumeric/sort/sort.cc                   \
 							 cunumeric/stat/bincount.cc               \
 							 cunumeric/convolution/convolve.cc        \
 							 cunumeric/transform/flip.cc              \
@@ -76,6 +77,7 @@ GEN_CPU_SRC += cunumeric/ternary/where_omp.cc          \
 							 cunumeric/matrix/util_omp.cc            \
 							 cunumeric/random/rand_omp.cc            \
 							 cunumeric/search/nonzero_omp.cc         \
+							 cunumeric/sort/sort_omp.cc              \
 							 cunumeric/stat/bincount_omp.cc          \
 							 cunumeric/convolution/convolve_omp.cc   \
 							 cunumeric/transform/flip_omp.cc
@@ -112,6 +114,7 @@ GEN_GPU_SRC += cunumeric/ternary/where.cu               \
 							 cunumeric/matrix/trsm.cu                 \
 							 cunumeric/random/rand.cu                 \
 							 cunumeric/search/nonzero.cu              \
+							 cunumeric/sort/sort.cu                   \
 							 cunumeric/stat/bincount.cu               \
 							 cunumeric/convolution/convolve.cu	  \
 							 cunumeric/transform/flip.cu              \
diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h
index e6cd513be..574587731 100644
--- a/src/cunumeric/cunumeric_c.h
+++ b/src/cunumeric/cunumeric_c.h
@@ -45,6 +45,7 @@ enum CuNumericOpCode {
   CUNUMERIC_RAND,
   CUNUMERIC_READ,
   CUNUMERIC_SCALAR_UNARY_RED,
+  CUNUMERIC_SORT,
   CUNUMERIC_SYRK,
   CUNUMERIC_TILE,
   CUNUMERIC_TRANSPOSE_COPY_2D,
diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
new file mode 100644
index 000000000..e42535cfc
--- /dev/null
+++ b/src/cunumeric/sort/sort.cc
@@ -0,0 +1,134 @@
+/* Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/sort/sort.h"
+#include "cunumeric/sort/sort_template.inl"
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+// general routine
+template <LegateTypeCode CODE, int32_t DIM>
+struct SortImplBody<VariantKind::CPU, CODE, DIM> {
+  using VAL = legate_type_of<CODE>;
+
+  void operator()(VAL* inptr,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  const size_t volume,
+                  bool is_index_space,
+                  Legion::DomainPoint index_point,
+                  Legion::Domain domain)
+  {
+    // std::cout << "local size = " << volume << ", dist. = " << is_index_space << ", index_point =
+    // "
+    //           << index_point << ", domain/volume = " << domain << "/" << domain.get_volume() <<
+    //           std::endl;
+
+    std::sort(inptr, inptr + volume);
+
+    // in case of distributed data we need to switch to sample sort
+    if (is_index_space) {
+      // create (starting) sample of (at most) domain.get_volume() equidistant values
+      // also enrich values with additional indexes rank & local position in order to handle
+      // duplicate values
+      size_t num_local_samples = std::min(domain.get_volume(), volume);
+      size_t local_rank        = index_point[0];
+      auto local_samples       = std::make_unique<SampleEntry<VAL>[]>(num_local_samples);
+      for (int i = 0; i < num_local_samples; ++i) {
+        const size_t index        = (i + 1) * volume / num_local_samples - 1;
+        local_samples[i].value    = inptr[index];
+        local_samples[i].rank     = local_rank;
+        local_samples[i].local_id = index;
+      }
+
+      // std::cout << "local samples: size = " << num_local_samples << std::endl;
+      // std::cout << "first = (" << local_samples[0].value << "," << local_samples[0].rank << ","<<
+      // local_samples[0].local_id << ")" << std::endl; std::cout << "last = (" <<
+      // local_samples[num_local_samples-1].value << "," << local_samples[num_local_samples-1].rank
+      // << ","<< local_samples[num_local_samples-1].local_id << ")" << std::endl;
+
+      // all2all those samples
+      // TODO broadcast package size
+      // TODO allocate targets
+      // TODO broadcast samples
+      size_t num_global_samples = 15;
+      std::unique_ptr<SampleEntry<VAL>[]> global_samples(new SampleEntry<VAL>[num_global_samples]);
+
+      // sort all samples (utilize 2nd and 3rd sort criteria as well)
+      std::sort(&(global_samples[0]),
+                &(global_samples[0]) + num_global_samples,
+                SampleEntryComparator<VAL>());
+
+      // define splitters
+      auto splitters = std::make_unique<SampleEntry<VAL>[]>(domain.get_volume() - 1);
+      for (int i = 0; i < domain.get_volume() - 1; ++i) {
+        const size_t index = (i + 1) * num_global_samples / domain.get_volume() - 1;
+        splitters[i]       = global_samples[index];
+      }
+
+      do {
+        // compute local package sizes for every process based on splitters
+        std::unique_ptr<size_t> local_partition_size(new size_t[domain.get_volume()]);
+        {
+          size_t range_start    = 0;
+          size_t local_position = 0;
+          for (int p_index = 0; p_index < domain.get_volume(); ++p) {
+            while (local_position < volume && still smaller or equal) { local_position++; }
+
+            local_partition_size[partition_index++] = local_position - range_start;
+            range_start                             = local_position;
+          }
+        }
+
+        // communicate local package-sizes all2all
+        // TODO
+
+        // evaluate distribution result??
+        // TODO
+
+        // if (good enough) break;
+        // TODO
+        break;
+        // else iterate/improve splitters
+        // TODO
+
+      } while (true);
+
+      // all2all accepted distribution
+      // package sizes should already be known
+      // all2all communication
+      // TODO
+
+      // final merge sort of received packages
+      // TODO
+    }
+  }
+};
+
+/*static*/ void SortTask::cpu_variant(TaskContext& context)
+{
+  sort_template<VariantKind::CPU>(context);
+}
+
+namespace  // unnamed
+{
+static void __attribute__((constructor)) register_tasks(void) { SortTask::register_variants(); }
+}  // namespace
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
new file mode 100644
index 000000000..f76b2871c
--- /dev/null
+++ b/src/cunumeric/sort/sort.cu
@@ -0,0 +1,57 @@
+/* Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/sort/sort.h"
+#include "cunumeric/sort/sort_template.inl"
+
+#include <thrust/scan.h>
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+
+#include "cunumeric/cuda_help.h"
+
+namespace cunumeric {
+
+using namespace Legion;
+
+template <LegateTypeCode CODE, int32_t DIM>
+struct SortImplBody<VariantKind::GPU, CODE, DIM> {
+  using VAL = legate_type_of<CODE>;
+
+  void operator()(VAL* inptr,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  const size_t volume,
+                  bool is_index_space,
+                  Legion::DomainPoint index_point,
+                  Legion::Domain domain)
+  {
+    thrust::sort(inptr, inptr + volume);
+
+    // in case of distributed data we need to switch to sample sort
+    if (is_index_space) {
+      // not implemented yet
+      assert(false);
+    }
+  }
+};
+
+/*static*/ void SortTask::gpu_variant(TaskContext& context)
+{
+  sort_template<VariantKind::GPU>(context);
+}
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/sort/sort.h b/src/cunumeric/sort/sort.h
new file mode 100644
index 000000000..8c3f5a0df
--- /dev/null
+++ b/src/cunumeric/sort/sort.h
@@ -0,0 +1,68 @@
+/* Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cunumeric/cunumeric.h"
+
+namespace cunumeric {
+
+struct SortArgs {
+  Array& output;
+  bool is_index_space;
+  Legion::DomainPoint index_point;
+  Legion::Domain domain;
+};
+
+template <typename VAL>
+struct SampleEntry {
+  VAL value;
+  size_t rank;
+  size_t local_id;
+};
+
+template <typename VAL>
+struct SampleEntryComparator {
+  bool operator()(const SampleEntry<VAL>& a, const SampleEntry<VAL>& b) const
+  {
+    if (a.value < b.value) {
+      return true;
+    } else if (a.value == b.value) {
+      if (a.rank < b.rank) {
+        return true;
+      } else if (a.rank == b.rank) {
+        return a.local_id < b.local_id;
+      }
+    }
+    return false;
+  }
+};
+
+class SortTask : public CuNumericTask<SortTask> {
+ public:
+  static const int TASK_ID = CUNUMERIC_SORT;
+
+ public:
+  static void cpu_variant(legate::TaskContext& context);
+#ifdef LEGATE_USE_OPENMP
+  static void omp_variant(legate::TaskContext& context);
+#endif
+#ifdef LEGATE_USE_CUDA
+  static void gpu_variant(legate::TaskContext& context);
+#endif
+};
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
new file mode 100644
index 000000000..d8ffadbd0
--- /dev/null
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -0,0 +1,49 @@
+/* Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/sort/sort.h"
+#include "cunumeric/sort/sort_template.inl"
+
+#include <omp.h>
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+template <LegateTypeCode CODE, int32_t DIM>
+struct SortImplBody<VariantKind::OMP, CODE, DIM> {
+  using VAL = legate_type_of<CODE>;
+
+  void operator()(VAL* inptr,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  const size_t volume,
+                  bool is_index_space,
+                  Legion::DomainPoint index_point,
+                  Legion::Domain domain)
+  {
+    // not implemented yet
+    assert(false);
+  }
+};
+
+/*static*/ void SortTask::omp_variant(TaskContext& context)
+{
+  sort_template<VariantKind::OMP>(context);
+}
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl
new file mode 100644
index 000000000..5355bdbbe
--- /dev/null
+++ b/src/cunumeric/sort/sort_template.inl
@@ -0,0 +1,59 @@
+/* Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/pitches.h"
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+template <VariantKind KIND, LegateTypeCode CODE, int32_t DIM>
+struct SortImplBody;
+
+template <VariantKind KIND>
+struct SortImpl {
+  template <LegateTypeCode CODE, int32_t DIM>
+  void operator()(SortArgs& args) const
+  {
+    using VAL = legate_type_of<CODE>;
+
+    auto rect = args.output.shape<DIM>();
+
+    Pitches<DIM - 1> pitches;
+    size_t volume = pitches.flatten(rect);
+
+    // TODO -- we cannot stop! need to proceed as partition might be filled later
+    if (volume == 0) { return; }
+
+    auto inout = args.output.read_write_accessor<VAL, DIM>(rect);
+
+    SortImplBody<KIND, CODE, DIM>()(
+      inout.ptr(rect), pitches, rect, volume, args.is_index_space, args.index_point, args.domain);
+  }
+};
+
+template <VariantKind KIND>
+static void sort_template(TaskContext& context)
+{
+  SortArgs args{context.outputs()[0],
+                context.task_->is_index_space,
+                context.task_->index_point,
+                context.task_->index_domain};
+  double_dispatch(args.output.dim(), args.output.code(), SortImpl<KIND>{}, args);
+}
+
+}  // namespace cunumeric
diff --git a/tests/sort.py b/tests/sort.py
new file mode 100644
index 000000000..ab5c91193
--- /dev/null
+++ b/tests/sort.py
@@ -0,0 +1,44 @@
+# Copyright 2021 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+
+import cunumeric as num
+
+
+def test():
+    np.random.seed(42)
+    A_np = np.array(np.random.randint(10, size=30), dtype=np.int32)
+
+    A_num = num.array(A_np)
+    print("Sorting array   : " + str(A_np))
+
+    sortA_np = np.sort(A_np)
+    print("Result numpy    : " + str(sortA_np))
+
+    # pdb.set_trace()
+    sortA_num = num.sort(A_num)
+    print("Result cunumeric: " + str(sortA_num))
+
+    A_num.sort()
+    print("Result (inplace): " + str(A_num))
+
+    assert num.allclose(sortA_np, sortA_num)
+
+    return
+
+
+if __name__ == "__main__":
+    test()

From dfa7adbcca414828084a686288fa7f195aeff6b5 Mon Sep 17 00:00:00 2001
From: mfoerste4 <mfoerster@nvidia.com>
Date: Tue, 8 Feb 2022 15:17:28 +0100
Subject: [PATCH 03/49] fixed compile error

---
 src/cunumeric/sort/sort.cc | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index e42535cfc..28cf83c23 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -84,15 +84,23 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
 
       do {
         // compute local package sizes for every process based on splitters
-        std::unique_ptr<size_t> local_partition_size(new size_t[domain.get_volume()]);
+        std::unique_ptr<size_t[]> local_partition_size(new size_t[domain.get_volume()]);
         {
           size_t range_start    = 0;
           size_t local_position = 0;
-          for (int p_index = 0; p_index < domain.get_volume(); ++p) {
-            while (local_position < volume && still smaller or equal) { local_position++; }
-
-            local_partition_size[partition_index++] = local_position - range_start;
-            range_start                             = local_position;
+          for (int p_index = 0; p_index < domain.get_volume(); ++p_index) {
+            // move as long current value is lesser or equaöl to current splitter
+            while (local_position < volume &&
+                   (inptr[local_position] < splitters[p_index].value ||
+                    (inptr[local_position] == splitters[p_index].value &&
+                     (local_rank < splitters[p_index].rank ||
+                      (local_rank == splitters[p_index].rank &&
+                       local_position <= splitters[p_index].local_id))))) {
+              local_position++;
+            }
+
+            local_partition_size[p_index++] = local_position - range_start;
+            range_start                     = local_position;
           }
         }
 

From 4c7c3a23a13d5e06297cb82bb70430f718ee6910 Mon Sep 17 00:00:00 2001
From: mfoerste4 <mfoerster@nvidia.com>
Date: Tue, 8 Feb 2022 21:03:50 +0100
Subject: [PATCH 04/49] OpenMP non-distributed implementation, some small
 fixes, benchmark tool

---
 examples/sort.py               | 102 +++++++++++++++++++++++++++++++++
 src/cunumeric/sort/sort.cc     |  17 +++---
 src/cunumeric/sort/sort.cu     |   9 ++-
 src/cunumeric/sort/sort_omp.cc |  72 ++++++++++++++++++++++-
 tests/sort.py                  |   4 +-
 5 files changed, 191 insertions(+), 13 deletions(-)
 create mode 100644 examples/sort.py

diff --git a/examples/sort.py b/examples/sort.py
new file mode 100644
index 000000000..9142c8a12
--- /dev/null
+++ b/examples/sort.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+
+# Copyright 2021 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import datetime
+
+import numpy
+from benchmark import run_benchmark
+
+import cunumeric
+
+
+def check_sorted(a, a_numpy):
+    a_sorted = numpy.sort(a_numpy)
+    print("Checking result...")
+    if cunumeric.allclose(a_sorted, a):
+        print("PASS!")
+    else:
+        print("FAIL!")
+        print("NUMPY    : " + str(a_sorted))
+        print("CUNUMERIC: " + str(a))
+
+
+def run_sort(N, perform_check, timing):
+
+    numpy.random.seed(42)
+    a_numpy = numpy.array(
+        numpy.random.randint(1000, size=N), dtype=numpy.int32
+    )
+    a = cunumeric.array(a_numpy)
+
+    start = datetime.datetime.now()
+    a_sorted = cunumeric.sort(a)
+    stop = datetime.datetime.now()
+
+    if perform_check:
+        check_sorted(a_sorted, a_numpy)
+    else:
+        # do we need to synchronize?
+        assert True
+    delta = stop - start
+    total = delta.total_seconds() * 1000.0
+    if timing:
+        print("Elapsed Time: " + str(total) + " ms")
+    return total
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-c",
+        "--check",
+        dest="check",
+        action="store_true",
+        help="check the result of the solve",
+    )
+    parser.add_argument(
+        "-n",
+        "--num",
+        type=int,
+        default=1000000,
+        dest="N",
+        help="number of elements in one dimension",
+    )
+    parser.add_argument(
+        "-t",
+        "--time",
+        dest="timing",
+        action="store_true",
+        help="perform timing",
+    )
+    parser.add_argument(
+        "-b",
+        "--benchmark",
+        type=int,
+        default=1,
+        dest="benchmark",
+        help="number of times to benchmark this application (default 1 - "
+        "normal execution)",
+    )
+
+    args = parser.parse_args()
+    run_benchmark(
+        run_sort,
+        args.benchmark,
+        "Sort",
+        (args.N, args.check, args.timing),
+    )
diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index 28cf83c23..af96acdba 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -35,12 +35,13 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
                   Legion::DomainPoint index_point,
                   Legion::Domain domain)
   {
-    // std::cout << "local size = " << volume << ", dist. = " << is_index_space << ", index_point =
-    // "
-    //           << index_point << ", domain/volume = " << domain << "/" << domain.get_volume() <<
-    //           std::endl;
+#ifdef DEBUG_CUNUMERIC
+    std::cout << "CPU(" << index_point[0] << "): local size = " << volume
+              << ", dist. = " << is_index_space << ", index_point = " << index_point
+              << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl;
+#endif
 
-    std::sort(inptr, inptr + volume);
+    std::stable_sort(inptr, inptr + volume);
 
     // in case of distributed data we need to switch to sample sort
     if (is_index_space) {
@@ -71,9 +72,9 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
       std::unique_ptr<SampleEntry<VAL>[]> global_samples(new SampleEntry<VAL>[num_global_samples]);
 
       // sort all samples (utilize 2nd and 3rd sort criteria as well)
-      std::sort(&(global_samples[0]),
-                &(global_samples[0]) + num_global_samples,
-                SampleEntryComparator<VAL>());
+      std::stable_sort(&(global_samples[0]),
+                       &(global_samples[0]) + num_global_samples,
+                       SampleEntryComparator<VAL>());
 
       // define splitters
       auto splitters = std::make_unique<SampleEntry<VAL>[]>(domain.get_volume() - 1);
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index f76b2871c..2ce1987ad 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -39,7 +39,14 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
                   Legion::DomainPoint index_point,
                   Legion::Domain domain)
   {
-    thrust::sort(inptr, inptr + volume);
+#ifdef DEBUG_CUNUMERIC
+    std::cout << "GPU(" << index_point[0] << "): local size = " << volume
+              << ", dist. = " << is_index_space << ", index_point = " << index_point
+              << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl;
+#endif
+
+    thrust::device_ptr<VAL> dev_ptr(inptr);
+    thrust::stable_sort(dev_ptr, dev_ptr + volume);
 
     // in case of distributed data we need to switch to sample sort
     if (is_index_space) {
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index d8ffadbd0..1f2d00262 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -24,6 +24,51 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
+template <typename VAL>
+void merge(VAL* inptr, size_t start_idx, size_t end_idx, VAL* tmp)
+{
+  const size_t mid  = (end_idx + start_idx) / 2;
+  size_t left_idx   = start_idx;
+  size_t right_idx  = mid;
+  size_t target_idx = start_idx;
+
+  while (left_idx < mid && right_idx < end_idx) {
+    if (inptr[left_idx] <= inptr[right_idx]) {
+      tmp[target_idx++] = inptr[left_idx++];
+    } else {
+      tmp[target_idx++] = inptr[right_idx++];
+    }
+  }
+
+  while (left_idx < mid) { tmp[target_idx++] = inptr[left_idx++]; }
+  while (right_idx < end_idx) { tmp[target_idx++] = inptr[right_idx++]; }
+
+  std::copy(tmp + start_idx, tmp + end_idx, inptr + start_idx);
+}
+
+// TODO tune
+#define SEQUENTIAL_THRESHOLD 1024
+#define TASK_THRESHOLD 2048
+
+template <typename VAL>
+void merge_sort(VAL* inptr, const size_t start_idx, const size_t end_idx, VAL* tmp)
+{
+  const size_t size = end_idx - start_idx + 1;
+  if (size > SEQUENTIAL_THRESHOLD) {
+    const size_t mid = (end_idx + start_idx) / 2;
+
+#pragma omp task shared(inptr, tmp) if (size > TASK_THRESHOLD)
+    merge_sort(inptr, start_idx, mid, tmp);
+#pragma omp task shared(inptr, tmp) if (size > TASK_THRESHOLD)
+    merge_sort(inptr, mid, end_idx, tmp);
+
+#pragma omp taskwait
+    merge(inptr, start_idx, end_idx, tmp);
+  } else if (size > 1) {
+    std::stable_sort(inptr + start_idx, inptr + end_idx);
+  }
+}
+
 template <LegateTypeCode CODE, int32_t DIM>
 struct SortImplBody<VariantKind::OMP, CODE, DIM> {
   using VAL = legate_type_of<CODE>;
@@ -36,8 +81,31 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
                   Legion::DomainPoint index_point,
                   Legion::Domain domain)
   {
-    // not implemented yet
-    assert(false);
+#ifdef DEBUG_CUNUMERIC
+    std::cout << "OMP(" << index_point[0] << ":" << omp_get_max_threads() << ":" << omp_get_nested()
+              << "): local size = " << volume << ", dist. = " << is_index_space
+              << ", index_point = " << index_point << ", domain/volume = " << domain << "/"
+              << domain.get_volume() << std::endl;
+#endif
+
+    bool nested = omp_get_nested();
+    if (!nested) omp_set_nested(1);
+
+    // merge sort
+    auto tmp = std::make_unique<VAL[]>(volume);
+
+#pragma omp parallel shared(inptr, tmp)
+    {
+#pragma omp single
+      merge_sort(inptr, 0, volume, &(tmp[0]));
+    }
+
+    if (is_index_space) {
+      // not implemented yet
+      assert(false);
+    }
+
+    if (!nested) omp_set_nested(0);
   }
 };
 
diff --git a/tests/sort.py b/tests/sort.py
index ab5c91193..b8945d19d 100644
--- a/tests/sort.py
+++ b/tests/sort.py
@@ -31,11 +31,11 @@ def test():
     # pdb.set_trace()
     sortA_num = num.sort(A_num)
     print("Result cunumeric: " + str(sortA_num))
+    assert num.allclose(sortA_np, sortA_num)
 
     A_num.sort()
     print("Result (inplace): " + str(A_num))
-
-    assert num.allclose(sortA_np, sortA_num)
+    assert num.allclose(sortA_np, A_num)
 
     return
 

From 710c084590f204f47038054f34c7d9abdf1c92f4 Mon Sep 17 00:00:00 2001
From: mfoerste4 <mfoerster@nvidia.com>
Date: Tue, 8 Feb 2022 21:10:51 +0100
Subject: [PATCH 05/49] added missing include

---
 src/cunumeric/sort/sort.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index 2ce1987ad..5530de63d 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -19,6 +19,7 @@
 
 #include <thrust/scan.h>
 #include <thrust/sort.h>
+#include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 
 #include "cunumeric/cuda_help.h"

From 07bdb1664257eb976396acd1c93026af1db3cb6f Mon Sep 17 00:00:00 2001
From: mfoerste4 <mfoerster@nvidia.com>
Date: Tue, 8 Feb 2022 21:48:03 +0100
Subject: [PATCH 06/49] switch to parallel gcc sort

---
 src/cunumeric/sort/sort_omp.cc | 60 ++--------------------------------
 1 file changed, 2 insertions(+), 58 deletions(-)

diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index 1f2d00262..030d26cf0 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -17,6 +17,7 @@
 #include "cunumeric/sort/sort.h"
 #include "cunumeric/sort/sort_template.inl"
 
+#include <parallel/algorithm>
 #include <omp.h>
 
 namespace cunumeric {
@@ -24,51 +25,6 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
-template <typename VAL>
-void merge(VAL* inptr, size_t start_idx, size_t end_idx, VAL* tmp)
-{
-  const size_t mid  = (end_idx + start_idx) / 2;
-  size_t left_idx   = start_idx;
-  size_t right_idx  = mid;
-  size_t target_idx = start_idx;
-
-  while (left_idx < mid && right_idx < end_idx) {
-    if (inptr[left_idx] <= inptr[right_idx]) {
-      tmp[target_idx++] = inptr[left_idx++];
-    } else {
-      tmp[target_idx++] = inptr[right_idx++];
-    }
-  }
-
-  while (left_idx < mid) { tmp[target_idx++] = inptr[left_idx++]; }
-  while (right_idx < end_idx) { tmp[target_idx++] = inptr[right_idx++]; }
-
-  std::copy(tmp + start_idx, tmp + end_idx, inptr + start_idx);
-}
-
-// TODO tune
-#define SEQUENTIAL_THRESHOLD 1024
-#define TASK_THRESHOLD 2048
-
-template <typename VAL>
-void merge_sort(VAL* inptr, const size_t start_idx, const size_t end_idx, VAL* tmp)
-{
-  const size_t size = end_idx - start_idx + 1;
-  if (size > SEQUENTIAL_THRESHOLD) {
-    const size_t mid = (end_idx + start_idx) / 2;
-
-#pragma omp task shared(inptr, tmp) if (size > TASK_THRESHOLD)
-    merge_sort(inptr, start_idx, mid, tmp);
-#pragma omp task shared(inptr, tmp) if (size > TASK_THRESHOLD)
-    merge_sort(inptr, mid, end_idx, tmp);
-
-#pragma omp taskwait
-    merge(inptr, start_idx, end_idx, tmp);
-  } else if (size > 1) {
-    std::stable_sort(inptr + start_idx, inptr + end_idx);
-  }
-}
-
 template <LegateTypeCode CODE, int32_t DIM>
 struct SortImplBody<VariantKind::OMP, CODE, DIM> {
   using VAL = legate_type_of<CODE>;
@@ -88,24 +44,12 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
               << domain.get_volume() << std::endl;
 #endif
 
-    bool nested = omp_get_nested();
-    if (!nested) omp_set_nested(1);
-
-    // merge sort
-    auto tmp = std::make_unique<VAL[]>(volume);
-
-#pragma omp parallel shared(inptr, tmp)
-    {
-#pragma omp single
-      merge_sort(inptr, 0, volume, &(tmp[0]));
-    }
+    __gnu_parallel::stable_sort(inptr, inptr + volume);
 
     if (is_index_space) {
       // not implemented yet
       assert(false);
     }
-
-    if (!nested) omp_set_nested(0);
   }
 };
 

From 58b2bf44315dca6d526cd274de38d0254f44001a Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@prm-dgx-06.nvidia.com>
Date: Thu, 10 Feb 2022 09:16:31 -0800
Subject: [PATCH 07/49] Enable N-D non-distributed sort

---
 cunumeric/array.py                   | 11 +---
 cunumeric/deferred.py                | 37 ++++++++++---
 cunumeric/module.py                  |  8 +--
 src/cunumeric/sort/sort.cc           | 15 ++++--
 src/cunumeric/sort/sort.cu           |  7 ++-
 src/cunumeric/sort/sort.h            |  1 +
 src/cunumeric/sort/sort_omp.cc       | 15 +++++-
 src/cunumeric/sort/sort_template.inl | 31 +++++++++--
 tests/sort.py                        | 77 +++++++++++++++++++++++++++-
 9 files changed, 170 insertions(+), 32 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index ead21a975..e801010d3 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -1498,16 +1498,9 @@ def sort(self, axis=-1, kind="stable", order=None):
         if self._thunk.scalar:
             # nothing to do
             return
-        elif self.ndim == 1:
-            # this is the default -- sorting of 1D array
-            self._thunk.sort(axis=axis)
-            return
         else:
-            raise NotImplementedError(
-                "cuNumeric only supports sorting 1D arrays at the moment"
-            )
-
-            # no return value
+            # this is the default -- sorting of N-D array
+            self._thunk.sort(axis=axis)
             return
 
     def squeeze(self, axis=None):
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 7625eea6f..7cc5499ba 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -1519,9 +1519,34 @@ def cholesky(self, src, no_tril=False):
             self.trilu(self, 0, True)
 
     def sort(self, axis=-1, kind="stable", order=None):
-        # TODO support axis parameter
-        self.runtime.legate_runtime.issue_execution_fence(block=True)
-        task = self.context.create_task(CuNumericOpCode.SORT)
-        task.add_output(self.base)
-        task.execute()
-        self.runtime.legate_runtime.issue_execution_fence(block=True)
+        axis_normalized = axis
+        if axis_normalized < 0:
+            axis_normalized = self.ndim + axis
+
+        if axis_normalized is not self.ndim - 1:
+            assert axis_normalized < self.ndim - 1 and axis_normalized >= 0
+
+            # swap axes
+            swapped = self.swapaxes(axis_normalized, self.ndim - 1)
+
+            # FIXME: ensure *new* distribution does not split last axis (!)
+            swapped_copy = self.runtime.create_empty_thunk(
+                swapped.shape, dtype=self.dtype, inputs=[self, swapped]
+            )
+            swapped_copy.copy(swapped, deep=True)
+
+            # run sort on last axis
+            swapped_copy.sort(self.ndim - 1)
+
+            self.base = swapped_copy.swapaxes(
+                axis_normalized, self.ndim - 1
+            ).base
+            self.numpy_array = None
+        else:
+            # run actual sort task
+            self.runtime.legate_runtime.issue_execution_fence(block=True)
+            task = self.context.create_task(CuNumericOpCode.SORT)
+            task.add_output(self.base)
+            task.add_scalar_arg(self.base.shape[self.ndim - 1], ty.uint64)
+            task.execute()
+            self.runtime.legate_runtime.issue_execution_fence(block=True)
diff --git a/cunumeric/module.py b/cunumeric/module.py
index 87455716f..99db26846 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -1805,9 +1805,9 @@ def where(a, x=None, y=None):
 # Sorting
 
 
+@add_boilerplate("a")
 def argsort(a, axis=-1, kind="stable", order=None):
-    array = ndarray.convert_to_cunumeric_ndarray(a)
-    return array.argsort(axis=axis, kind=kind, order=order)
+    return a.argsort(axis=axis, kind=kind, order=order)
 
 
 def lexsort(a, axis=-1):
@@ -1818,9 +1818,9 @@ def msort(a):
     return sort(a)
 
 
+@add_boilerplate("a")
 def sort(a, axis=-1, kind="stable", order=None):
-    array = ndarray.convert_to_cunumeric_ndarray(a)
-    out = array.copy()
+    out = a.copy()
     out_array = ndarray.convert_to_cunumeric_ndarray(out)
     out_array._thunk.sort(axis=axis, kind=kind, order=order)
     return out_array
diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index af96acdba..4d26ce1ad 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -31,6 +31,7 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
                   const Pitches<DIM - 1>& pitches,
                   const Rect<DIM>& rect,
                   const size_t volume,
+                  const size_t sort_dim_size,
                   bool is_index_space,
                   Legion::DomainPoint index_point,
                   Legion::Domain domain)
@@ -39,12 +40,20 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
     std::cout << "CPU(" << index_point[0] << "): local size = " << volume
               << ", dist. = " << is_index_space << ", index_point = " << index_point
               << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl;
+
+    if (volume <= 30) {
+      std::cout << "inptr = [ ";
+      for (size_t i = 0; i < volume; ++i) { std::cout << (i > 0 ? ", " : " ") << inptr[i]; }
+      std::cout << "]" << std::endl;
+    }
 #endif
 
-    std::stable_sort(inptr, inptr + volume);
+    for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
+      std::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size);
+    }
 
-    // in case of distributed data we need to switch to sample sort
-    if (is_index_space) {
+    // in case of distributed data (1D) we need to switch to sample sort
+    if (is_index_space && DIM == 1) {
       // create (starting) sample of (at most) domain.get_volume() equidistant values
       // also enrich values with additional indexes rank & local position in order to handle
       // duplicate values
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index 5530de63d..f43445a8b 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -36,6 +36,7 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
                   const Pitches<DIM - 1>& pitches,
                   const Rect<DIM>& rect,
                   const size_t volume,
+                  const size_t sort_dim_size,
                   bool is_index_space,
                   Legion::DomainPoint index_point,
                   Legion::Domain domain)
@@ -47,10 +48,12 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
 #endif
 
     thrust::device_ptr<VAL> dev_ptr(inptr);
-    thrust::stable_sort(dev_ptr, dev_ptr + volume);
+    for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
+      thrust::stable_sort(dev_ptr + start_idx, dev_ptr + start_idx + sort_dim_size);
+    }
 
     // in case of distributed data we need to switch to sample sort
-    if (is_index_space) {
+    if (is_index_space && DIM == 1) {
       // not implemented yet
       assert(false);
     }
diff --git a/src/cunumeric/sort/sort.h b/src/cunumeric/sort/sort.h
index 8c3f5a0df..febc1f57c 100644
--- a/src/cunumeric/sort/sort.h
+++ b/src/cunumeric/sort/sort.h
@@ -22,6 +22,7 @@ namespace cunumeric {
 
 struct SortArgs {
   Array& output;
+  size_t sort_dim_size;
   bool is_index_space;
   Legion::DomainPoint index_point;
   Legion::Domain domain;
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index 030d26cf0..6c26f07ae 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -33,6 +33,7 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
                   const Pitches<DIM - 1>& pitches,
                   const Rect<DIM>& rect,
                   const size_t volume,
+                  const size_t sort_dim_size,
                   bool is_index_space,
                   Legion::DomainPoint index_point,
                   Legion::Domain domain)
@@ -44,9 +45,19 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
               << domain.get_volume() << std::endl;
 #endif
 
-    __gnu_parallel::stable_sort(inptr, inptr + volume);
+    if (volume / sort_dim_size > omp_get_max_threads() / 2)  // TODO fine tune
+    {
+#pragma omp do schedule(dynamic)
+      for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
+        std::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size);
+      }
+    } else {
+      for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
+        __gnu_parallel::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size);
+      }
+    }
 
-    if (is_index_space) {
+    if (is_index_space && DIM == 1) {
       // not implemented yet
       assert(false);
     }
diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl
index 5355bdbbe..b1922f014 100644
--- a/src/cunumeric/sort/sort_template.inl
+++ b/src/cunumeric/sort/sort_template.inl
@@ -36,13 +36,33 @@ struct SortImpl {
     Pitches<DIM - 1> pitches;
     size_t volume = pitches.flatten(rect);
 
-    // TODO -- we cannot stop! need to proceed as partition might be filled later
-    if (volume == 0) { return; }
-
     auto inout = args.output.read_write_accessor<VAL, DIM>(rect);
 
-    SortImplBody<KIND, CODE, DIM>()(
-      inout.ptr(rect), pitches, rect, volume, args.is_index_space, args.index_point, args.domain);
+    /*
+     * Assumptions:
+     * 1. Sort is always requested for the 'last' dimension within rect
+     * 2. We have product_of_all_other_dimensions independent sort ranges
+     * 3. if we have more than one participants:
+     *  a) 1D-case: we need to perform parallel sort (e.g. via sampling)
+     *  b) ND-case: rect needs to be the full domain in that last dimension
+     */
+
+#ifdef DEBUG_CUNUMERIC
+    std::cout << "DIM=" << DIM << ", rect=" << rect << ", sort_dim_size=" << args.sort_dim_size
+              << std::endl;
+
+    assert((DIM == 1 || (rect.hi[DIM - 1] - rect.lo[DIM - 1] + 1 == args.sort_dim_size)) &&
+           "multi-dimensional array should not be distributed in last (sort) dimension");
+#endif
+
+    SortImplBody<KIND, CODE, DIM>()(inout.ptr(rect),
+                                    pitches,
+                                    rect,
+                                    volume,
+                                    args.sort_dim_size,
+                                    args.is_index_space,
+                                    args.index_point,
+                                    args.domain);
   }
 };
 
@@ -50,6 +70,7 @@ template <VariantKind KIND>
 static void sort_template(TaskContext& context)
 {
   SortArgs args{context.outputs()[0],
+                context.scalars()[0].value<size_t>(),
                 context.task_->is_index_space,
                 context.task_->index_point,
                 context.task_->index_domain};
diff --git a/tests/sort.py b/tests/sort.py
index b8945d19d..bdc4c4b93 100644
--- a/tests/sort.py
+++ b/tests/sort.py
@@ -18,7 +18,17 @@
 import cunumeric as num
 
 
-def test():
+def test_sort_axis(a_np, a_num, axis):
+    assert num.allclose(a_np, a_num)
+    print("Sorting axis " + str(axis) + ":")
+    sort_np = np.sort(a_np, axis)
+    sort_num = num.sort(a_num, axis, kind="merge")
+    # print(sort_np)
+    # print(sort_num)
+    assert num.allclose(sort_np, sort_num)
+
+
+def test_1D():
     np.random.seed(42)
     A_np = np.array(np.random.randint(10, size=30), dtype=np.int32)
 
@@ -40,5 +50,70 @@ def test():
     return
 
 
+def test_2D():
+    np.random.seed(42)
+    x_dim = 5
+    y_dim = 3
+    A_np = np.array(
+        np.random.randint(10, size=x_dim * y_dim), dtype=np.int32
+    ).reshape(x_dim, y_dim)
+
+    A_num = num.array(A_np)
+    print("Sorting matrix:\n")
+    print(A_num)
+
+    test_sort_axis(A_np, A_num, 1)
+    test_sort_axis(A_np, A_num, 0)
+
+    return
+
+
+def test_3D():
+    np.random.seed(42)
+    x_dim = 5
+    y_dim = 3
+    z_dim = 7
+    A_np = np.array(
+        np.random.randint(10, size=x_dim * y_dim * z_dim), dtype=np.int32
+    ).reshape(x_dim, y_dim, z_dim)
+
+    A_num = num.array(A_np)
+    print("Sorting 3d tensor:\n")
+    print(A_np)
+
+    test_sort_axis(A_np, A_num, 2)
+    test_sort_axis(A_np, A_num, 1)
+    test_sort_axis(A_np, A_num, 0)
+
+    return
+
+
+def test_custom():
+    a = np.arange(2 * 4).reshape(2, 4)
+    a_transpose = np.transpose(a)
+
+    a_transposed_num = num.array([[0, 4], [1, 5], [2, 6], [3, 7]])
+    a_num = num.array(a)
+    a_num_transposed = a_num.swapaxes(0, 1)
+
+    test_sort_axis(a, a_num, 1)
+    test_sort_axis(a_transpose, a_transposed_num, 1)
+    test_sort_axis(a_transpose, a_num_transposed, 1)
+    test_sort_axis(a_transpose, a_num_transposed, 0)
+
+    return
+
+
+def test():
+    print("\n\n -----------  Custom test ---------------\n")
+    test_custom()
+    print("\n\n -----------  2D test ---------------\n")
+    test_2D()
+    print("\n\n -----------  3D test ---------------\n")
+    test_3D()
+    print("\n\n -----------  1D test ---------------\n")
+    test_1D()
+
+
 if __name__ == "__main__":
     test()

From f585dd51469ad90ac1cd7c73bf721e45b2b0a21a Mon Sep 17 00:00:00 2001
From: mfoerste4 <mfoerster@nvidia.com>
Date: Tue, 8 Feb 2022 13:49:42 +0100
Subject: [PATCH 08/49] update OpenBLAS version to support new architectures

---
 install.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install.py b/install.py
index fadc27a7e..379c05ed8 100755
--- a/install.py
+++ b/install.py
@@ -160,7 +160,7 @@ def install_openblas(openblas_dir, thread_count, verbose):
     git_clone(
         temp_dir,
         url="https://github.com/xianyi/OpenBLAS.git",
-        tag="v0.3.15",
+        tag="v0.3.19",
         verbose=verbose,
     )
     # We can just build this directly

From 3a08481c1f407b24e37841708d3e9b44c9c0ffd4 Mon Sep 17 00:00:00 2001
From: mfoerste4 <mfoerster@nvidia.com>
Date: Tue, 8 Feb 2022 14:10:38 +0100
Subject: [PATCH 09/49] initial draft for sort, 1D, key sort

---
 cunumeric/array.py                   |  36 ++++++-
 cunumeric/config.py                  |   1 +
 cunumeric/deferred.py                |   8 ++
 cunumeric/eager.py                   |   7 ++
 cunumeric/lazy.py                    |   2 +-
 cunumeric/module.py                  |  28 ++++++
 src/cunumeric.mk                     |   3 +
 src/cunumeric/cunumeric_c.h          |   1 +
 src/cunumeric/sort/sort.cc           | 134 +++++++++++++++++++++++++++
 src/cunumeric/sort/sort.cu           |  57 ++++++++++++
 src/cunumeric/sort/sort.h            |  68 ++++++++++++++
 src/cunumeric/sort/sort_omp.cc       |  49 ++++++++++
 src/cunumeric/sort/sort_template.inl |  59 ++++++++++++
 tests/sort.py                        |  44 +++++++++
 14 files changed, 495 insertions(+), 2 deletions(-)
 create mode 100644 src/cunumeric/sort/sort.cc
 create mode 100644 src/cunumeric/sort/sort.cu
 create mode 100644 src/cunumeric/sort/sort.h
 create mode 100644 src/cunumeric/sort/sort_omp.cc
 create mode 100644 src/cunumeric/sort/sort_template.inl
 create mode 100644 tests/sort.py

diff --git a/cunumeric/array.py b/cunumeric/array.py
index c214db335..0becb1712 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -1476,7 +1476,41 @@ def setfield(self, val, dtype, offset=0):
         )
 
     def setflags(self, write=None, align=None, uic=None):
-        self.__array__().setflags(write=write, align=align, uic=uic)
+        self.__array__(stacklevel=2).setflags(
+            write=write, align=align, uic=uic
+        )
+
+    def sort(self, axis=-1, kind="stable", order=None):
+        if kind != "stable":
+            runtime.warn(
+                "cuNumeric uses a different (stable) algorithm than "
+                + str(kind)
+                + " for sorting",
+                category=RuntimeWarning,
+                stacklevel=2,
+            )
+        if order is not None:
+            raise NotImplementedError(
+                "cuNumeric does not support sorting with 'order' as "
+                "ndarray only supports numeric values"
+            )
+        if axis >= self.ndim or axis < -self.ndim:
+            raise ValueError("invalid axis")
+
+        if self._thunk.scalar:
+            # nothing to do
+            return
+        elif self.ndim == 1:
+            # this is the default -- sorting of 1D array
+            self._thunk.sort(axis=axis)
+            return
+        else:
+            raise NotImplementedError(
+                "cuNumeric only supports sorting 1D arrays at the moment"
+            )
+
+            # no return value
+            return
 
     def squeeze(self, axis=None):
         if axis is not None:
diff --git a/cunumeric/config.py b/cunumeric/config.py
index 2968c50ea..d4e942fda 100644
--- a/cunumeric/config.py
+++ b/cunumeric/config.py
@@ -100,6 +100,7 @@ class CuNumericOpCode(IntEnum):
     RAND = _cunumeric.CUNUMERIC_RAND
     READ = _cunumeric.CUNUMERIC_READ
     SCALAR_UNARY_RED = _cunumeric.CUNUMERIC_SCALAR_UNARY_RED
+    SORT = _cunumeric.CUNUMERIC_SORT
     SYRK = _cunumeric.CUNUMERIC_SYRK
     TILE = _cunumeric.CUNUMERIC_TILE
     TRANSPOSE_COPY_2D = _cunumeric.CUNUMERIC_TRANSPOSE_COPY_2D
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 101dc184d..871f32c1a 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -1517,3 +1517,11 @@ def cholesky(self, src, no_tril=False):
         cholesky(self, src)
         if not no_tril:
             self.trilu(self, 0, True)
+
+    def sort(self, axis=-1, kind="stable", order=None):
+        # TODO support axis parameter
+        self.runtime.legate_runtime.issue_execution_fence(block=True)
+        task = self.context.create_task(CuNumericOpCode.SORT)
+        task.add_output(self.base)
+        task.execute()
+        self.runtime.legate_runtime.issue_execution_fence(block=True)
diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index e1a19b1c2..0ebdd8959 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -502,6 +502,13 @@ def nonzero(self):
                 result += (EagerArray(self.runtime, array),)
             return result
 
+    def sort(self, axis=-1, kind="stable", order=None):
+        self.check_eager_args(axis, kind, order)
+        if self.deferred is not None:
+            self.deferred.sort(axis, kind, order)
+        else:
+            self.array.sort(axis, kind, order)
+
     def random_uniform(self):
         if self.deferred is not None:
             self.deferred.random_uniform()
diff --git a/cunumeric/lazy.py b/cunumeric/lazy.py
index bca9b7103..5b861a749 100644
--- a/cunumeric/lazy.py
+++ b/cunumeric/lazy.py
@@ -128,7 +128,7 @@ def bincount(self, rhs, stacklevel, weights=None):
     def nonzero(self, stacklevel):
         raise NotImplementedError("Implement in derived classes")
 
-    def sort(self, rhs, stacklevel):
+    def sort(self, axis, kind, order):
         raise NotImplementedError("Implement in derived classes")
 
     def random_uniform(self, stacklevel):
diff --git a/cunumeric/module.py b/cunumeric/module.py
index 9237c7e6c..55998d448 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -5846,3 +5846,31 @@ def bincount(a, weights=None, minlength=0):
             )
             out._thunk.bincount(a._thunk, weights=weights._thunk)
     return out
+
+# Sorting
+
+def argsort(a, axis=-1, kind="stable", order=None):
+    array = ndarray.convert_to_cunumeric_ndarray(a)
+    return array.argsort(axis=axis, kind=kind, order=order)
+
+
+def lexsort(a, axis=-1):
+    raise NotImplementedError("Not yet implemented")
+
+
+def msort(a):
+    return sort(a)
+
+
+def sort(a, axis=-1, kind="stable", order=None):
+    array = ndarray.convert_to_cunumeric_ndarray(a)
+    out = array.copy()
+    out_array = ndarray.convert_to_cunumeric_ndarray(out)
+    out_array._thunk.sort(axis=axis, kind=kind, order=order)
+    return out_array
+
+
+def sort_complex(a):
+    return sort(a)
+
+
diff --git a/src/cunumeric.mk b/src/cunumeric.mk
index 20c74128a..f695ac97b 100644
--- a/src/cunumeric.mk
+++ b/src/cunumeric.mk
@@ -43,6 +43,7 @@ GEN_CPU_SRC += cunumeric/ternary/where.cc               \
 							 cunumeric/matrix/util.cc                 \
 							 cunumeric/random/rand.cc                 \
 							 cunumeric/search/nonzero.cc              \
+							 cunumeric/sort/sort.cc                   \
 							 cunumeric/stat/bincount.cc               \
 							 cunumeric/convolution/convolve.cc        \
 							 cunumeric/transform/flip.cc              \
@@ -76,6 +77,7 @@ GEN_CPU_SRC += cunumeric/ternary/where_omp.cc          \
 							 cunumeric/matrix/util_omp.cc            \
 							 cunumeric/random/rand_omp.cc            \
 							 cunumeric/search/nonzero_omp.cc         \
+							 cunumeric/sort/sort_omp.cc              \
 							 cunumeric/stat/bincount_omp.cc          \
 							 cunumeric/convolution/convolve_omp.cc   \
 							 cunumeric/transform/flip_omp.cc
@@ -112,6 +114,7 @@ GEN_GPU_SRC += cunumeric/ternary/where.cu               \
 							 cunumeric/matrix/trsm.cu                 \
 							 cunumeric/random/rand.cu                 \
 							 cunumeric/search/nonzero.cu              \
+							 cunumeric/sort/sort.cu                   \
 							 cunumeric/stat/bincount.cu               \
 							 cunumeric/convolution/convolve.cu	  \
 							 cunumeric/transform/flip.cu              \
diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h
index e6612abe1..abbc13fbb 100644
--- a/src/cunumeric/cunumeric_c.h
+++ b/src/cunumeric/cunumeric_c.h
@@ -45,6 +45,7 @@ enum CuNumericOpCode {
   CUNUMERIC_RAND,
   CUNUMERIC_READ,
   CUNUMERIC_SCALAR_UNARY_RED,
+  CUNUMERIC_SORT,
   CUNUMERIC_SYRK,
   CUNUMERIC_TILE,
   CUNUMERIC_TRANSPOSE_COPY_2D,
diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
new file mode 100644
index 000000000..e42535cfc
--- /dev/null
+++ b/src/cunumeric/sort/sort.cc
@@ -0,0 +1,134 @@
+/* Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/sort/sort.h"
+#include "cunumeric/sort/sort_template.inl"
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+// general routine
+template <LegateTypeCode CODE, int32_t DIM>
+struct SortImplBody<VariantKind::CPU, CODE, DIM> {
+  using VAL = legate_type_of<CODE>;
+
+  void operator()(VAL* inptr,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  const size_t volume,
+                  bool is_index_space,
+                  Legion::DomainPoint index_point,
+                  Legion::Domain domain)
+  {
+    // std::cout << "local size = " << volume << ", dist. = " << is_index_space << ", index_point =
+    // "
+    //           << index_point << ", domain/volume = " << domain << "/" << domain.get_volume() <<
+    //           std::endl;
+
+    std::sort(inptr, inptr + volume);
+
+    // in case of distributed data we need to switch to sample sort
+    if (is_index_space) {
+      // create (starting) sample of (at most) domain.get_volume() equidistant values
+      // also enrich values with additional indexes rank & local position in order to handle
+      // duplicate values
+      size_t num_local_samples = std::min(domain.get_volume(), volume);
+      size_t local_rank        = index_point[0];
+      auto local_samples       = std::make_unique<SampleEntry<VAL>[]>(num_local_samples);
+      for (int i = 0; i < num_local_samples; ++i) {
+        const size_t index        = (i + 1) * volume / num_local_samples - 1;
+        local_samples[i].value    = inptr[index];
+        local_samples[i].rank     = local_rank;
+        local_samples[i].local_id = index;
+      }
+
+      // std::cout << "local samples: size = " << num_local_samples << std::endl;
+      // std::cout << "first = (" << local_samples[0].value << "," << local_samples[0].rank << ","<<
+      // local_samples[0].local_id << ")" << std::endl; std::cout << "last = (" <<
+      // local_samples[num_local_samples-1].value << "," << local_samples[num_local_samples-1].rank
+      // << ","<< local_samples[num_local_samples-1].local_id << ")" << std::endl;
+
+      // all2all those samples
+      // TODO broadcast package size
+      // TODO allocate targets
+      // TODO broadcast samples
+      size_t num_global_samples = 15;
+      std::unique_ptr<SampleEntry<VAL>[]> global_samples(new SampleEntry<VAL>[num_global_samples]);
+
+      // sort all samples (utilize 2nd and 3rd sort criteria as well)
+      std::sort(&(global_samples[0]),
+                &(global_samples[0]) + num_global_samples,
+                SampleEntryComparator<VAL>());
+
+      // define splitters
+      auto splitters = std::make_unique<SampleEntry<VAL>[]>(domain.get_volume() - 1);
+      for (int i = 0; i < domain.get_volume() - 1; ++i) {
+        const size_t index = (i + 1) * num_global_samples / domain.get_volume() - 1;
+        splitters[i]       = global_samples[index];
+      }
+
+      do {
+        // compute local package sizes for every process based on splitters
+        std::unique_ptr<size_t> local_partition_size(new size_t[domain.get_volume()]);
+        {
+          size_t range_start    = 0;
+          size_t local_position = 0;
+          for (int p_index = 0; p_index < domain.get_volume(); ++p) {
+            while (local_position < volume && still smaller or equal) { local_position++; }
+
+            local_partition_size[partition_index++] = local_position - range_start;
+            range_start                             = local_position;
+          }
+        }
+
+        // communicate local package-sizes all2all
+        // TODO
+
+        // evaluate distribution result??
+        // TODO
+
+        // if (good enough) break;
+        // TODO
+        break;
+        // else iterate/improve splitters
+        // TODO
+
+      } while (true);
+
+      // all2all accepted distribution
+      // package sizes should already be known
+      // all2all communication
+      // TODO
+
+      // final merge sort of received packages
+      // TODO
+    }
+  }
+};
+
+/*static*/ void SortTask::cpu_variant(TaskContext& context)
+{
+  sort_template<VariantKind::CPU>(context);
+}
+
+namespace  // unnamed
+{
+static void __attribute__((constructor)) register_tasks(void) { SortTask::register_variants(); }
+}  // namespace
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
new file mode 100644
index 000000000..f76b2871c
--- /dev/null
+++ b/src/cunumeric/sort/sort.cu
@@ -0,0 +1,57 @@
+/* Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/sort/sort.h"
+#include "cunumeric/sort/sort_template.inl"
+
+#include <thrust/scan.h>
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+
+#include "cunumeric/cuda_help.h"
+
+namespace cunumeric {
+
+using namespace Legion;
+
+template <LegateTypeCode CODE, int32_t DIM>
+struct SortImplBody<VariantKind::GPU, CODE, DIM> {
+  using VAL = legate_type_of<CODE>;
+
+  void operator()(VAL* inptr,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  const size_t volume,
+                  bool is_index_space,
+                  Legion::DomainPoint index_point,
+                  Legion::Domain domain)
+  {
+    thrust::sort(inptr, inptr + volume);
+
+    // in case of distributed data we need to switch to sample sort
+    if (is_index_space) {
+      // not implemented yet
+      assert(false);
+    }
+  }
+};
+
+/*static*/ void SortTask::gpu_variant(TaskContext& context)
+{
+  sort_template<VariantKind::GPU>(context);
+}
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/sort/sort.h b/src/cunumeric/sort/sort.h
new file mode 100644
index 000000000..8c3f5a0df
--- /dev/null
+++ b/src/cunumeric/sort/sort.h
@@ -0,0 +1,68 @@
+/* Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cunumeric/cunumeric.h"
+
+namespace cunumeric {
+
+struct SortArgs {
+  Array& output;
+  bool is_index_space;
+  Legion::DomainPoint index_point;
+  Legion::Domain domain;
+};
+
+template <typename VAL>
+struct SampleEntry {
+  VAL value;
+  size_t rank;
+  size_t local_id;
+};
+
+template <typename VAL>
+struct SampleEntryComparator {
+  bool operator()(const SampleEntry<VAL>& a, const SampleEntry<VAL>& b) const
+  {
+    if (a.value < b.value) {
+      return true;
+    } else if (a.value == b.value) {
+      if (a.rank < b.rank) {
+        return true;
+      } else if (a.rank == b.rank) {
+        return a.local_id < b.local_id;
+      }
+    }
+    return false;
+  }
+};
+
+class SortTask : public CuNumericTask<SortTask> {
+ public:
+  static const int TASK_ID = CUNUMERIC_SORT;
+
+ public:
+  static void cpu_variant(legate::TaskContext& context);
+#ifdef LEGATE_USE_OPENMP
+  static void omp_variant(legate::TaskContext& context);
+#endif
+#ifdef LEGATE_USE_CUDA
+  static void gpu_variant(legate::TaskContext& context);
+#endif
+};
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
new file mode 100644
index 000000000..d8ffadbd0
--- /dev/null
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -0,0 +1,49 @@
+/* Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/sort/sort.h"
+#include "cunumeric/sort/sort_template.inl"
+
+#include <omp.h>
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+template <LegateTypeCode CODE, int32_t DIM>
+struct SortImplBody<VariantKind::OMP, CODE, DIM> {
+  using VAL = legate_type_of<CODE>;
+
+  void operator()(VAL* inptr,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  const size_t volume,
+                  bool is_index_space,
+                  Legion::DomainPoint index_point,
+                  Legion::Domain domain)
+  {
+    // not implemented yet
+    assert(false);
+  }
+};
+
+/*static*/ void SortTask::omp_variant(TaskContext& context)
+{
+  sort_template<VariantKind::OMP>(context);
+}
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl
new file mode 100644
index 000000000..5355bdbbe
--- /dev/null
+++ b/src/cunumeric/sort/sort_template.inl
@@ -0,0 +1,59 @@
+/* Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/pitches.h"
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+template <VariantKind KIND, LegateTypeCode CODE, int32_t DIM>
+struct SortImplBody;
+
+template <VariantKind KIND>
+struct SortImpl {
+  template <LegateTypeCode CODE, int32_t DIM>
+  void operator()(SortArgs& args) const
+  {
+    using VAL = legate_type_of<CODE>;
+
+    auto rect = args.output.shape<DIM>();
+
+    Pitches<DIM - 1> pitches;
+    size_t volume = pitches.flatten(rect);
+
+    // TODO -- we cannot stop! need to proceed as partition might be filled later
+    if (volume == 0) { return; }
+
+    auto inout = args.output.read_write_accessor<VAL, DIM>(rect);
+
+    SortImplBody<KIND, CODE, DIM>()(
+      inout.ptr(rect), pitches, rect, volume, args.is_index_space, args.index_point, args.domain);
+  }
+};
+
+template <VariantKind KIND>
+static void sort_template(TaskContext& context)
+{
+  SortArgs args{context.outputs()[0],
+                context.task_->is_index_space,
+                context.task_->index_point,
+                context.task_->index_domain};
+  double_dispatch(args.output.dim(), args.output.code(), SortImpl<KIND>{}, args);
+}
+
+}  // namespace cunumeric
diff --git a/tests/sort.py b/tests/sort.py
new file mode 100644
index 000000000..ab5c91193
--- /dev/null
+++ b/tests/sort.py
@@ -0,0 +1,44 @@
+# Copyright 2021 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+
+import cunumeric as num
+
+
+def test():
+    np.random.seed(42)
+    A_np = np.array(np.random.randint(10, size=30), dtype=np.int32)
+
+    A_num = num.array(A_np)
+    print("Sorting array   : " + str(A_np))
+
+    sortA_np = np.sort(A_np)
+    print("Result numpy    : " + str(sortA_np))
+
+    # pdb.set_trace()
+    sortA_num = num.sort(A_num)
+    print("Result cunumeric: " + str(sortA_num))
+
+    A_num.sort()
+    print("Result (inplace): " + str(A_num))
+
+    assert num.allclose(sortA_np, sortA_num)
+
+    return
+
+
+if __name__ == "__main__":
+    test()

From 131fb6d3fd583888207ad401427ac79fb5072c36 Mon Sep 17 00:00:00 2001
From: mfoerste4 <mfoerster@nvidia.com>
Date: Tue, 8 Feb 2022 15:17:28 +0100
Subject: [PATCH 10/49] fixed compile error

---
 src/cunumeric/sort/sort.cc | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index e42535cfc..28cf83c23 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -84,15 +84,23 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
 
       do {
         // compute local package sizes for every process based on splitters
-        std::unique_ptr<size_t> local_partition_size(new size_t[domain.get_volume()]);
+        std::unique_ptr<size_t[]> local_partition_size(new size_t[domain.get_volume()]);
         {
           size_t range_start    = 0;
           size_t local_position = 0;
-          for (int p_index = 0; p_index < domain.get_volume(); ++p) {
-            while (local_position < volume && still smaller or equal) { local_position++; }
-
-            local_partition_size[partition_index++] = local_position - range_start;
-            range_start                             = local_position;
+          for (int p_index = 0; p_index < domain.get_volume(); ++p_index) {
+            // move as long current value is lesser or equaöl to current splitter
+            while (local_position < volume &&
+                   (inptr[local_position] < splitters[p_index].value ||
+                    (inptr[local_position] == splitters[p_index].value &&
+                     (local_rank < splitters[p_index].rank ||
+                      (local_rank == splitters[p_index].rank &&
+                       local_position <= splitters[p_index].local_id))))) {
+              local_position++;
+            }
+
+            local_partition_size[p_index++] = local_position - range_start;
+            range_start                     = local_position;
           }
         }
 

From b115835512ceba71a215c0338497b96473c81c30 Mon Sep 17 00:00:00 2001
From: mfoerste4 <mfoerster@nvidia.com>
Date: Tue, 8 Feb 2022 21:03:50 +0100
Subject: [PATCH 11/49] OpenMP non-distributed implementation, some small
 fixes, benchmark tool

---
 examples/sort.py               | 102 +++++++++++++++++++++++++++++++++
 src/cunumeric/sort/sort.cc     |  17 +++---
 src/cunumeric/sort/sort.cu     |   9 ++-
 src/cunumeric/sort/sort_omp.cc |  72 ++++++++++++++++++++++-
 tests/sort.py                  |   4 +-
 5 files changed, 191 insertions(+), 13 deletions(-)
 create mode 100644 examples/sort.py

diff --git a/examples/sort.py b/examples/sort.py
new file mode 100644
index 000000000..9142c8a12
--- /dev/null
+++ b/examples/sort.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+
+# Copyright 2021 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import datetime
+
+import numpy
+from benchmark import run_benchmark
+
+import cunumeric
+
+
+def check_sorted(a, a_numpy):
+    a_sorted = numpy.sort(a_numpy)
+    print("Checking result...")
+    if cunumeric.allclose(a_sorted, a):
+        print("PASS!")
+    else:
+        print("FAIL!")
+        print("NUMPY    : " + str(a_sorted))
+        print("CUNUMERIC: " + str(a))
+
+
+def run_sort(N, perform_check, timing):
+
+    numpy.random.seed(42)
+    a_numpy = numpy.array(
+        numpy.random.randint(1000, size=N), dtype=numpy.int32
+    )
+    a = cunumeric.array(a_numpy)
+
+    start = datetime.datetime.now()
+    a_sorted = cunumeric.sort(a)
+    stop = datetime.datetime.now()
+
+    if perform_check:
+        check_sorted(a_sorted, a_numpy)
+    else:
+        # do we need to synchronize?
+        assert True
+    delta = stop - start
+    total = delta.total_seconds() * 1000.0
+    if timing:
+        print("Elapsed Time: " + str(total) + " ms")
+    return total
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-c",
+        "--check",
+        dest="check",
+        action="store_true",
+        help="check the result of the solve",
+    )
+    parser.add_argument(
+        "-n",
+        "--num",
+        type=int,
+        default=1000000,
+        dest="N",
+        help="number of elements in one dimension",
+    )
+    parser.add_argument(
+        "-t",
+        "--time",
+        dest="timing",
+        action="store_true",
+        help="perform timing",
+    )
+    parser.add_argument(
+        "-b",
+        "--benchmark",
+        type=int,
+        default=1,
+        dest="benchmark",
+        help="number of times to benchmark this application (default 1 - "
+        "normal execution)",
+    )
+
+    args = parser.parse_args()
+    run_benchmark(
+        run_sort,
+        args.benchmark,
+        "Sort",
+        (args.N, args.check, args.timing),
+    )
diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index 28cf83c23..af96acdba 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -35,12 +35,13 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
                   Legion::DomainPoint index_point,
                   Legion::Domain domain)
   {
-    // std::cout << "local size = " << volume << ", dist. = " << is_index_space << ", index_point =
-    // "
-    //           << index_point << ", domain/volume = " << domain << "/" << domain.get_volume() <<
-    //           std::endl;
+#ifdef DEBUG_CUNUMERIC
+    std::cout << "CPU(" << index_point[0] << "): local size = " << volume
+              << ", dist. = " << is_index_space << ", index_point = " << index_point
+              << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl;
+#endif
 
-    std::sort(inptr, inptr + volume);
+    std::stable_sort(inptr, inptr + volume);
 
     // in case of distributed data we need to switch to sample sort
     if (is_index_space) {
@@ -71,9 +72,9 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
       std::unique_ptr<SampleEntry<VAL>[]> global_samples(new SampleEntry<VAL>[num_global_samples]);
 
       // sort all samples (utilize 2nd and 3rd sort criteria as well)
-      std::sort(&(global_samples[0]),
-                &(global_samples[0]) + num_global_samples,
-                SampleEntryComparator<VAL>());
+      std::stable_sort(&(global_samples[0]),
+                       &(global_samples[0]) + num_global_samples,
+                       SampleEntryComparator<VAL>());
 
       // define splitters
       auto splitters = std::make_unique<SampleEntry<VAL>[]>(domain.get_volume() - 1);
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index f76b2871c..2ce1987ad 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -39,7 +39,14 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
                   Legion::DomainPoint index_point,
                   Legion::Domain domain)
   {
-    thrust::sort(inptr, inptr + volume);
+#ifdef DEBUG_CUNUMERIC
+    std::cout << "GPU(" << index_point[0] << "): local size = " << volume
+              << ", dist. = " << is_index_space << ", index_point = " << index_point
+              << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl;
+#endif
+
+    thrust::device_ptr<VAL> dev_ptr(inptr);
+    thrust::stable_sort(dev_ptr, dev_ptr + volume);
 
     // in case of distributed data we need to switch to sample sort
     if (is_index_space) {
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index d8ffadbd0..1f2d00262 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -24,6 +24,51 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
+template <typename VAL>
+void merge(VAL* inptr, size_t start_idx, size_t end_idx, VAL* tmp)
+{
+  const size_t mid  = (end_idx + start_idx) / 2;
+  size_t left_idx   = start_idx;
+  size_t right_idx  = mid;
+  size_t target_idx = start_idx;
+
+  while (left_idx < mid && right_idx < end_idx) {
+    if (inptr[left_idx] <= inptr[right_idx]) {
+      tmp[target_idx++] = inptr[left_idx++];
+    } else {
+      tmp[target_idx++] = inptr[right_idx++];
+    }
+  }
+
+  while (left_idx < mid) { tmp[target_idx++] = inptr[left_idx++]; }
+  while (right_idx < end_idx) { tmp[target_idx++] = inptr[right_idx++]; }
+
+  std::copy(tmp + start_idx, tmp + end_idx, inptr + start_idx);
+}
+
+// TODO tune
+#define SEQUENTIAL_THRESHOLD 1024
+#define TASK_THRESHOLD 2048
+
+template <typename VAL>
+void merge_sort(VAL* inptr, const size_t start_idx, const size_t end_idx, VAL* tmp)
+{
+  const size_t size = end_idx - start_idx + 1;
+  if (size > SEQUENTIAL_THRESHOLD) {
+    const size_t mid = (end_idx + start_idx) / 2;
+
+#pragma omp task shared(inptr, tmp) if (size > TASK_THRESHOLD)
+    merge_sort(inptr, start_idx, mid, tmp);
+#pragma omp task shared(inptr, tmp) if (size > TASK_THRESHOLD)
+    merge_sort(inptr, mid, end_idx, tmp);
+
+#pragma omp taskwait
+    merge(inptr, start_idx, end_idx, tmp);
+  } else if (size > 1) {
+    std::stable_sort(inptr + start_idx, inptr + end_idx);
+  }
+}
+
 template <LegateTypeCode CODE, int32_t DIM>
 struct SortImplBody<VariantKind::OMP, CODE, DIM> {
   using VAL = legate_type_of<CODE>;
@@ -36,8 +81,31 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
                   Legion::DomainPoint index_point,
                   Legion::Domain domain)
   {
-    // not implemented yet
-    assert(false);
+#ifdef DEBUG_CUNUMERIC
+    std::cout << "OMP(" << index_point[0] << ":" << omp_get_max_threads() << ":" << omp_get_nested()
+              << "): local size = " << volume << ", dist. = " << is_index_space
+              << ", index_point = " << index_point << ", domain/volume = " << domain << "/"
+              << domain.get_volume() << std::endl;
+#endif
+
+    bool nested = omp_get_nested();
+    if (!nested) omp_set_nested(1);
+
+    // merge sort
+    auto tmp = std::make_unique<VAL[]>(volume);
+
+#pragma omp parallel shared(inptr, tmp)
+    {
+#pragma omp single
+      merge_sort(inptr, 0, volume, &(tmp[0]));
+    }
+
+    if (is_index_space) {
+      // not implemented yet
+      assert(false);
+    }
+
+    if (!nested) omp_set_nested(0);
   }
 };
 
diff --git a/tests/sort.py b/tests/sort.py
index ab5c91193..b8945d19d 100644
--- a/tests/sort.py
+++ b/tests/sort.py
@@ -31,11 +31,11 @@ def test():
     # pdb.set_trace()
     sortA_num = num.sort(A_num)
     print("Result cunumeric: " + str(sortA_num))
+    assert num.allclose(sortA_np, sortA_num)
 
     A_num.sort()
     print("Result (inplace): " + str(A_num))
-
-    assert num.allclose(sortA_np, sortA_num)
+    assert num.allclose(sortA_np, A_num)
 
     return
 

From 03608cfaf7ca2903d41063115d0ebf01782412d3 Mon Sep 17 00:00:00 2001
From: mfoerste4 <mfoerster@nvidia.com>
Date: Tue, 8 Feb 2022 21:10:51 +0100
Subject: [PATCH 12/49] added missing include

---
 src/cunumeric/sort/sort.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index 2ce1987ad..5530de63d 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -19,6 +19,7 @@
 
 #include <thrust/scan.h>
 #include <thrust/sort.h>
+#include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 
 #include "cunumeric/cuda_help.h"

From 85bc3a73c5ca59adf74a2c939724daadb70bcdd6 Mon Sep 17 00:00:00 2001
From: mfoerste4 <mfoerster@nvidia.com>
Date: Tue, 8 Feb 2022 21:48:03 +0100
Subject: [PATCH 13/49] switch to parallel gcc sort

---
 src/cunumeric/sort/sort_omp.cc | 60 ++--------------------------------
 1 file changed, 2 insertions(+), 58 deletions(-)

diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index 1f2d00262..030d26cf0 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -17,6 +17,7 @@
 #include "cunumeric/sort/sort.h"
 #include "cunumeric/sort/sort_template.inl"
 
+#include <parallel/algorithm>
 #include <omp.h>
 
 namespace cunumeric {
@@ -24,51 +25,6 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
-template <typename VAL>
-void merge(VAL* inptr, size_t start_idx, size_t end_idx, VAL* tmp)
-{
-  const size_t mid  = (end_idx + start_idx) / 2;
-  size_t left_idx   = start_idx;
-  size_t right_idx  = mid;
-  size_t target_idx = start_idx;
-
-  while (left_idx < mid && right_idx < end_idx) {
-    if (inptr[left_idx] <= inptr[right_idx]) {
-      tmp[target_idx++] = inptr[left_idx++];
-    } else {
-      tmp[target_idx++] = inptr[right_idx++];
-    }
-  }
-
-  while (left_idx < mid) { tmp[target_idx++] = inptr[left_idx++]; }
-  while (right_idx < end_idx) { tmp[target_idx++] = inptr[right_idx++]; }
-
-  std::copy(tmp + start_idx, tmp + end_idx, inptr + start_idx);
-}
-
-// TODO tune
-#define SEQUENTIAL_THRESHOLD 1024
-#define TASK_THRESHOLD 2048
-
-template <typename VAL>
-void merge_sort(VAL* inptr, const size_t start_idx, const size_t end_idx, VAL* tmp)
-{
-  const size_t size = end_idx - start_idx + 1;
-  if (size > SEQUENTIAL_THRESHOLD) {
-    const size_t mid = (end_idx + start_idx) / 2;
-
-#pragma omp task shared(inptr, tmp) if (size > TASK_THRESHOLD)
-    merge_sort(inptr, start_idx, mid, tmp);
-#pragma omp task shared(inptr, tmp) if (size > TASK_THRESHOLD)
-    merge_sort(inptr, mid, end_idx, tmp);
-
-#pragma omp taskwait
-    merge(inptr, start_idx, end_idx, tmp);
-  } else if (size > 1) {
-    std::stable_sort(inptr + start_idx, inptr + end_idx);
-  }
-}
-
 template <LegateTypeCode CODE, int32_t DIM>
 struct SortImplBody<VariantKind::OMP, CODE, DIM> {
   using VAL = legate_type_of<CODE>;
@@ -88,24 +44,12 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
               << domain.get_volume() << std::endl;
 #endif
 
-    bool nested = omp_get_nested();
-    if (!nested) omp_set_nested(1);
-
-    // merge sort
-    auto tmp = std::make_unique<VAL[]>(volume);
-
-#pragma omp parallel shared(inptr, tmp)
-    {
-#pragma omp single
-      merge_sort(inptr, 0, volume, &(tmp[0]));
-    }
+    __gnu_parallel::stable_sort(inptr, inptr + volume);
 
     if (is_index_space) {
       // not implemented yet
       assert(false);
     }
-
-    if (!nested) omp_set_nested(0);
   }
 };
 

From 188077bf010de4c8b4b002b2bca79d01540de375 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@prm-dgx-06.nvidia.com>
Date: Thu, 10 Feb 2022 09:16:31 -0800
Subject: [PATCH 14/49] Enable N-D non-distributed sort

---
 cunumeric/array.py                   | 11 +---
 cunumeric/deferred.py                | 37 ++++++++++---
 cunumeric/module.py                  |  9 ++--
 src/cunumeric/sort/sort.cc           | 15 ++++--
 src/cunumeric/sort/sort.cu           |  7 ++-
 src/cunumeric/sort/sort.h            |  1 +
 src/cunumeric/sort/sort_omp.cc       | 15 +++++-
 src/cunumeric/sort/sort_template.inl | 31 +++++++++--
 tests/sort.py                        | 77 +++++++++++++++++++++++++++-
 9 files changed, 171 insertions(+), 32 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 0becb1712..85a9f4664 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -1500,16 +1500,9 @@ def sort(self, axis=-1, kind="stable", order=None):
         if self._thunk.scalar:
             # nothing to do
             return
-        elif self.ndim == 1:
-            # this is the default -- sorting of 1D array
-            self._thunk.sort(axis=axis)
-            return
         else:
-            raise NotImplementedError(
-                "cuNumeric only supports sorting 1D arrays at the moment"
-            )
-
-            # no return value
+            # this is the default -- sorting of N-D array
+            self._thunk.sort(axis=axis)
             return
 
     def squeeze(self, axis=None):
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 871f32c1a..3edf8c75c 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -1519,9 +1519,34 @@ def cholesky(self, src, no_tril=False):
             self.trilu(self, 0, True)
 
     def sort(self, axis=-1, kind="stable", order=None):
-        # TODO support axis parameter
-        self.runtime.legate_runtime.issue_execution_fence(block=True)
-        task = self.context.create_task(CuNumericOpCode.SORT)
-        task.add_output(self.base)
-        task.execute()
-        self.runtime.legate_runtime.issue_execution_fence(block=True)
+        axis_normalized = axis
+        if axis_normalized < 0:
+            axis_normalized = self.ndim + axis
+
+        if axis_normalized is not self.ndim - 1:
+            assert axis_normalized < self.ndim - 1 and axis_normalized >= 0
+
+            # swap axes
+            swapped = self.swapaxes(axis_normalized, self.ndim - 1)
+
+            # FIXME: ensure *new* distribution does not split last axis (!)
+            swapped_copy = self.runtime.create_empty_thunk(
+                swapped.shape, dtype=self.dtype, inputs=[self, swapped]
+            )
+            swapped_copy.copy(swapped, deep=True)
+
+            # run sort on last axis
+            swapped_copy.sort(self.ndim - 1)
+
+            self.base = swapped_copy.swapaxes(
+                axis_normalized, self.ndim - 1
+            ).base
+            self.numpy_array = None
+        else:
+            # run actual sort task
+            self.runtime.legate_runtime.issue_execution_fence(block=True)
+            task = self.context.create_task(CuNumericOpCode.SORT)
+            task.add_output(self.base)
+            task.add_scalar_arg(self.base.shape[self.ndim - 1], ty.uint64)
+            task.execute()
+            self.runtime.legate_runtime.issue_execution_fence(block=True)
diff --git a/cunumeric/module.py b/cunumeric/module.py
index 55998d448..fce223979 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -5849,9 +5849,10 @@ def bincount(a, weights=None, minlength=0):
 
 # Sorting
 
+
+@add_boilerplate("a")
 def argsort(a, axis=-1, kind="stable", order=None):
-    array = ndarray.convert_to_cunumeric_ndarray(a)
-    return array.argsort(axis=axis, kind=kind, order=order)
+    return a.argsort(axis=axis, kind=kind, order=order)
 
 
 def lexsort(a, axis=-1):
@@ -5862,9 +5863,9 @@ def msort(a):
     return sort(a)
 
 
+@add_boilerplate("a")
 def sort(a, axis=-1, kind="stable", order=None):
-    array = ndarray.convert_to_cunumeric_ndarray(a)
-    out = array.copy()
+    out = a.copy()
     out_array = ndarray.convert_to_cunumeric_ndarray(out)
     out_array._thunk.sort(axis=axis, kind=kind, order=order)
     return out_array
diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index af96acdba..4d26ce1ad 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -31,6 +31,7 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
                   const Pitches<DIM - 1>& pitches,
                   const Rect<DIM>& rect,
                   const size_t volume,
+                  const size_t sort_dim_size,
                   bool is_index_space,
                   Legion::DomainPoint index_point,
                   Legion::Domain domain)
@@ -39,12 +40,20 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
     std::cout << "CPU(" << index_point[0] << "): local size = " << volume
               << ", dist. = " << is_index_space << ", index_point = " << index_point
               << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl;
+
+    if (volume <= 30) {
+      std::cout << "inptr = [ ";
+      for (size_t i = 0; i < volume; ++i) { std::cout << (i > 0 ? ", " : " ") << inptr[i]; }
+      std::cout << "]" << std::endl;
+    }
 #endif
 
-    std::stable_sort(inptr, inptr + volume);
+    for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
+      std::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size);
+    }
 
-    // in case of distributed data we need to switch to sample sort
-    if (is_index_space) {
+    // in case of distributed data (1D) we need to switch to sample sort
+    if (is_index_space && DIM == 1) {
       // create (starting) sample of (at most) domain.get_volume() equidistant values
       // also enrich values with additional indexes rank & local position in order to handle
       // duplicate values
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index 5530de63d..f43445a8b 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -36,6 +36,7 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
                   const Pitches<DIM - 1>& pitches,
                   const Rect<DIM>& rect,
                   const size_t volume,
+                  const size_t sort_dim_size,
                   bool is_index_space,
                   Legion::DomainPoint index_point,
                   Legion::Domain domain)
@@ -47,10 +48,12 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
 #endif
 
     thrust::device_ptr<VAL> dev_ptr(inptr);
-    thrust::stable_sort(dev_ptr, dev_ptr + volume);
+    for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
+      thrust::stable_sort(dev_ptr + start_idx, dev_ptr + start_idx + sort_dim_size);
+    }
 
     // in case of distributed data we need to switch to sample sort
-    if (is_index_space) {
+    if (is_index_space && DIM == 1) {
       // not implemented yet
       assert(false);
     }
diff --git a/src/cunumeric/sort/sort.h b/src/cunumeric/sort/sort.h
index 8c3f5a0df..febc1f57c 100644
--- a/src/cunumeric/sort/sort.h
+++ b/src/cunumeric/sort/sort.h
@@ -22,6 +22,7 @@ namespace cunumeric {
 
 struct SortArgs {
   Array& output;
+  size_t sort_dim_size;
   bool is_index_space;
   Legion::DomainPoint index_point;
   Legion::Domain domain;
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index 030d26cf0..6c26f07ae 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -33,6 +33,7 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
                   const Pitches<DIM - 1>& pitches,
                   const Rect<DIM>& rect,
                   const size_t volume,
+                  const size_t sort_dim_size,
                   bool is_index_space,
                   Legion::DomainPoint index_point,
                   Legion::Domain domain)
@@ -44,9 +45,19 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
               << domain.get_volume() << std::endl;
 #endif
 
-    __gnu_parallel::stable_sort(inptr, inptr + volume);
+    if (volume / sort_dim_size > omp_get_max_threads() / 2)  // TODO fine tune
+    {
+#pragma omp do schedule(dynamic)
+      for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
+        std::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size);
+      }
+    } else {
+      for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
+        __gnu_parallel::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size);
+      }
+    }
 
-    if (is_index_space) {
+    if (is_index_space && DIM == 1) {
       // not implemented yet
       assert(false);
     }
diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl
index 5355bdbbe..b1922f014 100644
--- a/src/cunumeric/sort/sort_template.inl
+++ b/src/cunumeric/sort/sort_template.inl
@@ -36,13 +36,33 @@ struct SortImpl {
     Pitches<DIM - 1> pitches;
     size_t volume = pitches.flatten(rect);
 
-    // TODO -- we cannot stop! need to proceed as partition might be filled later
-    if (volume == 0) { return; }
-
     auto inout = args.output.read_write_accessor<VAL, DIM>(rect);
 
-    SortImplBody<KIND, CODE, DIM>()(
-      inout.ptr(rect), pitches, rect, volume, args.is_index_space, args.index_point, args.domain);
+    /*
+     * Assumptions:
+     * 1. Sort is always requested for the 'last' dimension within rect
+     * 2. We have product_of_all_other_dimensions independent sort ranges
+     * 3. if we have more than one participants:
+     *  a) 1D-case: we need to perform parallel sort (e.g. via sampling)
+     *  b) ND-case: rect needs to be the full domain in that last dimension
+     */
+
+#ifdef DEBUG_CUNUMERIC
+    std::cout << "DIM=" << DIM << ", rect=" << rect << ", sort_dim_size=" << args.sort_dim_size
+              << std::endl;
+
+    assert((DIM == 1 || (rect.hi[DIM - 1] - rect.lo[DIM - 1] + 1 == args.sort_dim_size)) &&
+           "multi-dimensional array should not be distributed in last (sort) dimension");
+#endif
+
+    SortImplBody<KIND, CODE, DIM>()(inout.ptr(rect),
+                                    pitches,
+                                    rect,
+                                    volume,
+                                    args.sort_dim_size,
+                                    args.is_index_space,
+                                    args.index_point,
+                                    args.domain);
   }
 };
 
@@ -50,6 +70,7 @@ template <VariantKind KIND>
 static void sort_template(TaskContext& context)
 {
   SortArgs args{context.outputs()[0],
+                context.scalars()[0].value<size_t>(),
                 context.task_->is_index_space,
                 context.task_->index_point,
                 context.task_->index_domain};
diff --git a/tests/sort.py b/tests/sort.py
index b8945d19d..bdc4c4b93 100644
--- a/tests/sort.py
+++ b/tests/sort.py
@@ -18,7 +18,17 @@
 import cunumeric as num
 
 
-def test():
+def test_sort_axis(a_np, a_num, axis):
+    assert num.allclose(a_np, a_num)
+    print("Sorting axis " + str(axis) + ":")
+    sort_np = np.sort(a_np, axis)
+    sort_num = num.sort(a_num, axis, kind="merge")
+    # print(sort_np)
+    # print(sort_num)
+    assert num.allclose(sort_np, sort_num)
+
+
+def test_1D():
     np.random.seed(42)
     A_np = np.array(np.random.randint(10, size=30), dtype=np.int32)
 
@@ -40,5 +50,70 @@ def test():
     return
 
 
+def test_2D():
+    np.random.seed(42)
+    x_dim = 5
+    y_dim = 3
+    A_np = np.array(
+        np.random.randint(10, size=x_dim * y_dim), dtype=np.int32
+    ).reshape(x_dim, y_dim)
+
+    A_num = num.array(A_np)
+    print("Sorting matrix:\n")
+    print(A_num)
+
+    test_sort_axis(A_np, A_num, 1)
+    test_sort_axis(A_np, A_num, 0)
+
+    return
+
+
+def test_3D():
+    np.random.seed(42)
+    x_dim = 5
+    y_dim = 3
+    z_dim = 7
+    A_np = np.array(
+        np.random.randint(10, size=x_dim * y_dim * z_dim), dtype=np.int32
+    ).reshape(x_dim, y_dim, z_dim)
+
+    A_num = num.array(A_np)
+    print("Sorting 3d tensor:\n")
+    print(A_np)
+
+    test_sort_axis(A_np, A_num, 2)
+    test_sort_axis(A_np, A_num, 1)
+    test_sort_axis(A_np, A_num, 0)
+
+    return
+
+
+def test_custom():
+    a = np.arange(2 * 4).reshape(2, 4)
+    a_transpose = np.transpose(a)
+
+    a_transposed_num = num.array([[0, 4], [1, 5], [2, 6], [3, 7]])
+    a_num = num.array(a)
+    a_num_transposed = a_num.swapaxes(0, 1)
+
+    test_sort_axis(a, a_num, 1)
+    test_sort_axis(a_transpose, a_transposed_num, 1)
+    test_sort_axis(a_transpose, a_num_transposed, 1)
+    test_sort_axis(a_transpose, a_num_transposed, 0)
+
+    return
+
+
+def test():
+    print("\n\n -----------  Custom test ---------------\n")
+    test_custom()
+    print("\n\n -----------  2D test ---------------\n")
+    test_2D()
+    print("\n\n -----------  3D test ---------------\n")
+    test_3D()
+    print("\n\n -----------  1D test ---------------\n")
+    test_1D()
+
+
 if __name__ == "__main__":
     test()

From 5cd0956763105a2aaf9cde47006f12f19f3f799c Mon Sep 17 00:00:00 2001
From: mfoerste4 <mfoerster@nvidia.com>
Date: Thu, 10 Feb 2022 21:48:48 +0100
Subject: [PATCH 15/49] merge after rebase to 22.03

---
 cunumeric/module.py                  | 88 +++++++++-------------------
 src/cunumeric/sort/sort.cc           |  2 +-
 src/cunumeric/sort/sort.cu           |  2 +-
 src/cunumeric/sort/sort.h            |  2 +-
 src/cunumeric/sort/sort_omp.cc       |  2 +-
 src/cunumeric/sort/sort_template.inl |  2 +-
 6 files changed, 33 insertions(+), 65 deletions(-)

diff --git a/cunumeric/module.py b/cunumeric/module.py
index 54d03137b..ffb5eaff5 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -5571,6 +5571,34 @@ def sign(a, out=None, where=True, dtype=None, **kwargs):
 # Sorting, searching, and counting
 ##################################
 
+# Sorting
+
+
+@add_boilerplate("a")
+def argsort(a, axis=-1, kind="stable", order=None):
+    return a.argsort(axis=axis, kind=kind, order=order)
+
+
+def lexsort(a, axis=-1):
+    raise NotImplementedError("Not yet implemented")
+
+
+def msort(a):
+    return sort(a)
+
+
+@add_boilerplate("a")
+def sort(a, axis=-1, kind="stable", order=None):
+    out = a.copy()
+    out_array = ndarray.convert_to_cunumeric_ndarray(out)
+    out_array._thunk.sort(axis=axis, kind=kind, order=order)
+    return out_array
+
+
+def sort_complex(a):
+    return sort(a)
+
+
 # Searching
 
 
@@ -5846,63 +5874,3 @@ def bincount(a, weights=None, minlength=0):
             )
             out._thunk.bincount(a._thunk, weights=weights._thunk)
     return out
-
-# Sorting
-
-
-# Sorting
-
-
-@add_boilerplate("a")
-def argsort(a, axis=-1, kind="stable", order=None):
-    return a.argsort(axis=axis, kind=kind, order=order)
-
-
-def lexsort(a, axis=-1):
-    raise NotImplementedError("Not yet implemented")
-
-
-def msort(a):
-    return sort(a)
-
-
-@add_boilerplate("a")
-def sort(a, axis=-1, kind="stable", order=None):
-    out = a.copy()
-    out_array = ndarray.convert_to_cunumeric_ndarray(out)
-    out_array._thunk.sort(axis=axis, kind=kind, order=order)
-    return out_array
-
-
-def sort_complex(a):
-    return sort(a)
-
-
-# Counting
-
-
-@add_boilerplate("a")
-def argsort(a, axis=-1, kind="stable", order=None):
-    return a.argsort(axis=axis, kind=kind, order=order)
-
-
-def lexsort(a, axis=-1):
-    raise NotImplementedError("Not yet implemented")
-
-
-def msort(a):
-    return sort(a)
-
-
-@add_boilerplate("a")
-def sort(a, axis=-1, kind="stable", order=None):
-    out = a.copy()
-    out_array = ndarray.convert_to_cunumeric_ndarray(out)
-    out_array._thunk.sort(axis=axis, kind=kind, order=order)
-    return out_array
-
-
-def sort_complex(a):
-    return sort(a)
-
-
diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index 4d26ce1ad..1f4407b7f 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021 NVIDIA Corporation
+/* Copyright 2021-2022 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index f43445a8b..ff70e26d9 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -1,4 +1,4 @@
-/* Copyright 2021 NVIDIA Corporation
+/* Copyright 2021-2022 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/cunumeric/sort/sort.h b/src/cunumeric/sort/sort.h
index febc1f57c..6afba9230 100644
--- a/src/cunumeric/sort/sort.h
+++ b/src/cunumeric/sort/sort.h
@@ -1,4 +1,4 @@
-/* Copyright 2021 NVIDIA Corporation
+/* Copyright 2021-2022 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index 6c26f07ae..c7265f447 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021 NVIDIA Corporation
+/* Copyright 2021-2022 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl
index b1922f014..2b4886471 100644
--- a/src/cunumeric/sort/sort_template.inl
+++ b/src/cunumeric/sort/sort_template.inl
@@ -1,4 +1,4 @@
-/* Copyright 2021 NVIDIA Corporation
+/* Copyright 2021-2022 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 9063d77b04d530d59c6f102454ad18c984f0a7f2 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 16 Feb 2022 02:40:33 -0800
Subject: [PATCH 16/49] added cupy-style sort kernel, support axis=None,
 improved benchmark

---
 cunumeric/array.py                   |  2 +-
 cunumeric/deferred.py                | 70 +++++++++++++++---------
 examples/sort.py                     | 70 ++++++++++++++++++++----
 src/cunumeric/sort/sort.cc           |  8 ++-
 src/cunumeric/sort/sort.cu           | 60 +++++++++++++++++++--
 src/cunumeric/sort/sort.h            |  3 +-
 src/cunumeric/sort/sort_omp.cc       |  5 +-
 src/cunumeric/sort/sort_template.inl | 81 +++++++++++++++++++++++-----
 tests/sort.py                        | 24 +++++----
 9 files changed, 255 insertions(+), 68 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 85a9f4664..f128b96e8 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -1494,7 +1494,7 @@ def sort(self, axis=-1, kind="stable", order=None):
                 "cuNumeric does not support sorting with 'order' as "
                 "ndarray only supports numeric values"
             )
-        if axis >= self.ndim or axis < -self.ndim:
+        if axis is not None and (axis >= self.ndim or axis < -self.ndim):
             raise ValueError("invalid axis")
 
         if self._thunk.scalar:
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 3edf8c75c..aaa8f6c26 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -1519,34 +1519,52 @@ def cholesky(self, src, no_tril=False):
             self.trilu(self, 0, True)
 
     def sort(self, axis=-1, kind="stable", order=None):
-        axis_normalized = axis
-        if axis_normalized < 0:
-            axis_normalized = self.ndim + axis
+        if axis is None and self.ndim > 1:
+            flattened = self.reshape((self.size,), order="C")
+            flattened_copy = self.runtime.create_empty_thunk(
+                flattened.shape, dtype=self.dtype, inputs=[self, flattened]
+            )
+            flattened_copy.copy(flattened, deep=True)
 
-        if axis_normalized is not self.ndim - 1:
-            assert axis_normalized < self.ndim - 1 and axis_normalized >= 0
+            # run sort on last axis -- return 1D solution
+            flattened_copy.sort()
+            self.base = flattened_copy.base
+            self.numpy_array = None
+        else:
+            if axis is None:
+                sort_axis = 0
+            elif axis < 0:
+                sort_axis = self.ndim + axis
+            else:
+                sort_axis = axis
 
-            # swap axes
-            swapped = self.swapaxes(axis_normalized, self.ndim - 1)
+            if sort_axis is not self.ndim - 1:
+                assert sort_axis < self.ndim - 1 and sort_axis >= 0
 
-            # FIXME: ensure *new* distribution does not split last axis (!)
-            swapped_copy = self.runtime.create_empty_thunk(
-                swapped.shape, dtype=self.dtype, inputs=[self, swapped]
-            )
-            swapped_copy.copy(swapped, deep=True)
+                # swap axes
+                swapped = self.swapaxes(sort_axis, self.ndim - 1)
 
-            # run sort on last axis
-            swapped_copy.sort(self.ndim - 1)
+                swapped_copy = self.runtime.create_empty_thunk(
+                    swapped.shape, dtype=self.dtype, inputs=[self, swapped]
+                )
+                swapped_copy.copy(swapped, deep=True)
 
-            self.base = swapped_copy.swapaxes(
-                axis_normalized, self.ndim - 1
-            ).base
-            self.numpy_array = None
-        else:
-            # run actual sort task
-            self.runtime.legate_runtime.issue_execution_fence(block=True)
-            task = self.context.create_task(CuNumericOpCode.SORT)
-            task.add_output(self.base)
-            task.add_scalar_arg(self.base.shape[self.ndim - 1], ty.uint64)
-            task.execute()
-            self.runtime.legate_runtime.issue_execution_fence(block=True)
+                # run sort on last axis
+                swapped_copy.sort(self.ndim - 1)
+
+                self.base = swapped_copy.swapaxes(
+                    self.ndim - 1, sort_axis
+                ).base
+                self.numpy_array = None
+            else:
+                # run actual sort task
+                self.runtime.legate_runtime.issue_execution_fence(block=True)
+                task = self.context.create_task(CuNumericOpCode.SORT)
+                task.add_output(self.base)
+                task.add_input(self.base)
+                if self.ndim > 1:
+                    task.add_broadcast(self.base, self.ndim - 1)
+                task.add_scalar_arg(self.ndim - 1, ty.int32)
+                task.add_scalar_arg(self.base.shape, (ty.int32,))
+                task.execute()
+                self.runtime.legate_runtime.issue_execution_fence(block=True)
diff --git a/examples/sort.py b/examples/sort.py
index 9142c8a12..21b503708 100644
--- a/examples/sort.py
+++ b/examples/sort.py
@@ -24,8 +24,8 @@
 import cunumeric
 
 
-def check_sorted(a, a_numpy):
-    a_sorted = numpy.sort(a_numpy)
+def check_sorted(a, a_numpy, axis=-1):
+    a_sorted = numpy.sort(a_numpy, axis)
     print("Checking result...")
     if cunumeric.allclose(a_sorted, a):
         print("PASS!")
@@ -35,20 +35,40 @@ def check_sorted(a, a_numpy):
         print("CUNUMERIC: " + str(a))
 
 
-def run_sort(N, perform_check, timing):
+def run_sort(N, shape, axis, datatype, perform_check, timing):
 
     numpy.random.seed(42)
-    a_numpy = numpy.array(
-        numpy.random.randint(1000, size=N), dtype=numpy.int32
-    )
+    newtype = numpy.dtype(datatype).type
+
+    if numpy.issubdtype(newtype, numpy.integer):
+        a_numpy = numpy.array(
+            numpy.random.randint(
+                numpy.iinfo(newtype).min, numpy.iinfo(newtype).max, size=N
+            ),
+            dtype=newtype,
+        )
+    elif numpy.issubdtype(newtype, numpy.floating):
+        a_numpy = numpy.array(numpy.random.random(size=N), dtype=newtype)
+    elif numpy.issubdtype(newtype, numpy.complexfloating):
+        a_numpy = numpy.array(
+            numpy.random.random(size=N) + numpy.random.random(size=N) * 1j,
+            dtype=newtype,
+        )
+    else:
+        print("UNKNOWN type " + str(newtype))
+        assert False
+
+    if shape is not None:
+        a_numpy = a_numpy.reshape(tuple(shape))
+
     a = cunumeric.array(a_numpy)
 
     start = datetime.datetime.now()
-    a_sorted = cunumeric.sort(a)
+    a_sorted = cunumeric.sort(a, axis)
     stop = datetime.datetime.now()
 
     if perform_check:
-        check_sorted(a_sorted, a_numpy)
+        check_sorted(a_sorted, a_numpy, axis)
     else:
         # do we need to synchronize?
         assert True
@@ -83,6 +103,31 @@ def run_sort(N, perform_check, timing):
         action="store_true",
         help="perform timing",
     )
+    parser.add_argument(
+        "-s",
+        "--shape",
+        type=int,
+        nargs="+",
+        default=None,
+        dest="shape",
+        help="array reshape (default 'None')",
+    )
+    parser.add_argument(
+        "-d",
+        "--datatype",
+        type=str,
+        default="uint32",
+        dest="datatype",
+        help="data type (default numpy.int32)",
+    )
+    parser.add_argument(
+        "-a",
+        "--axis",
+        type=int,
+        default=-1,
+        dest="axis",
+        help="sort axis (default -1)",
+    )
     parser.add_argument(
         "-b",
         "--benchmark",
@@ -98,5 +143,12 @@ def run_sort(N, perform_check, timing):
         run_sort,
         args.benchmark,
         "Sort",
-        (args.N, args.check, args.timing),
+        (
+            args.N,
+            args.shape,
+            args.axis,
+            args.datatype,
+            args.check,
+            args.timing,
+        ),
     )
diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index 1f4407b7f..33bd9e83b 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -31,7 +31,8 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
                   const Pitches<DIM - 1>& pitches,
                   const Rect<DIM>& rect,
                   const size_t volume,
-                  const size_t sort_dim_size,
+                  const uint32_t sort_axis,
+                  Legion::DomainPoint global_shape,
                   bool is_index_space,
                   Legion::DomainPoint index_point,
                   Legion::Domain domain)
@@ -47,13 +48,16 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
       std::cout << "]" << std::endl;
     }
 #endif
-
+    const size_t sort_dim_size = global_shape[sort_axis];
     for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
       std::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size);
     }
 
     // in case of distributed data (1D) we need to switch to sample sort
     if (is_index_space && DIM == 1) {
+      // not implemented yet
+      assert(false);
+
       // create (starting) sample of (at most) domain.get_volume() equidistant values
       // also enrich values with additional indexes rank & local position in order to handle
       // duplicate values
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index ff70e26d9..f48f9a079 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -20,6 +20,8 @@
 #include <thrust/scan.h>
 #include <thrust/sort.h>
 #include <thrust/device_vector.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
 #include <thrust/execution_policy.h>
 
 #include "cunumeric/cuda_help.h"
@@ -36,7 +38,8 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
                   const Pitches<DIM - 1>& pitches,
                   const Rect<DIM>& rect,
                   const size_t volume,
-                  const size_t sort_dim_size,
+                  const uint32_t sort_axis,
+                  Legion::DomainPoint global_shape,
                   bool is_index_space,
                   Legion::DomainPoint index_point,
                   Legion::Domain domain)
@@ -47,9 +50,60 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
               << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl;
 #endif
 
+    const size_t sort_dim_size = global_shape[sort_axis];
     thrust::device_ptr<VAL> dev_ptr(inptr);
-    for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
-      thrust::stable_sort(dev_ptr + start_idx, dev_ptr + start_idx + sort_dim_size);
+
+    // same approach as cupy implemntation --> combine multiple individual sorts into single
+    // kernel with data tuples - (id_sub-sort, actual_data)
+    if (DIM == 1) {
+      thrust::stable_sort(dev_ptr, dev_ptr + volume);
+    } else {
+      // in this case we know we are sorting for the *last* index
+      const uint64_t max_elements_per_kernel =
+        1 << 22;  // TODO check amount of available GPU memory from config
+      const uint64_t number_sorts_per_kernel =
+        std::max(1ul, std::min(volume, max_elements_per_kernel) / sort_dim_size);
+      const uint64_t number_sorts = volume / sort_dim_size;
+
+      // std::cout << "Number of sorts per kernel: " << number_sorts_per_kernel << std::endl;
+
+      if (number_sorts_per_kernel >=
+          32)  // key-tuple sort has quite some overhead -- only utilize if beneficial
+      {
+        // allocate memory for keys (iterating +=1 for each individual sort dimension)
+        // ensure keys have minimal bit-length (needs values up to number_sorts_per_kernel-1)!
+        // TODO!!!!
+        auto keys_array = create_buffer<uint32_t>(number_sorts_per_kernel * sort_dim_size,
+                                                  Legion::Memory::Kind::GPU_FB_MEM);
+        thrust::device_ptr<uint32_t> dev_key_ptr(keys_array.ptr(0));
+
+        for (uint64_t sort_part = 0; sort_part < number_sorts;
+             sort_part += number_sorts_per_kernel) {
+          // compute size of batch (might be smaller for the last call)
+          const uint64_t num_elements =
+            std::min(number_sorts - sort_part, max_elements_per_kernel) * sort_dim_size;
+          const uint64_t offset = sort_part * sort_dim_size;
+
+          // reinit keys
+          thrust::transform(thrust::make_counting_iterator<uint64_t>(0),
+                            thrust::make_counting_iterator<uint64_t>(num_elements),
+                            thrust::make_constant_iterator<uint64_t>(sort_dim_size),
+                            dev_key_ptr,
+                            thrust::divides<uint64_t>());
+
+          // sort
+          auto combined =
+            thrust::make_zip_iterator(thrust::make_tuple(dev_key_ptr, dev_ptr + offset));
+          thrust::stable_sort(
+            combined, combined + num_elements, thrust::less<thrust::tuple<size_t, VAL>>());
+        }
+      } else {
+        // number_sorts_per_kernel == 1 ----> we don't need keys
+        for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part++) {
+          const uint64_t offset = sort_part * sort_dim_size;
+          thrust::stable_sort(dev_ptr + offset, dev_ptr + offset + sort_dim_size);
+        }
+      }
     }
 
     // in case of distributed data we need to switch to sample sort
diff --git a/src/cunumeric/sort/sort.h b/src/cunumeric/sort/sort.h
index 6afba9230..cc90ff21d 100644
--- a/src/cunumeric/sort/sort.h
+++ b/src/cunumeric/sort/sort.h
@@ -22,7 +22,8 @@ namespace cunumeric {
 
 struct SortArgs {
   Array& output;
-  size_t sort_dim_size;
+  uint32_t sort_axis;
+  Legion::DomainPoint global_shape;
   bool is_index_space;
   Legion::DomainPoint index_point;
   Legion::Domain domain;
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index c7265f447..659e5c138 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -33,7 +33,8 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
                   const Pitches<DIM - 1>& pitches,
                   const Rect<DIM>& rect,
                   const size_t volume,
-                  const size_t sort_dim_size,
+                  const uint32_t sort_axis,
+                  Legion::DomainPoint global_shape,
                   bool is_index_space,
                   Legion::DomainPoint index_point,
                   Legion::Domain domain)
@@ -44,7 +45,7 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
               << ", index_point = " << index_point << ", domain/volume = " << domain << "/"
               << domain.get_volume() << std::endl;
 #endif
-
+    const size_t sort_dim_size = global_shape[sort_axis];
     if (volume / sort_dim_size > omp_get_max_threads() / 2)  // TODO fine tune
     {
 #pragma omp do schedule(dynamic)
diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl
index 2b4886471..81c020912 100644
--- a/src/cunumeric/sort/sort_template.inl
+++ b/src/cunumeric/sort/sort_template.inl
@@ -33,6 +33,10 @@ struct SortImpl {
 
     auto rect = args.output.shape<DIM>();
 
+    // we shall not return on empty rectangle in case of distributed data
+    // as the process might still participate in the parallel sort
+    if ((DIM > 1 || !args.is_index_space) && rect.empty()) return;
+
     Pitches<DIM - 1> pitches;
     size_t volume = pitches.flatten(rect);
 
@@ -45,32 +49,83 @@ struct SortImpl {
      * 3. if we have more than one participants:
      *  a) 1D-case: we need to perform parallel sort (e.g. via sampling)
      *  b) ND-case: rect needs to be the full domain in that last dimension
+     *
+     *  FIXME: understand legion-dim != ndarray-dim case
+     *
+     *
      */
 
 #ifdef DEBUG_CUNUMERIC
-    std::cout << "DIM=" << DIM << ", rect=" << rect << ", sort_dim_size=" << args.sort_dim_size
-              << std::endl;
+    std::cout << "DIM=" << DIM << ", rect=" << rect << ", shape=" << args.global_shape
+              << ", axis=" << args.sort_axis
+              << ", sort_dim_size=" << args.global_shape[args.sort_axis] << std::endl;
+
+    assert((DIM == 1 || (rect.hi[args.sort_axis] - rect.lo[args.sort_axis] + 1 ==
+                         args.global_shape[args.sort_axis])) &&
+           "multi-dimensional array should not be distributed in (sort) dimension");
+#endif
 
-    assert((DIM == 1 || (rect.hi[DIM - 1] - rect.lo[DIM - 1] + 1 == args.sort_dim_size)) &&
-           "multi-dimensional array should not be distributed in last (sort) dimension");
+#ifndef LEGION_BOUNDS_CHECKS
+    bool dense = inout.accessor.is_dense_row_major(rect);
+#else
+    bool dense = false;
 #endif
 
-    SortImplBody<KIND, CODE, DIM>()(inout.ptr(rect),
-                                    pitches,
-                                    rect,
-                                    volume,
-                                    args.sort_dim_size,
-                                    args.is_index_space,
-                                    args.index_point,
-                                    args.domain);
+    if (dense) {
+      SortImplBody<KIND, CODE, DIM>()(inout.ptr(rect),
+                                      pitches,
+                                      rect,
+                                      volume,
+                                      args.sort_axis,
+                                      args.global_shape,
+                                      args.is_index_space,
+                                      args.index_point,
+                                      args.domain);
+    } else {
+      // NOTE: we might want to place this loop logic in the different KIND-implementations in
+      // norder to re-use buffers
+
+      assert(!args.is_index_space || DIM > 1);
+      // compute contiguous memory block
+      int contiguous_elements = 1;
+      for (int i = DIM - 1; i >= 0; i--) {
+        auto diff = 1 + rect.hi[i] - rect.lo[i];
+        contiguous_elements *= diff;
+        if (diff < args.global_shape[i]) { break; }
+      }
+
+      uint64_t elements_processed = 0;
+      while (elements_processed < volume) {
+        Legion::Point<DIM> start_point = pitches.unflatten(elements_processed, rect.lo);
+        // RUN based on current start point
+        SortImplBody<KIND, CODE, DIM>()(&(inout[start_point]),
+                                        pitches,
+                                        rect,
+                                        contiguous_elements,
+                                        args.sort_axis,
+                                        args.global_shape,
+                                        args.is_index_space,
+                                        args.index_point,
+                                        args.domain);
+        elements_processed += contiguous_elements;
+      }
+    }
   }
 };
 
 template <VariantKind KIND>
 static void sort_template(TaskContext& context)
 {
+  DomainPoint global_shape;
+  {
+    auto shape_span  = context.scalars()[1].values<int32_t>();
+    global_shape.dim = shape_span.size();
+    for (int32_t dim = 0; dim < global_shape.dim; ++dim) { global_shape[dim] = shape_span[dim]; }
+  }
+
   SortArgs args{context.outputs()[0],
-                context.scalars()[0].value<size_t>(),
+                context.scalars()[0].value<uint32_t>(),
+                global_shape,
                 context.task_->is_index_space,
                 context.task_->index_point,
                 context.task_->index_domain};
diff --git a/tests/sort.py b/tests/sort.py
index bdc4c4b93..85f17ed3b 100644
--- a/tests/sort.py
+++ b/tests/sort.py
@@ -22,10 +22,11 @@ def test_sort_axis(a_np, a_num, axis):
     assert num.allclose(a_np, a_num)
     print("Sorting axis " + str(axis) + ":")
     sort_np = np.sort(a_np, axis)
-    sort_num = num.sort(a_num, axis, kind="merge")
-    # print(sort_np)
-    # print(sort_num)
-    assert num.allclose(sort_np, sort_num)
+    sort_num = num.sort(a_num, axis)
+    if not num.allclose(sort_np, sort_num):
+        print(sort_np)
+        print(sort_num)
+        assert False
 
 
 def test_1D():
@@ -64,6 +65,7 @@ def test_2D():
 
     test_sort_axis(A_np, A_num, 1)
     test_sort_axis(A_np, A_num, 0)
+    test_sort_axis(A_np, A_num, axis=None)
 
     return
 
@@ -84,22 +86,22 @@ def test_3D():
     test_sort_axis(A_np, A_num, 2)
     test_sort_axis(A_np, A_num, 1)
     test_sort_axis(A_np, A_num, 0)
+    test_sort_axis(A_np, A_num, axis=None)
 
     return
 
 
 def test_custom():
-    a = np.arange(2 * 4).reshape(2, 4)
-    a_transpose = np.transpose(a)
+    a = np.arange(4 * 4 * 5 * 2 * 3 * 2 * 2 * 2 * 4).reshape(
+        4, 4, 5, 2, 3, 2, 2, 2, 4
+    )
 
-    a_transposed_num = num.array([[0, 4], [1, 5], [2, 6], [3, 7]])
     a_num = num.array(a)
-    a_num_transposed = a_num.swapaxes(0, 1)
 
     test_sort_axis(a, a_num, 1)
-    test_sort_axis(a_transpose, a_transposed_num, 1)
-    test_sort_axis(a_transpose, a_num_transposed, 1)
-    test_sort_axis(a_transpose, a_num_transposed, 0)
+    test_sort_axis(a, a_num, 2)
+    test_sort_axis(a, a_num, 7)
+    test_sort_axis(a, a_num, 4)
 
     return
 

From 5e982c2e53e9b09a5fd430d8e71b4ef70174cec9 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Thu, 17 Feb 2022 12:07:08 -0800
Subject: [PATCH 17/49] refactoring and documentation

---
 cunumeric/array.py                   |  25 +----
 cunumeric/deferred.py                |  78 +++++++++++-----
 cunumeric/eager.py                   |   8 +-
 cunumeric/module.py                  | 126 +++++++++++++++++++++++--
 src/cunumeric/mapper.cc              |   8 ++
 src/cunumeric/sort/sort.cc           | 135 +++++++--------------------
 src/cunumeric/sort/sort.cu           |  72 ++++++++------
 src/cunumeric/sort/sort.h            |   4 +-
 src/cunumeric/sort/sort_omp.cc       |  63 ++++++++-----
 src/cunumeric/sort/sort_template.inl |  79 ++++++----------
 tests/sort.py                        |  76 +++++++++++++--
 11 files changed, 405 insertions(+), 269 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index f128b96e8..6c5a598f4 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -1481,29 +1481,8 @@ def setflags(self, write=None, align=None, uic=None):
         )
 
     def sort(self, axis=-1, kind="stable", order=None):
-        if kind != "stable":
-            runtime.warn(
-                "cuNumeric uses a different (stable) algorithm than "
-                + str(kind)
-                + " for sorting",
-                category=RuntimeWarning,
-                stacklevel=2,
-            )
-        if order is not None:
-            raise NotImplementedError(
-                "cuNumeric does not support sorting with 'order' as "
-                "ndarray only supports numeric values"
-            )
-        if axis is not None and (axis >= self.ndim or axis < -self.ndim):
-            raise ValueError("invalid axis")
-
-        if self._thunk.scalar:
-            # nothing to do
-            return
-        else:
-            # this is the default -- sorting of N-D array
-            self._thunk.sort(axis=axis)
-            return
+        self._thunk.sort(rhs=self._thunk, axis=axis, kind=kind, order=order)
+        return
 
     def squeeze(self, axis=None):
         if axis is not None:
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index aaa8f6c26..311753138 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -1518,53 +1518,89 @@ def cholesky(self, src, no_tril=False):
         if not no_tril:
             self.trilu(self, 0, True)
 
-    def sort(self, axis=-1, kind="stable", order=None):
-        if axis is None and self.ndim > 1:
-            flattened = self.reshape((self.size,), order="C")
+    @auto_convert([1])
+    def sort(self, rhs, axis=-1, kind="stable", order=None):
+
+        if kind != "stable":
+            self.runtime.warn(
+                "cuNumeric uses a different (stable) algorithm than "
+                + str(kind)
+                + " for sorting",
+                category=RuntimeWarning,
+                stacklevel=2,
+            )
+        if order is not None:
+            raise NotImplementedError(
+                "cuNumeric does not support sorting with 'order' as "
+                "ndarray only supports numeric values"
+            )
+        if axis is not None and (axis >= rhs.ndim or axis < -rhs.ndim):
+            raise ValueError("invalid axis")
+
+        if axis is None and rhs.ndim > 1:
+            flattened = rhs.reshape((rhs.size,), order="C")
             flattened_copy = self.runtime.create_empty_thunk(
-                flattened.shape, dtype=self.dtype, inputs=[self, flattened]
+                flattened.shape, dtype=rhs.dtype, inputs=[rhs, flattened]
             )
             flattened_copy.copy(flattened, deep=True)
 
-            # run sort on last axis -- return 1D solution
-            flattened_copy.sort()
+            # run sort flattened -- return 1D solution
+            flattened_copy.sort(flattened_copy)
             self.base = flattened_copy.base
             self.numpy_array = None
+
         else:
             if axis is None:
                 sort_axis = 0
             elif axis < 0:
-                sort_axis = self.ndim + axis
+                sort_axis = rhs.ndim + axis
             else:
                 sort_axis = axis
 
-            if sort_axis is not self.ndim - 1:
-                assert sort_axis < self.ndim - 1 and sort_axis >= 0
+            if sort_axis is not rhs.ndim - 1:
+                assert sort_axis < rhs.ndim - 1 and sort_axis >= 0
 
                 # swap axes
-                swapped = self.swapaxes(sort_axis, self.ndim - 1)
+                swapped = rhs.swapaxes(sort_axis, rhs.ndim - 1)
 
                 swapped_copy = self.runtime.create_empty_thunk(
-                    swapped.shape, dtype=self.dtype, inputs=[self, swapped]
+                    swapped.shape, dtype=rhs.dtype, inputs=[rhs, swapped]
                 )
                 swapped_copy.copy(swapped, deep=True)
 
                 # run sort on last axis
-                swapped_copy.sort(self.ndim - 1)
+                swapped_copy.sort(swapped_copy)
 
-                self.base = swapped_copy.swapaxes(
-                    self.ndim - 1, sort_axis
-                ).base
+                self.base = swapped_copy.swapaxes(rhs.ndim - 1, sort_axis).base
                 self.numpy_array = None
+
             else:
                 # run actual sort task
-                self.runtime.legate_runtime.issue_execution_fence(block=True)
+                needs_communication = self.runtime.num_gpus > 1 or (
+                    self.runtime.num_gpus == 0 and self.runtime.num_procs > 1
+                )
+
+                if needs_communication:
+                    self.runtime.legate_runtime.issue_execution_fence(
+                        block=True
+                    )
+
                 task = self.context.create_task(CuNumericOpCode.SORT)
                 task.add_output(self.base)
-                task.add_input(self.base)
+                task.add_input(rhs.base)
+                task.add_alignment(self.base, rhs.base)
                 if self.ndim > 1:
-                    task.add_broadcast(self.base, self.ndim - 1)
-                task.add_scalar_arg(self.ndim - 1, ty.int32)
-                task.add_scalar_arg(self.base.shape, (ty.int32,))
+                    task.add_broadcast(rhs.base, rhs.ndim - 1)
+                elif needs_communication:
+                    # print("Distributed 1D sort --> broadcast")
+                    task.add_broadcast(rhs.base)
+
+                task.add_scalar_arg(False, bool)  # descending flag
+                task.add_scalar_arg(False, bool)  # return indices flag
+                task.add_scalar_arg(rhs.base.shape, (ty.int32,))
                 task.execute()
-                self.runtime.legate_runtime.issue_execution_fence(block=True)
+
+                if needs_communication:
+                    self.runtime.legate_runtime.issue_execution_fence(
+                        block=True
+                    )
diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index 0ebdd8959..520287a6f 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -502,12 +502,12 @@ def nonzero(self):
                 result += (EagerArray(self.runtime, array),)
             return result
 
-    def sort(self, axis=-1, kind="stable", order=None):
-        self.check_eager_args(axis, kind, order)
+    def sort(self, rhs, axis=-1, kind="stable", order=None):
+        self.check_eager_args(rhs, axis, kind, order)
         if self.deferred is not None:
-            self.deferred.sort(axis, kind, order)
+            self.deferred.sort(rhs, axis, kind, order)
         else:
-            self.array.sort(axis, kind, order)
+            self.array = np.sort(rhs.array, axis, kind, order)
 
     def random_uniform(self):
         if self.deferred is not None:
diff --git a/cunumeric/module.py b/cunumeric/module.py
index ffb5eaff5..17728b9b0 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -5576,27 +5576,135 @@ def sign(a, out=None, where=True, dtype=None, **kwargs):
 
 @add_boilerplate("a")
 def argsort(a, axis=-1, kind="stable", order=None):
-    return a.argsort(axis=axis, kind=kind, order=order)
+    """
 
+    Returns the indices that would sort an array.
 
-def lexsort(a, axis=-1):
-    raise NotImplementedError("Not yet implemented")
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+    axis : int or None, optional
+        Axis to sort. By default, the index -1 (the last axis) is used. If
+        None, the flattened array is used.
+    kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
+        Currently only 'stable' sort is supported
+    order : str or list of str, optional
+        Currently not supported
+
+    Returns
+    -------
+    index_array : ndarray of ints
+        Array of indices that sort a along the specified axis. It has the
+        same shape as `a.shape` or is flattened in case of `axis` is None.
+
+    See Also
+    --------
+    numpy.argsort
+
+    Availability
+    --------
+    GPU, CPU
+    """
+    return a.argsort(axis=axis, kind=kind, order=order)
 
 
 def msort(a):
-    return sort(a)
+    """
+
+    Returns a sorted copy of an array sorted along the first axis.
+
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+
+    Returns
+    -------
+    out : ndarray
+        Sorted array with same dtype and shape as `a`.
+
+    See Also
+    --------
+    numpy.msort
+
+    Availability
+    --------
+    GPU, CPU
+    """
+    return sort(a, axis=0)
 
 
 @add_boilerplate("a")
 def sort(a, axis=-1, kind="stable", order=None):
-    out = a.copy()
-    out_array = ndarray.convert_to_cunumeric_ndarray(out)
-    out_array._thunk.sort(axis=axis, kind=kind, order=order)
-    return out_array
+    """
 
+    Returns a sorted copy of an array.
 
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+    axis : int or None, optional
+        Axis to sort. By default, the index -1 (the last axis) is used. If
+        None, the flattened array is used.
+    kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
+        Currently only 'stable' sort is supported
+    order : str or list of str, optional
+        Currently not supported
+
+    Returns
+    -------
+    out : ndarray
+        Sorted array with same dtype and shape as `a`. In case `axis` is
+        None the result is flattened.
+
+    See Also
+    --------
+    numpy.sort
+
+    Availability
+    --------
+    GPU, CPU
+    """
+    result = ndarray(a.shape, a.dtype)
+    result._thunk.sort(rhs=a._thunk, axis=axis, kind=kind, order=order)
+    return result
+
+
+@add_boilerplate("a")
 def sort_complex(a):
-    return sort(a)
+    """
+
+    Returns a sorted copy of an array sorted along the last axis. Sorts the
+    real part first, the imaginary part second.
+
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+
+    Returns
+    -------
+    out : ndarray, complex
+        Sorted array with same shape as `a`.
+
+    See Also
+    --------
+    numpy.sort_complex
+
+    Availability
+    --------
+    GPU, CPU
+    """
+
+    # force complex result
+    if np.issubdtype(a.dtype, np.complexfloating):
+        out = a
+    else:
+        out = a.astype(np.complex64, copy=True)
+
+    return sort(out)
 
 
 # Searching
diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc
index 0be299982..3452bc78d 100644
--- a/src/cunumeric/mapper.cc
+++ b/src/cunumeric/mapper.cc
@@ -115,6 +115,14 @@ std::vector<StoreMapping> CuNumericMapper::store_mappings(
       }
       return std::move(mappings);
     }
+    case CUNUMERIC_SORT: {
+      std::vector<StoreMapping> mappings;
+      auto& inputs = task.inputs();
+      mappings.push_back(StoreMapping::default_mapping(inputs[0], options.front()));
+      mappings.back().policy.ordering.c_order();
+      mappings.back().policy.exact = true;
+      return std::move(mappings);
+    }
     default: {
       return {};
     }
diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index 33bd9e83b..44a6ba3e6 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -27,118 +27,51 @@ template <LegateTypeCode CODE, int32_t DIM>
 struct SortImplBody<VariantKind::CPU, CODE, DIM> {
   using VAL = legate_type_of<CODE>;
 
-  void operator()(VAL* inptr,
+  void std_sort(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size)
+  {
+    std::copy(inptr, inptr + volume, outptr);
+    for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
+      std::stable_sort(outptr + start_idx, outptr + start_idx + sort_dim_size);
+    }
+  }
+
+  void operator()(AccessorRO<VAL, DIM> input,
+                  AccessorWO<VAL, DIM> output,
                   const Pitches<DIM - 1>& pitches,
                   const Rect<DIM>& rect,
+                  const bool dense,
                   const size_t volume,
-                  const uint32_t sort_axis,
-                  Legion::DomainPoint global_shape,
-                  bool is_index_space,
-                  Legion::DomainPoint index_point,
-                  Legion::Domain domain)
+                  const Legion::DomainPoint global_shape,
+                  const bool is_index_space,
+                  const Legion::DomainPoint index_point,
+                  const Legion::Domain domain)
   {
 #ifdef DEBUG_CUNUMERIC
     std::cout << "CPU(" << index_point[0] << "): local size = " << volume
               << ", dist. = " << is_index_space << ", index_point = " << index_point
-              << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl;
-
-    if (volume <= 30) {
-      std::cout << "inptr = [ ";
-      for (size_t i = 0; i < volume; ++i) { std::cout << (i > 0 ? ", " : " ") << inptr[i]; }
-      std::cout << "]" << std::endl;
-    }
+              << ", domain/volume = " << domain << "/" << domain.get_volume()
+              << ", dense = " << dense << std::endl;
 #endif
-    const size_t sort_dim_size = global_shape[sort_axis];
-    for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
-      std::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size);
-    }
-
-    // in case of distributed data (1D) we need to switch to sample sort
-    if (is_index_space && DIM == 1) {
-      // not implemented yet
-      assert(false);
-
-      // create (starting) sample of (at most) domain.get_volume() equidistant values
-      // also enrich values with additional indexes rank & local position in order to handle
-      // duplicate values
-      size_t num_local_samples = std::min(domain.get_volume(), volume);
-      size_t local_rank        = index_point[0];
-      auto local_samples       = std::make_unique<SampleEntry<VAL>[]>(num_local_samples);
-      for (int i = 0; i < num_local_samples; ++i) {
-        const size_t index        = (i + 1) * volume / num_local_samples - 1;
-        local_samples[i].value    = inptr[index];
-        local_samples[i].rank     = local_rank;
-        local_samples[i].local_id = index;
+    const size_t sort_dim_size = global_shape[DIM - 1];
+    assert(!is_index_space || DIM > 1);  // not implemented for now
+    if (dense) {
+      std_sort(input.ptr(rect), output.ptr(rect), volume, sort_dim_size);
+    } else {
+      // compute contiguous memory block
+      int contiguous_elements = 1;
+      for (int i = DIM - 1; i >= 0; i--) {
+        auto diff = 1 + rect.hi[i] - rect.lo[i];
+        contiguous_elements *= diff;
+        if (diff < global_shape[i]) { break; }
       }
 
-      // std::cout << "local samples: size = " << num_local_samples << std::endl;
-      // std::cout << "first = (" << local_samples[0].value << "," << local_samples[0].rank << ","<<
-      // local_samples[0].local_id << ")" << std::endl; std::cout << "last = (" <<
-      // local_samples[num_local_samples-1].value << "," << local_samples[num_local_samples-1].rank
-      // << ","<< local_samples[num_local_samples-1].local_id << ")" << std::endl;
-
-      // all2all those samples
-      // TODO broadcast package size
-      // TODO allocate targets
-      // TODO broadcast samples
-      size_t num_global_samples = 15;
-      std::unique_ptr<SampleEntry<VAL>[]> global_samples(new SampleEntry<VAL>[num_global_samples]);
-
-      // sort all samples (utilize 2nd and 3rd sort criteria as well)
-      std::stable_sort(&(global_samples[0]),
-                       &(global_samples[0]) + num_global_samples,
-                       SampleEntryComparator<VAL>());
-
-      // define splitters
-      auto splitters = std::make_unique<SampleEntry<VAL>[]>(domain.get_volume() - 1);
-      for (int i = 0; i < domain.get_volume() - 1; ++i) {
-        const size_t index = (i + 1) * num_global_samples / domain.get_volume() - 1;
-        splitters[i]       = global_samples[index];
+      uint64_t elements_processed = 0;
+      while (elements_processed < volume) {
+        Legion::Point<DIM> start_point = pitches.unflatten(elements_processed, rect.lo);
+        std_sort(
+          input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size);
+        elements_processed += contiguous_elements;
       }
-
-      do {
-        // compute local package sizes for every process based on splitters
-        std::unique_ptr<size_t[]> local_partition_size(new size_t[domain.get_volume()]);
-        {
-          size_t range_start    = 0;
-          size_t local_position = 0;
-          for (int p_index = 0; p_index < domain.get_volume(); ++p_index) {
-            // move as long current value is lesser or equaöl to current splitter
-            while (local_position < volume &&
-                   (inptr[local_position] < splitters[p_index].value ||
-                    (inptr[local_position] == splitters[p_index].value &&
-                     (local_rank < splitters[p_index].rank ||
-                      (local_rank == splitters[p_index].rank &&
-                       local_position <= splitters[p_index].local_id))))) {
-              local_position++;
-            }
-
-            local_partition_size[p_index++] = local_position - range_start;
-            range_start                     = local_position;
-          }
-        }
-
-        // communicate local package-sizes all2all
-        // TODO
-
-        // evaluate distribution result??
-        // TODO
-
-        // if (good enough) break;
-        // TODO
-        break;
-        // else iterate/improve splitters
-        // TODO
-
-      } while (true);
-
-      // all2all accepted distribution
-      // package sizes should already be known
-      // all2all communication
-      // TODO
-
-      // final merge sort of received packages
-      // TODO
     }
   }
 };
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index f48f9a079..fd864ca78 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -34,29 +34,15 @@ template <LegateTypeCode CODE, int32_t DIM>
 struct SortImplBody<VariantKind::GPU, CODE, DIM> {
   using VAL = legate_type_of<CODE>;
 
-  void operator()(VAL* inptr,
-                  const Pitches<DIM - 1>& pitches,
-                  const Rect<DIM>& rect,
-                  const size_t volume,
-                  const uint32_t sort_axis,
-                  Legion::DomainPoint global_shape,
-                  bool is_index_space,
-                  Legion::DomainPoint index_point,
-                  Legion::Domain domain)
+  void thrust_sort(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size)
   {
-#ifdef DEBUG_CUNUMERIC
-    std::cout << "GPU(" << index_point[0] << "): local size = " << volume
-              << ", dist. = " << is_index_space << ", index_point = " << index_point
-              << ", domain/volume = " << domain << "/" << domain.get_volume() << std::endl;
-#endif
-
-    const size_t sort_dim_size = global_shape[sort_axis];
-    thrust::device_ptr<VAL> dev_ptr(inptr);
-
+    thrust::device_ptr<const VAL> dev_input_ptr(inptr);
+    thrust::device_ptr<VAL> dev_output_ptr(outptr);
+    thrust::copy(dev_input_ptr, dev_input_ptr + volume, dev_output_ptr);
     // same approach as cupy implemntation --> combine multiple individual sorts into single
     // kernel with data tuples - (id_sub-sort, actual_data)
     if (DIM == 1) {
-      thrust::stable_sort(dev_ptr, dev_ptr + volume);
+      thrust::stable_sort(dev_output_ptr, dev_output_ptr + volume);
     } else {
       // in this case we know we are sorting for the *last* index
       const uint64_t max_elements_per_kernel =
@@ -93,23 +79,57 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
 
           // sort
           auto combined =
-            thrust::make_zip_iterator(thrust::make_tuple(dev_key_ptr, dev_ptr + offset));
+            thrust::make_zip_iterator(thrust::make_tuple(dev_key_ptr, dev_output_ptr + offset));
           thrust::stable_sort(
             combined, combined + num_elements, thrust::less<thrust::tuple<size_t, VAL>>());
         }
       } else {
-        // number_sorts_per_kernel == 1 ----> we don't need keys
+        // number_sorts_per_kernel too small ----> we sort one after another
         for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part++) {
           const uint64_t offset = sort_part * sort_dim_size;
-          thrust::stable_sort(dev_ptr + offset, dev_ptr + offset + sort_dim_size);
+          thrust::stable_sort(dev_output_ptr + offset, dev_output_ptr + offset + sort_dim_size);
         }
       }
     }
+  }
+
+  void operator()(AccessorRO<VAL, DIM> input,
+                  AccessorWO<VAL, DIM> output,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  const bool dense,
+                  const size_t volume,
+                  const Legion::DomainPoint global_shape,
+                  const bool is_index_space,
+                  const Legion::DomainPoint index_point,
+                  const Legion::Domain domain)
+  {
+#ifdef DEBUG_CUNUMERIC
+    std::cout << "GPU(" << index_point[0] << "): local size = " << volume
+              << ", dist. = " << is_index_space << ", index_point = " << index_point
+              << ", domain/volume = " << domain << "/" << domain.get_volume()
+              << ", dense = " << dense << std::endl;
+#endif
+    const size_t sort_dim_size = global_shape[DIM - 1];
+    assert(!is_index_space || DIM > 1);  // not implemented for now
+    if (dense) {
+      thrust_sort(input.ptr(rect), output.ptr(rect), volume, sort_dim_size);
+    } else {
+      // compute contiguous memory block
+      int contiguous_elements = 1;
+      for (int i = DIM - 1; i >= 0; i--) {
+        auto diff = 1 + rect.hi[i] - rect.lo[i];
+        contiguous_elements *= diff;
+        if (diff < global_shape[i]) { break; }
+      }
 
-    // in case of distributed data we need to switch to sample sort
-    if (is_index_space && DIM == 1) {
-      // not implemented yet
-      assert(false);
+      uint64_t elements_processed = 0;
+      while (elements_processed < volume) {
+        Legion::Point<DIM> start_point = pitches.unflatten(elements_processed, rect.lo);
+        thrust_sort(
+          input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size);
+        elements_processed += contiguous_elements;
+      }
     }
   }
 };
diff --git a/src/cunumeric/sort/sort.h b/src/cunumeric/sort/sort.h
index cc90ff21d..837027086 100644
--- a/src/cunumeric/sort/sort.h
+++ b/src/cunumeric/sort/sort.h
@@ -21,8 +21,10 @@
 namespace cunumeric {
 
 struct SortArgs {
+  const Array& input;
   Array& output;
-  uint32_t sort_axis;
+  bool descending;
+  bool argsort;
   Legion::DomainPoint global_shape;
   bool is_index_space;
   Legion::DomainPoint index_point;
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index 659e5c138..c5b3ccd54 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -29,38 +29,59 @@ template <LegateTypeCode CODE, int32_t DIM>
 struct SortImplBody<VariantKind::OMP, CODE, DIM> {
   using VAL = legate_type_of<CODE>;
 
-  void operator()(VAL* inptr,
-                  const Pitches<DIM - 1>& pitches,
-                  const Rect<DIM>& rect,
-                  const size_t volume,
-                  const uint32_t sort_axis,
-                  Legion::DomainPoint global_shape,
-                  bool is_index_space,
-                  Legion::DomainPoint index_point,
-                  Legion::Domain domain)
+  void std_sort_omp(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size)
   {
-#ifdef DEBUG_CUNUMERIC
-    std::cout << "OMP(" << index_point[0] << ":" << omp_get_max_threads() << ":" << omp_get_nested()
-              << "): local size = " << volume << ", dist. = " << is_index_space
-              << ", index_point = " << index_point << ", domain/volume = " << domain << "/"
-              << domain.get_volume() << std::endl;
-#endif
-    const size_t sort_dim_size = global_shape[sort_axis];
+    std::copy(inptr, inptr + volume, outptr);
     if (volume / sort_dim_size > omp_get_max_threads() / 2)  // TODO fine tune
     {
 #pragma omp do schedule(dynamic)
       for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
-        std::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size);
+        std::stable_sort(outptr + start_idx, outptr + start_idx + sort_dim_size);
       }
     } else {
       for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
-        __gnu_parallel::stable_sort(inptr + start_idx, inptr + start_idx + sort_dim_size);
+        __gnu_parallel::stable_sort(outptr + start_idx, outptr + start_idx + sort_dim_size);
       }
     }
+  }
+
+  void operator()(AccessorRO<VAL, DIM> input,
+                  AccessorWO<VAL, DIM> output,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  const bool dense,
+                  const size_t volume,
+                  const Legion::DomainPoint global_shape,
+                  const bool is_index_space,
+                  const Legion::DomainPoint index_point,
+                  const Legion::Domain domain)
+  {
+#ifdef DEBUG_CUNUMERIC
+    std::cout << "CPU(" << index_point[0] << "): local size = " << volume
+              << ", dist. = " << is_index_space << ", index_point = " << index_point
+              << ", domain/volume = " << domain << "/" << domain.get_volume()
+              << ", dense = " << dense << std::endl;
+#endif
+    const size_t sort_dim_size = global_shape[DIM - 1];
+    assert(!is_index_space || DIM > 1);  // not implemented for now
+    if (dense) {
+      std_sort_omp(input.ptr(rect), output.ptr(rect), volume, sort_dim_size);
+    } else {
+      // compute contiguous memory block
+      int contiguous_elements = 1;
+      for (int i = DIM - 1; i >= 0; i--) {
+        auto diff = 1 + rect.hi[i] - rect.lo[i];
+        contiguous_elements *= diff;
+        if (diff < global_shape[i]) { break; }
+      }
 
-    if (is_index_space && DIM == 1) {
-      // not implemented yet
-      assert(false);
+      uint64_t elements_processed = 0;
+      while (elements_processed < volume) {
+        Legion::Point<DIM> start_point = pitches.unflatten(elements_processed, rect.lo);
+        std_sort_omp(
+          input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size);
+        elements_processed += contiguous_elements;
+      }
     }
   }
 };
diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl
index 81c020912..5488330a5 100644
--- a/src/cunumeric/sort/sort_template.inl
+++ b/src/cunumeric/sort/sort_template.inl
@@ -31,7 +31,7 @@ struct SortImpl {
   {
     using VAL = legate_type_of<CODE>;
 
-    auto rect = args.output.shape<DIM>();
+    auto rect = args.input.shape<DIM>();
 
     // we shall not return on empty rectangle in case of distributed data
     // as the process might still participate in the parallel sort
@@ -40,76 +40,47 @@ struct SortImpl {
     Pitches<DIM - 1> pitches;
     size_t volume = pitches.flatten(rect);
 
-    auto inout = args.output.read_write_accessor<VAL, DIM>(rect);
+    auto input  = args.input.read_accessor<VAL, DIM>(rect);
+    auto output = args.output.write_accessor<VAL, DIM>(rect);
 
     /*
      * Assumptions:
      * 1. Sort is always requested for the 'last' dimension within rect
      * 2. We have product_of_all_other_dimensions independent sort ranges
      * 3. if we have more than one participants:
-     *  a) 1D-case: we need to perform parallel sort (e.g. via sampling)
+     *  a) 1D-case: we need to perform parallel sort (e.g. via sampling) -- not implemented yet
      *  b) ND-case: rect needs to be the full domain in that last dimension
      *
-     *  FIXME: understand legion-dim != ndarray-dim case
-     *
-     *
      */
 
 #ifdef DEBUG_CUNUMERIC
     std::cout << "DIM=" << DIM << ", rect=" << rect << ", shape=" << args.global_shape
-              << ", axis=" << args.sort_axis
-              << ", sort_dim_size=" << args.global_shape[args.sort_axis] << std::endl;
+              << ", descending=" << args.descending << ", argsort=" << args.argsort
+              << ", sort_dim_size=" << args.global_shape[DIM - 1] << std::endl;
 
-    assert((DIM == 1 || (rect.hi[args.sort_axis] - rect.lo[args.sort_axis] + 1 ==
-                         args.global_shape[args.sort_axis])) &&
+    assert((DIM == 1 || (rect.hi[DIM - 1] - rect.lo[DIM - 1] + 1 == args.global_shape[DIM - 1])) &&
            "multi-dimensional array should not be distributed in (sort) dimension");
 #endif
 
 #ifndef LEGION_BOUNDS_CHECKS
-    bool dense = inout.accessor.is_dense_row_major(rect);
+    bool dense =
+      input.accessor.is_dense_row_major(rect) && output.accessor.is_dense_row_major(rect);
 #else
     bool dense = false;
 #endif
 
-    if (dense) {
-      SortImplBody<KIND, CODE, DIM>()(inout.ptr(rect),
-                                      pitches,
-                                      rect,
-                                      volume,
-                                      args.sort_axis,
-                                      args.global_shape,
-                                      args.is_index_space,
-                                      args.index_point,
-                                      args.domain);
-    } else {
-      // NOTE: we might want to place this loop logic in the different KIND-implementations in
-      // norder to re-use buffers
-
-      assert(!args.is_index_space || DIM > 1);
-      // compute contiguous memory block
-      int contiguous_elements = 1;
-      for (int i = DIM - 1; i >= 0; i--) {
-        auto diff = 1 + rect.hi[i] - rect.lo[i];
-        contiguous_elements *= diff;
-        if (diff < args.global_shape[i]) { break; }
-      }
-
-      uint64_t elements_processed = 0;
-      while (elements_processed < volume) {
-        Legion::Point<DIM> start_point = pitches.unflatten(elements_processed, rect.lo);
-        // RUN based on current start point
-        SortImplBody<KIND, CODE, DIM>()(&(inout[start_point]),
-                                        pitches,
-                                        rect,
-                                        contiguous_elements,
-                                        args.sort_axis,
-                                        args.global_shape,
-                                        args.is_index_space,
-                                        args.index_point,
-                                        args.domain);
-        elements_processed += contiguous_elements;
-      }
-    }
+    assert(dense || !args.is_index_space || DIM > 1);
+
+    SortImplBody<KIND, CODE, DIM>()(input,
+                                    output,
+                                    pitches,
+                                    rect,
+                                    dense,
+                                    volume,
+                                    args.global_shape,
+                                    args.is_index_space,
+                                    args.index_point,
+                                    args.domain);
   }
 };
 
@@ -118,13 +89,15 @@ static void sort_template(TaskContext& context)
 {
   DomainPoint global_shape;
   {
-    auto shape_span  = context.scalars()[1].values<int32_t>();
+    auto shape_span  = context.scalars()[2].values<int32_t>();
     global_shape.dim = shape_span.size();
     for (int32_t dim = 0; dim < global_shape.dim; ++dim) { global_shape[dim] = shape_span[dim]; }
   }
 
-  SortArgs args{context.outputs()[0],
-                context.scalars()[0].value<uint32_t>(),
+  SortArgs args{context.inputs()[0],
+                context.outputs()[0],
+                context.scalars()[0].value<bool>(),
+                context.scalars()[1].value<bool>(),
                 global_shape,
                 context.task_->is_index_space,
                 context.task_->index_point,
diff --git a/tests/sort.py b/tests/sort.py
index 85f17ed3b..ef78d0447 100644
--- a/tests/sort.py
+++ b/tests/sort.py
@@ -70,11 +70,8 @@ def test_2D():
     return
 
 
-def test_3D():
+def test_3D(x_dim, y_dim, z_dim):
     np.random.seed(42)
-    x_dim = 5
-    y_dim = 3
-    z_dim = 7
     A_np = np.array(
         np.random.randint(10, size=x_dim * y_dim * z_dim), dtype=np.int32
     ).reshape(x_dim, y_dim, z_dim)
@@ -91,7 +88,33 @@ def test_3D():
     return
 
 
+def test_3D_complex(x_dim, y_dim, z_dim):
+    np.random.seed(42)
+    A_np = np.array(
+        np.random.random(size=x_dim * y_dim * z_dim), dtype=np.complex64
+    ).reshape(x_dim, y_dim, z_dim)
+
+    A_num = num.array(A_np)
+    print("Sorting 3d tensor:\n")
+    print(A_np)
+
+    test_sort_axis(A_np, A_num, 2)
+    test_sort_axis(A_np, A_num, 1)
+    test_sort_axis(A_np, A_num, 0)
+    test_sort_axis(A_np, A_num, axis=None)
+
+    return
+
+
 def test_custom():
+    # 4D still works, >=5D always falls back to numpy
+    a = np.arange(4 * 2 * 2 * 4).reshape(4, 2, 2, 4)
+    a_num = num.array(a)
+
+    test_sort_axis(a, a_num, 1)
+    test_sort_axis(a, a_num, 2)
+    test_sort_axis(a, a_num, a.ndim - 1)
+
     a = np.arange(4 * 4 * 5 * 2 * 3 * 2 * 2 * 2 * 4).reshape(
         4, 4, 5, 2, 3, 2, 2, 2, 4
     )
@@ -106,15 +129,48 @@ def test_custom():
     return
 
 
+def test_other_api():
+    a = np.arange(4 * 2 * 3).reshape(4, 2, 3)
+    a_num = num.array(a)
+
+    # msort
+    assert num.allclose(np.msort(a), num.msort(a_num))
+
+    # sort_complex
+    assert num.allclose(np.sort_complex(a), num.sort_complex(a_num))
+
+    # reverse order sort
+    # TODO
+
+    # in-place sort
+    copy_a = a.copy()
+    copy_a_num = a_num.copy()
+    copy_a.sort()
+    copy_a_num.sort()
+    assert num.allclose(copy_a, copy_a_num)
+
+    # reverse order sort (in place)
+    # TODO
+
+    # argsort
+    # TODO
+
+    return
+
+
 def test():
-    print("\n\n -----------  Custom test ---------------\n")
-    test_custom()
-    print("\n\n -----------  2D test ---------------\n")
-    test_2D()
-    print("\n\n -----------  3D test ---------------\n")
-    test_3D()
     print("\n\n -----------  1D test ---------------\n")
     test_1D()
+    print("\n\n -----------  2D test ---------------\n")
+    test_2D()
+    print("\n\n -----------  3D test (int32) -------\n")
+    test_3D(51, 23, 17)
+    print("\n\n -----------  3D test (complex) -----\n")
+    test_3D_complex(27, 30, 45)
+    print("\n\n -----------  4D/5D test-------------\n")
+    test_custom()
+    print("\n\n -----------  API test --------------\n")
+    test_other_api()
 
 
 if __name__ == "__main__":

From c9e4407901b6e81e86fa6a2f5770dfc964f040de Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 18 Feb 2022 14:51:26 -0800
Subject: [PATCH 18/49] added argsort support and test coverage

---
 cunumeric/array.py                   |   6 +
 cunumeric/deferred.py                |  20 +-
 cunumeric/eager.py                   |   9 +-
 cunumeric/module.py                  |   7 +-
 src/cunumeric/sort/sort.cc           |  72 ++++-
 src/cunumeric/sort/sort.cu           | 404 +++++++++++++++++++++++----
 src/cunumeric/sort/sort.h            |   5 +-
 src/cunumeric/sort/sort_omp.cc       |  97 ++++++-
 src/cunumeric/sort/sort_template.inl |  88 ++++--
 tests/sort.py                        | 106 ++++++-
 10 files changed, 696 insertions(+), 118 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 6c5a598f4..924bd9edd 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -1484,6 +1484,12 @@ def sort(self, axis=-1, kind="stable", order=None):
         self._thunk.sort(rhs=self._thunk, axis=axis, kind=kind, order=order)
         return
 
+    def argsort(self, axis=-1, kind="stable", order=None):
+        self._thunk.sort(
+            rhs=self._thunk, argsort=True, axis=axis, kind=kind, order=order
+        )
+        return
+
     def squeeze(self, axis=None):
         if axis is not None:
             if isinstance(axis, int):
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 311753138..5d43f8bfa 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -1519,7 +1519,7 @@ def cholesky(self, src, no_tril=False):
             self.trilu(self, 0, True)
 
     @auto_convert([1])
-    def sort(self, rhs, axis=-1, kind="stable", order=None):
+    def sort(self, rhs, argsort=False, axis=-1, kind="stable", order=None):
 
         if kind != "stable":
             self.runtime.warn(
@@ -1545,8 +1545,11 @@ def sort(self, rhs, axis=-1, kind="stable", order=None):
             flattened_copy.copy(flattened, deep=True)
 
             # run sort flattened -- return 1D solution
-            flattened_copy.sort(flattened_copy)
-            self.base = flattened_copy.base
+            sort_result = self.runtime.create_empty_thunk(
+                flattened_copy.shape, dtype=self.dtype, inputs=[flattened_copy]
+            )
+            sort_result.sort(rhs=flattened_copy, argsort=argsort)
+            self.base = sort_result.base
             self.numpy_array = None
 
         else:
@@ -1569,9 +1572,12 @@ def sort(self, rhs, axis=-1, kind="stable", order=None):
                 swapped_copy.copy(swapped, deep=True)
 
                 # run sort on last axis
-                swapped_copy.sort(swapped_copy)
+                sort_result = self.runtime.create_empty_thunk(
+                    swapped_copy.shape, dtype=self.dtype, inputs=[swapped_copy]
+                )
+                sort_result.sort(rhs=swapped_copy, argsort=argsort)
 
-                self.base = swapped_copy.swapaxes(rhs.ndim - 1, sort_axis).base
+                self.base = sort_result.swapaxes(rhs.ndim - 1, sort_axis).base
                 self.numpy_array = None
 
             else:
@@ -1586,6 +1592,7 @@ def sort(self, rhs, axis=-1, kind="stable", order=None):
                     )
 
                 task = self.context.create_task(CuNumericOpCode.SORT)
+
                 task.add_output(self.base)
                 task.add_input(rhs.base)
                 task.add_alignment(self.base, rhs.base)
@@ -1595,8 +1602,7 @@ def sort(self, rhs, axis=-1, kind="stable", order=None):
                     # print("Distributed 1D sort --> broadcast")
                     task.add_broadcast(rhs.base)
 
-                task.add_scalar_arg(False, bool)  # descending flag
-                task.add_scalar_arg(False, bool)  # return indices flag
+                task.add_scalar_arg(argsort, bool)  # return indices flag
                 task.add_scalar_arg(rhs.base.shape, (ty.int32,))
                 task.execute()
 
diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index 520287a6f..f5bc10a96 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -502,12 +502,15 @@ def nonzero(self):
                 result += (EagerArray(self.runtime, array),)
             return result
 
-    def sort(self, rhs, axis=-1, kind="stable", order=None):
+    def sort(self, rhs, argsort=False, axis=-1, kind="stable", order=None):
         self.check_eager_args(rhs, axis, kind, order)
         if self.deferred is not None:
-            self.deferred.sort(rhs, axis, kind, order)
+            self.deferred.sort(rhs, argsort, axis, kind, order)
         else:
-            self.array = np.sort(rhs.array, axis, kind, order)
+            if argsort:
+                self.array = np.argsort(rhs.array, axis, kind, order)
+            else:
+                self.array = np.sort(rhs.array, axis, kind, order)
 
     def random_uniform(self):
         if self.deferred is not None:
diff --git a/cunumeric/module.py b/cunumeric/module.py
index 596d61b11..d99b4e9c2 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -5606,7 +5606,12 @@ def argsort(a, axis=-1, kind="stable", order=None):
     --------
     GPU, CPU
     """
-    return a.argsort(axis=axis, kind=kind, order=order)
+
+    result = ndarray(a.shape, np.int32)
+    result._thunk.sort(
+        rhs=a._thunk, argsort=True, axis=axis, kind=kind, order=order
+    )
+    return result
 
 
 def msort(a):
diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index 44a6ba3e6..8b209827a 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -17,14 +17,16 @@
 #include "cunumeric/sort/sort.h"
 #include "cunumeric/sort/sort_template.inl"
 
+#include <numeric>
+
 namespace cunumeric {
 
 using namespace Legion;
 using namespace legate;
 
-// general routine
+// general routine SORT
 template <LegateTypeCode CODE, int32_t DIM>
-struct SortImplBody<VariantKind::CPU, CODE, DIM> {
+struct SortImplBody<VariantKind::CPU, false, CODE, DIM> {
   using VAL = legate_type_of<CODE>;
 
   void std_sort(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size)
@@ -41,13 +43,14 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
                   const Rect<DIM>& rect,
                   const bool dense,
                   const size_t volume,
+                  const bool argsort,
                   const Legion::DomainPoint global_shape,
                   const bool is_index_space,
                   const Legion::DomainPoint index_point,
                   const Legion::Domain domain)
   {
 #ifdef DEBUG_CUNUMERIC
-    std::cout << "CPU(" << index_point[0] << "): local size = " << volume
+    std::cout << "CPU(" << getRank(domain, index_point) << "): local size = " << volume
               << ", dist. = " << is_index_space << ", index_point = " << index_point
               << ", domain/volume = " << domain << "/" << domain.get_volume()
               << ", dense = " << dense << std::endl;
@@ -76,6 +79,69 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
   }
 };
 
+// general routine ARGSORT
+template <LegateTypeCode CODE, int32_t DIM>
+struct SortImplBody<VariantKind::CPU, true, CODE, DIM> {
+  using VAL = legate_type_of<CODE>;
+
+  void std_argsort(const VAL* inptr,
+                   int32_t* outptr,
+                   const size_t volume,
+                   const size_t sort_dim_size)
+  {
+    for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
+      int32_t* segmentKeys     = outptr + start_idx;
+      const VAL* segmentValues = inptr + start_idx;
+      std::iota(segmentKeys, segmentKeys + sort_dim_size, 0);
+      std::stable_sort(
+        segmentKeys, segmentKeys + sort_dim_size, [segmentValues](int32_t i1, int32_t i2) {
+          return segmentValues[i1] < segmentValues[i2];
+        });
+    }
+  }
+
+  void operator()(AccessorRO<VAL, DIM> input,
+                  AccessorWO<int32_t, DIM> output,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  const bool dense,
+                  const size_t volume,
+                  const bool argsort,
+                  const Legion::DomainPoint global_shape,
+                  const bool is_index_space,
+                  const Legion::DomainPoint index_point,
+                  const Legion::Domain domain)
+  {
+#ifdef DEBUG_CUNUMERIC
+    std::cout << "CPU(" << getRank(domain, index_point) << "): local size = " << volume
+              << ", dist. = " << is_index_space << ", index_point = " << index_point
+              << ", domain/volume = " << domain << "/" << domain.get_volume()
+              << ", dense = " << dense << std::endl;
+#endif
+    const size_t sort_dim_size = global_shape[DIM - 1];
+    assert(!is_index_space || DIM > 1);  // not implemented for now
+    if (dense) {
+      std_argsort(input.ptr(rect), output.ptr(rect), volume, sort_dim_size);
+    } else {
+      // compute contiguous memory block
+      int contiguous_elements = 1;
+      for (int i = DIM - 1; i >= 0; i--) {
+        auto diff = 1 + rect.hi[i] - rect.lo[i];
+        contiguous_elements *= diff;
+        if (diff < global_shape[i]) { break; }
+      }
+
+      uint64_t elements_processed = 0;
+      while (elements_processed < volume) {
+        Legion::Point<DIM> start_point = pitches.unflatten(elements_processed, rect.lo);
+        std_argsort(
+          input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size);
+        elements_processed += contiguous_elements;
+      }
+    }
+  }
+};
+
 /*static*/ void SortTask::cpu_variant(TaskContext& context)
 {
   sort_template<VariantKind::CPU>(context);
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index fd864ca78..a170a5069 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -23,6 +23,8 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/tuple.h>
 #include <thrust/execution_policy.h>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_segmented_radix_sort.cuh>
 
 #include "cunumeric/cuda_help.h"
 
@@ -30,90 +32,376 @@ namespace cunumeric {
 
 using namespace Legion;
 
+struct multiply : public thrust::unary_function<int, int> {
+  const int constant;
+
+  multiply(int _constant) : constant(_constant) {}
+
+  __host__ __device__ int operator()(int& input) const { return input * constant; }
+};
+
+template <class VAL>
+void cub_sort(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size)
+{
+  if (volume == sort_dim_size) {
+    size_t temp_storage_bytes = 0;
+    cub::DeviceRadixSort::SortKeys(NULL, temp_storage_bytes, inptr, outptr, volume);
+
+    auto temp_storage =
+      create_buffer<unsigned char>(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM);
+
+    cub::DeviceRadixSort::SortKeys(temp_storage.ptr(0), temp_storage_bytes, inptr, outptr, volume);
+  } else {
+    auto off_start_it =
+      thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size));
+    auto off_end_it =
+      thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size));
+
+    size_t temp_storage_bytes = 0;
+    cub::DeviceSegmentedRadixSort::SortKeys(NULL,
+                                            temp_storage_bytes,
+                                            inptr,
+                                            outptr,
+                                            volume,
+                                            volume / sort_dim_size,
+                                            off_start_it,
+                                            off_end_it);
+    auto temp_storage =
+      create_buffer<unsigned char>(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM);
+
+    cub::DeviceSegmentedRadixSort::SortKeys(temp_storage.ptr(0),
+                                            temp_storage_bytes,
+                                            inptr,
+                                            outptr,
+                                            volume,
+                                            volume / sort_dim_size,
+                                            off_start_it,
+                                            off_end_it);
+  }
+}
+
+template <class VAL>
+void thrust_sort(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size)
+{
+  thrust::device_ptr<const VAL> dev_input_ptr(inptr);
+  thrust::device_ptr<VAL> dev_output_ptr(outptr);
+  thrust::copy(dev_input_ptr, dev_input_ptr + volume, dev_output_ptr);
+  // same approach as cupy implemntation --> combine multiple individual sorts into single
+  // kernel with data tuples - (id_sub-sort, actual_data)
+  if (volume == sort_dim_size) {
+    thrust::stable_sort(dev_output_ptr, dev_output_ptr + volume);
+  } else {
+    // in this case we know we are sorting for the *last* index
+    const uint64_t max_elements_per_kernel =
+      1 << 22;  // TODO check amount of available GPU memory from config
+    const uint64_t number_sorts_per_kernel =
+      std::max(1ul, std::min(volume, max_elements_per_kernel) / sort_dim_size);
+    const uint64_t number_sorts = volume / sort_dim_size;
+
+    // std::cout << "Number of sorts per kernel: " << number_sorts_per_kernel << std::endl;
+
+    if (number_sorts_per_kernel >=
+        32)  // key-tuple sort has quite some overhead -- only utilize if beneficial
+    {
+      // allocate memory for keys (iterating +=1 for each individual sort dimension)
+      // ensure keys have minimal bit-length (needs values up to number_sorts_per_kernel-1)!
+      // TODO!!!!
+      auto keys_array = create_buffer<uint32_t>(number_sorts_per_kernel * sort_dim_size,
+                                                Legion::Memory::Kind::GPU_FB_MEM);
+      thrust::device_ptr<uint32_t> dev_key_ptr(keys_array.ptr(0));
+
+      for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part += number_sorts_per_kernel) {
+        // compute size of batch (might be smaller for the last call)
+        const uint64_t num_elements =
+          std::min(number_sorts - sort_part, max_elements_per_kernel) * sort_dim_size;
+        const uint64_t offset = sort_part * sort_dim_size;
+
+        // reinit keys
+        thrust::transform(thrust::make_counting_iterator<uint64_t>(0),
+                          thrust::make_counting_iterator<uint64_t>(num_elements),
+                          thrust::make_constant_iterator<uint64_t>(sort_dim_size),
+                          dev_key_ptr,
+                          thrust::divides<uint64_t>());
+
+        // sort
+        auto combined =
+          thrust::make_zip_iterator(thrust::make_tuple(dev_key_ptr, dev_output_ptr + offset));
+        thrust::stable_sort(
+          combined, combined + num_elements, thrust::less<thrust::tuple<size_t, VAL>>());
+      }
+    } else {
+      // number_sorts_per_kernel too small ----> we sort one after another
+      for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part++) {
+        const uint64_t offset = sort_part * sort_dim_size;
+        thrust::stable_sort(dev_output_ptr + offset, dev_output_ptr + offset + sort_dim_size);
+      }
+    }
+  }
+}
+
+template <class VAL>
+void cub_argsort(const VAL* inptr, int32_t* outptr, const size_t volume, const size_t sort_dim_size)
+{
+  auto keys_out = create_buffer<VAL>(volume, Legion::Memory::Kind::GPU_FB_MEM);
+  thrust::device_ptr<VAL> dev_key_out_ptr(keys_out.ptr(0));
+
+  auto idx_in = create_buffer<int32_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
+  thrust::device_ptr<int32_t> dev_idx_in_ptr(idx_in.ptr(0));
+  thrust::transform(thrust::make_counting_iterator<int32_t>(0),
+                    thrust::make_counting_iterator<int32_t>(volume),
+                    thrust::make_constant_iterator<int32_t>(sort_dim_size),
+                    dev_idx_in_ptr,
+                    thrust::modulus<int32_t>());
+
+  if (volume == sort_dim_size) {
+    size_t temp_storage_bytes = 0;
+    cub::DeviceRadixSort::SortPairs(
+      NULL, temp_storage_bytes, inptr, keys_out.ptr(0), idx_in.ptr(0), outptr, volume);
+
+    auto temp_storage =
+      create_buffer<unsigned char>(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM);
+
+    cub::DeviceRadixSort::SortPairs(temp_storage.ptr(0),
+                                    temp_storage_bytes,
+                                    inptr,
+                                    keys_out.ptr(0),
+                                    idx_in.ptr(0),
+                                    outptr,
+                                    volume);
+  } else {
+    auto off_start_it =
+      thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size));
+    auto off_end_it =
+      thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size));
+
+    size_t temp_storage_bytes = 0;
+    cub::DeviceSegmentedRadixSort::SortPairs(NULL,
+                                             temp_storage_bytes,
+                                             inptr,
+                                             keys_out.ptr(0),
+                                             idx_in.ptr(0),
+                                             outptr,
+                                             volume,
+                                             volume / sort_dim_size,
+                                             off_start_it,
+                                             off_end_it);
+
+    auto temp_storage =
+      create_buffer<unsigned char>(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM);
+
+    cub::DeviceSegmentedRadixSort::SortPairs(temp_storage.ptr(0),
+                                             temp_storage_bytes,
+                                             inptr,
+                                             keys_out.ptr(0),
+                                             idx_in.ptr(0),
+                                             outptr,
+                                             volume,
+                                             volume / sort_dim_size,
+                                             off_start_it,
+                                             off_end_it);
+  }
+}
+
+template <class VAL>
+void thrust_argsort(const VAL* inptr,
+                    int32_t* outptr,
+                    const size_t volume,
+                    const size_t sort_dim_size)
+{
+  thrust::device_ptr<const VAL> dev_input_ptr(inptr);
+
+  auto keys_copy = create_buffer<VAL>(volume, Legion::Memory::Kind::GPU_FB_MEM);
+  thrust::device_ptr<VAL> dev_keys_copy_ptr(keys_copy.ptr(0));
+  thrust::copy(dev_input_ptr, dev_input_ptr + volume, dev_keys_copy_ptr);
+
+  thrust::device_ptr<int32_t> dev_output_ptr(outptr);
+  thrust::transform(thrust::make_counting_iterator<int32_t>(0),
+                    thrust::make_counting_iterator<int32_t>(volume),
+                    thrust::make_constant_iterator<int32_t>(sort_dim_size),
+                    dev_output_ptr,
+                    thrust::modulus<int32_t>());
+
+  // same approach as cupy implemntation --> combine multiple individual sorts into single
+  // kernel with data tuples - (id_sub-sort, actual_data)
+  if (volume == sort_dim_size) {
+    thrust::stable_sort_by_key(dev_keys_copy_ptr, dev_keys_copy_ptr + volume, dev_output_ptr);
+  } else {
+    // in this case we know we are sorting for the *last* index
+    const uint64_t max_elements_per_kernel =
+      1 << 22;  // TODO check amount of available GPU memory from config
+    const uint64_t number_sorts_per_kernel =
+      std::max(1ul, std::min(volume, max_elements_per_kernel) / sort_dim_size);
+    const uint64_t number_sorts = volume / sort_dim_size;
+
+    // std::cout << "Number of sorts per kernel: " << number_sorts_per_kernel << std::endl;
+
+    if (number_sorts_per_kernel >=
+        32)  // key-tuple sort has quite some overhead -- only utilize if beneficial
+    {
+      // allocate memory for keys (iterating +=1 for each individual sort dimension)
+      // ensure keys have minimal bit-length (needs values up to number_sorts_per_kernel-1)!
+      // TODO!!!!
+      auto keys_array = create_buffer<uint32_t>(number_sorts_per_kernel * sort_dim_size,
+                                                Legion::Memory::Kind::GPU_FB_MEM);
+      thrust::device_ptr<uint32_t> dev_key_ptr(keys_array.ptr(0));
+
+      for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part += number_sorts_per_kernel) {
+        // compute size of batch (might be smaller for the last call)
+        const uint64_t num_elements =
+          std::min(number_sorts - sort_part, max_elements_per_kernel) * sort_dim_size;
+        const uint64_t offset = sort_part * sort_dim_size;
+
+        // reinit keys
+        thrust::transform(thrust::make_counting_iterator<uint64_t>(0),
+                          thrust::make_counting_iterator<uint64_t>(num_elements),
+                          thrust::make_constant_iterator<uint64_t>(sort_dim_size),
+                          dev_key_ptr,
+                          thrust::divides<uint64_t>());
+
+        // sort
+        auto combined =
+          thrust::make_zip_iterator(thrust::make_tuple(dev_key_ptr, dev_keys_copy_ptr + offset));
+        thrust::stable_sort_by_key(combined,
+                                   combined + num_elements,
+                                   dev_output_ptr + offset,
+                                   thrust::less<thrust::tuple<size_t, VAL>>());
+      }
+    } else {
+      // number_sorts_per_kernel too small ----> we sort one after another
+      for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part++) {
+        const uint64_t offset = sort_part * sort_dim_size;
+        thrust::stable_sort_by_key(dev_keys_copy_ptr + offset,
+                                   dev_keys_copy_ptr + offset + sort_dim_size,
+                                   dev_output_ptr + offset);
+      }
+    }
+  }
+}
+
+template <LegateTypeCode CODE>
+struct support_cub : std::true_type {
+};
+template <>
+struct support_cub<LegateTypeCode::COMPLEX64_LT> : std::false_type {
+};
+template <>
+struct support_cub<LegateTypeCode::COMPLEX128_LT> : std::false_type {
+};
+
+template <LegateTypeCode CODE, std::enable_if_t<support_cub<CODE>::value>* = nullptr>
+void sort_stable(const legate_type_of<CODE>* inptr,
+                 legate_type_of<CODE>* outptr,
+                 const size_t volume,
+                 const size_t sort_dim_size)
+{
+  using VAL = legate_type_of<CODE>;
+  cub_sort<VAL>(inptr, outptr, volume, sort_dim_size);
+}
+
+template <LegateTypeCode CODE, std::enable_if_t<!support_cub<CODE>::value>* = nullptr>
+void sort_stable(const legate_type_of<CODE>* inptr,
+                 legate_type_of<CODE>* outptr,
+                 const size_t volume,
+                 const size_t sort_dim_size)
+{
+  using VAL = legate_type_of<CODE>;
+  thrust_sort<VAL>(inptr, outptr, volume, sort_dim_size);
+}
+
+template <LegateTypeCode CODE, std::enable_if_t<support_cub<CODE>::value>* = nullptr>
+void argsort_stable(const legate_type_of<CODE>* inptr,
+                    int32_t* outptr,
+                    const size_t volume,
+                    const size_t sort_dim_size)
+{
+  using VAL = legate_type_of<CODE>;
+  cub_argsort<VAL>(inptr, outptr, volume, sort_dim_size);
+}
+
+template <LegateTypeCode CODE, std::enable_if_t<!support_cub<CODE>::value>* = nullptr>
+void argsort_stable(const legate_type_of<CODE>* inptr,
+                    int32_t* outptr,
+                    const size_t volume,
+                    const size_t sort_dim_size)
+{
+  using VAL = legate_type_of<CODE>;
+  thrust_argsort<VAL>(inptr, outptr, volume, sort_dim_size);
+}
+
 template <LegateTypeCode CODE, int32_t DIM>
-struct SortImplBody<VariantKind::GPU, CODE, DIM> {
+struct SortImplBody<VariantKind::GPU, false, CODE, DIM> {
   using VAL = legate_type_of<CODE>;
 
-  void thrust_sort(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size)
+  void operator()(AccessorRO<VAL, DIM> input,
+                  AccessorWO<VAL, DIM> output,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  const bool dense,
+                  const size_t volume,
+                  const bool argsort,
+                  const Legion::DomainPoint global_shape,
+                  const bool is_index_space,
+                  const Legion::DomainPoint index_point,
+                  const Legion::Domain domain)
   {
-    thrust::device_ptr<const VAL> dev_input_ptr(inptr);
-    thrust::device_ptr<VAL> dev_output_ptr(outptr);
-    thrust::copy(dev_input_ptr, dev_input_ptr + volume, dev_output_ptr);
-    // same approach as cupy implemntation --> combine multiple individual sorts into single
-    // kernel with data tuples - (id_sub-sort, actual_data)
-    if (DIM == 1) {
-      thrust::stable_sort(dev_output_ptr, dev_output_ptr + volume);
+#ifdef DEBUG_CUNUMERIC
+    std::cout << "GPU(" << getRank(domain, index_point) << "): local size = " << volume
+              << ", dist. = " << is_index_space << ", index_point = " << index_point
+              << ", domain/volume = " << domain << "/" << domain.get_volume()
+              << ", dense = " << dense << std::endl;
+#endif
+    assert(!argsort);
+    const size_t sort_dim_size = global_shape[DIM - 1];
+    assert(!is_index_space || DIM > 1);  // not implemented for now
+    if (dense) {
+      sort_stable<CODE>(input.ptr(rect), output.ptr(rect), volume, sort_dim_size);
     } else {
-      // in this case we know we are sorting for the *last* index
-      const uint64_t max_elements_per_kernel =
-        1 << 22;  // TODO check amount of available GPU memory from config
-      const uint64_t number_sorts_per_kernel =
-        std::max(1ul, std::min(volume, max_elements_per_kernel) / sort_dim_size);
-      const uint64_t number_sorts = volume / sort_dim_size;
-
-      // std::cout << "Number of sorts per kernel: " << number_sorts_per_kernel << std::endl;
-
-      if (number_sorts_per_kernel >=
-          32)  // key-tuple sort has quite some overhead -- only utilize if beneficial
-      {
-        // allocate memory for keys (iterating +=1 for each individual sort dimension)
-        // ensure keys have minimal bit-length (needs values up to number_sorts_per_kernel-1)!
-        // TODO!!!!
-        auto keys_array = create_buffer<uint32_t>(number_sorts_per_kernel * sort_dim_size,
-                                                  Legion::Memory::Kind::GPU_FB_MEM);
-        thrust::device_ptr<uint32_t> dev_key_ptr(keys_array.ptr(0));
-
-        for (uint64_t sort_part = 0; sort_part < number_sorts;
-             sort_part += number_sorts_per_kernel) {
-          // compute size of batch (might be smaller for the last call)
-          const uint64_t num_elements =
-            std::min(number_sorts - sort_part, max_elements_per_kernel) * sort_dim_size;
-          const uint64_t offset = sort_part * sort_dim_size;
-
-          // reinit keys
-          thrust::transform(thrust::make_counting_iterator<uint64_t>(0),
-                            thrust::make_counting_iterator<uint64_t>(num_elements),
-                            thrust::make_constant_iterator<uint64_t>(sort_dim_size),
-                            dev_key_ptr,
-                            thrust::divides<uint64_t>());
-
-          // sort
-          auto combined =
-            thrust::make_zip_iterator(thrust::make_tuple(dev_key_ptr, dev_output_ptr + offset));
-          thrust::stable_sort(
-            combined, combined + num_elements, thrust::less<thrust::tuple<size_t, VAL>>());
-        }
-      } else {
-        // number_sorts_per_kernel too small ----> we sort one after another
-        for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part++) {
-          const uint64_t offset = sort_part * sort_dim_size;
-          thrust::stable_sort(dev_output_ptr + offset, dev_output_ptr + offset + sort_dim_size);
-        }
+      // compute contiguous memory block
+      int contiguous_elements = 1;
+      for (int i = DIM - 1; i >= 0; i--) {
+        auto diff = 1 + rect.hi[i] - rect.lo[i];
+        contiguous_elements *= diff;
+        if (diff < global_shape[i]) { break; }
+      }
+
+      uint64_t elements_processed = 0;
+      while (elements_processed < volume) {
+        Legion::Point<DIM> start_point = pitches.unflatten(elements_processed, rect.lo);
+        sort_stable<CODE>(
+          input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size);
+        elements_processed += contiguous_elements;
       }
     }
   }
+};
+
+template <LegateTypeCode CODE, int32_t DIM>
+struct SortImplBody<VariantKind::GPU, true, CODE, DIM> {
+  using VAL = legate_type_of<CODE>;
 
   void operator()(AccessorRO<VAL, DIM> input,
-                  AccessorWO<VAL, DIM> output,
+                  AccessorWO<int32_t, DIM> output,
                   const Pitches<DIM - 1>& pitches,
                   const Rect<DIM>& rect,
                   const bool dense,
                   const size_t volume,
+                  const bool argsort,
                   const Legion::DomainPoint global_shape,
                   const bool is_index_space,
                   const Legion::DomainPoint index_point,
                   const Legion::Domain domain)
   {
 #ifdef DEBUG_CUNUMERIC
-    std::cout << "GPU(" << index_point[0] << "): local size = " << volume
+    std::cout << "GPU(" << getRank(domain, index_point) << "): local size = " << volume
               << ", dist. = " << is_index_space << ", index_point = " << index_point
               << ", domain/volume = " << domain << "/" << domain.get_volume()
               << ", dense = " << dense << std::endl;
 #endif
+    assert(argsort);
     const size_t sort_dim_size = global_shape[DIM - 1];
     assert(!is_index_space || DIM > 1);  // not implemented for now
     if (dense) {
-      thrust_sort(input.ptr(rect), output.ptr(rect), volume, sort_dim_size);
+      argsort_stable<CODE>(input.ptr(rect), output.ptr(rect), volume, sort_dim_size);
     } else {
       // compute contiguous memory block
       int contiguous_elements = 1;
@@ -126,7 +414,7 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
       uint64_t elements_processed = 0;
       while (elements_processed < volume) {
         Legion::Point<DIM> start_point = pitches.unflatten(elements_processed, rect.lo);
-        thrust_sort(
+        argsort_stable<CODE>(
           input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size);
         elements_processed += contiguous_elements;
       }
diff --git a/src/cunumeric/sort/sort.h b/src/cunumeric/sort/sort.h
index 837027086..b915df838 100644
--- a/src/cunumeric/sort/sort.h
+++ b/src/cunumeric/sort/sort.h
@@ -23,12 +23,11 @@ namespace cunumeric {
 struct SortArgs {
   const Array& input;
   Array& output;
-  bool descending;
   bool argsort;
   Legion::DomainPoint global_shape;
   bool is_index_space;
-  Legion::DomainPoint index_point;
-  Legion::Domain domain;
+  Legion::DomainPoint task_index;
+  Legion::Domain launch_domain;
 };
 
 template <typename VAL>
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index c5b3ccd54..1fc560617 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -26,7 +26,7 @@ using namespace Legion;
 using namespace legate;
 
 template <LegateTypeCode CODE, int32_t DIM>
-struct SortImplBody<VariantKind::OMP, CODE, DIM> {
+struct SortImplBody<VariantKind::OMP, false, CODE, DIM> {
   using VAL = legate_type_of<CODE>;
 
   void std_sort_omp(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size)
@@ -51,13 +51,14 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
                   const Rect<DIM>& rect,
                   const bool dense,
                   const size_t volume,
+                  const bool argsort,
                   const Legion::DomainPoint global_shape,
                   const bool is_index_space,
                   const Legion::DomainPoint index_point,
                   const Legion::Domain domain)
   {
 #ifdef DEBUG_CUNUMERIC
-    std::cout << "CPU(" << index_point[0] << "): local size = " << volume
+    std::cout << "CPU(" << getRank(domain, index_point) << "): local size = " << volume
               << ", dist. = " << is_index_space << ", index_point = " << index_point
               << ", domain/volume = " << domain << "/" << domain.get_volume()
               << ", dense = " << dense << std::endl;
@@ -86,6 +87,98 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
   }
 };
 
+template <LegateTypeCode CODE, int32_t DIM>
+struct SortImplBody<VariantKind::OMP, true, CODE, DIM> {
+  using VAL = legate_type_of<CODE>;
+
+  void std_argsort(const VAL* inptr,
+                   int32_t* outptr,
+                   const size_t volume,
+                   const size_t sort_dim_size)
+  {
+    for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
+      int32_t* segmentKeys     = outptr + start_idx;
+      const VAL* segmentValues = inptr + start_idx;
+      std::iota(outptr + start_idx, outptr + start_idx + sort_dim_size, 0);
+      std::stable_sort(
+        segmentKeys, segmentKeys + sort_dim_size, [segmentValues](int32_t i1, int32_t i2) {
+          return segmentValues[i1] < segmentValues[i2];
+        });
+    }
+  }
+
+  void std_argsort_omp(const VAL* inptr,
+                       int32_t* outptr,
+                       const size_t volume,
+                       const size_t sort_dim_size)
+  {
+    if (volume / sort_dim_size > omp_get_max_threads() / 2)  // TODO fine tune
+    {
+#pragma omp do schedule(dynamic)
+      for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
+        int32_t* segmentKeys     = outptr + start_idx;
+        const VAL* segmentValues = inptr + start_idx;
+        std::iota(segmentKeys, segmentKeys + sort_dim_size, 0);
+        std::stable_sort(
+          segmentKeys, segmentKeys + sort_dim_size, [segmentValues](int32_t i1, int32_t i2) {
+            return segmentValues[i1] < segmentValues[i2];
+          });
+      }
+    } else {
+      for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
+        int32_t* segmentKeys     = outptr + start_idx;
+        const VAL* segmentValues = inptr + start_idx;
+        std::iota(segmentKeys, segmentKeys + sort_dim_size, 0);
+        __gnu_parallel::stable_sort(
+          segmentKeys, segmentKeys + sort_dim_size, [segmentValues](int32_t i1, int32_t i2) {
+            return segmentValues[i1] < segmentValues[i2];
+          });
+      }
+    }
+  }
+
+  void operator()(AccessorRO<VAL, DIM> input,
+                  AccessorWO<int32_t, DIM> output,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  const bool dense,
+                  const size_t volume,
+                  const bool argsort,
+                  const Legion::DomainPoint global_shape,
+                  const bool is_index_space,
+                  const Legion::DomainPoint index_point,
+                  const Legion::Domain domain)
+  {
+#ifdef DEBUG_CUNUMERIC
+    std::cout << "CPU(" << getRank(domain, index_point) << "): local size = " << volume
+              << ", dist. = " << is_index_space << ", index_point = " << index_point
+              << ", domain/volume = " << domain << "/" << domain.get_volume()
+              << ", dense = " << dense << std::endl;
+#endif
+    const size_t sort_dim_size = global_shape[DIM - 1];
+    assert(!is_index_space || DIM > 1);  // not implemented for now
+    if (dense) {
+      std_argsort_omp(input.ptr(rect), output.ptr(rect), volume, sort_dim_size);
+    } else {
+      // compute contiguous memory block
+      int contiguous_elements = 1;
+      for (int i = DIM - 1; i >= 0; i--) {
+        auto diff = 1 + rect.hi[i] - rect.lo[i];
+        contiguous_elements *= diff;
+        if (diff < global_shape[i]) { break; }
+      }
+
+      uint64_t elements_processed = 0;
+      while (elements_processed < volume) {
+        Legion::Point<DIM> start_point = pitches.unflatten(elements_processed, rect.lo);
+        std_argsort_omp(
+          input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size);
+        elements_processed += contiguous_elements;
+      }
+    }
+  }
+};
+
 /*static*/ void SortTask::omp_variant(TaskContext& context)
 {
   sort_template<VariantKind::OMP>(context);
diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl
index 5488330a5..c00b4a7e8 100644
--- a/src/cunumeric/sort/sort_template.inl
+++ b/src/cunumeric/sort/sort_template.inl
@@ -21,9 +21,19 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
-template <VariantKind KIND, LegateTypeCode CODE, int32_t DIM>
+template <VariantKind KIND, bool ARGSORT, LegateTypeCode CODE, int32_t DIM>
 struct SortImplBody;
 
+static int getRank(Domain domain, DomainPoint index_point)
+{
+  int domain_index = 0;
+  for (int i = 0; i < domain.get_dim(); ++i) {
+    if (i > 0) domain_index *= domain.hi()[i] - domain.lo()[i] + 1;
+    domain_index += index_point[i];
+  }
+  return domain_index;
+}
+
 template <VariantKind KIND>
 struct SortImpl {
   template <LegateTypeCode CODE, int32_t DIM>
@@ -40,9 +50,6 @@ struct SortImpl {
     Pitches<DIM - 1> pitches;
     size_t volume = pitches.flatten(rect);
 
-    auto input  = args.input.read_accessor<VAL, DIM>(rect);
-    auto output = args.output.write_accessor<VAL, DIM>(rect);
-
     /*
      * Assumptions:
      * 1. Sort is always requested for the 'last' dimension within rect
@@ -55,32 +62,60 @@ struct SortImpl {
 
 #ifdef DEBUG_CUNUMERIC
     std::cout << "DIM=" << DIM << ", rect=" << rect << ", shape=" << args.global_shape
-              << ", descending=" << args.descending << ", argsort=" << args.argsort
-              << ", sort_dim_size=" << args.global_shape[DIM - 1] << std::endl;
+              << ", argsort=" << args.argsort << ", sort_dim_size=" << args.global_shape[DIM - 1]
+              << std::endl;
 
     assert((DIM == 1 || (rect.hi[DIM - 1] - rect.lo[DIM - 1] + 1 == args.global_shape[DIM - 1])) &&
            "multi-dimensional array should not be distributed in (sort) dimension");
 #endif
 
+    auto input = args.input.read_accessor<VAL, DIM>(rect);
+
+    if (args.argsort) {
+      auto output = args.output.write_accessor<int32_t, DIM>(rect);
+
 #ifndef LEGION_BOUNDS_CHECKS
-    bool dense =
-      input.accessor.is_dense_row_major(rect) && output.accessor.is_dense_row_major(rect);
+      bool dense =
+        input.accessor.is_dense_row_major(rect) && output.accessor.is_dense_row_major(rect);
 #else
-    bool dense = false;
+      bool dense = false;
 #endif
+      assert(dense || !args.is_index_space || DIM > 1);
+
+      SortImplBody<KIND, true, CODE, DIM>()(input,
+                                            output,
+                                            pitches,
+                                            rect,
+                                            dense,
+                                            volume,
+                                            args.argsort,
+                                            args.global_shape,
+                                            args.is_index_space,
+                                            args.task_index,
+                                            args.launch_domain);
+
+    } else {
+      auto output = args.output.write_accessor<VAL, DIM>(rect);
 
-    assert(dense || !args.is_index_space || DIM > 1);
-
-    SortImplBody<KIND, CODE, DIM>()(input,
-                                    output,
-                                    pitches,
-                                    rect,
-                                    dense,
-                                    volume,
-                                    args.global_shape,
-                                    args.is_index_space,
-                                    args.index_point,
-                                    args.domain);
+#ifndef LEGION_BOUNDS_CHECKS
+      bool dense =
+        input.accessor.is_dense_row_major(rect) && output.accessor.is_dense_row_major(rect);
+#else
+      bool dense = false;
+#endif
+      assert(dense || !args.is_index_space || DIM > 1);
+      SortImplBody<KIND, false, CODE, DIM>()(input,
+                                             output,
+                                             pitches,
+                                             rect,
+                                             dense,
+                                             volume,
+                                             args.argsort,
+                                             args.global_shape,
+                                             args.is_index_space,
+                                             args.task_index,
+                                             args.launch_domain);
+    }
   }
 };
 
@@ -89,7 +124,7 @@ static void sort_template(TaskContext& context)
 {
   DomainPoint global_shape;
   {
-    auto shape_span  = context.scalars()[2].values<int32_t>();
+    auto shape_span  = context.scalars()[1].values<int32_t>();
     global_shape.dim = shape_span.size();
     for (int32_t dim = 0; dim < global_shape.dim; ++dim) { global_shape[dim] = shape_span[dim]; }
   }
@@ -97,12 +132,11 @@ static void sort_template(TaskContext& context)
   SortArgs args{context.inputs()[0],
                 context.outputs()[0],
                 context.scalars()[0].value<bool>(),
-                context.scalars()[1].value<bool>(),
                 global_shape,
-                context.task_->is_index_space,
-                context.task_->index_point,
-                context.task_->index_domain};
-  double_dispatch(args.output.dim(), args.output.code(), SortImpl<KIND>{}, args);
+                !context.is_single_task(),
+                context.get_task_index(),
+                context.get_launch_domain()};
+  double_dispatch(args.input.dim(), args.input.code(), SortImpl<KIND>{}, args);
 }
 
 }  // namespace cunumeric
diff --git a/tests/sort.py b/tests/sort.py
index ef78d0447..24385b06a 100644
--- a/tests/sort.py
+++ b/tests/sort.py
@@ -18,15 +18,24 @@
 import cunumeric as num
 
 
+def compare_assert(a_np, a_num):
+    if not num.allclose(a_np, a_num):
+        print("numpy:")
+        print(a_np)
+        print("cuNumeric:")
+        print(a_num)
+        assert False
+
+
 def test_sort_axis(a_np, a_num, axis):
-    assert num.allclose(a_np, a_num)
+    compare_assert(a_np, a_num)
     print("Sorting axis " + str(axis) + ":")
     sort_np = np.sort(a_np, axis)
     sort_num = num.sort(a_num, axis)
-    if not num.allclose(sort_np, sort_num):
-        print(sort_np)
-        print(sort_num)
-        assert False
+    compare_assert(sort_np, sort_num)
+    argsort_np = np.sort(a_np, axis)
+    argsort_num = num.sort(a_num, axis)
+    compare_assert(argsort_np, argsort_num)
 
 
 def test_1D():
@@ -42,11 +51,11 @@ def test_1D():
     # pdb.set_trace()
     sortA_num = num.sort(A_num)
     print("Result cunumeric: " + str(sortA_num))
-    assert num.allclose(sortA_np, sortA_num)
+    compare_assert(sortA_np, sortA_num)
 
     A_num.sort()
     print("Result (inplace): " + str(A_num))
-    assert num.allclose(sortA_np, A_num)
+    compare_assert(sortA_np, A_num)
 
     return
 
@@ -129,15 +138,25 @@ def test_custom():
     return
 
 
-def test_other_api():
-    a = np.arange(4 * 2 * 3).reshape(4, 2, 3)
+def test_api(a=None):
+    if a is None:
+        a = np.arange(4 * 2 * 3).reshape(4, 2, 3)
     a_num = num.array(a)
 
+    # sort axes
+    for i in range(a.ndim):
+        compare_assert(np.sort(a, axis=i, kind="stable"), num.sort(a_num, i))
+
+    # flatten
+    compare_assert(
+        np.sort(a, axis=None, kind="stable"), num.sort(a_num, axis=None)
+    )
+
     # msort
-    assert num.allclose(np.msort(a), num.msort(a_num))
+    compare_assert(np.msort(a), num.msort(a_num))
 
     # sort_complex
-    assert num.allclose(np.sort_complex(a), num.sort_complex(a_num))
+    compare_assert(np.sort_complex(a), num.sort_complex(a_num))
 
     # reverse order sort
     # TODO
@@ -147,13 +166,70 @@ def test_other_api():
     copy_a_num = a_num.copy()
     copy_a.sort()
     copy_a_num.sort()
-    assert num.allclose(copy_a, copy_a_num)
+    compare_assert(copy_a, copy_a_num)
 
     # reverse order sort (in place)
     # TODO
 
     # argsort
-    # TODO
+    for i in range(a.ndim):
+        compare_assert(a, a_num)
+        compare_assert(
+            np.argsort(a, axis=i, kind="stable"), num.argsort(a_num, axis=i)
+        )
+
+    # flatten
+    compare_assert(
+        np.argsort(a, axis=None, kind="stable"), num.argsort(a_num, axis=None)
+    )
+
+    return
+
+
+def generate_random(shape, datatype):
+    print("Generate random for " + str(datatype))
+    a_np = None
+    volume = 1
+    for i in shape:
+        volume *= i
+
+    if np.issubdtype(datatype, np.integer):
+        a_np = np.array(
+            np.random.randint(
+                np.iinfo(datatype).min, np.iinfo(datatype).max, size=volume
+            ),
+            dtype=datatype,
+        )
+    elif np.issubdtype(datatype, np.floating):
+        a_np = np.array(np.random.random(size=volume), dtype=datatype)
+    elif np.issubdtype(datatype, np.complexfloating):
+        a_np = np.array(
+            np.random.random(size=volume) + np.random.random(size=volume) * 1j,
+            dtype=datatype,
+        )
+    else:
+        print("UNKNOWN type " + str(datatype))
+        assert False
+    return a_np
+
+
+def test_dtypes():
+    np.random.seed(42)
+    test_api(generate_random((2, 5, 7), np.uint8))
+    test_api(generate_random((8, 5), np.uint16))
+    test_api(generate_random((22, 5, 7), np.uint32))
+
+    test_api(generate_random((2, 5, 7), np.int8))
+    test_api(generate_random((8, 5), np.int16))
+    test_api(generate_random((22, 5, 7), np.int32))
+    test_api(generate_random((2, 5, 7), np.int64))
+
+    test_api(generate_random((8, 5), np.float32))
+    test_api(generate_random((8, 5), np.float64))
+    test_api(generate_random((22, 5, 7), np.double))
+
+    test_api(generate_random((2, 5, 7), np.complex64))
+    test_api(generate_random((2, 5, 7), np.complex128))
 
     return
 
@@ -170,7 +246,9 @@ def test():
     print("\n\n -----------  4D/5D test-------------\n")
     test_custom()
     print("\n\n -----------  API test --------------\n")
-    test_other_api()
+    test_api()
+    print("\n\n -----------  dtype test ------------\n")
+    test_dtypes()
 
 
 if __name__ == "__main__":

From fd0d3f8fb7ec368d7bd07520bba41ecf4c51ab36 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 18 Feb 2022 15:12:45 -0800
Subject: [PATCH 19/49] adjusted docstring

---
 cunumeric/module.py | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/cunumeric/module.py b/cunumeric/module.py
index 88e78164e..7b938f078 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -5610,13 +5610,18 @@ def argsort(a, axis=-1, kind="stable", order=None):
         Array of indices that sort a along the specified axis. It has the
         same shape as `a.shape` or is flattened in case of `axis` is None.
 
+    Notes
+    -----
+    The current implementation has only limited support for distributed data.
+    Distributed 1-D or flattened data will be broadcasted.
+
     See Also
     --------
     numpy.argsort
 
     Availability
     --------
-    GPU, CPU
+    Single GPU, Single CPU
     """
 
     result = ndarray(a.shape, np.int32)
@@ -5641,13 +5646,18 @@ def msort(a):
     out : ndarray
         Sorted array with same dtype and shape as `a`.
 
+    Notes
+    -----
+    The current implementation has only limited support for distributed data.
+    Distributed 1-D  data will be broadcasted.
+
     See Also
     --------
     numpy.msort
 
     Availability
     --------
-    GPU, CPU
+    Single GPU, Single CPU
     """
     return sort(a, axis=0)
 
@@ -5676,13 +5686,18 @@ def sort(a, axis=-1, kind="stable", order=None):
         Sorted array with same dtype and shape as `a`. In case `axis` is
         None the result is flattened.
 
+    Notes
+    -----
+    The current implementation has only limited support for distributed data.
+    Distributed 1-D or flattened data will be broadcasted.
+
     See Also
     --------
     numpy.sort
 
     Availability
     --------
-    GPU, CPU
+    Single GPU, Single CPU
     """
     result = ndarray(a.shape, a.dtype)
     result._thunk.sort(rhs=a._thunk, axis=axis, kind=kind, order=order)
@@ -5706,13 +5721,18 @@ def sort_complex(a):
     out : ndarray, complex
         Sorted array with same shape as `a`.
 
+    Notes
+    -----
+    The current implementation has only limited support for distributed data.
+    Distributed 1-D data will be broadcasted.
+
     See Also
     --------
     numpy.sort_complex
 
     Availability
     --------
-    GPU, CPU
+    Single GPU, Single CPU
     """
 
     # force complex result

From 6c385dd2dfdaba135357768da02f7a446341acac Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 18 Feb 2022 15:38:44 -0800
Subject: [PATCH 20/49] extract messy code from deferred

---
 cunumeric/deferred.py         |  75 +------------------------
 cunumeric/sorting/__init__.py |  16 ++++++
 cunumeric/sorting/sorting.py  | 102 ++++++++++++++++++++++++++++++++++
 3 files changed, 120 insertions(+), 73 deletions(-)
 create mode 100644 cunumeric/sorting/__init__.py
 create mode 100644 cunumeric/sorting/sorting.py

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 5d43f8bfa..15fdbd6ac 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -32,6 +32,7 @@
     UnaryRedCode,
 )
 from .linalg.cholesky import cholesky
+from .sorting.sorting import sorting
 from .thunk import NumPyThunk
 from .utils import get_arg_value_dtype
 
@@ -1537,76 +1538,4 @@ def sort(self, rhs, argsort=False, axis=-1, kind="stable", order=None):
         if axis is not None and (axis >= rhs.ndim or axis < -rhs.ndim):
             raise ValueError("invalid axis")
 
-        if axis is None and rhs.ndim > 1:
-            flattened = rhs.reshape((rhs.size,), order="C")
-            flattened_copy = self.runtime.create_empty_thunk(
-                flattened.shape, dtype=rhs.dtype, inputs=[rhs, flattened]
-            )
-            flattened_copy.copy(flattened, deep=True)
-
-            # run sort flattened -- return 1D solution
-            sort_result = self.runtime.create_empty_thunk(
-                flattened_copy.shape, dtype=self.dtype, inputs=[flattened_copy]
-            )
-            sort_result.sort(rhs=flattened_copy, argsort=argsort)
-            self.base = sort_result.base
-            self.numpy_array = None
-
-        else:
-            if axis is None:
-                sort_axis = 0
-            elif axis < 0:
-                sort_axis = rhs.ndim + axis
-            else:
-                sort_axis = axis
-
-            if sort_axis is not rhs.ndim - 1:
-                assert sort_axis < rhs.ndim - 1 and sort_axis >= 0
-
-                # swap axes
-                swapped = rhs.swapaxes(sort_axis, rhs.ndim - 1)
-
-                swapped_copy = self.runtime.create_empty_thunk(
-                    swapped.shape, dtype=rhs.dtype, inputs=[rhs, swapped]
-                )
-                swapped_copy.copy(swapped, deep=True)
-
-                # run sort on last axis
-                sort_result = self.runtime.create_empty_thunk(
-                    swapped_copy.shape, dtype=self.dtype, inputs=[swapped_copy]
-                )
-                sort_result.sort(rhs=swapped_copy, argsort=argsort)
-
-                self.base = sort_result.swapaxes(rhs.ndim - 1, sort_axis).base
-                self.numpy_array = None
-
-            else:
-                # run actual sort task
-                needs_communication = self.runtime.num_gpus > 1 or (
-                    self.runtime.num_gpus == 0 and self.runtime.num_procs > 1
-                )
-
-                if needs_communication:
-                    self.runtime.legate_runtime.issue_execution_fence(
-                        block=True
-                    )
-
-                task = self.context.create_task(CuNumericOpCode.SORT)
-
-                task.add_output(self.base)
-                task.add_input(rhs.base)
-                task.add_alignment(self.base, rhs.base)
-                if self.ndim > 1:
-                    task.add_broadcast(rhs.base, rhs.ndim - 1)
-                elif needs_communication:
-                    # print("Distributed 1D sort --> broadcast")
-                    task.add_broadcast(rhs.base)
-
-                task.add_scalar_arg(argsort, bool)  # return indices flag
-                task.add_scalar_arg(rhs.base.shape, (ty.int32,))
-                task.execute()
-
-                if needs_communication:
-                    self.runtime.legate_runtime.issue_execution_fence(
-                        block=True
-                    )
+        sorting(self, rhs, argsort, axis)
diff --git a/cunumeric/sorting/__init__.py b/cunumeric/sorting/__init__.py
new file mode 100644
index 000000000..8988b3353
--- /dev/null
+++ b/cunumeric/sorting/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys as _sys
diff --git a/cunumeric/sorting/sorting.py b/cunumeric/sorting/sorting.py
new file mode 100644
index 000000000..246b3abe3
--- /dev/null
+++ b/cunumeric/sorting/sorting.py
@@ -0,0 +1,102 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+from cunumeric.config import CuNumericOpCode
+
+from legate.core import types as ty
+
+
+def sort_flattened(output, input, argsort):
+    flattened = input.reshape((input.size,), order="C")
+    flattened_copy = output.runtime.create_empty_thunk(
+        flattened.shape, dtype=input.dtype, inputs=[input, flattened]
+    )
+    flattened_copy.copy(flattened, deep=True)
+
+    # run sort flattened -- return 1D solution
+    sort_result = output.runtime.create_empty_thunk(
+        flattened_copy.shape, dtype=output.dtype, inputs=[flattened_copy]
+    )
+    sorting(sort_result, flattened_copy, argsort)
+    output.base = sort_result.base
+    output.numpy_array = None
+
+
+def sort_swapped(output, input, argsort, sort_axis):
+    assert sort_axis < input.ndim - 1 and sort_axis >= 0
+
+    # swap axes
+    swapped = input.swapaxes(sort_axis, input.ndim - 1)
+
+    swapped_copy = output.runtime.create_empty_thunk(
+        swapped.shape, dtype=input.dtype, inputs=[input, swapped]
+    )
+    swapped_copy.copy(swapped, deep=True)
+
+    # run sort on last axis
+    sort_result = output.runtime.create_empty_thunk(
+        swapped_copy.shape, dtype=output.dtype, inputs=[swapped_copy]
+    )
+    sorting(sort_result, swapped_copy, argsort)
+
+    output.base = sort_result.swapaxes(input.ndim - 1, sort_axis).base
+    output.numpy_array = None
+
+
+def sort_task(output, input, argsort):
+    needs_communication = output.runtime.num_gpus > 1 or (
+        output.runtime.num_gpus == 0 and output.runtime.num_procs > 1
+    )
+
+    if needs_communication:
+        output.runtime.legate_runtime.issue_execution_fence(block=True)
+
+    task = output.context.create_task(CuNumericOpCode.SORT)
+
+    task.add_output(output.base)
+    task.add_input(input.base)
+    task.add_alignment(output.base, input.base)
+    if output.ndim > 1:
+        task.add_broadcast(input.base, input.ndim - 1)
+    elif needs_communication:
+        # print("Distributed 1D sort --> broadcast")
+        task.add_broadcast(input.base)
+
+    task.add_scalar_arg(argsort, bool)  # return indices flag
+    task.add_scalar_arg(input.base.shape, (ty.int32,))
+    task.execute()
+
+    if needs_communication:
+        output.runtime.legate_runtime.issue_execution_fence(block=True)
+
+
+def sorting(output, input, argsort, axis=-1):
+    if axis is None and input.ndim > 1:
+        sort_flattened(output, input, argsort)
+    else:
+        if axis is None:
+            sort_axis = 0
+        elif axis < 0:
+            sort_axis = input.ndim + axis
+        else:
+            sort_axis = axis
+
+        if sort_axis is not input.ndim - 1:
+            sort_swapped(output, input, argsort, sort_axis)
+
+        else:
+            # run actual sort task
+            sort_task(output, input, argsort)

From 49c3f3bd99abab23a0b9b9643bc3ecba8efc8d4e Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 25 Feb 2022 09:08:53 -0800
Subject: [PATCH 21/49] refactor sort c-code, simplify, reduce duplicated code

---
 src/cunumeric/sort/sort.cc           | 148 +++----
 src/cunumeric/sort/sort.cu           | 583 ++++++++++++---------------
 src/cunumeric/sort/sort_omp.cc       | 184 ++++-----
 src/cunumeric/sort/sort_template.inl |  59 +--
 tests/sort.py                        |   3 +
 5 files changed, 417 insertions(+), 560 deletions(-)

diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index 8b209827a..30bcd4592 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -17,6 +17,9 @@
 #include "cunumeric/sort/sort.h"
 #include "cunumeric/sort/sort_template.inl"
 
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+
 #include <numeric>
 
 namespace cunumeric {
@@ -24,24 +27,37 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
-// general routine SORT
 template <LegateTypeCode CODE, int32_t DIM>
-struct SortImplBody<VariantKind::CPU, false, CODE, DIM> {
+struct SortImplBody<VariantKind::CPU, CODE, DIM> {
   using VAL = legate_type_of<CODE>;
 
-  void std_sort(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size)
+  // sorts inptr in-place, if argptr not nullptr it returns sort indices
+  void thrust_local_sort_inplace(VAL* inptr,
+                                 int32_t* argptr,
+                                 const size_t volume,
+                                 const size_t sort_dim_size)
   {
-    std::copy(inptr, inptr + volume, outptr);
-    for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
-      std::stable_sort(outptr + start_idx, outptr + start_idx + sort_dim_size);
+    if (argptr == nullptr) {
+      // sort (in place)
+      for (size_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
+        thrust::stable_sort(thrust::host, inptr + start_idx, inptr + start_idx + sort_dim_size);
+      }
+    } else {
+      // argsort
+      for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
+        int32_t* segmentValues = argptr + start_idx;
+        VAL* segmentKeys       = inptr + start_idx;
+        std::iota(segmentValues, segmentValues + sort_dim_size, 0);  // init
+        thrust::stable_sort_by_key(
+          thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues);
+      }
     }
   }
 
-  void operator()(AccessorRO<VAL, DIM> input,
-                  AccessorWO<VAL, DIM> output,
+  void operator()(const Array& input_array,
+                  Array& output_array,
                   const Pitches<DIM - 1>& pitches,
                   const Rect<DIM>& rect,
-                  const bool dense,
                   const size_t volume,
                   const bool argsort,
                   const Legion::DomainPoint global_shape,
@@ -49,94 +65,64 @@ struct SortImplBody<VariantKind::CPU, false, CODE, DIM> {
                   const Legion::DomainPoint index_point,
                   const Legion::Domain domain)
   {
+    AccessorRO<VAL, DIM> input = input_array.read_accessor<VAL, DIM>(rect);
+
+    bool dense = input.accessor.is_dense_row_major(rect);
+
 #ifdef DEBUG_CUNUMERIC
     std::cout << "CPU(" << getRank(domain, index_point) << "): local size = " << volume
               << ", dist. = " << is_index_space << ", index_point = " << index_point
               << ", domain/volume = " << domain << "/" << domain.get_volume()
-              << ", dense = " << dense << std::endl;
+              << ", dense = " << dense << ", argsort. = " << argsort << std::endl;
 #endif
+
     const size_t sort_dim_size = global_shape[DIM - 1];
     assert(!is_index_space || DIM > 1);  // not implemented for now
+
+    // make a copy of the input
+    auto dense_input_copy = create_buffer<VAL>(volume);
     if (dense) {
-      std_sort(input.ptr(rect), output.ptr(rect), volume, sort_dim_size);
+      auto* src = input.ptr(rect.lo);
+      std::copy(src, src + volume, dense_input_copy.ptr(0));
     } else {
-      // compute contiguous memory block
-      int contiguous_elements = 1;
-      for (int i = DIM - 1; i >= 0; i--) {
-        auto diff = 1 + rect.hi[i] - rect.lo[i];
-        contiguous_elements *= diff;
-        if (diff < global_shape[i]) { break; }
-      }
-
-      uint64_t elements_processed = 0;
-      while (elements_processed < volume) {
-        Legion::Point<DIM> start_point = pitches.unflatten(elements_processed, rect.lo);
-        std_sort(
-          input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size);
-        elements_processed += contiguous_elements;
+      auto* target = dense_input_copy.ptr(0);
+      for (size_t offset = 0; offset < volume; ++offset) {
+        auto point     = pitches.unflatten(offset, rect.lo);
+        target[offset] = input[rect.lo + point];
       }
     }
-  }
-};
 
-// general routine ARGSORT
-template <LegateTypeCode CODE, int32_t DIM>
-struct SortImplBody<VariantKind::CPU, true, CODE, DIM> {
-  using VAL = legate_type_of<CODE>;
+    // we need a buffer for argsort
+    auto indices_buffer = create_buffer<int32_t>(argsort ? volume : 0);
 
-  void std_argsort(const VAL* inptr,
-                   int32_t* outptr,
-                   const size_t volume,
-                   const size_t sort_dim_size)
-  {
-    for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
-      int32_t* segmentKeys     = outptr + start_idx;
-      const VAL* segmentValues = inptr + start_idx;
-      std::iota(segmentKeys, segmentKeys + sort_dim_size, 0);
-      std::stable_sort(
-        segmentKeys, segmentKeys + sort_dim_size, [segmentValues](int32_t i1, int32_t i2) {
-          return segmentValues[i1] < segmentValues[i2];
-        });
-    }
-  }
+    // sort data
+    thrust_local_sort_inplace(
+      dense_input_copy.ptr(0), argsort ? indices_buffer.ptr(0) : nullptr, volume, sort_dim_size);
 
-  void operator()(AccessorRO<VAL, DIM> input,
-                  AccessorWO<int32_t, DIM> output,
-                  const Pitches<DIM - 1>& pitches,
-                  const Rect<DIM>& rect,
-                  const bool dense,
-                  const size_t volume,
-                  const bool argsort,
-                  const Legion::DomainPoint global_shape,
-                  const bool is_index_space,
-                  const Legion::DomainPoint index_point,
-                  const Legion::Domain domain)
-  {
-#ifdef DEBUG_CUNUMERIC
-    std::cout << "CPU(" << getRank(domain, index_point) << "): local size = " << volume
-              << ", dist. = " << is_index_space << ", index_point = " << index_point
-              << ", domain/volume = " << domain << "/" << domain.get_volume()
-              << ", dense = " << dense << std::endl;
-#endif
-    const size_t sort_dim_size = global_shape[DIM - 1];
-    assert(!is_index_space || DIM > 1);  // not implemented for now
+    // copy back data (we assume output partition to be aliged to input!)
     if (dense) {
-      std_argsort(input.ptr(rect), output.ptr(rect), volume, sort_dim_size);
-    } else {
-      // compute contiguous memory block
-      int contiguous_elements = 1;
-      for (int i = DIM - 1; i >= 0; i--) {
-        auto diff = 1 + rect.hi[i] - rect.lo[i];
-        contiguous_elements *= diff;
-        if (diff < global_shape[i]) { break; }
+      if (argsort) {
+        AccessorWO<int32_t, DIM> output = output_array.write_accessor<int32_t, DIM>(rect);
+        std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo));
+      } else {
+        AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
+        std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo));
       }
-
-      uint64_t elements_processed = 0;
-      while (elements_processed < volume) {
-        Legion::Point<DIM> start_point = pitches.unflatten(elements_processed, rect.lo);
-        std_argsort(
-          input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size);
-        elements_processed += contiguous_elements;
+    } else {
+      if (argsort) {
+        AccessorWO<int32_t, DIM> output = output_array.write_accessor<int32_t, DIM>(rect);
+        auto* source                    = indices_buffer.ptr(0);
+        for (size_t offset = 0; offset < volume; ++offset) {
+          auto point              = pitches.unflatten(offset, rect.lo);
+          output[rect.lo + point] = source[offset];
+        }
+      } else {
+        AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
+        auto* source                = dense_input_copy.ptr(0);
+        for (size_t offset = 0; offset < volume; ++offset) {
+          auto point              = pitches.unflatten(offset, rect.lo);
+          output[rect.lo + point] = source[offset];
+        }
       }
     }
   }
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index a170a5069..49edec9e1 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -32,6 +32,34 @@ namespace cunumeric {
 
 using namespace Legion;
 
+template <typename VAL, int DIM>
+__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  copy_into_buffer(VAL* out,
+                   const AccessorRO<VAL, DIM> accessor,
+                   const Point<DIM> lo,
+                   const Pitches<DIM - 1> pitches,
+                   const size_t volume)
+{
+  size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
+  if (offset >= volume) return;
+  auto point  = pitches.unflatten(offset, lo);
+  out[offset] = accessor[lo + point];
+}
+
+template <typename VAL, int DIM>
+__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  copy_into_output(AccessorWO<VAL, DIM> accessor,
+                   const VAL* data,
+                   const Point<DIM> lo,
+                   const Pitches<DIM - 1> pitches,
+                   const size_t volume)
+{
+  size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
+  if (offset >= volume) return;
+  auto point           = pitches.unflatten(offset, lo);
+  accessor[lo + point] = data[offset];
+}
+
 struct multiply : public thrust::unary_function<int, int> {
   const int constant;
 
@@ -41,239 +69,187 @@ struct multiply : public thrust::unary_function<int, int> {
 };
 
 template <class VAL>
-void cub_sort(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size)
-{
-  if (volume == sort_dim_size) {
-    size_t temp_storage_bytes = 0;
-    cub::DeviceRadixSort::SortKeys(NULL, temp_storage_bytes, inptr, outptr, volume);
-
-    auto temp_storage =
-      create_buffer<unsigned char>(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM);
-
-    cub::DeviceRadixSort::SortKeys(temp_storage.ptr(0), temp_storage_bytes, inptr, outptr, volume);
-  } else {
-    auto off_start_it =
-      thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size));
-    auto off_end_it =
-      thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size));
-
-    size_t temp_storage_bytes = 0;
-    cub::DeviceSegmentedRadixSort::SortKeys(NULL,
-                                            temp_storage_bytes,
-                                            inptr,
-                                            outptr,
-                                            volume,
-                                            volume / sort_dim_size,
-                                            off_start_it,
-                                            off_end_it);
-    auto temp_storage =
-      create_buffer<unsigned char>(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM);
-
-    cub::DeviceSegmentedRadixSort::SortKeys(temp_storage.ptr(0),
-                                            temp_storage_bytes,
-                                            inptr,
-                                            outptr,
-                                            volume,
-                                            volume / sort_dim_size,
-                                            off_start_it,
-                                            off_end_it);
-  }
-}
-
-template <class VAL>
-void thrust_sort(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size)
+void cub_local_sort_inplace(
+  VAL* inptr, int32_t* argptr, const size_t volume, const size_t sort_dim_size, cudaStream_t stream)
 {
-  thrust::device_ptr<const VAL> dev_input_ptr(inptr);
-  thrust::device_ptr<VAL> dev_output_ptr(outptr);
-  thrust::copy(dev_input_ptr, dev_input_ptr + volume, dev_output_ptr);
-  // same approach as cupy implemntation --> combine multiple individual sorts into single
-  // kernel with data tuples - (id_sub-sort, actual_data)
-  if (volume == sort_dim_size) {
-    thrust::stable_sort(dev_output_ptr, dev_output_ptr + volume);
-  } else {
-    // in this case we know we are sorting for the *last* index
-    const uint64_t max_elements_per_kernel =
-      1 << 22;  // TODO check amount of available GPU memory from config
-    const uint64_t number_sorts_per_kernel =
-      std::max(1ul, std::min(volume, max_elements_per_kernel) / sort_dim_size);
-    const uint64_t number_sorts = volume / sort_dim_size;
-
-    // std::cout << "Number of sorts per kernel: " << number_sorts_per_kernel << std::endl;
-
-    if (number_sorts_per_kernel >=
-        32)  // key-tuple sort has quite some overhead -- only utilize if beneficial
-    {
-      // allocate memory for keys (iterating +=1 for each individual sort dimension)
-      // ensure keys have minimal bit-length (needs values up to number_sorts_per_kernel-1)!
-      // TODO!!!!
-      auto keys_array = create_buffer<uint32_t>(number_sorts_per_kernel * sort_dim_size,
-                                                Legion::Memory::Kind::GPU_FB_MEM);
-      thrust::device_ptr<uint32_t> dev_key_ptr(keys_array.ptr(0));
-
-      for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part += number_sorts_per_kernel) {
-        // compute size of batch (might be smaller for the last call)
-        const uint64_t num_elements =
-          std::min(number_sorts - sort_part, max_elements_per_kernel) * sort_dim_size;
-        const uint64_t offset = sort_part * sort_dim_size;
-
-        // reinit keys
-        thrust::transform(thrust::make_counting_iterator<uint64_t>(0),
-                          thrust::make_counting_iterator<uint64_t>(num_elements),
-                          thrust::make_constant_iterator<uint64_t>(sort_dim_size),
-                          dev_key_ptr,
-                          thrust::divides<uint64_t>());
-
-        // sort
-        auto combined =
-          thrust::make_zip_iterator(thrust::make_tuple(dev_key_ptr, dev_output_ptr + offset));
-        thrust::stable_sort(
-          combined, combined + num_elements, thrust::less<thrust::tuple<size_t, VAL>>());
-      }
+  // make a copy of input --> we want inptr to return sorted values
+  auto keys_in = create_buffer<VAL>(volume, Legion::Memory::Kind::GPU_FB_MEM);
+  cudaMemcpyAsync(keys_in.ptr(0), inptr, sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream);
+  size_t temp_storage_bytes = 0;
+  if (argptr == nullptr) {
+    if (volume == sort_dim_size) {
+      // sort
+      cub::DeviceRadixSort::SortKeys(
+        NULL, temp_storage_bytes, keys_in.ptr(0), inptr, volume, 0, sizeof(VAL) * 8, stream);
+      auto temp_storage =
+        create_buffer<unsigned char>(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM);
+      cub::DeviceRadixSort::SortKeys(temp_storage.ptr(0),
+                                     temp_storage_bytes,
+                                     keys_in.ptr(0),
+                                     inptr,
+                                     volume,
+                                     0,
+                                     sizeof(VAL) * 8,
+                                     stream);
     } else {
-      // number_sorts_per_kernel too small ----> we sort one after another
-      for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part++) {
-        const uint64_t offset = sort_part * sort_dim_size;
-        thrust::stable_sort(dev_output_ptr + offset, dev_output_ptr + offset + sort_dim_size);
-      }
+      // segmented sort
+      auto off_start_it =
+        thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size));
+      auto off_end_it =
+        thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size));
+
+      cub::DeviceSegmentedRadixSort::SortKeys(NULL,
+                                              temp_storage_bytes,
+                                              keys_in.ptr(0),
+                                              inptr,
+                                              volume,
+                                              volume / sort_dim_size,
+                                              off_start_it,
+                                              off_end_it,
+                                              0,
+                                              sizeof(VAL) * 8,
+                                              stream);
+      auto temp_storage =
+        create_buffer<unsigned char>(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM);
+
+      cub::DeviceSegmentedRadixSort::SortKeys(temp_storage.ptr(0),
+                                              temp_storage_bytes,
+                                              keys_in.ptr(0),
+                                              inptr,
+                                              volume,
+                                              volume / sort_dim_size,
+                                              off_start_it,
+                                              off_end_it,
+                                              0,
+                                              sizeof(VAL) * 8,
+                                              stream);
     }
-  }
-}
-
-template <class VAL>
-void cub_argsort(const VAL* inptr, int32_t* outptr, const size_t volume, const size_t sort_dim_size)
-{
-  auto keys_out = create_buffer<VAL>(volume, Legion::Memory::Kind::GPU_FB_MEM);
-  thrust::device_ptr<VAL> dev_key_out_ptr(keys_out.ptr(0));
-
-  auto idx_in = create_buffer<int32_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
-  thrust::device_ptr<int32_t> dev_idx_in_ptr(idx_in.ptr(0));
-  thrust::transform(thrust::make_counting_iterator<int32_t>(0),
-                    thrust::make_counting_iterator<int32_t>(volume),
-                    thrust::make_constant_iterator<int32_t>(sort_dim_size),
-                    dev_idx_in_ptr,
-                    thrust::modulus<int32_t>());
-
-  if (volume == sort_dim_size) {
-    size_t temp_storage_bytes = 0;
-    cub::DeviceRadixSort::SortPairs(
-      NULL, temp_storage_bytes, inptr, keys_out.ptr(0), idx_in.ptr(0), outptr, volume);
-
-    auto temp_storage =
-      create_buffer<unsigned char>(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM);
-
-    cub::DeviceRadixSort::SortPairs(temp_storage.ptr(0),
-                                    temp_storage_bytes,
-                                    inptr,
-                                    keys_out.ptr(0),
-                                    idx_in.ptr(0),
-                                    outptr,
-                                    volume);
   } else {
-    auto off_start_it =
-      thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size));
-    auto off_end_it =
-      thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size));
-
-    size_t temp_storage_bytes = 0;
-    cub::DeviceSegmentedRadixSort::SortPairs(NULL,
-                                             temp_storage_bytes,
-                                             inptr,
-                                             keys_out.ptr(0),
-                                             idx_in.ptr(0),
-                                             outptr,
-                                             volume,
-                                             volume / sort_dim_size,
-                                             off_start_it,
-                                             off_end_it);
-
-    auto temp_storage =
-      create_buffer<unsigned char>(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM);
-
-    cub::DeviceSegmentedRadixSort::SortPairs(temp_storage.ptr(0),
-                                             temp_storage_bytes,
-                                             inptr,
-                                             keys_out.ptr(0),
-                                             idx_in.ptr(0),
-                                             outptr,
-                                             volume,
-                                             volume / sort_dim_size,
-                                             off_start_it,
-                                             off_end_it);
+    auto idx_in = create_buffer<int32_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
+    thrust::transform(thrust::cuda::par.on(stream),
+                      thrust::make_counting_iterator<int32_t>(0),
+                      thrust::make_counting_iterator<int32_t>(volume),
+                      thrust::make_constant_iterator<int32_t>(sort_dim_size),
+                      idx_in.ptr(0),
+                      thrust::modulus<int32_t>());
+
+    if (volume == sort_dim_size) {
+      // argsort
+      cub::DeviceRadixSort::SortPairs(NULL,
+                                      temp_storage_bytes,
+                                      keys_in.ptr(0),
+                                      inptr,
+                                      idx_in.ptr(0),
+                                      argptr,
+                                      volume,
+                                      0,
+                                      sizeof(VAL) * 8,
+                                      stream);
+
+      auto temp_storage =
+        create_buffer<unsigned char>(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM);
+
+      cub::DeviceRadixSort::SortPairs(temp_storage.ptr(0),
+                                      temp_storage_bytes,
+                                      keys_in.ptr(0),
+                                      inptr,
+                                      idx_in.ptr(0),
+                                      argptr,
+                                      volume,
+                                      0,
+                                      sizeof(VAL) * 8,
+                                      stream);
+    } else {
+      // segmented argsort
+      auto off_start_it =
+        thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size));
+      auto off_end_it =
+        thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size));
+
+      cub::DeviceSegmentedRadixSort::SortPairs(NULL,
+                                               temp_storage_bytes,
+                                               keys_in.ptr(0),
+                                               inptr,
+                                               idx_in.ptr(0),
+                                               argptr,
+                                               volume,
+                                               volume / sort_dim_size,
+                                               off_start_it,
+                                               off_end_it,
+                                               0,
+                                               sizeof(VAL) * 8,
+                                               stream);
+
+      auto temp_storage =
+        create_buffer<unsigned char>(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM);
+
+      cub::DeviceSegmentedRadixSort::SortPairs(temp_storage.ptr(0),
+                                               temp_storage_bytes,
+                                               keys_in.ptr(0),
+                                               inptr,
+                                               idx_in.ptr(0),
+                                               argptr,
+                                               volume,
+                                               volume / sort_dim_size,
+                                               off_start_it,
+                                               off_end_it,
+                                               0,
+                                               sizeof(VAL) * 8,
+                                               stream);
+    }
   }
 }
 
 template <class VAL>
-void thrust_argsort(const VAL* inptr,
-                    int32_t* outptr,
-                    const size_t volume,
-                    const size_t sort_dim_size)
+void thrust_local_sort_inplace(
+  VAL* inptr, int32_t* argptr, const size_t volume, const size_t sort_dim_size, cudaStream_t stream)
 {
-  thrust::device_ptr<const VAL> dev_input_ptr(inptr);
-
-  auto keys_copy = create_buffer<VAL>(volume, Legion::Memory::Kind::GPU_FB_MEM);
-  thrust::device_ptr<VAL> dev_keys_copy_ptr(keys_copy.ptr(0));
-  thrust::copy(dev_input_ptr, dev_input_ptr + volume, dev_keys_copy_ptr);
-
-  thrust::device_ptr<int32_t> dev_output_ptr(outptr);
-  thrust::transform(thrust::make_counting_iterator<int32_t>(0),
-                    thrust::make_counting_iterator<int32_t>(volume),
-                    thrust::make_constant_iterator<int32_t>(sort_dim_size),
-                    dev_output_ptr,
-                    thrust::modulus<int32_t>());
-
-  // same approach as cupy implemntation --> combine multiple individual sorts into single
-  // kernel with data tuples - (id_sub-sort, actual_data)
-  if (volume == sort_dim_size) {
-    thrust::stable_sort_by_key(dev_keys_copy_ptr, dev_keys_copy_ptr + volume, dev_output_ptr);
+  if (argptr == nullptr) {
+    if (volume == sort_dim_size) {
+      thrust::stable_sort(thrust::cuda::par.on(stream), inptr, inptr + volume);
+    } else {
+      auto sort_id = create_buffer<uint32_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
+      // init combined keys
+      thrust::transform(thrust::cuda::par.on(stream),
+                        thrust::make_counting_iterator<uint64_t>(0),
+                        thrust::make_counting_iterator<uint64_t>(volume),
+                        thrust::make_constant_iterator<uint64_t>(sort_dim_size),
+                        sort_id.ptr(0),
+                        thrust::divides<uint64_t>());
+      auto combined = thrust::make_zip_iterator(thrust::make_tuple(sort_id.ptr(0), inptr));
+
+      thrust::stable_sort(thrust::cuda::par.on(stream),
+                          combined,
+                          combined + volume,
+                          thrust::less<thrust::tuple<size_t, VAL>>());
+    }
   } else {
-    // in this case we know we are sorting for the *last* index
-    const uint64_t max_elements_per_kernel =
-      1 << 22;  // TODO check amount of available GPU memory from config
-    const uint64_t number_sorts_per_kernel =
-      std::max(1ul, std::min(volume, max_elements_per_kernel) / sort_dim_size);
-    const uint64_t number_sorts = volume / sort_dim_size;
-
-    // std::cout << "Number of sorts per kernel: " << number_sorts_per_kernel << std::endl;
-
-    if (number_sorts_per_kernel >=
-        32)  // key-tuple sort has quite some overhead -- only utilize if beneficial
-    {
-      // allocate memory for keys (iterating +=1 for each individual sort dimension)
-      // ensure keys have minimal bit-length (needs values up to number_sorts_per_kernel-1)!
-      // TODO!!!!
-      auto keys_array = create_buffer<uint32_t>(number_sorts_per_kernel * sort_dim_size,
-                                                Legion::Memory::Kind::GPU_FB_MEM);
-      thrust::device_ptr<uint32_t> dev_key_ptr(keys_array.ptr(0));
-
-      for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part += number_sorts_per_kernel) {
-        // compute size of batch (might be smaller for the last call)
-        const uint64_t num_elements =
-          std::min(number_sorts - sort_part, max_elements_per_kernel) * sort_dim_size;
-        const uint64_t offset = sort_part * sort_dim_size;
-
-        // reinit keys
-        thrust::transform(thrust::make_counting_iterator<uint64_t>(0),
-                          thrust::make_counting_iterator<uint64_t>(num_elements),
-                          thrust::make_constant_iterator<uint64_t>(sort_dim_size),
-                          dev_key_ptr,
-                          thrust::divides<uint64_t>());
-
-        // sort
-        auto combined =
-          thrust::make_zip_iterator(thrust::make_tuple(dev_key_ptr, dev_keys_copy_ptr + offset));
-        thrust::stable_sort_by_key(combined,
-                                   combined + num_elements,
-                                   dev_output_ptr + offset,
-                                   thrust::less<thrust::tuple<size_t, VAL>>());
-      }
+    // intialize indices
+    thrust::transform(thrust::cuda::par.on(stream),
+                      thrust::make_counting_iterator<int32_t>(0),
+                      thrust::make_counting_iterator<int32_t>(volume),
+                      thrust::make_constant_iterator<int32_t>(sort_dim_size),
+                      argptr,
+                      thrust::modulus<int32_t>());
+
+    if (volume == sort_dim_size) {
+      thrust::stable_sort_by_key(thrust::cuda::par.on(stream), inptr, inptr + volume, argptr);
     } else {
-      // number_sorts_per_kernel too small ----> we sort one after another
-      for (uint64_t sort_part = 0; sort_part < number_sorts; sort_part++) {
-        const uint64_t offset = sort_part * sort_dim_size;
-        thrust::stable_sort_by_key(dev_keys_copy_ptr + offset,
-                                   dev_keys_copy_ptr + offset + sort_dim_size,
-                                   dev_output_ptr + offset);
-      }
+      auto sort_id = create_buffer<uint32_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
+      // init combined keys
+      thrust::transform(thrust::cuda::par.on(stream),
+                        thrust::make_counting_iterator<uint64_t>(0),
+                        thrust::make_counting_iterator<uint64_t>(volume),
+                        thrust::make_constant_iterator<uint64_t>(sort_dim_size),
+                        sort_id.ptr(0),
+                        thrust::divides<uint64_t>());
+      auto combined = thrust::make_zip_iterator(thrust::make_tuple(sort_id.ptr(0), inptr));
+
+      thrust::stable_sort_by_key(thrust::cuda::par.on(stream),
+                                 combined,
+                                 combined + volume,
+                                 argptr,
+                                 thrust::less<thrust::tuple<size_t, VAL>>());
     }
   }
 }
@@ -289,54 +265,35 @@ struct support_cub<LegateTypeCode::COMPLEX128_LT> : std::false_type {
 };
 
 template <LegateTypeCode CODE, std::enable_if_t<support_cub<CODE>::value>* = nullptr>
-void sort_stable(const legate_type_of<CODE>* inptr,
-                 legate_type_of<CODE>* outptr,
-                 const size_t volume,
-                 const size_t sort_dim_size)
-{
-  using VAL = legate_type_of<CODE>;
-  cub_sort<VAL>(inptr, outptr, volume, sort_dim_size);
-}
-
-template <LegateTypeCode CODE, std::enable_if_t<!support_cub<CODE>::value>* = nullptr>
-void sort_stable(const legate_type_of<CODE>* inptr,
-                 legate_type_of<CODE>* outptr,
-                 const size_t volume,
-                 const size_t sort_dim_size)
-{
-  using VAL = legate_type_of<CODE>;
-  thrust_sort<VAL>(inptr, outptr, volume, sort_dim_size);
-}
-
-template <LegateTypeCode CODE, std::enable_if_t<support_cub<CODE>::value>* = nullptr>
-void argsort_stable(const legate_type_of<CODE>* inptr,
-                    int32_t* outptr,
-                    const size_t volume,
-                    const size_t sort_dim_size)
+void local_sort_inplace(legate_type_of<CODE>* inptr,
+                        int32_t* argptr,
+                        const size_t volume,
+                        const size_t sort_dim_size,
+                        cudaStream_t stream)
 {
   using VAL = legate_type_of<CODE>;
-  cub_argsort<VAL>(inptr, outptr, volume, sort_dim_size);
+  cub_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stream);
 }
 
 template <LegateTypeCode CODE, std::enable_if_t<!support_cub<CODE>::value>* = nullptr>
-void argsort_stable(const legate_type_of<CODE>* inptr,
-                    int32_t* outptr,
-                    const size_t volume,
-                    const size_t sort_dim_size)
+void local_sort_inplace(legate_type_of<CODE>* inptr,
+                        int32_t* argptr,
+                        const size_t volume,
+                        const size_t sort_dim_size,
+                        cudaStream_t stream)
 {
   using VAL = legate_type_of<CODE>;
-  thrust_argsort<VAL>(inptr, outptr, volume, sort_dim_size);
+  thrust_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stream);
 }
 
 template <LegateTypeCode CODE, int32_t DIM>
-struct SortImplBody<VariantKind::GPU, false, CODE, DIM> {
+struct SortImplBody<VariantKind::GPU, CODE, DIM> {
   using VAL = legate_type_of<CODE>;
 
-  void operator()(AccessorRO<VAL, DIM> input,
-                  AccessorWO<VAL, DIM> output,
+  void operator()(const Array& input_array,
+                  Array& output_array,
                   const Pitches<DIM - 1>& pitches,
                   const Rect<DIM>& rect,
-                  const bool dense,
                   const size_t volume,
                   const bool argsort,
                   const Legion::DomainPoint global_shape,
@@ -344,81 +301,77 @@ struct SortImplBody<VariantKind::GPU, false, CODE, DIM> {
                   const Legion::DomainPoint index_point,
                   const Legion::Domain domain)
   {
+    AccessorRO<VAL, DIM> input = input_array.read_accessor<VAL, DIM>(rect);
+
+    bool dense = input.accessor.is_dense_row_major(rect);
+
 #ifdef DEBUG_CUNUMERIC
     std::cout << "GPU(" << getRank(domain, index_point) << "): local size = " << volume
               << ", dist. = " << is_index_space << ", index_point = " << index_point
               << ", domain/volume = " << domain << "/" << domain.get_volume()
-              << ", dense = " << dense << std::endl;
+              << ", dense = " << dense << ", argsort. = " << argsort << std::endl;
 #endif
-    assert(!argsort);
+
+    auto stream = get_cached_stream();
+
     const size_t sort_dim_size = global_shape[DIM - 1];
     assert(!is_index_space || DIM > 1);  // not implemented for now
+
+    // make a copy of the input
+    auto dense_input_copy = create_buffer<VAL>(volume, Legion::Memory::Kind::GPU_FB_MEM);
     if (dense) {
-      sort_stable<CODE>(input.ptr(rect), output.ptr(rect), volume, sort_dim_size);
+      cudaMemcpyAsync(dense_input_copy.ptr(0),
+                      input.ptr(rect.lo),
+                      sizeof(VAL) * volume,
+                      cudaMemcpyDeviceToDevice,
+                      stream);
     } else {
-      // compute contiguous memory block
-      int contiguous_elements = 1;
-      for (int i = DIM - 1; i >= 0; i--) {
-        auto diff = 1 + rect.hi[i] - rect.lo[i];
-        contiguous_elements *= diff;
-        if (diff < global_shape[i]) { break; }
-      }
-
-      uint64_t elements_processed = 0;
-      while (elements_processed < volume) {
-        Legion::Point<DIM> start_point = pitches.unflatten(elements_processed, rect.lo);
-        sort_stable<CODE>(
-          input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size);
-        elements_processed += contiguous_elements;
-      }
+      const size_t num_blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+      copy_into_buffer<<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(
+        dense_input_copy.ptr(0), input, rect.lo, pitches, volume);
     }
-  }
-};
 
-template <LegateTypeCode CODE, int32_t DIM>
-struct SortImplBody<VariantKind::GPU, true, CODE, DIM> {
-  using VAL = legate_type_of<CODE>;
+    // we need a buffer for argsort
+    auto indices_buffer =
+      create_buffer<int32_t>(argsort ? volume : 0, Legion::Memory::Kind::GPU_FB_MEM);
 
-  void operator()(AccessorRO<VAL, DIM> input,
-                  AccessorWO<int32_t, DIM> output,
-                  const Pitches<DIM - 1>& pitches,
-                  const Rect<DIM>& rect,
-                  const bool dense,
-                  const size_t volume,
-                  const bool argsort,
-                  const Legion::DomainPoint global_shape,
-                  const bool is_index_space,
-                  const Legion::DomainPoint index_point,
-                  const Legion::Domain domain)
-  {
-#ifdef DEBUG_CUNUMERIC
-    std::cout << "GPU(" << getRank(domain, index_point) << "): local size = " << volume
-              << ", dist. = " << is_index_space << ", index_point = " << index_point
-              << ", domain/volume = " << domain << "/" << domain.get_volume()
-              << ", dense = " << dense << std::endl;
-#endif
-    assert(argsort);
-    const size_t sort_dim_size = global_shape[DIM - 1];
-    assert(!is_index_space || DIM > 1);  // not implemented for now
+    // sort data
+    local_sort_inplace<CODE>(dense_input_copy.ptr(0),
+                             argsort ? indices_buffer.ptr(0) : nullptr,
+                             volume,
+                             sort_dim_size,
+                             stream);
+
+    // copy back data (we assume output partition to be aliged to input!)
     if (dense) {
-      argsort_stable<CODE>(input.ptr(rect), output.ptr(rect), volume, sort_dim_size);
-    } else {
-      // compute contiguous memory block
-      int contiguous_elements = 1;
-      for (int i = DIM - 1; i >= 0; i--) {
-        auto diff = 1 + rect.hi[i] - rect.lo[i];
-        contiguous_elements *= diff;
-        if (diff < global_shape[i]) { break; }
+      if (argsort) {
+        AccessorWO<int32_t, DIM> output = output_array.write_accessor<int32_t, DIM>(rect);
+        cudaMemcpyAsync(output.ptr(rect.lo),
+                        indices_buffer.ptr(0),
+                        sizeof(int32_t) * volume,
+                        cudaMemcpyDeviceToDevice,
+                        stream);
+      } else {
+        AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
+        cudaMemcpyAsync(output.ptr(rect.lo),
+                        dense_input_copy.ptr(0),
+                        sizeof(VAL) * volume,
+                        cudaMemcpyDeviceToDevice,
+                        stream);
       }
-
-      uint64_t elements_processed = 0;
-      while (elements_processed < volume) {
-        Legion::Point<DIM> start_point = pitches.unflatten(elements_processed, rect.lo);
-        argsort_stable<CODE>(
-          input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size);
-        elements_processed += contiguous_elements;
+    } else {
+      const size_t num_blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+      if (argsort) {
+        AccessorWO<int32_t, DIM> output = output_array.write_accessor<int32_t, DIM>(rect);
+        copy_into_output<<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(
+          output, indices_buffer.ptr(0), rect.lo, pitches, volume);
+      } else {
+        AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
+        copy_into_output<<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(
+          output, dense_input_copy.ptr(0), rect.lo, pitches, volume);
       }
     }
+    CHECK_CUDA(cudaStreamSynchronize(stream));
   }
 };
 
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index 1fc560617..a728ebeb1 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -17,7 +17,9 @@
 #include "cunumeric/sort/sort.h"
 #include "cunumeric/sort/sort_template.inl"
 
-#include <parallel/algorithm>
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+#include <numeric>
 #include <omp.h>
 
 namespace cunumeric {
@@ -26,30 +28,38 @@ using namespace Legion;
 using namespace legate;
 
 template <LegateTypeCode CODE, int32_t DIM>
-struct SortImplBody<VariantKind::OMP, false, CODE, DIM> {
+struct SortImplBody<VariantKind::OMP, CODE, DIM> {
   using VAL = legate_type_of<CODE>;
 
-  void std_sort_omp(const VAL* inptr, VAL* outptr, const size_t volume, const size_t sort_dim_size)
+  // sorts inptr in-place, if argptr not nullptr it returns sort indices
+  void thrust_local_sort_inplace(VAL* inptr,
+                                 int32_t* argptr,
+                                 const size_t volume,
+                                 const size_t sort_dim_size)
   {
-    std::copy(inptr, inptr + volume, outptr);
-    if (volume / sort_dim_size > omp_get_max_threads() / 2)  // TODO fine tune
-    {
-#pragma omp do schedule(dynamic)
-      for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
-        std::stable_sort(outptr + start_idx, outptr + start_idx + sort_dim_size);
+    if (argptr == nullptr) {
+      // sort (in place)
+#pragma omp parallel for
+      for (size_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
+        thrust::stable_sort(thrust::host, inptr + start_idx, inptr + start_idx + sort_dim_size);
       }
     } else {
-      for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
-        __gnu_parallel::stable_sort(outptr + start_idx, outptr + start_idx + sort_dim_size);
+      // argsort
+#pragma omp parallel for
+      for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
+        int32_t* segmentValues = argptr + start_idx;
+        VAL* segmentKeys       = inptr + start_idx;
+        std::iota(segmentValues, segmentValues + sort_dim_size, 0);  // init
+        thrust::stable_sort_by_key(
+          thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues);
       }
     }
   }
 
-  void operator()(AccessorRO<VAL, DIM> input,
-                  AccessorWO<VAL, DIM> output,
+  void operator()(const Array& input_array,
+                  Array& output_array,
                   const Pitches<DIM - 1>& pitches,
                   const Rect<DIM>& rect,
-                  const bool dense,
                   const size_t volume,
                   const bool argsort,
                   const Legion::DomainPoint global_shape,
@@ -57,123 +67,65 @@ struct SortImplBody<VariantKind::OMP, false, CODE, DIM> {
                   const Legion::DomainPoint index_point,
                   const Legion::Domain domain)
   {
+    AccessorRO<VAL, DIM> input = input_array.read_accessor<VAL, DIM>(rect);
+
+    bool dense = input.accessor.is_dense_row_major(rect);
+
 #ifdef DEBUG_CUNUMERIC
-    std::cout << "CPU(" << getRank(domain, index_point) << "): local size = " << volume
+    std::cout << "OMP(" << getRank(domain, index_point) << "): local size = " << volume
               << ", dist. = " << is_index_space << ", index_point = " << index_point
               << ", domain/volume = " << domain << "/" << domain.get_volume()
-              << ", dense = " << dense << std::endl;
+              << ", dense = " << dense << ", argsort. = " << argsort << std::endl;
 #endif
+
     const size_t sort_dim_size = global_shape[DIM - 1];
     assert(!is_index_space || DIM > 1);  // not implemented for now
+
+    // make a copy of the input
+    auto dense_input_copy = create_buffer<VAL>(volume, Legion::Memory::Kind::SOCKET_MEM);
     if (dense) {
-      std_sort_omp(input.ptr(rect), output.ptr(rect), volume, sort_dim_size);
+      auto* src = input.ptr(rect.lo);
+      std::copy(src, src + volume, dense_input_copy.ptr(0));
     } else {
-      // compute contiguous memory block
-      int contiguous_elements = 1;
-      for (int i = DIM - 1; i >= 0; i--) {
-        auto diff = 1 + rect.hi[i] - rect.lo[i];
-        contiguous_elements *= diff;
-        if (diff < global_shape[i]) { break; }
-      }
-
-      uint64_t elements_processed = 0;
-      while (elements_processed < volume) {
-        Legion::Point<DIM> start_point = pitches.unflatten(elements_processed, rect.lo);
-        std_sort_omp(
-          input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size);
-        elements_processed += contiguous_elements;
+      auto* target = dense_input_copy.ptr(0);
+      for (size_t offset = 0; offset < volume; ++offset) {
+        auto point     = pitches.unflatten(offset, rect.lo);
+        target[offset] = input[rect.lo + point];
       }
     }
-  }
-};
 
-template <LegateTypeCode CODE, int32_t DIM>
-struct SortImplBody<VariantKind::OMP, true, CODE, DIM> {
-  using VAL = legate_type_of<CODE>;
+    // we need a buffer for argsort
+    auto indices_buffer =
+      create_buffer<int32_t>(argsort ? volume : 0, Legion::Memory::Kind::SOCKET_MEM);
 
-  void std_argsort(const VAL* inptr,
-                   int32_t* outptr,
-                   const size_t volume,
-                   const size_t sort_dim_size)
-  {
-    for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
-      int32_t* segmentKeys     = outptr + start_idx;
-      const VAL* segmentValues = inptr + start_idx;
-      std::iota(outptr + start_idx, outptr + start_idx + sort_dim_size, 0);
-      std::stable_sort(
-        segmentKeys, segmentKeys + sort_dim_size, [segmentValues](int32_t i1, int32_t i2) {
-          return segmentValues[i1] < segmentValues[i2];
-        });
-    }
-  }
-
-  void std_argsort_omp(const VAL* inptr,
-                       int32_t* outptr,
-                       const size_t volume,
-                       const size_t sort_dim_size)
-  {
-    if (volume / sort_dim_size > omp_get_max_threads() / 2)  // TODO fine tune
-    {
-#pragma omp do schedule(dynamic)
-      for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
-        int32_t* segmentKeys     = outptr + start_idx;
-        const VAL* segmentValues = inptr + start_idx;
-        std::iota(segmentKeys, segmentKeys + sort_dim_size, 0);
-        std::stable_sort(
-          segmentKeys, segmentKeys + sort_dim_size, [segmentValues](int32_t i1, int32_t i2) {
-            return segmentValues[i1] < segmentValues[i2];
-          });
-      }
-    } else {
-      for (uint32_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
-        int32_t* segmentKeys     = outptr + start_idx;
-        const VAL* segmentValues = inptr + start_idx;
-        std::iota(segmentKeys, segmentKeys + sort_dim_size, 0);
-        __gnu_parallel::stable_sort(
-          segmentKeys, segmentKeys + sort_dim_size, [segmentValues](int32_t i1, int32_t i2) {
-            return segmentValues[i1] < segmentValues[i2];
-          });
-      }
-    }
-  }
+    // sort data
+    thrust_local_sort_inplace(
+      dense_input_copy.ptr(0), argsort ? indices_buffer.ptr(0) : nullptr, volume, sort_dim_size);
 
-  void operator()(AccessorRO<VAL, DIM> input,
-                  AccessorWO<int32_t, DIM> output,
-                  const Pitches<DIM - 1>& pitches,
-                  const Rect<DIM>& rect,
-                  const bool dense,
-                  const size_t volume,
-                  const bool argsort,
-                  const Legion::DomainPoint global_shape,
-                  const bool is_index_space,
-                  const Legion::DomainPoint index_point,
-                  const Legion::Domain domain)
-  {
-#ifdef DEBUG_CUNUMERIC
-    std::cout << "CPU(" << getRank(domain, index_point) << "): local size = " << volume
-              << ", dist. = " << is_index_space << ", index_point = " << index_point
-              << ", domain/volume = " << domain << "/" << domain.get_volume()
-              << ", dense = " << dense << std::endl;
-#endif
-    const size_t sort_dim_size = global_shape[DIM - 1];
-    assert(!is_index_space || DIM > 1);  // not implemented for now
+    // copy back data (we assume output partition to be aliged to input!)
     if (dense) {
-      std_argsort_omp(input.ptr(rect), output.ptr(rect), volume, sort_dim_size);
-    } else {
-      // compute contiguous memory block
-      int contiguous_elements = 1;
-      for (int i = DIM - 1; i >= 0; i--) {
-        auto diff = 1 + rect.hi[i] - rect.lo[i];
-        contiguous_elements *= diff;
-        if (diff < global_shape[i]) { break; }
+      if (argsort) {
+        AccessorWO<int32_t, DIM> output = output_array.write_accessor<int32_t, DIM>(rect);
+        std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo));
+      } else {
+        AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
+        std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo));
       }
-
-      uint64_t elements_processed = 0;
-      while (elements_processed < volume) {
-        Legion::Point<DIM> start_point = pitches.unflatten(elements_processed, rect.lo);
-        std_argsort_omp(
-          input.ptr(start_point), output.ptr(start_point), contiguous_elements, sort_dim_size);
-        elements_processed += contiguous_elements;
+    } else {
+      if (argsort) {
+        AccessorWO<int32_t, DIM> output = output_array.write_accessor<int32_t, DIM>(rect);
+        auto* source                    = indices_buffer.ptr(0);
+        for (size_t offset = 0; offset < volume; ++offset) {
+          auto point              = pitches.unflatten(offset, rect.lo);
+          output[rect.lo + point] = source[offset];
+        }
+      } else {
+        AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
+        auto* source                = dense_input_copy.ptr(0);
+        for (size_t offset = 0; offset < volume; ++offset) {
+          auto point              = pitches.unflatten(offset, rect.lo);
+          output[rect.lo + point] = source[offset];
+        }
       }
     }
   }
diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl
index c00b4a7e8..57ae935ad 100644
--- a/src/cunumeric/sort/sort_template.inl
+++ b/src/cunumeric/sort/sort_template.inl
@@ -21,7 +21,7 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
-template <VariantKind KIND, bool ARGSORT, LegateTypeCode CODE, int32_t DIM>
+template <VariantKind KIND, LegateTypeCode CODE, int32_t DIM>
 struct SortImplBody;
 
 static int getRank(Domain domain, DomainPoint index_point)
@@ -69,53 +69,16 @@ struct SortImpl {
            "multi-dimensional array should not be distributed in (sort) dimension");
 #endif
 
-    auto input = args.input.read_accessor<VAL, DIM>(rect);
-
-    if (args.argsort) {
-      auto output = args.output.write_accessor<int32_t, DIM>(rect);
-
-#ifndef LEGION_BOUNDS_CHECKS
-      bool dense =
-        input.accessor.is_dense_row_major(rect) && output.accessor.is_dense_row_major(rect);
-#else
-      bool dense = false;
-#endif
-      assert(dense || !args.is_index_space || DIM > 1);
-
-      SortImplBody<KIND, true, CODE, DIM>()(input,
-                                            output,
-                                            pitches,
-                                            rect,
-                                            dense,
-                                            volume,
-                                            args.argsort,
-                                            args.global_shape,
-                                            args.is_index_space,
-                                            args.task_index,
-                                            args.launch_domain);
-
-    } else {
-      auto output = args.output.write_accessor<VAL, DIM>(rect);
-
-#ifndef LEGION_BOUNDS_CHECKS
-      bool dense =
-        input.accessor.is_dense_row_major(rect) && output.accessor.is_dense_row_major(rect);
-#else
-      bool dense = false;
-#endif
-      assert(dense || !args.is_index_space || DIM > 1);
-      SortImplBody<KIND, false, CODE, DIM>()(input,
-                                             output,
-                                             pitches,
-                                             rect,
-                                             dense,
-                                             volume,
-                                             args.argsort,
-                                             args.global_shape,
-                                             args.is_index_space,
-                                             args.task_index,
-                                             args.launch_domain);
-    }
+    SortImplBody<KIND, CODE, DIM>()(args.input,
+                                    args.output,
+                                    pitches,
+                                    rect,
+                                    volume,
+                                    args.argsort,
+                                    args.global_shape,
+                                    args.is_index_space,
+                                    args.task_index,
+                                    args.launch_domain);
   }
 };
 
diff --git a/tests/sort.py b/tests/sort.py
index 24385b06a..a085c0cbe 100644
--- a/tests/sort.py
+++ b/tests/sort.py
@@ -218,6 +218,7 @@ def test_dtypes():
     test_api(generate_random((2, 5, 7), np.uint8))
     test_api(generate_random((8, 5), np.uint16))
     test_api(generate_random((22, 5, 7), np.uint32))
+    test_api(generate_random((220,), np.uint32))
 
     test_api(generate_random((2, 5, 7), np.int8))
     test_api(generate_random((8, 5), np.int16))
@@ -227,9 +228,11 @@ def test_dtypes():
     test_api(generate_random((8, 5), np.float32))
     test_api(generate_random((8, 5), np.float64))
     test_api(generate_random((22, 5, 7), np.double))
+    test_api(generate_random((220,), np.double))
 
     test_api(generate_random((2, 5, 7), np.complex64))
     test_api(generate_random((2, 5, 7), np.complex128))
+    test_api(generate_random((220,), np.complex128))
 
     return
 

From 6a061494debb41666f469851f7e0dadcbc6a35b0 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 25 Feb 2022 09:51:19 -0800
Subject: [PATCH 22/49] change argsort return type to int64

---
 cunumeric/module.py            |  2 +-
 src/cunumeric/sort/sort.cc     | 10 ++++-----
 src/cunumeric/sort/sort.cu     | 38 +++++++++++++++++-----------------
 src/cunumeric/sort/sort_omp.cc | 10 ++++-----
 4 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/cunumeric/module.py b/cunumeric/module.py
index 4f6059f1c..0d6bc6e08 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -5633,7 +5633,7 @@ def argsort(a, axis=-1, kind="stable", order=None):
     Single GPU, Single CPU
     """
 
-    result = ndarray(a.shape, np.int32)
+    result = ndarray(a.shape, np.int64)
     result._thunk.sort(
         rhs=a._thunk, argsort=True, axis=axis, kind=kind, order=order
     )
diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index 30bcd4592..8bea7b5a6 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -33,7 +33,7 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
 
   // sorts inptr in-place, if argptr not nullptr it returns sort indices
   void thrust_local_sort_inplace(VAL* inptr,
-                                 int32_t* argptr,
+                                 int64_t* argptr,
                                  const size_t volume,
                                  const size_t sort_dim_size)
   {
@@ -45,7 +45,7 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
     } else {
       // argsort
       for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
-        int32_t* segmentValues = argptr + start_idx;
+        int64_t* segmentValues = argptr + start_idx;
         VAL* segmentKeys       = inptr + start_idx;
         std::iota(segmentValues, segmentValues + sort_dim_size, 0);  // init
         thrust::stable_sort_by_key(
@@ -93,7 +93,7 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
     }
 
     // we need a buffer for argsort
-    auto indices_buffer = create_buffer<int32_t>(argsort ? volume : 0);
+    auto indices_buffer = create_buffer<int64_t>(argsort ? volume : 0);
 
     // sort data
     thrust_local_sort_inplace(
@@ -102,7 +102,7 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
     // copy back data (we assume output partition to be aliged to input!)
     if (dense) {
       if (argsort) {
-        AccessorWO<int32_t, DIM> output = output_array.write_accessor<int32_t, DIM>(rect);
+        AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
         std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo));
       } else {
         AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
@@ -110,7 +110,7 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
       }
     } else {
       if (argsort) {
-        AccessorWO<int32_t, DIM> output = output_array.write_accessor<int32_t, DIM>(rect);
+        AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
         auto* source                    = indices_buffer.ptr(0);
         for (size_t offset = 0; offset < volume; ++offset) {
           auto point              = pitches.unflatten(offset, rect.lo);
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index 49edec9e1..2ba8bba3d 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -70,7 +70,7 @@ struct multiply : public thrust::unary_function<int, int> {
 
 template <class VAL>
 void cub_local_sort_inplace(
-  VAL* inptr, int32_t* argptr, const size_t volume, const size_t sort_dim_size, cudaStream_t stream)
+  VAL* inptr, int64_t* argptr, const size_t volume, const size_t sort_dim_size, cudaStream_t stream)
 {
   // make a copy of input --> we want inptr to return sorted values
   auto keys_in = create_buffer<VAL>(volume, Legion::Memory::Kind::GPU_FB_MEM);
@@ -125,13 +125,13 @@ void cub_local_sort_inplace(
                                               stream);
     }
   } else {
-    auto idx_in = create_buffer<int32_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
+    auto idx_in = create_buffer<int64_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
     thrust::transform(thrust::cuda::par.on(stream),
-                      thrust::make_counting_iterator<int32_t>(0),
-                      thrust::make_counting_iterator<int32_t>(volume),
-                      thrust::make_constant_iterator<int32_t>(sort_dim_size),
+                      thrust::make_counting_iterator<int64_t>(0),
+                      thrust::make_counting_iterator<int64_t>(volume),
+                      thrust::make_constant_iterator<int64_t>(sort_dim_size),
                       idx_in.ptr(0),
-                      thrust::modulus<int32_t>());
+                      thrust::modulus<int64_t>());
 
     if (volume == sort_dim_size) {
       // argsort
@@ -202,13 +202,13 @@ void cub_local_sort_inplace(
 
 template <class VAL>
 void thrust_local_sort_inplace(
-  VAL* inptr, int32_t* argptr, const size_t volume, const size_t sort_dim_size, cudaStream_t stream)
+  VAL* inptr, int64_t* argptr, const size_t volume, const size_t sort_dim_size, cudaStream_t stream)
 {
   if (argptr == nullptr) {
     if (volume == sort_dim_size) {
       thrust::stable_sort(thrust::cuda::par.on(stream), inptr, inptr + volume);
     } else {
-      auto sort_id = create_buffer<uint32_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
+      auto sort_id = create_buffer<uint64_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
       // init combined keys
       thrust::transform(thrust::cuda::par.on(stream),
                         thrust::make_counting_iterator<uint64_t>(0),
@@ -226,16 +226,16 @@ void thrust_local_sort_inplace(
   } else {
     // intialize indices
     thrust::transform(thrust::cuda::par.on(stream),
-                      thrust::make_counting_iterator<int32_t>(0),
-                      thrust::make_counting_iterator<int32_t>(volume),
-                      thrust::make_constant_iterator<int32_t>(sort_dim_size),
+                      thrust::make_counting_iterator<int64_t>(0),
+                      thrust::make_counting_iterator<int64_t>(volume),
+                      thrust::make_constant_iterator<int64_t>(sort_dim_size),
                       argptr,
-                      thrust::modulus<int32_t>());
+                      thrust::modulus<int64_t>());
 
     if (volume == sort_dim_size) {
       thrust::stable_sort_by_key(thrust::cuda::par.on(stream), inptr, inptr + volume, argptr);
     } else {
-      auto sort_id = create_buffer<uint32_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
+      auto sort_id = create_buffer<uint64_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
       // init combined keys
       thrust::transform(thrust::cuda::par.on(stream),
                         thrust::make_counting_iterator<uint64_t>(0),
@@ -266,7 +266,7 @@ struct support_cub<LegateTypeCode::COMPLEX128_LT> : std::false_type {
 
 template <LegateTypeCode CODE, std::enable_if_t<support_cub<CODE>::value>* = nullptr>
 void local_sort_inplace(legate_type_of<CODE>* inptr,
-                        int32_t* argptr,
+                        int64_t* argptr,
                         const size_t volume,
                         const size_t sort_dim_size,
                         cudaStream_t stream)
@@ -277,7 +277,7 @@ void local_sort_inplace(legate_type_of<CODE>* inptr,
 
 template <LegateTypeCode CODE, std::enable_if_t<!support_cub<CODE>::value>* = nullptr>
 void local_sort_inplace(legate_type_of<CODE>* inptr,
-                        int32_t* argptr,
+                        int64_t* argptr,
                         const size_t volume,
                         const size_t sort_dim_size,
                         cudaStream_t stream)
@@ -333,7 +333,7 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
 
     // we need a buffer for argsort
     auto indices_buffer =
-      create_buffer<int32_t>(argsort ? volume : 0, Legion::Memory::Kind::GPU_FB_MEM);
+      create_buffer<int64_t>(argsort ? volume : 0, Legion::Memory::Kind::GPU_FB_MEM);
 
     // sort data
     local_sort_inplace<CODE>(dense_input_copy.ptr(0),
@@ -345,10 +345,10 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
     // copy back data (we assume output partition to be aliged to input!)
     if (dense) {
       if (argsort) {
-        AccessorWO<int32_t, DIM> output = output_array.write_accessor<int32_t, DIM>(rect);
+        AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
         cudaMemcpyAsync(output.ptr(rect.lo),
                         indices_buffer.ptr(0),
-                        sizeof(int32_t) * volume,
+                        sizeof(int64_t) * volume,
                         cudaMemcpyDeviceToDevice,
                         stream);
       } else {
@@ -362,7 +362,7 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
     } else {
       const size_t num_blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
       if (argsort) {
-        AccessorWO<int32_t, DIM> output = output_array.write_accessor<int32_t, DIM>(rect);
+        AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
         copy_into_output<<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(
           output, indices_buffer.ptr(0), rect.lo, pitches, volume);
       } else {
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index a728ebeb1..0e8cbe7e6 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -33,7 +33,7 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
 
   // sorts inptr in-place, if argptr not nullptr it returns sort indices
   void thrust_local_sort_inplace(VAL* inptr,
-                                 int32_t* argptr,
+                                 int64_t* argptr,
                                  const size_t volume,
                                  const size_t sort_dim_size)
   {
@@ -47,7 +47,7 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
       // argsort
 #pragma omp parallel for
       for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
-        int32_t* segmentValues = argptr + start_idx;
+        int64_t* segmentValues = argptr + start_idx;
         VAL* segmentKeys       = inptr + start_idx;
         std::iota(segmentValues, segmentValues + sort_dim_size, 0);  // init
         thrust::stable_sort_by_key(
@@ -96,7 +96,7 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
 
     // we need a buffer for argsort
     auto indices_buffer =
-      create_buffer<int32_t>(argsort ? volume : 0, Legion::Memory::Kind::SOCKET_MEM);
+      create_buffer<int64_t>(argsort ? volume : 0, Legion::Memory::Kind::SOCKET_MEM);
 
     // sort data
     thrust_local_sort_inplace(
@@ -105,7 +105,7 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
     // copy back data (we assume output partition to be aliged to input!)
     if (dense) {
       if (argsort) {
-        AccessorWO<int32_t, DIM> output = output_array.write_accessor<int32_t, DIM>(rect);
+        AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
         std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo));
       } else {
         AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
@@ -113,7 +113,7 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
       }
     } else {
       if (argsort) {
-        AccessorWO<int32_t, DIM> output = output_array.write_accessor<int32_t, DIM>(rect);
+        AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
         auto* source                    = indices_buffer.ptr(0);
         for (size_t offset = 0; offset < volume; ++offset) {
           auto point              = pitches.unflatten(offset, rect.lo);

From ca889b935e1b26fe4b340d9c286d88670f0ea845 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 25 Feb 2022 13:50:56 -0800
Subject: [PATCH 23/49] resolved earlier merge issue

---
 cunumeric/array.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index eaa949cbc..318b3ea18 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -1480,9 +1480,7 @@ def setfield(self, val, dtype, offset=0):
         )
 
     def setflags(self, write=None, align=None, uic=None):
-        self.__array__(stacklevel=2).setflags(
-            write=write, align=align, uic=uic
-        )
+        self.__array__().setflags(write=write, align=align, uic=uic)
 
     def sort(self, axis=-1, kind="stable", order=None):
         self._thunk.sort(rhs=self._thunk, axis=axis, kind=kind, order=order)

From 5897c686ce7d408a1abacf9e8c6ff07e51fd74b1 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Mon, 28 Feb 2022 02:51:37 -0800
Subject: [PATCH 24/49] deactivate test for dimesions > 4

---
 tests/sort.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/sort.py b/tests/sort.py
index a085c0cbe..3cb593abb 100644
--- a/tests/sort.py
+++ b/tests/sort.py
@@ -246,8 +246,8 @@ def test():
     test_3D(51, 23, 17)
     print("\n\n -----------  3D test (complex) -----\n")
     test_3D_complex(27, 30, 45)
-    print("\n\n -----------  4D/5D test-------------\n")
-    test_custom()
+    # print("\n\n -----------  4D/5D test-------------\n")
+    # test_custom()
     print("\n\n -----------  API test --------------\n")
     test_api()
     print("\n\n -----------  dtype test ------------\n")

From e24eccaf4c0a8dde5a426db2a6a01d4142b53327 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 2 Mar 2022 14:13:54 -0800
Subject: [PATCH 25/49] Distributed 1-D Sort on GPU

---
 cunumeric/sorting/sorting.py         |  28 +-
 src/cunumeric/sort/sort.cc           |   3 +-
 src/cunumeric/sort/sort.cu           | 404 +++++++++++++++++++++++----
 src/cunumeric/sort/sort_omp.cc       |   3 +-
 src/cunumeric/sort/sort_template.inl |  20 +-
 tests/sort.py                        |  43 ++-
 6 files changed, 405 insertions(+), 96 deletions(-)

diff --git a/cunumeric/sorting/sorting.py b/cunumeric/sorting/sorting.py
index 246b3abe3..b2c72a1a2 100644
--- a/cunumeric/sorting/sorting.py
+++ b/cunumeric/sorting/sorting.py
@@ -57,30 +57,34 @@ def sort_swapped(output, input, argsort, sort_axis):
 
 
 def sort_task(output, input, argsort):
-    needs_communication = output.runtime.num_gpus > 1 or (
-        output.runtime.num_gpus == 0 and output.runtime.num_procs > 1
-    )
+    task = output.context.create_task(CuNumericOpCode.SORT)
 
-    if needs_communication:
-        output.runtime.legate_runtime.issue_execution_fence(block=True)
+    needs_unbound_output = output.runtime.num_gpus > 1 and input.ndim == 1
 
-    task = output.context.create_task(CuNumericOpCode.SORT)
+    if needs_unbound_output:
+        unbound = output.runtime.create_unbound_thunk(dtype=output.dtype)
+        task.add_output(unbound.base)
+    else:
+        task.add_output(output.base)
+        task.add_alignment(output.base, input.base)
 
-    task.add_output(output.base)
     task.add_input(input.base)
-    task.add_alignment(output.base, input.base)
+
     if output.ndim > 1:
         task.add_broadcast(input.base, input.ndim - 1)
-    elif needs_communication:
-        # print("Distributed 1D sort --> broadcast")
+    elif output.runtime.num_gpus > 0:
+        task.add_nccl_communicator()
+    elif output.runtime.num_procs > 1:
+        # Distributed 1D sort on CPU not supported yet
         task.add_broadcast(input.base)
 
     task.add_scalar_arg(argsort, bool)  # return indices flag
     task.add_scalar_arg(input.base.shape, (ty.int32,))
     task.execute()
 
-    if needs_communication:
-        output.runtime.legate_runtime.issue_execution_fence(block=True)
+    if needs_unbound_output:
+        output.base = unbound.base
+        output.numpy_array = None
 
 
 def sorting(output, input, argsort, axis=-1):
diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index 8bea7b5a6..2a7264a39 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -63,7 +63,8 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
                   const Legion::DomainPoint global_shape,
                   const bool is_index_space,
                   const Legion::DomainPoint index_point,
-                  const Legion::Domain domain)
+                  const Legion::Domain domain,
+                  const std::vector<comm::Communicator>& comms)
   {
     AccessorRO<VAL, DIM> input = input_array.read_accessor<VAL, DIM>(rect);
 
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index 2ba8bba3d..6a25249e9 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -21,10 +21,12 @@
 #include <thrust/sort.h>
 #include <thrust/device_vector.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/binary_search.h>
 #include <thrust/tuple.h>
 #include <thrust/execution_policy.h>
 #include <cub/device/device_radix_sort.cuh>
 #include <cub/device/device_segmented_radix_sort.cuh>
+#include <cub/thread/thread_search.cuh>
 
 #include "cunumeric/cuda_help.h"
 
@@ -126,12 +128,8 @@ void cub_local_sort_inplace(
     }
   } else {
     auto idx_in = create_buffer<int64_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
-    thrust::transform(thrust::cuda::par.on(stream),
-                      thrust::make_counting_iterator<int64_t>(0),
-                      thrust::make_counting_iterator<int64_t>(volume),
-                      thrust::make_constant_iterator<int64_t>(sort_dim_size),
-                      idx_in.ptr(0),
-                      thrust::modulus<int64_t>());
+    cudaMemcpyAsync(
+      idx_in.ptr(0), argptr, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream);
 
     if (volume == sort_dim_size) {
       // argsort
@@ -224,14 +222,6 @@ void thrust_local_sort_inplace(
                           thrust::less<thrust::tuple<size_t, VAL>>());
     }
   } else {
-    // intialize indices
-    thrust::transform(thrust::cuda::par.on(stream),
-                      thrust::make_counting_iterator<int64_t>(0),
-                      thrust::make_counting_iterator<int64_t>(volume),
-                      thrust::make_constant_iterator<int64_t>(sort_dim_size),
-                      argptr,
-                      thrust::modulus<int64_t>());
-
     if (volume == sort_dim_size) {
       thrust::stable_sort_by_key(thrust::cuda::par.on(stream), inptr, inptr + volume, argptr);
     } else {
@@ -272,7 +262,7 @@ void local_sort_inplace(legate_type_of<CODE>* inptr,
                         cudaStream_t stream)
 {
   using VAL = legate_type_of<CODE>;
-  cub_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stream);
+  if (volume > 0) { cub_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stream); }
 }
 
 template <LegateTypeCode CODE, std::enable_if_t<!support_cub<CODE>::value>* = nullptr>
@@ -283,7 +273,300 @@ void local_sort_inplace(legate_type_of<CODE>* inptr,
                         cudaStream_t stream)
 {
   using VAL = legate_type_of<CODE>;
-  thrust_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stream);
+  if (volume > 0) { thrust_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stream); }
+}
+
+template <typename VAL>
+__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  print_subset(const VAL* data, const size_t volume, const size_t rank)
+{
+  const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx == 0) {
+    printf("data(%d) = [ ", rank);
+    for (int i = 0; i < volume; ++i) { printf("%d ", data[i]); }
+    printf("]\n");
+  }
+}
+
+// auto align to multiples of 16 bytes
+auto get_aligned_size = [](auto size) { return std::max<size_t>(16, (size + 15) / 16 * 16); };
+
+template <typename VAL>
+struct SortPiece {
+  Buffer<VAL> values;
+  Buffer<int64_t> indices;
+  size_t size;
+};
+
+template <typename VAL>
+struct Sample {
+  VAL value;
+  int32_t rank;
+  size_t position;
+};
+
+template <typename VAL>
+struct SampleComparator : public thrust::binary_function<Sample<VAL>, Sample<VAL>, bool> {
+  __host__ __device__ bool operator()(const Sample<VAL>& lhs, const Sample<VAL>& rhs) const
+  {
+    // special case for unused samples
+    if (lhs.rank < 0 || rhs.rank < 0) { return rhs.rank < 0 && lhs.rank >= 0; }
+
+    if (lhs.value != rhs.value) {
+      return lhs.value < rhs.value;
+    } else if (lhs.rank != rhs.rank) {
+      return lhs.rank < rhs.rank;
+    } else {
+      return lhs.position < rhs.position;
+    }
+  }
+};
+
+template <typename VAL>
+__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  extract_samples(const VAL* data,
+                  const size_t volume,
+                  Sample<VAL>* samples,
+                  const size_t num_local_samples,
+                  const Sample<VAL> init_sample,
+                  const size_t offset,
+                  const size_t rank)
+{
+  const size_t sample_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (sample_idx >= num_local_samples) return;
+
+  if (num_local_samples < volume) {
+    const size_t index                    = (sample_idx + 1) * volume / num_local_samples - 1;
+    samples[offset + sample_idx].value    = data[index];
+    samples[offset + sample_idx].rank     = rank;
+    samples[offset + sample_idx].position = index;
+    // printf("Sample rank %lu position %lu offset %lu\n", rank, index, (offset+sample_idx));
+  } else {
+    // edge case where num_local_samples > volume
+    if (sample_idx < volume) {
+      samples[offset + sample_idx].value    = data[sample_idx];
+      samples[offset + sample_idx].rank     = rank;
+      samples[offset + sample_idx].position = sample_idx;
+    } else {
+      samples[offset + sample_idx] = init_sample;
+    }
+  }
+}
+
+template <typename VAL>
+__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  extract_split_positions(const VAL* data,
+                          const size_t volume,
+                          const Sample<VAL>* samples,
+                          const size_t num_samples,
+                          size_t* split_positions,
+                          const size_t num_splitters,
+                          const size_t rank)
+{
+  const size_t splitter_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (splitter_idx >= num_splitters) return;
+
+  const size_t index         = (splitter_idx + 1) * num_samples / (num_splitters + 1) - 1;
+  const Sample<VAL> splitter = samples[index];
+
+  // now perform search on data to receive position *after* last element to be
+  // part of the package for rank splitter_idx
+  if (rank > splitter.rank) {
+    // position of the last position with smaller value than splitter.value + 1
+    split_positions[splitter_idx] = cub::LowerBound(data, volume, splitter.value);
+  } else if (rank < splitter.rank) {
+    // position of the first position with value larger than splitter.value
+    split_positions[splitter_idx] = cub::UpperBound(data, volume, splitter.value);
+  } else {
+    split_positions[splitter_idx] = splitter.position + 1;
+  }
+  // printf("Splitter position id %lu rank %lu position %lu num_samples %lu\n", splitter_idx, rank,
+  // split_positions[splitter_idx], num_samples);
+}
+
+template <typename VAL>
+static SortPiece<VAL> sample_sort_nccl(SortPiece<VAL> local_sorted,
+                                       size_t my_rank,
+                                       size_t num_ranks,
+                                       bool argsort,
+                                       cudaStream_t stream,
+                                       ncclComm_t* comm)
+{
+  size_t volume = local_sorted.size;
+
+  // collect local samples
+  size_t num_local_samples  = num_ranks;  // handle case numRanks > volume!!
+  size_t num_global_samples = num_local_samples * num_ranks;
+  auto samples              = create_buffer<Sample<VAL>>(num_global_samples, Memory::GPU_FB_MEM);
+
+  Sample<VAL> init_sample;
+  {
+    const size_t num_blocks = (num_local_samples + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    init_sample.rank        = -1;  // init samples that are not populated
+    size_t offset           = num_local_samples * my_rank;
+    extract_samples<<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(local_sorted.values.ptr(0),
+                                                                  volume,
+                                                                  samples.ptr(0),
+                                                                  num_local_samples,
+                                                                  init_sample,
+                                                                  offset,
+                                                                  my_rank);
+  }
+
+  // AllGather: check alignment? as we want to receive data in-place we take exact size for now
+  CHECK_NCCL(ncclAllGather(samples.ptr(my_rank * num_ranks),
+                           samples.ptr(0),
+                           num_ranks * sizeof(Sample<VAL>),
+                           ncclInt8,
+                           *comm,
+                           stream));
+
+  // sort samples on device
+  thrust::stable_sort(thrust::cuda::par.on(stream),
+                      samples.ptr(0),
+                      samples.ptr(0) + num_global_samples,
+                      SampleComparator<VAL>());
+
+  auto lower_bound          = thrust::lower_bound(thrust::cuda::par.on(stream),
+                                         samples.ptr(0),
+                                         samples.ptr(0) + num_global_samples,
+                                         init_sample,
+                                         SampleComparator<VAL>());
+  size_t num_usable_samples = lower_bound - samples.ptr(0);
+
+  // select splitters / positions based on samples (on device)
+  const size_t num_splitters = num_ranks - 1;
+  auto split_positions       = create_buffer<size_t>(num_splitters, Memory::Z_COPY_MEM);
+  {
+    const size_t num_blocks = (num_splitters + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    VAL init_value          = std::numeric_limits<VAL>::max();
+    extract_split_positions<<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(
+      local_sorted.values.ptr(0),
+      volume,
+      samples.ptr(0),
+      num_usable_samples,
+      split_positions.ptr(0),
+      num_splitters,
+      my_rank);
+  }
+
+  // need to sync as we share values in between host/device
+  CHECK_CUDA(cudaStreamSynchronize(stream));
+
+  // collect sizes2send, send to rank i: local_sort_data from positions  split_positions[i-1],
+  // split_positions[i] - 1
+  auto size_send = create_buffer<size_t>(num_ranks, Memory::Z_COPY_MEM);
+  {
+    size_t last_position = 0;
+    for (size_t rank = 0; rank < num_ranks - 1; ++rank) {
+      size_t cur_position = split_positions[rank];
+      size_send[rank]     = cur_position - last_position;
+      last_position       = cur_position;
+    }
+    size_send[num_ranks - 1] = volume - last_position;
+  }
+
+  // need to sync as we share values in between host/device
+  CHECK_CUDA(cudaStreamSynchronize(stream));
+
+  // all2all exchange send/receive sizes
+  auto size_recv = create_buffer<size_t>(num_ranks, Memory::Z_COPY_MEM);
+  CHECK_NCCL(ncclGroupStart());
+  for (int r = 0; r < num_ranks; r++) {
+    CHECK_NCCL(ncclSend(size_send.ptr(r), 1, ncclUint64, r, *comm, stream));
+    CHECK_NCCL(ncclRecv(size_recv.ptr(r), 1, ncclUint64, r, *comm, stream));
+  }
+  CHECK_NCCL(ncclGroupEnd());
+
+  // need to sync as we share values in between host/device
+  CHECK_CUDA(cudaStreamSynchronize(stream));
+
+  // allocate merge targets, data transfer...
+  std::vector<SortPiece<VAL>> merge_buffers(num_ranks);
+
+  for (int i = 0; i < merge_buffers.size(); ++i) {
+    // align buffer to allow data transfer of 16byte blocks
+    auto recv_size_aligned   = get_aligned_size(size_recv[i] * sizeof(VAL));
+    auto buf_size            = (recv_size_aligned + sizeof(VAL) - 1) / sizeof(VAL);
+    merge_buffers[i].values  = create_buffer<VAL>(buf_size, Memory::GPU_FB_MEM);
+    merge_buffers[i].indices = create_buffer<int64_t>(argsort ? buf_size : 0, Memory::GPU_FB_MEM);
+    merge_buffers[i].size    = size_recv[i];
+  }
+  size_t send_pos = 0;
+  CHECK_NCCL(ncclGroupStart());
+  for (int r = 0; r < num_ranks; r++) {
+    CHECK_NCCL(ncclSend(local_sorted.values.ptr(send_pos),
+                        get_aligned_size(size_send[r] * sizeof(VAL)),
+                        ncclInt8,
+                        r,
+                        *comm,
+                        stream));
+    CHECK_NCCL(ncclRecv(merge_buffers[r].values.ptr(0),
+                        get_aligned_size(size_recv[r] * sizeof(VAL)),
+                        ncclInt8,
+                        r,
+                        *comm,
+                        stream));
+    if (argsort) {
+      CHECK_NCCL(
+        ncclSend(local_sorted.indices.ptr(send_pos), size_send[r], ncclInt64, r, *comm, stream));
+      CHECK_NCCL(
+        ncclRecv(merge_buffers[r].indices.ptr(0), size_recv[r], ncclInt64, r, *comm, stream));
+    }
+    send_pos += size_send[r];
+  }
+  CHECK_NCCL(ncclGroupEnd());
+
+  // now merge sort all into the result buffer
+  // maybe k-way merge is more efficient here...
+  for (size_t stride = 1; stride < num_ranks; stride *= 2) {
+    for (size_t pos = 0; pos + stride < num_ranks; pos += 2 * stride) {
+      SortPiece<VAL> source1 = merge_buffers[pos];
+      SortPiece<VAL> source2 = merge_buffers[pos + stride];
+      auto merged_size       = source1.size + source2.size;
+      auto merged_values     = create_buffer<VAL>(merged_size);
+      auto merged_indices    = source1.indices;  // will be overriden for argsort
+      auto p_merged_values   = merged_values.ptr(0);
+      auto p_values1         = source1.values.ptr(0);
+      auto p_values2         = source2.values.ptr(0);
+      if (argsort) {
+        merged_indices = create_buffer<int64_t>(merged_size);
+        // merge with key/value
+        auto p_indices1       = source1.indices.ptr(0);
+        auto p_indices2       = source2.indices.ptr(0);
+        auto p_merged_indices = merged_indices.ptr(0);
+        thrust::merge_by_key(thrust::cuda::par.on(stream),
+                             p_values1,
+                             p_values1 + source1.size,
+                             p_values2,
+                             p_values2 + source2.size,
+                             p_indices1,
+                             p_indices2,
+                             p_merged_values,
+                             p_merged_indices);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+        source1.indices.destroy();
+      } else {
+        thrust::merge(thrust::cuda::par.on(stream),
+                      p_values1,
+                      p_values1 + source1.size,
+                      p_values2,
+                      p_values2 + source2.size,
+                      p_merged_values);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+      }
+
+      source1.values.destroy();
+      source2.values.destroy();
+      source2.indices.destroy();
+
+      merge_buffers[pos].values  = merged_values;
+      merge_buffers[pos].indices = merged_indices;
+      merge_buffers[pos].size    = merged_size;
+    }
+  }
+  return merge_buffers[0];
 }
 
 template <LegateTypeCode CODE, int32_t DIM>
@@ -299,41 +582,54 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
                   const Legion::DomainPoint global_shape,
                   const bool is_index_space,
                   const Legion::DomainPoint index_point,
-                  const Legion::Domain domain)
+                  const Legion::Domain domain,
+                  const std::vector<comm::Communicator>& comms)
   {
     AccessorRO<VAL, DIM> input = input_array.read_accessor<VAL, DIM>(rect);
 
-    bool dense = input.accessor.is_dense_row_major(rect);
+    size_t my_rank = getRank(domain, index_point);
 
 #ifdef DEBUG_CUNUMERIC
-    std::cout << "GPU(" << getRank(domain, index_point) << "): local size = " << volume
-              << ", dist. = " << is_index_space << ", index_point = " << index_point
-              << ", domain/volume = " << domain << "/" << domain.get_volume()
-              << ", dense = " << dense << ", argsort. = " << argsort << std::endl;
+    std::cout << "GPU(" << my_rank << "): local size = " << volume << ", dist. = " << is_index_space
+              << ", index_point = " << index_point << ", domain/volume = " << domain << "/"
+              << domain.get_volume() << ", dense = " << input.accessor.is_dense_row_major(rect)
+              << ", argsort. = " << argsort << std::endl;
 #endif
 
+    assert(rect.empty() || input.accessor.is_dense_row_major(rect));
+
     auto stream = get_cached_stream();
 
-    const size_t sort_dim_size = global_shape[DIM - 1];
-    assert(!is_index_space || DIM > 1);  // not implemented for now
+    const size_t sort_dim_size = DIM == 1 ? volume : global_shape[DIM - 1];
 
     // make a copy of the input
     auto dense_input_copy = create_buffer<VAL>(volume, Legion::Memory::Kind::GPU_FB_MEM);
-    if (dense) {
-      cudaMemcpyAsync(dense_input_copy.ptr(0),
-                      input.ptr(rect.lo),
-                      sizeof(VAL) * volume,
-                      cudaMemcpyDeviceToDevice,
-                      stream);
-    } else {
-      const size_t num_blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-      copy_into_buffer<<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(
-        dense_input_copy.ptr(0), input, rect.lo, pitches, volume);
-    }
+    cudaMemcpyAsync(dense_input_copy.ptr(0),
+                    input.ptr(rect.lo),
+                    sizeof(VAL) * volume,
+                    cudaMemcpyDeviceToDevice,
+                    stream);
 
     // we need a buffer for argsort
     auto indices_buffer =
       create_buffer<int64_t>(argsort ? volume : 0, Legion::Memory::Kind::GPU_FB_MEM);
+    if (argsort && volume > 0) {
+      // intialize
+      if (DIM == 1) {
+        size_t offset = DIM > 1 ? 0 : rect.lo[0];
+        thrust::sequence(thrust::cuda::par.on(stream),
+                         indices_buffer.ptr(0),
+                         indices_buffer.ptr(0) + volume,
+                         offset);
+      } else {
+        thrust::transform(thrust::cuda::par.on(stream),
+                          thrust::make_counting_iterator<int64_t>(0),
+                          thrust::make_counting_iterator<int64_t>(volume),
+                          thrust::make_constant_iterator<int64_t>(sort_dim_size),
+                          indices_buffer.ptr(0),
+                          thrust::modulus<int64_t>());
+      }
+    }
 
     // sort data
     local_sort_inplace<CODE>(dense_input_copy.ptr(0),
@@ -342,10 +638,32 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
                              sort_dim_size,
                              stream);
 
-    // copy back data (we assume output partition to be aliged to input!)
-    if (dense) {
+    // this is linked to the decision in sorting.py on when to use adn 'unbounded' output array.
+    if (output_array.dim() == -1) {
+      SortPiece<VAL> local_sorted;
+      local_sorted.values                       = dense_input_copy;
+      local_sorted.indices                      = indices_buffer;
+      local_sorted.size                         = volume;
+      SortPiece<VAL> local_sorted_repartitioned = is_index_space
+                                                    ? sample_sort_nccl(local_sorted,
+                                                                       my_rank,
+                                                                       domain.get_volume(),
+                                                                       argsort,
+                                                                       stream,
+                                                                       comms[0].get<ncclComm_t*>())
+                                                    : local_sorted;
+      if (argsort) {
+        output_array.return_data(local_sorted_repartitioned.indices,
+                                 local_sorted_repartitioned.size);
+      } else {
+        output_array.return_data(local_sorted_repartitioned.values,
+                                 local_sorted_repartitioned.size);
+      }
+    } else {
+      // copy back data (we assume output partition to be aliged to input!)
       if (argsort) {
         AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
+        assert(output.accessor.is_dense_row_major(rect));
         cudaMemcpyAsync(output.ptr(rect.lo),
                         indices_buffer.ptr(0),
                         sizeof(int64_t) * volume,
@@ -353,23 +671,13 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
                         stream);
       } else {
         AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
+        assert(output.accessor.is_dense_row_major(rect));
         cudaMemcpyAsync(output.ptr(rect.lo),
                         dense_input_copy.ptr(0),
                         sizeof(VAL) * volume,
                         cudaMemcpyDeviceToDevice,
                         stream);
       }
-    } else {
-      const size_t num_blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-      if (argsort) {
-        AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
-        copy_into_output<<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(
-          output, indices_buffer.ptr(0), rect.lo, pitches, volume);
-      } else {
-        AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
-        copy_into_output<<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(
-          output, dense_input_copy.ptr(0), rect.lo, pitches, volume);
-      }
     }
     CHECK_CUDA(cudaStreamSynchronize(stream));
   }
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index 0e8cbe7e6..1416bd394 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -65,7 +65,8 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
                   const Legion::DomainPoint global_shape,
                   const bool is_index_space,
                   const Legion::DomainPoint index_point,
-                  const Legion::Domain domain)
+                  const Legion::Domain domain,
+                  const std::vector<comm::Communicator>& comms)
   {
     AccessorRO<VAL, DIM> input = input_array.read_accessor<VAL, DIM>(rect);
 
diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl
index 57ae935ad..2360f1068 100644
--- a/src/cunumeric/sort/sort_template.inl
+++ b/src/cunumeric/sort/sort_template.inl
@@ -37,16 +37,12 @@ static int getRank(Domain domain, DomainPoint index_point)
 template <VariantKind KIND>
 struct SortImpl {
   template <LegateTypeCode CODE, int32_t DIM>
-  void operator()(SortArgs& args) const
+  void operator()(SortArgs& args, std::vector<comm::Communicator>& comms) const
   {
     using VAL = legate_type_of<CODE>;
 
     auto rect = args.input.shape<DIM>();
 
-    // we shall not return on empty rectangle in case of distributed data
-    // as the process might still participate in the parallel sort
-    if ((DIM > 1 || !args.is_index_space) && rect.empty()) return;
-
     Pitches<DIM - 1> pitches;
     size_t volume = pitches.flatten(rect);
 
@@ -55,8 +51,8 @@ struct SortImpl {
      * 1. Sort is always requested for the 'last' dimension within rect
      * 2. We have product_of_all_other_dimensions independent sort ranges
      * 3. if we have more than one participants:
-     *  a) 1D-case: we need to perform parallel sort (e.g. via sampling) -- not implemented yet
-     *  b) ND-case: rect needs to be the full domain in that last dimension
+     *  a) 1D-case: we need to perform parallel sort (e.g. via sampling) -- (only implemented for
+     * GPU) b) ND-case: rect needs to be the full domain in that last dimension
      *
      */
 
@@ -69,6 +65,10 @@ struct SortImpl {
            "multi-dimensional array should not be distributed in (sort) dimension");
 #endif
 
+    // we shall not return on empty rectangle in case of distributed data
+    // as the process might still participate in the parallel sort
+    if ((DIM > 1 || !args.is_index_space) && rect.empty()) return;
+
     SortImplBody<KIND, CODE, DIM>()(args.input,
                                     args.output,
                                     pitches,
@@ -78,7 +78,8 @@ struct SortImpl {
                                     args.global_shape,
                                     args.is_index_space,
                                     args.task_index,
-                                    args.launch_domain);
+                                    args.launch_domain,
+                                    comms);
   }
 };
 
@@ -99,7 +100,8 @@ static void sort_template(TaskContext& context)
                 !context.is_single_task(),
                 context.get_task_index(),
                 context.get_launch_domain()};
-  double_dispatch(args.input.dim(), args.input.code(), SortImpl<KIND>{}, args);
+  double_dispatch(
+    args.input.dim(), args.input.code(), SortImpl<KIND>{}, args, context.communicators());
 }
 
 }  // namespace cunumeric
diff --git a/tests/sort.py b/tests/sort.py
index 3cb593abb..9cac07195 100644
--- a/tests/sort.py
+++ b/tests/sort.py
@@ -20,9 +20,9 @@
 
 def compare_assert(a_np, a_num):
     if not num.allclose(a_np, a_num):
-        print("numpy:")
+        print("numpy, shape " + str(a_np.shape) + ":")
         print(a_np)
-        print("cuNumeric:")
+        print("cuNumeric, shape " + str(a_num.shape) + ":")
         print(a_num)
         assert False
 
@@ -30,11 +30,11 @@ def compare_assert(a_np, a_num):
 def test_sort_axis(a_np, a_num, axis):
     compare_assert(a_np, a_num)
     print("Sorting axis " + str(axis) + ":")
-    sort_np = np.sort(a_np, axis)
+    sort_np = np.sort(a_np, axis, kind="stable")
     sort_num = num.sort(a_num, axis)
     compare_assert(sort_np, sort_num)
-    argsort_np = np.sort(a_np, axis)
-    argsort_num = num.sort(a_num, axis)
+    argsort_np = np.argsort(a_np, axis, kind="stable")
+    argsort_num = num.argsort(a_num, axis)
     compare_assert(argsort_np, argsort_num)
 
 
@@ -48,7 +48,6 @@ def test_1D():
     sortA_np = np.sort(A_np)
     print("Result numpy    : " + str(sortA_np))
 
-    # pdb.set_trace()
     sortA_num = num.sort(A_num)
     print("Result cunumeric: " + str(sortA_num))
     compare_assert(sortA_np, sortA_num)
@@ -116,24 +115,13 @@ def test_3D_complex(x_dim, y_dim, z_dim):
 
 
 def test_custom():
-    # 4D still works, >=5D always falls back to numpy
-    a = np.arange(4 * 2 * 2 * 4).reshape(4, 2, 2, 4)
-    a_num = num.array(a)
-
-    test_sort_axis(a, a_num, 1)
-    test_sort_axis(a, a_num, 2)
-    test_sort_axis(a, a_num, a.ndim - 1)
-
-    a = np.arange(4 * 4 * 5 * 2 * 3 * 2 * 2 * 2 * 4).reshape(
-        4, 4, 5, 2, 3, 2, 2, 2, 4
-    )
+    np.random.seed(42)
+    a = generate_random((4,), np.uint8)
+    print("Matrix A")
+    print(a)
 
     a_num = num.array(a)
-
-    test_sort_axis(a, a_num, 1)
-    test_sort_axis(a, a_num, 2)
-    test_sort_axis(a, a_num, 7)
-    test_sort_axis(a, a_num, 4)
+    compare_assert(np.sort_complex(a), num.sort_complex(a_num))
 
     return
 
@@ -145,17 +133,21 @@ def test_api(a=None):
 
     # sort axes
     for i in range(a.ndim):
+        print("sort axis " + str(i))
         compare_assert(np.sort(a, axis=i, kind="stable"), num.sort(a_num, i))
 
     # flatten
+    print("sort flattened")
     compare_assert(
         np.sort(a, axis=None, kind="stable"), num.sort(a_num, axis=None)
     )
 
     # msort
+    print("msort")
     compare_assert(np.msort(a), num.msort(a_num))
 
     # sort_complex
+    print("sort_complex")
     compare_assert(np.sort_complex(a), num.sort_complex(a_num))
 
     # reverse order sort
@@ -174,11 +166,13 @@ def test_api(a=None):
     # argsort
     for i in range(a.ndim):
         compare_assert(a, a_num)
+        print("argsort axis " + str(i))
         compare_assert(
             np.argsort(a, axis=i, kind="stable"), num.argsort(a_num, axis=i)
         )
 
     # flatten
+    print("argsort flattened")
     compare_assert(
         np.argsort(a, axis=None, kind="stable"), num.argsort(a_num, axis=None)
     )
@@ -210,7 +204,7 @@ def generate_random(shape, datatype):
     else:
         print("UNKNOWN type " + str(datatype))
         assert False
-    return a_np
+    return a_np.reshape(shape)
 
 
 def test_dtypes():
@@ -246,8 +240,6 @@ def test():
     test_3D(51, 23, 17)
     print("\n\n -----------  3D test (complex) -----\n")
     test_3D_complex(27, 30, 45)
-    # print("\n\n -----------  4D/5D test-------------\n")
-    # test_custom()
     print("\n\n -----------  API test --------------\n")
     test_api()
     print("\n\n -----------  dtype test ------------\n")
@@ -256,3 +248,4 @@ def test():
 
 if __name__ == "__main__":
     test()
+    # test_custom()

From 3eeebd2dd1af3941cee93438fd7cd228042baae5 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Mon, 7 Mar 2022 01:24:18 -0800
Subject: [PATCH 26/49] remove explicit host memory type

---
 src/cunumeric/sort/sort_omp.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index 1416bd394..0fe8b92a4 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -83,7 +83,7 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
     assert(!is_index_space || DIM > 1);  // not implemented for now
 
     // make a copy of the input
-    auto dense_input_copy = create_buffer<VAL>(volume, Legion::Memory::Kind::SOCKET_MEM);
+    auto dense_input_copy = create_buffer<VAL>(volume);
     if (dense) {
       auto* src = input.ptr(rect.lo);
       std::copy(src, src + volume, dense_input_copy.ptr(0));
@@ -96,8 +96,7 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
     }
 
     // we need a buffer for argsort
-    auto indices_buffer =
-      create_buffer<int64_t>(argsort ? volume : 0, Legion::Memory::Kind::SOCKET_MEM);
+    auto indices_buffer = create_buffer<int64_t>(argsort ? volume : 0);
 
     // sort data
     thrust_local_sort_inplace(

From 6a7e736628a609192710a7624c666dd3b0a020f3 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Mon, 7 Mar 2022 01:37:51 -0800
Subject: [PATCH 27/49] assume all data is dense according to mapping config

---
 src/cunumeric/sort/sort.cc     | 26 +++-----------------------
 src/cunumeric/sort/sort_omp.cc | 27 +++------------------------
 2 files changed, 6 insertions(+), 47 deletions(-)

diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index 2a7264a39..ca0bf1545 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -78,19 +78,15 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
 #endif
 
     const size_t sort_dim_size = global_shape[DIM - 1];
+
+    assert(dense);
     assert(!is_index_space || DIM > 1);  // not implemented for now
 
     // make a copy of the input
     auto dense_input_copy = create_buffer<VAL>(volume);
-    if (dense) {
+    {
       auto* src = input.ptr(rect.lo);
       std::copy(src, src + volume, dense_input_copy.ptr(0));
-    } else {
-      auto* target = dense_input_copy.ptr(0);
-      for (size_t offset = 0; offset < volume; ++offset) {
-        auto point     = pitches.unflatten(offset, rect.lo);
-        target[offset] = input[rect.lo + point];
-      }
     }
 
     // we need a buffer for argsort
@@ -109,22 +105,6 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
         AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
         std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo));
       }
-    } else {
-      if (argsort) {
-        AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
-        auto* source                    = indices_buffer.ptr(0);
-        for (size_t offset = 0; offset < volume; ++offset) {
-          auto point              = pitches.unflatten(offset, rect.lo);
-          output[rect.lo + point] = source[offset];
-        }
-      } else {
-        AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
-        auto* source                = dense_input_copy.ptr(0);
-        for (size_t offset = 0; offset < volume; ++offset) {
-          auto point              = pitches.unflatten(offset, rect.lo);
-          output[rect.lo + point] = source[offset];
-        }
-      }
     }
   }
 };
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index 0fe8b92a4..ca9004f1e 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -80,19 +80,14 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
 #endif
 
     const size_t sort_dim_size = global_shape[DIM - 1];
+    assert(dense);
     assert(!is_index_space || DIM > 1);  // not implemented for now
 
     // make a copy of the input
     auto dense_input_copy = create_buffer<VAL>(volume);
-    if (dense) {
+    {
       auto* src = input.ptr(rect.lo);
       std::copy(src, src + volume, dense_input_copy.ptr(0));
-    } else {
-      auto* target = dense_input_copy.ptr(0);
-      for (size_t offset = 0; offset < volume; ++offset) {
-        auto point     = pitches.unflatten(offset, rect.lo);
-        target[offset] = input[rect.lo + point];
-      }
     }
 
     // we need a buffer for argsort
@@ -103,7 +98,7 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
       dense_input_copy.ptr(0), argsort ? indices_buffer.ptr(0) : nullptr, volume, sort_dim_size);
 
     // copy back data (we assume output partition to be aliged to input!)
-    if (dense) {
+    {
       if (argsort) {
         AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
         std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo));
@@ -111,22 +106,6 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
         AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
         std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo));
       }
-    } else {
-      if (argsort) {
-        AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
-        auto* source                    = indices_buffer.ptr(0);
-        for (size_t offset = 0; offset < volume; ++offset) {
-          auto point              = pitches.unflatten(offset, rect.lo);
-          output[rect.lo + point] = source[offset];
-        }
-      } else {
-        AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
-        auto* source                = dense_input_copy.ptr(0);
-        for (size_t offset = 0; offset < volume; ++offset) {
-          auto point              = pitches.unflatten(offset, rect.lo);
-          output[rect.lo + point] = source[offset];
-        }
-      }
     }
   }
 };

From 7483a2b4c1fb2682c89d25f638bf8f2e91e6e138 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Mon, 7 Mar 2022 02:28:38 -0800
Subject: [PATCH 28/49] transform to complex datatype AFTER computation

---
 cunumeric/module.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/cunumeric/module.py b/cunumeric/module.py
index cc73b11b8..08c079479 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -5860,13 +5860,12 @@ def sort_complex(a):
     Single GPU, Single CPU
     """
 
-    # force complex result
-    if np.issubdtype(a.dtype, np.complexfloating):
-        out = a
+    result = sort(a)
+    # force complex result upon return
+    if np.issubdtype(result.dtype, np.complexfloating):
+        return result
     else:
-        out = a.astype(np.complex64, copy=True)
-
-    return sort(out)
+        return result.astype(np.complex64, copy=True)
 
 
 # Searching

From e6beb1d4c3c9245ba70c0c02304419d5af950c25 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 9 Mar 2022 04:58:33 -0800
Subject: [PATCH 29/49] review changes python

---
 cunumeric/array.py                 |  6 +--
 cunumeric/deferred.py              | 19 +++-----
 cunumeric/eager.py                 |  2 +-
 cunumeric/module.py                | 18 ++++---
 cunumeric/{sorting => }/sorting.py | 39 +++++++--------
 cunumeric/sorting/__init__.py      | 16 ------
 examples/sort.py                   | 78 +++++++++++++++++++-----------
 install.py                         |  2 +-
 tests/sort.py                      | 33 +------------
 9 files changed, 89 insertions(+), 124 deletions(-)
 rename cunumeric/{sorting => }/sorting.py (70%)
 delete mode 100644 cunumeric/sorting/__init__.py

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 27ed88afb..c9090ed26 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -2771,15 +2771,13 @@ def setflags(self, write=None, align=None, uic=None):
         """
         self.__array__().setflags(write=write, align=align, uic=uic)
 
-    def sort(self, axis=-1, kind="stable", order=None):
+    def sort(self, axis=-1, kind="quicksort", order=None):
         self._thunk.sort(rhs=self._thunk, axis=axis, kind=kind, order=order)
-        return
 
-    def argsort(self, axis=-1, kind="stable", order=None):
+    def argsort(self, axis=-1, kind="quicksort", order=None):
         self._thunk.sort(
             rhs=self._thunk, argsort=True, axis=axis, kind=kind, order=order
         )
-        return
 
     def squeeze(self, axis=None):
         """a.squeeze(axis=None)
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 095a9e1d6..654ebfa70 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -32,7 +32,7 @@
     UnaryRedCode,
 )
 from .linalg.cholesky import cholesky
-from .sorting.sorting import sorting
+from .sorting import sorting
 from .thunk import NumPyThunk
 from .utils import get_arg_value_dtype
 
@@ -1544,16 +1544,13 @@ def unique(self):
         return result
 
     @auto_convert([1])
-    def sort(self, rhs, argsort=False, axis=-1, kind="stable", order=None):
+    def sort(self, rhs, argsort=False, axis=-1, kind="quicksort", order=None):
+
+        if kind == "stable":
+            stable = True
+        else:
+            stable = False
 
-        if kind != "stable":
-            self.runtime.warn(
-                "cuNumeric uses a different (stable) algorithm than "
-                + str(kind)
-                + " for sorting",
-                category=RuntimeWarning,
-                stacklevel=2,
-            )
         if order is not None:
             raise NotImplementedError(
                 "cuNumeric does not support sorting with 'order' as "
@@ -1562,4 +1559,4 @@ def sort(self, rhs, argsort=False, axis=-1, kind="stable", order=None):
         if axis is not None and (axis >= rhs.ndim or axis < -rhs.ndim):
             raise ValueError("invalid axis")
 
-        sorting(self, rhs, argsort, axis)
+        sorting(self, rhs, argsort, axis, stable)
diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index 30f34e018..59127b985 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -502,7 +502,7 @@ def nonzero(self):
                 result += (EagerArray(self.runtime, array),)
             return result
 
-    def sort(self, rhs, argsort=False, axis=-1, kind="stable", order=None):
+    def sort(self, rhs, argsort=False, axis=-1, kind="quicksort", order=None):
         self.check_eager_args(rhs, axis, kind, order)
         if self.deferred is not None:
             self.deferred.sort(rhs, argsort, axis, kind, order)
diff --git a/cunumeric/module.py b/cunumeric/module.py
index 08c079479..3b94291b1 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -5712,7 +5712,7 @@ def unique(
 
 
 @add_boilerplate("a")
-def argsort(a, axis=-1, kind="stable", order=None):
+def argsort(a, axis=-1, kind="quicksort", order=None):
     """
 
     Returns the indices that would sort an array.
@@ -5725,7 +5725,8 @@ def argsort(a, axis=-1, kind="stable", order=None):
         Axis to sort. By default, the index -1 (the last axis) is used. If
         None, the flattened array is used.
     kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
-        Currently only 'stable' sort is supported
+        Default is 'quicksort'. The underlying sort algorithm might vary.
+        The code basically supports 'stable' or *not* 'stable'.
     order : str or list of str, optional
         Currently not supported
 
@@ -5746,7 +5747,7 @@ def argsort(a, axis=-1, kind="stable", order=None):
 
     Availability
     --------
-    Single GPU, Single CPU
+    Multiple GPUs, Single CPU
     """
 
     result = ndarray(a.shape, np.int64)
@@ -5782,13 +5783,13 @@ def msort(a):
 
     Availability
     --------
-    Single GPU, Single CPU
+    Multiple GPUs, Single CPU
     """
     return sort(a, axis=0)
 
 
 @add_boilerplate("a")
-def sort(a, axis=-1, kind="stable", order=None):
+def sort(a, axis=-1, kind="quicksort", order=None):
     """
 
     Returns a sorted copy of an array.
@@ -5801,7 +5802,8 @@ def sort(a, axis=-1, kind="stable", order=None):
         Axis to sort. By default, the index -1 (the last axis) is used. If
         None, the flattened array is used.
     kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
-        Currently only 'stable' sort is supported
+        Default is 'quicksort'. The underlying sort algorithm might vary.
+        The code basically supports 'stable' or *not* 'stable'.
     order : str or list of str, optional
         Currently not supported
 
@@ -5822,7 +5824,7 @@ def sort(a, axis=-1, kind="stable", order=None):
 
     Availability
     --------
-    Single GPU, Single CPU
+    Multiple GPUs, Single CPU
     """
     result = ndarray(a.shape, a.dtype)
     result._thunk.sort(rhs=a._thunk, axis=axis, kind=kind, order=order)
@@ -5857,7 +5859,7 @@ def sort_complex(a):
 
     Availability
     --------
-    Single GPU, Single CPU
+    Multiple GPUs, Single CPU
     """
 
     result = sort(a)
diff --git a/cunumeric/sorting/sorting.py b/cunumeric/sorting.py
similarity index 70%
rename from cunumeric/sorting/sorting.py
rename to cunumeric/sorting.py
index b2c72a1a2..f8c56fa0b 100644
--- a/cunumeric/sorting/sorting.py
+++ b/cunumeric/sorting.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2022 NVIDIA Corporation
+# Copyright 2022 NVIDIA Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,44 +19,40 @@
 from legate.core import types as ty
 
 
-def sort_flattened(output, input, argsort):
+def sort_flattened(output, input, argsort, stable):
     flattened = input.reshape((input.size,), order="C")
-    flattened_copy = output.runtime.create_empty_thunk(
-        flattened.shape, dtype=input.dtype, inputs=[input, flattened]
-    )
-    flattened_copy.copy(flattened, deep=True)
 
     # run sort flattened -- return 1D solution
     sort_result = output.runtime.create_empty_thunk(
-        flattened_copy.shape, dtype=output.dtype, inputs=[flattened_copy]
+        flattened.shape, dtype=output.dtype, inputs=(flattened,)
     )
-    sorting(sort_result, flattened_copy, argsort)
+    sorting(sort_result, flattened, argsort, stable=stable)
     output.base = sort_result.base
     output.numpy_array = None
 
 
-def sort_swapped(output, input, argsort, sort_axis):
+def sort_swapped(output, input, argsort, sort_axis, stable):
     assert sort_axis < input.ndim - 1 and sort_axis >= 0
 
     # swap axes
     swapped = input.swapaxes(sort_axis, input.ndim - 1)
 
     swapped_copy = output.runtime.create_empty_thunk(
-        swapped.shape, dtype=input.dtype, inputs=[input, swapped]
+        swapped.shape, dtype=input.dtype, inputs=(input, swapped)
     )
     swapped_copy.copy(swapped, deep=True)
 
     # run sort on last axis
     sort_result = output.runtime.create_empty_thunk(
-        swapped_copy.shape, dtype=output.dtype, inputs=[swapped_copy]
+        swapped_copy.shape, dtype=output.dtype, inputs=(swapped_copy,)
     )
-    sorting(sort_result, swapped_copy, argsort)
+    sorting(sort_result, swapped_copy, argsort, stable=stable)
 
     output.base = sort_result.swapaxes(input.ndim - 1, sort_axis).base
     output.numpy_array = None
 
 
-def sort_task(output, input, argsort):
+def sort_task(output, input, argsort, stable):
     task = output.context.create_task(CuNumericOpCode.SORT)
 
     needs_unbound_output = output.runtime.num_gpus > 1 and input.ndim == 1
@@ -80,6 +76,7 @@ def sort_task(output, input, argsort):
 
     task.add_scalar_arg(argsort, bool)  # return indices flag
     task.add_scalar_arg(input.base.shape, (ty.int32,))
+    task.add_scalar_arg(stable, bool)
     task.execute()
 
     if needs_unbound_output:
@@ -87,20 +84,18 @@ def sort_task(output, input, argsort):
         output.numpy_array = None
 
 
-def sorting(output, input, argsort, axis=-1):
+def sorting(output, input, argsort, axis=-1, stable=False):
     if axis is None and input.ndim > 1:
-        sort_flattened(output, input, argsort)
+        sort_flattened(output, input, argsort, stable)
     else:
         if axis is None:
-            sort_axis = 0
+            axis = 0
         elif axis < 0:
-            sort_axis = input.ndim + axis
-        else:
-            sort_axis = axis
+            axis = input.ndim + axis
 
-        if sort_axis is not input.ndim - 1:
-            sort_swapped(output, input, argsort, sort_axis)
+        if axis is not input.ndim - 1:
+            sort_swapped(output, input, argsort, axis, stable)
 
         else:
             # run actual sort task
-            sort_task(output, input, argsort)
+            sort_task(output, input, argsort, stable)
diff --git a/cunumeric/sorting/__init__.py b/cunumeric/sorting/__init__.py
deleted file mode 100644
index 8988b3353..000000000
--- a/cunumeric/sorting/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright 2021-2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import sys as _sys
diff --git a/examples/sort.py b/examples/sort.py
index 21b503708..47c54f619 100644
--- a/examples/sort.py
+++ b/examples/sort.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-# Copyright 2021 NVIDIA Corporation
+# Copyright 2022 NVIDIA Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,64 +16,66 @@
 #
 
 import argparse
-import datetime
 
 import numpy
 from benchmark import run_benchmark
+from legate.timing import time
 
 import cunumeric
 
 
-def check_sorted(a, a_numpy, axis=-1):
-    a_sorted = numpy.sort(a_numpy, axis)
+def check_sorted(a, a_sorted, axis=-1):
+    a_numpy = a.__array__()
+    a_numpy_sorted = numpy.sort(a_numpy, axis)
     print("Checking result...")
-    if cunumeric.allclose(a_sorted, a):
+    if cunumeric.allclose(a_numpy_sorted, a_sorted):
         print("PASS!")
     else:
         print("FAIL!")
-        print("NUMPY    : " + str(a_sorted))
-        print("CUNUMERIC: " + str(a))
+        print("NUMPY    : " + str(a_numpy_sorted))
+        print("CUNUMERIC: " + str(a_sorted))
+        assert False
 
 
-def run_sort(N, shape, axis, datatype, perform_check, timing):
+def run_sort(N, shape, axis, datatype, lower, upper, perform_check, timing):
 
-    numpy.random.seed(42)
+    cunumeric.random.seed(42)
     newtype = numpy.dtype(datatype).type
+    if shape is not None:
+        shape = tuple(shape)
+    else:
+        shape = (N,)
 
     if numpy.issubdtype(newtype, numpy.integer):
-        a_numpy = numpy.array(
-            numpy.random.randint(
-                numpy.iinfo(newtype).min, numpy.iinfo(newtype).max, size=N
-            ),
-            dtype=newtype,
+        if lower is None:
+            lower = numpy.iinfo(newtype).min
+        if upper is None:
+            upper = numpy.iinfo(newtype).max
+        a = cunumeric.random.randint(low=lower, high=upper, size=N).astype(
+            newtype
         )
+        a = a.reshape(shape)
     elif numpy.issubdtype(newtype, numpy.floating):
-        a_numpy = numpy.array(numpy.random.random(size=N), dtype=newtype)
+        a = cunumeric.random.random(shape).astype(newtype)
     elif numpy.issubdtype(newtype, numpy.complexfloating):
-        a_numpy = numpy.array(
-            numpy.random.random(size=N) + numpy.random.random(size=N) * 1j,
-            dtype=newtype,
-        )
+        a = cunumeric.array(
+            cunumeric.random.random(shape)
+            + cunumeric.random.random(shape) * 1j
+        ).astype(newtype)
     else:
         print("UNKNOWN type " + str(newtype))
         assert False
 
-    if shape is not None:
-        a_numpy = a_numpy.reshape(tuple(shape))
-
-    a = cunumeric.array(a_numpy)
-
-    start = datetime.datetime.now()
+    start = time()
     a_sorted = cunumeric.sort(a, axis)
-    stop = datetime.datetime.now()
+    stop = time()
 
     if perform_check:
-        check_sorted(a_sorted, a_numpy, axis)
+        check_sorted(a, a_sorted, axis)
     else:
         # do we need to synchronize?
         assert True
-    delta = stop - start
-    total = delta.total_seconds() * 1000.0
+    total = (stop - start) * 1e-3
     if timing:
         print("Elapsed Time: " + str(total) + " ms")
     return total
@@ -120,6 +122,22 @@ def run_sort(N, shape, axis, datatype, perform_check, timing):
         dest="datatype",
         help="data type (default numpy.int32)",
     )
+    parser.add_argument(
+        "-l",
+        "--lower",
+        type=int,
+        default=None,
+        dest="lower",
+        help="lower bound for integer based arrays (inclusive)",
+    )
+    parser.add_argument(
+        "-u",
+        "--upper",
+        type=int,
+        default=None,
+        dest="upper",
+        help="upper bound for integer based arrays (exclusive)",
+    )
     parser.add_argument(
         "-a",
         "--axis",
@@ -148,6 +166,8 @@ def run_sort(N, shape, axis, datatype, perform_check, timing):
             args.shape,
             args.axis,
             args.datatype,
+            args.lower,
+            args.upper,
             args.check,
             args.timing,
         ),
diff --git a/install.py b/install.py
index 45a9281f6..ad3581810 100755
--- a/install.py
+++ b/install.py
@@ -160,7 +160,7 @@ def install_openblas(openblas_dir, thread_count, verbose):
     git_clone(
         temp_dir,
         url="https://github.com/xianyi/OpenBLAS.git",
-        tag="v0.3.19",
+        tag="v0.3.15",
         verbose=verbose,
     )
     # We can just build this directly
diff --git a/tests/sort.py b/tests/sort.py
index 9cac07195..50b705364 100644
--- a/tests/sort.py
+++ b/tests/sort.py
@@ -1,4 +1,4 @@
-# Copyright 2021 NVIDIA Corporation
+# Copyright 2022 NVIDIA Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -56,8 +56,6 @@ def test_1D():
     print("Result (inplace): " + str(A_num))
     compare_assert(sortA_np, A_num)
 
-    return
-
 
 def test_2D():
     np.random.seed(42)
@@ -75,8 +73,6 @@ def test_2D():
     test_sort_axis(A_np, A_num, 0)
     test_sort_axis(A_np, A_num, axis=None)
 
-    return
-
 
 def test_3D(x_dim, y_dim, z_dim):
     np.random.seed(42)
@@ -93,8 +89,6 @@ def test_3D(x_dim, y_dim, z_dim):
     test_sort_axis(A_np, A_num, 0)
     test_sort_axis(A_np, A_num, axis=None)
 
-    return
-
 
 def test_3D_complex(x_dim, y_dim, z_dim):
     np.random.seed(42)
@@ -111,20 +105,6 @@ def test_3D_complex(x_dim, y_dim, z_dim):
     test_sort_axis(A_np, A_num, 0)
     test_sort_axis(A_np, A_num, axis=None)
 
-    return
-
-
-def test_custom():
-    np.random.seed(42)
-    a = generate_random((4,), np.uint8)
-    print("Matrix A")
-    print(a)
-
-    a_num = num.array(a)
-    compare_assert(np.sort_complex(a), num.sort_complex(a_num))
-
-    return
-
 
 def test_api(a=None):
     if a is None:
@@ -150,9 +130,6 @@ def test_api(a=None):
     print("sort_complex")
     compare_assert(np.sort_complex(a), num.sort_complex(a_num))
 
-    # reverse order sort
-    # TODO
-
     # in-place sort
     copy_a = a.copy()
     copy_a_num = a_num.copy()
@@ -160,9 +137,6 @@ def test_api(a=None):
     copy_a_num.sort()
     compare_assert(copy_a, copy_a_num)
 
-    # reverse order sort (in place)
-    # TODO
-
     # argsort
     for i in range(a.ndim):
         compare_assert(a, a_num)
@@ -177,8 +151,6 @@ def test_api(a=None):
         np.argsort(a, axis=None, kind="stable"), num.argsort(a_num, axis=None)
     )
 
-    return
-
 
 def generate_random(shape, datatype):
     print("Generate random for " + str(datatype))
@@ -228,8 +200,6 @@ def test_dtypes():
     test_api(generate_random((2, 5, 7), np.complex128))
     test_api(generate_random((220,), np.complex128))
 
-    return
-
 
 def test():
     print("\n\n -----------  1D test ---------------\n")
@@ -248,4 +218,3 @@ def test():
 
 if __name__ == "__main__":
     test()
-    # test_custom()

From c7fee9902c98b7d60e29952bc9a97d54c70ceb13 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 9 Mar 2022 09:40:48 -0800
Subject: [PATCH 30/49] review changes C++ signatures and cleanup

---
 src/cunumeric/sort/sort.cc           |  39 +++------
 src/cunumeric/sort/sort.cu           | 120 ++++++++++-----------------
 src/cunumeric/sort/sort.h            |  26 ++----
 src/cunumeric/sort/sort_omp.cc       |  38 +++------
 src/cunumeric/sort/sort_template.inl |  41 ++++-----
 5 files changed, 93 insertions(+), 171 deletions(-)

diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index ca0bf1545..740622b68 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021-2022 NVIDIA Corporation
+/* Copyright 2022 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,27 +59,16 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
                   const Pitches<DIM - 1>& pitches,
                   const Rect<DIM>& rect,
                   const size_t volume,
+                  const size_t sort_dim_size,
                   const bool argsort,
-                  const Legion::DomainPoint global_shape,
+                  const bool stable,
                   const bool is_index_space,
-                  const Legion::DomainPoint index_point,
-                  const Legion::Domain domain,
+                  const size_t local_rank,
+                  const size_t num_ranks,
                   const std::vector<comm::Communicator>& comms)
   {
-    AccessorRO<VAL, DIM> input = input_array.read_accessor<VAL, DIM>(rect);
-
-    bool dense = input.accessor.is_dense_row_major(rect);
-
-#ifdef DEBUG_CUNUMERIC
-    std::cout << "CPU(" << getRank(domain, index_point) << "): local size = " << volume
-              << ", dist. = " << is_index_space << ", index_point = " << index_point
-              << ", domain/volume = " << domain << "/" << domain.get_volume()
-              << ", dense = " << dense << ", argsort. = " << argsort << std::endl;
-#endif
-
-    const size_t sort_dim_size = global_shape[DIM - 1];
-
-    assert(dense);
+    auto input = input_array.read_accessor<VAL, DIM>(rect);
+    assert(input.accessor.is_dense_row_major(rect));
     assert(!is_index_space || DIM > 1);  // not implemented for now
 
     // make a copy of the input
@@ -97,14 +86,12 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
       dense_input_copy.ptr(0), argsort ? indices_buffer.ptr(0) : nullptr, volume, sort_dim_size);
 
     // copy back data (we assume output partition to be aliged to input!)
-    if (dense) {
-      if (argsort) {
-        AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
-        std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo));
-      } else {
-        AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
-        std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo));
-      }
+    if (argsort) {
+      AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
+      std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo));
+    } else {
+      AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
+      std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo));
     }
   }
 };
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index 6a25249e9..b63d61f8e 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -1,4 +1,4 @@
-/* Copyright 2021-2022 NVIDIA Corporation
+/* Copyright 2022 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -76,13 +76,14 @@ void cub_local_sort_inplace(
 {
   // make a copy of input --> we want inptr to return sorted values
   auto keys_in = create_buffer<VAL>(volume, Legion::Memory::Kind::GPU_FB_MEM);
-  cudaMemcpyAsync(keys_in.ptr(0), inptr, sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream);
+  CHECK_CUDA(
+    cudaMemcpyAsync(keys_in.ptr(0), inptr, sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream));
   size_t temp_storage_bytes = 0;
   if (argptr == nullptr) {
     if (volume == sort_dim_size) {
-      // sort
+      // sort (initial call to compute bufffer size)
       cub::DeviceRadixSort::SortKeys(
-        NULL, temp_storage_bytes, keys_in.ptr(0), inptr, volume, 0, sizeof(VAL) * 8, stream);
+        nullptr, temp_storage_bytes, keys_in.ptr(0), inptr, volume, 0, sizeof(VAL) * 8, stream);
       auto temp_storage =
         create_buffer<unsigned char>(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM);
       cub::DeviceRadixSort::SortKeys(temp_storage.ptr(0),
@@ -94,13 +95,13 @@ void cub_local_sort_inplace(
                                      sizeof(VAL) * 8,
                                      stream);
     } else {
-      // segmented sort
+      // segmented sort (initial call to compute bufffer size)
       auto off_start_it =
         thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size));
       auto off_end_it =
         thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size));
 
-      cub::DeviceSegmentedRadixSort::SortKeys(NULL,
+      cub::DeviceSegmentedRadixSort::SortKeys(nullptr,
                                               temp_storage_bytes,
                                               keys_in.ptr(0),
                                               inptr,
@@ -128,12 +129,12 @@ void cub_local_sort_inplace(
     }
   } else {
     auto idx_in = create_buffer<int64_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
-    cudaMemcpyAsync(
-      idx_in.ptr(0), argptr, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream);
+    CHECK_CUDA(cudaMemcpyAsync(
+      idx_in.ptr(0), argptr, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream));
 
     if (volume == sort_dim_size) {
-      // argsort
-      cub::DeviceRadixSort::SortPairs(NULL,
+      // argsort (initial call to compute bufffer size)
+      cub::DeviceRadixSort::SortPairs(nullptr,
                                       temp_storage_bytes,
                                       keys_in.ptr(0),
                                       inptr,
@@ -158,13 +159,13 @@ void cub_local_sort_inplace(
                                       sizeof(VAL) * 8,
                                       stream);
     } else {
-      // segmented argsort
+      // segmented argsort (initial call to compute bufffer size)
       auto off_start_it =
         thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size));
       auto off_end_it =
         thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size));
 
-      cub::DeviceSegmentedRadixSort::SortPairs(NULL,
+      cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
                                                temp_storage_bytes,
                                                keys_in.ptr(0),
                                                inptr,
@@ -276,19 +277,6 @@ void local_sort_inplace(legate_type_of<CODE>* inptr,
   if (volume > 0) { thrust_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stream); }
 }
 
-template <typename VAL>
-__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
-  print_subset(const VAL* data, const size_t volume, const size_t rank)
-{
-  const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (idx == 0) {
-    printf("data(%d) = [ ", rank);
-    for (int i = 0; i < volume; ++i) { printf("%d ", data[i]); }
-    printf("]\n");
-  }
-}
-
 // auto align to multiples of 16 bytes
 auto get_aligned_size = [](auto size) { return std::max<size_t>(16, (size + 15) / 16 * 16); };
 
@@ -341,7 +329,6 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
     samples[offset + sample_idx].value    = data[index];
     samples[offset + sample_idx].rank     = rank;
     samples[offset + sample_idx].position = index;
-    // printf("Sample rank %lu position %lu offset %lu\n", rank, index, (offset+sample_idx));
   } else {
     // edge case where num_local_samples > volume
     if (sample_idx < volume) {
@@ -381,8 +368,6 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   } else {
     split_positions[splitter_idx] = splitter.position + 1;
   }
-  // printf("Splitter position id %lu rank %lu position %lu num_samples %lu\n", splitter_idx, rank,
-  // split_positions[splitter_idx], num_samples);
 }
 
 template <typename VAL>
@@ -473,7 +458,7 @@ static SortPiece<VAL> sample_sort_nccl(SortPiece<VAL> local_sorted,
   // all2all exchange send/receive sizes
   auto size_recv = create_buffer<size_t>(num_ranks, Memory::Z_COPY_MEM);
   CHECK_NCCL(ncclGroupStart());
-  for (int r = 0; r < num_ranks; r++) {
+  for (size_t r = 0; r < num_ranks; r++) {
     CHECK_NCCL(ncclSend(size_send.ptr(r), 1, ncclUint64, r, *comm, stream));
     CHECK_NCCL(ncclRecv(size_recv.ptr(r), 1, ncclUint64, r, *comm, stream));
   }
@@ -485,7 +470,7 @@ static SortPiece<VAL> sample_sort_nccl(SortPiece<VAL> local_sorted,
   // allocate merge targets, data transfer...
   std::vector<SortPiece<VAL>> merge_buffers(num_ranks);
 
-  for (int i = 0; i < merge_buffers.size(); ++i) {
+  for (size_t i = 0; i < num_ranks; ++i) {
     // align buffer to allow data transfer of 16byte blocks
     auto recv_size_aligned   = get_aligned_size(size_recv[i] * sizeof(VAL));
     auto buf_size            = (recv_size_aligned + sizeof(VAL) - 1) / sizeof(VAL);
@@ -495,7 +480,7 @@ static SortPiece<VAL> sample_sort_nccl(SortPiece<VAL> local_sorted,
   }
   size_t send_pos = 0;
   CHECK_NCCL(ncclGroupStart());
-  for (int r = 0; r < num_ranks; r++) {
+  for (size_t r = 0; r < num_ranks; r++) {
     CHECK_NCCL(ncclSend(local_sorted.values.ptr(send_pos),
                         get_aligned_size(size_send[r] * sizeof(VAL)),
                         ncclInt8,
@@ -545,7 +530,6 @@ static SortPiece<VAL> sample_sort_nccl(SortPiece<VAL> local_sorted,
                              p_indices2,
                              p_merged_values,
                              p_merged_indices);
-        CHECK_CUDA(cudaStreamSynchronize(stream));
         source1.indices.destroy();
       } else {
         thrust::merge(thrust::cuda::par.on(stream),
@@ -554,7 +538,6 @@ static SortPiece<VAL> sample_sort_nccl(SortPiece<VAL> local_sorted,
                       p_values2,
                       p_values2 + source2.size,
                       p_merged_values);
-        CHECK_CUDA(cudaStreamSynchronize(stream));
       }
 
       source1.values.destroy();
@@ -578,37 +561,28 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
                   const Pitches<DIM - 1>& pitches,
                   const Rect<DIM>& rect,
                   const size_t volume,
+                  const size_t sort_dim_size,
                   const bool argsort,
-                  const Legion::DomainPoint global_shape,
+                  const bool stable,
                   const bool is_index_space,
-                  const Legion::DomainPoint index_point,
-                  const Legion::Domain domain,
+                  const size_t local_rank,
+                  const size_t num_ranks,
                   const std::vector<comm::Communicator>& comms)
   {
-    AccessorRO<VAL, DIM> input = input_array.read_accessor<VAL, DIM>(rect);
-
-    size_t my_rank = getRank(domain, index_point);
-
-#ifdef DEBUG_CUNUMERIC
-    std::cout << "GPU(" << my_rank << "): local size = " << volume << ", dist. = " << is_index_space
-              << ", index_point = " << index_point << ", domain/volume = " << domain << "/"
-              << domain.get_volume() << ", dense = " << input.accessor.is_dense_row_major(rect)
-              << ", argsort. = " << argsort << std::endl;
-#endif
+    auto input = input_array.read_accessor<VAL, DIM>(rect);
 
+    // we allow empty domains for distributed sorting
     assert(rect.empty() || input.accessor.is_dense_row_major(rect));
 
     auto stream = get_cached_stream();
 
-    const size_t sort_dim_size = DIM == 1 ? volume : global_shape[DIM - 1];
-
     // make a copy of the input
     auto dense_input_copy = create_buffer<VAL>(volume, Legion::Memory::Kind::GPU_FB_MEM);
-    cudaMemcpyAsync(dense_input_copy.ptr(0),
-                    input.ptr(rect.lo),
-                    sizeof(VAL) * volume,
-                    cudaMemcpyDeviceToDevice,
-                    stream);
+    CHECK_CUDA(cudaMemcpyAsync(dense_input_copy.ptr(0),
+                               input.ptr(rect.lo),
+                               sizeof(VAL) * volume,
+                               cudaMemcpyDeviceToDevice,
+                               stream));
 
     // we need a buffer for argsort
     auto indices_buffer =
@@ -641,17 +615,14 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
     // this is linked to the decision in sorting.py on when to use adn 'unbounded' output array.
     if (output_array.dim() == -1) {
       SortPiece<VAL> local_sorted;
-      local_sorted.values                       = dense_input_copy;
-      local_sorted.indices                      = indices_buffer;
-      local_sorted.size                         = volume;
-      SortPiece<VAL> local_sorted_repartitioned = is_index_space
-                                                    ? sample_sort_nccl(local_sorted,
-                                                                       my_rank,
-                                                                       domain.get_volume(),
-                                                                       argsort,
-                                                                       stream,
-                                                                       comms[0].get<ncclComm_t*>())
-                                                    : local_sorted;
+      local_sorted.values  = dense_input_copy;
+      local_sorted.indices = indices_buffer;
+      local_sorted.size    = volume;
+      SortPiece<VAL> local_sorted_repartitioned =
+        is_index_space
+          ? sample_sort_nccl(
+              local_sorted, local_rank, num_ranks, argsort, stream, comms[0].get<ncclComm_t*>())
+          : local_sorted;
       if (argsort) {
         output_array.return_data(local_sorted_repartitioned.indices,
                                  local_sorted_repartitioned.size);
@@ -664,22 +635,21 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
       if (argsort) {
         AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
         assert(output.accessor.is_dense_row_major(rect));
-        cudaMemcpyAsync(output.ptr(rect.lo),
-                        indices_buffer.ptr(0),
-                        sizeof(int64_t) * volume,
-                        cudaMemcpyDeviceToDevice,
-                        stream);
+        CHECK_CUDA(cudaMemcpyAsync(output.ptr(rect.lo),
+                                   indices_buffer.ptr(0),
+                                   sizeof(int64_t) * volume,
+                                   cudaMemcpyDeviceToDevice,
+                                   stream));
       } else {
         AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
         assert(output.accessor.is_dense_row_major(rect));
-        cudaMemcpyAsync(output.ptr(rect.lo),
-                        dense_input_copy.ptr(0),
-                        sizeof(VAL) * volume,
-                        cudaMemcpyDeviceToDevice,
-                        stream);
+        CHECK_CUDA(cudaMemcpyAsync(output.ptr(rect.lo),
+                                   dense_input_copy.ptr(0),
+                                   sizeof(VAL) * volume,
+                                   cudaMemcpyDeviceToDevice,
+                                   stream));
       }
     }
-    CHECK_CUDA(cudaStreamSynchronize(stream));
   }
 };
 
diff --git a/src/cunumeric/sort/sort.h b/src/cunumeric/sort/sort.h
index b915df838..b6dc88b5d 100644
--- a/src/cunumeric/sort/sort.h
+++ b/src/cunumeric/sort/sort.h
@@ -1,4 +1,4 @@
-/* Copyright 2021-2022 NVIDIA Corporation
+/* Copyright 2022 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,10 +24,11 @@ struct SortArgs {
   const Array& input;
   Array& output;
   bool argsort;
-  Legion::DomainPoint global_shape;
+  bool stable;
+  size_t sort_dim_size;
   bool is_index_space;
-  Legion::DomainPoint task_index;
-  Legion::Domain launch_domain;
+  size_t local_rank;
+  size_t num_ranks;
 };
 
 template <typename VAL>
@@ -37,23 +38,6 @@ struct SampleEntry {
   size_t local_id;
 };
 
-template <typename VAL>
-struct SampleEntryComparator {
-  bool operator()(const SampleEntry<VAL>& a, const SampleEntry<VAL>& b) const
-  {
-    if (a.value < b.value) {
-      return true;
-    } else if (a.value == b.value) {
-      if (a.rank < b.rank) {
-        return true;
-      } else if (a.rank == b.rank) {
-        return a.local_id < b.local_id;
-      }
-    }
-    return false;
-  }
-};
-
 class SortTask : public CuNumericTask<SortTask> {
  public:
   static const int TASK_ID = CUNUMERIC_SORT;
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index ca9004f1e..54ef40109 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021-2022 NVIDIA Corporation
+/* Copyright 2022 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,26 +61,16 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
                   const Pitches<DIM - 1>& pitches,
                   const Rect<DIM>& rect,
                   const size_t volume,
+                  const size_t sort_dim_size,
                   const bool argsort,
-                  const Legion::DomainPoint global_shape,
+                  const bool stable,
                   const bool is_index_space,
-                  const Legion::DomainPoint index_point,
-                  const Legion::Domain domain,
+                  const size_t local_rank,
+                  const size_t num_ranks,
                   const std::vector<comm::Communicator>& comms)
   {
-    AccessorRO<VAL, DIM> input = input_array.read_accessor<VAL, DIM>(rect);
-
-    bool dense = input.accessor.is_dense_row_major(rect);
-
-#ifdef DEBUG_CUNUMERIC
-    std::cout << "OMP(" << getRank(domain, index_point) << "): local size = " << volume
-              << ", dist. = " << is_index_space << ", index_point = " << index_point
-              << ", domain/volume = " << domain << "/" << domain.get_volume()
-              << ", dense = " << dense << ", argsort. = " << argsort << std::endl;
-#endif
-
-    const size_t sort_dim_size = global_shape[DIM - 1];
-    assert(dense);
+    auto input = input_array.read_accessor<VAL, DIM>(rect);
+    assert(input.accessor.is_dense_row_major(rect));
     assert(!is_index_space || DIM > 1);  // not implemented for now
 
     // make a copy of the input
@@ -98,14 +88,12 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
       dense_input_copy.ptr(0), argsort ? indices_buffer.ptr(0) : nullptr, volume, sort_dim_size);
 
     // copy back data (we assume output partition to be aliged to input!)
-    {
-      if (argsort) {
-        AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
-        std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo));
-      } else {
-        AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
-        std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo));
-      }
+    if (argsort) {
+      AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
+      std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo));
+    } else {
+      AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
+      std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo));
     }
   }
 };
diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl
index 2360f1068..593b7cc21 100644
--- a/src/cunumeric/sort/sort_template.inl
+++ b/src/cunumeric/sort/sort_template.inl
@@ -1,4 +1,4 @@
-/* Copyright 2021-2022 NVIDIA Corporation
+/* Copyright 2022 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ using namespace legate;
 template <VariantKind KIND, LegateTypeCode CODE, int32_t DIM>
 struct SortImplBody;
 
-static int getRank(Domain domain, DomainPoint index_point)
+static int get_rank(Domain domain, DomainPoint index_point)
 {
   int domain_index = 0;
   for (int i = 0; i < domain.get_dim(); ++i) {
@@ -46,6 +46,8 @@ struct SortImpl {
     Pitches<DIM - 1> pitches;
     size_t volume = pitches.flatten(rect);
 
+    size_t sort_dim_size = std::min(args.sort_dim_size, volume);
+
     /*
      * Assumptions:
      * 1. Sort is always requested for the 'last' dimension within rect
@@ -56,15 +58,6 @@ struct SortImpl {
      *
      */
 
-#ifdef DEBUG_CUNUMERIC
-    std::cout << "DIM=" << DIM << ", rect=" << rect << ", shape=" << args.global_shape
-              << ", argsort=" << args.argsort << ", sort_dim_size=" << args.global_shape[DIM - 1]
-              << std::endl;
-
-    assert((DIM == 1 || (rect.hi[DIM - 1] - rect.lo[DIM - 1] + 1 == args.global_shape[DIM - 1])) &&
-           "multi-dimensional array should not be distributed in (sort) dimension");
-#endif
-
     // we shall not return on empty rectangle in case of distributed data
     // as the process might still participate in the parallel sort
     if ((DIM > 1 || !args.is_index_space) && rect.empty()) return;
@@ -74,11 +67,12 @@ struct SortImpl {
                                     pitches,
                                     rect,
                                     volume,
+                                    sort_dim_size,
                                     args.argsort,
-                                    args.global_shape,
+                                    args.stable,
                                     args.is_index_space,
-                                    args.task_index,
-                                    args.launch_domain,
+                                    args.local_rank,
+                                    args.num_ranks,
                                     comms);
   }
 };
@@ -86,20 +80,19 @@ struct SortImpl {
 template <VariantKind KIND>
 static void sort_template(TaskContext& context)
 {
-  DomainPoint global_shape;
-  {
-    auto shape_span  = context.scalars()[1].values<int32_t>();
-    global_shape.dim = shape_span.size();
-    for (int32_t dim = 0; dim < global_shape.dim; ++dim) { global_shape[dim] = shape_span[dim]; }
-  }
+  auto shape_span      = context.scalars()[1].values<int32_t>();
+  size_t sort_dim_size = shape_span[shape_span.size() - 1];
+  size_t local_rank    = get_rank(context.get_launch_domain(), context.get_task_index());
+  size_t num_ranks     = context.get_launch_domain().get_volume();
 
   SortArgs args{context.inputs()[0],
                 context.outputs()[0],
-                context.scalars()[0].value<bool>(),
-                global_shape,
+                context.scalars()[0].value<bool>(),  // argsort
+                context.scalars()[2].value<bool>(),  // stable
+                sort_dim_size,
                 !context.is_single_task(),
-                context.get_task_index(),
-                context.get_launch_domain()};
+                local_rank,
+                num_ranks};
   double_dispatch(
     args.input.dim(), args.input.code(), SortImpl<KIND>{}, args, context.communicators());
 }

From 5a0204796908cbd320a438c745a89d3de2ba2d24 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 9 Mar 2022 09:45:53 -0800
Subject: [PATCH 31/49] non-stable sort for primitive values

---
 src/cunumeric/sort/sort.cc     |  2 +-
 src/cunumeric/sort/sort.cu     | 10 +++++-----
 src/cunumeric/sort/sort_omp.cc |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index 740622b68..6f40927dc 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -40,7 +40,7 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
     if (argptr == nullptr) {
       // sort (in place)
       for (size_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
-        thrust::stable_sort(thrust::host, inptr + start_idx, inptr + start_idx + sort_dim_size);
+        thrust::sort(thrust::host, inptr + start_idx, inptr + start_idx + sort_dim_size);
       }
     } else {
       // argsort
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index b63d61f8e..6d61c9226 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -205,7 +205,7 @@ void thrust_local_sort_inplace(
 {
   if (argptr == nullptr) {
     if (volume == sort_dim_size) {
-      thrust::stable_sort(thrust::cuda::par.on(stream), inptr, inptr + volume);
+      thrust::sort(thrust::cuda::par.on(stream), inptr, inptr + volume);
     } else {
       auto sort_id = create_buffer<uint64_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
       // init combined keys
@@ -217,10 +217,10 @@ void thrust_local_sort_inplace(
                         thrust::divides<uint64_t>());
       auto combined = thrust::make_zip_iterator(thrust::make_tuple(sort_id.ptr(0), inptr));
 
-      thrust::stable_sort(thrust::cuda::par.on(stream),
-                          combined,
-                          combined + volume,
-                          thrust::less<thrust::tuple<size_t, VAL>>());
+      thrust::sort(thrust::cuda::par.on(stream),
+                   combined,
+                   combined + volume,
+                   thrust::less<thrust::tuple<size_t, VAL>>());
     }
   } else {
     if (volume == sort_dim_size) {
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index 54ef40109..b97c66a8a 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -41,7 +41,7 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
       // sort (in place)
 #pragma omp parallel for
       for (size_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
-        thrust::stable_sort(thrust::host, inptr + start_idx, inptr + start_idx + sort_dim_size);
+        thrust::sort(thrust::host, inptr + start_idx, inptr + start_idx + sort_dim_size);
       }
     } else {
       // argsort

From 0c1805f22a0599171efe432a31f1a8ba19e517c4 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 9 Mar 2022 11:48:30 -0800
Subject: [PATCH 32/49] remove copies where possible

---
 src/cunumeric/sort/sort.cc     | 34 ++++++------
 src/cunumeric/sort/sort.cu     | 99 +++++++++++++++++-----------------
 src/cunumeric/sort/sort_omp.cc | 34 ++++++------
 3 files changed, 85 insertions(+), 82 deletions(-)

diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index 6f40927dc..f20169284 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -71,27 +71,29 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
     assert(input.accessor.is_dense_row_major(rect));
     assert(!is_index_space || DIM > 1);  // not implemented for now
 
-    // make a copy of the input
-    auto dense_input_copy = create_buffer<VAL>(volume);
-    {
-      auto* src = input.ptr(rect.lo);
-      std::copy(src, src + volume, dense_input_copy.ptr(0));
-    }
+    if (argsort) {
+      // make copy of the input
+      auto dense_input_copy = create_buffer<VAL>(volume);
+      {
+        auto* src = input.ptr(rect.lo);
+        std::copy(src, src + volume, dense_input_copy.ptr(0));
+      }
 
-    // we need a buffer for argsort
-    auto indices_buffer = create_buffer<int64_t>(argsort ? volume : 0);
+      AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
 
-    // sort data
-    thrust_local_sort_inplace(
-      dense_input_copy.ptr(0), argsort ? indices_buffer.ptr(0) : nullptr, volume, sort_dim_size);
+      // sort data in place
+      thrust_local_sort_inplace(
+        dense_input_copy.ptr(0), output.ptr(rect.lo), volume, sort_dim_size);
 
-    // copy back data (we assume output partition to be aliged to input!)
-    if (argsort) {
-      AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
-      std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo));
     } else {
       AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
-      std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo));
+
+      // init output values
+      auto* src = input.ptr(rect.lo);
+      std::copy(src, src + volume, output.ptr(rect.lo));
+
+      // sort data in place
+      thrust_local_sort_inplace(output.ptr(rect.lo), nullptr, volume, sort_dim_size);
     }
   }
 };
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index 6d61c9226..1cd60aceb 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -263,7 +263,7 @@ void local_sort_inplace(legate_type_of<CODE>* inptr,
                         cudaStream_t stream)
 {
   using VAL = legate_type_of<CODE>;
-  if (volume > 0) { cub_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stream); }
+  cub_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stream);
 }
 
 template <LegateTypeCode CODE, std::enable_if_t<!support_cub<CODE>::value>* = nullptr>
@@ -274,7 +274,7 @@ void local_sort_inplace(legate_type_of<CODE>* inptr,
                         cudaStream_t stream)
 {
   using VAL = legate_type_of<CODE>;
-  if (volume > 0) { thrust_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stream); }
+  thrust_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stream);
 }
 
 // auto align to multiples of 16 bytes
@@ -576,48 +576,66 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
 
     auto stream = get_cached_stream();
 
-    // make a copy of the input
-    auto dense_input_copy = create_buffer<VAL>(volume, Legion::Memory::Kind::GPU_FB_MEM);
-    CHECK_CUDA(cudaMemcpyAsync(dense_input_copy.ptr(0),
-                               input.ptr(rect.lo),
-                               sizeof(VAL) * volume,
-                               cudaMemcpyDeviceToDevice,
-                               stream));
-
-    // we need a buffer for argsort
-    auto indices_buffer =
-      create_buffer<int64_t>(argsort ? volume : 0, Legion::Memory::Kind::GPU_FB_MEM);
-    if (argsort && volume > 0) {
-      // intialize
+    // initialize sort pointers
+    SortPiece<VAL> local_sorted;
+    int64_t* indices_ptr = nullptr;
+    VAL* values_ptr      = nullptr;
+    if (argsort) {
+      // make a buffer for input
+      auto input_copy = create_buffer<VAL>(volume, Legion::Memory::Kind::GPU_FB_MEM);
+      values_ptr      = input_copy.ptr(0);
+
+      // initialize indices
+      if (output_array.dim() == -1) {
+        auto indices_buffer  = create_buffer<int64_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
+        indices_ptr          = indices_buffer.ptr(0);
+        local_sorted.values  = input_copy;
+        local_sorted.indices = indices_buffer;
+        local_sorted.size    = volume;
+      } else {
+        AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
+        assert(output.accessor.is_dense_row_major(rect));
+        indices_ptr = output.ptr(rect.lo);
+      }
       if (DIM == 1) {
         size_t offset = DIM > 1 ? 0 : rect.lo[0];
-        thrust::sequence(thrust::cuda::par.on(stream),
-                         indices_buffer.ptr(0),
-                         indices_buffer.ptr(0) + volume,
-                         offset);
+        if (volume > 0) {
+          thrust::sequence(thrust::cuda::par.on(stream), indices_ptr, indices_ptr + volume, offset);
+        }
       } else {
         thrust::transform(thrust::cuda::par.on(stream),
                           thrust::make_counting_iterator<int64_t>(0),
                           thrust::make_counting_iterator<int64_t>(volume),
                           thrust::make_constant_iterator<int64_t>(sort_dim_size),
-                          indices_buffer.ptr(0),
+                          indices_ptr,
                           thrust::modulus<int64_t>());
       }
+    } else {
+      // initialize output
+      if (output_array.dim() == -1) {
+        auto input_copy      = create_buffer<VAL>(volume, Legion::Memory::Kind::GPU_FB_MEM);
+        values_ptr           = input_copy.ptr(0);
+        local_sorted.values  = input_copy;
+        local_sorted.indices = create_buffer<int64_t>(0, Legion::Memory::Kind::GPU_FB_MEM);
+        ;
+        local_sorted.size = volume;
+      } else {
+        AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
+        assert(output.accessor.is_dense_row_major(rect));
+        values_ptr = output.ptr(rect.lo);
+      }
     }
 
-    // sort data
-    local_sort_inplace<CODE>(dense_input_copy.ptr(0),
-                             argsort ? indices_buffer.ptr(0) : nullptr,
-                             volume,
-                             sort_dim_size,
-                             stream);
+    if (volume > 0) {
+      CHECK_CUDA(cudaMemcpyAsync(
+        values_ptr, input.ptr(rect.lo), sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream));
 
-    // this is linked to the decision in sorting.py on when to use adn 'unbounded' output array.
+      // sort data (locally)
+      local_sort_inplace<CODE>(values_ptr, indices_ptr, volume, sort_dim_size, stream);
+    }
+
+    // this is linked to the decision in sorting.py on when to use an 'unbounded' output array.
     if (output_array.dim() == -1) {
-      SortPiece<VAL> local_sorted;
-      local_sorted.values  = dense_input_copy;
-      local_sorted.indices = indices_buffer;
-      local_sorted.size    = volume;
       SortPiece<VAL> local_sorted_repartitioned =
         is_index_space
           ? sample_sort_nccl(
@@ -630,25 +648,6 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
         output_array.return_data(local_sorted_repartitioned.values,
                                  local_sorted_repartitioned.size);
       }
-    } else {
-      // copy back data (we assume output partition to be aliged to input!)
-      if (argsort) {
-        AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
-        assert(output.accessor.is_dense_row_major(rect));
-        CHECK_CUDA(cudaMemcpyAsync(output.ptr(rect.lo),
-                                   indices_buffer.ptr(0),
-                                   sizeof(int64_t) * volume,
-                                   cudaMemcpyDeviceToDevice,
-                                   stream));
-      } else {
-        AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
-        assert(output.accessor.is_dense_row_major(rect));
-        CHECK_CUDA(cudaMemcpyAsync(output.ptr(rect.lo),
-                                   dense_input_copy.ptr(0),
-                                   sizeof(VAL) * volume,
-                                   cudaMemcpyDeviceToDevice,
-                                   stream));
-      }
     }
   }
 };
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index b97c66a8a..192a75333 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -73,27 +73,29 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
     assert(input.accessor.is_dense_row_major(rect));
     assert(!is_index_space || DIM > 1);  // not implemented for now
 
-    // make a copy of the input
-    auto dense_input_copy = create_buffer<VAL>(volume);
-    {
-      auto* src = input.ptr(rect.lo);
-      std::copy(src, src + volume, dense_input_copy.ptr(0));
-    }
+    if (argsort) {
+      // make copy of the input
+      auto dense_input_copy = create_buffer<VAL>(volume);
+      {
+        auto* src = input.ptr(rect.lo);
+        std::copy(src, src + volume, dense_input_copy.ptr(0));
+      }
 
-    // we need a buffer for argsort
-    auto indices_buffer = create_buffer<int64_t>(argsort ? volume : 0);
+      AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
 
-    // sort data
-    thrust_local_sort_inplace(
-      dense_input_copy.ptr(0), argsort ? indices_buffer.ptr(0) : nullptr, volume, sort_dim_size);
+      // sort data in place
+      thrust_local_sort_inplace(
+        dense_input_copy.ptr(0), output.ptr(rect.lo), volume, sort_dim_size);
 
-    // copy back data (we assume output partition to be aliged to input!)
-    if (argsort) {
-      AccessorWO<int64_t, DIM> output = output_array.write_accessor<int64_t, DIM>(rect);
-      std::copy(indices_buffer.ptr(0), indices_buffer.ptr(0) + volume, output.ptr(rect.lo));
     } else {
       AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
-      std::copy(dense_input_copy.ptr(0), dense_input_copy.ptr(0) + volume, output.ptr(rect.lo));
+
+      // init output values
+      auto* src = input.ptr(rect.lo);
+      std::copy(src, src + volume, output.ptr(rect.lo));
+
+      // sort data in place
+      thrust_local_sort_inplace(output.ptr(rect.lo), nullptr, volume, sort_dim_size);
     }
   }
 };

From 461ae2b12857ff68d2c9a549b6bfdb87a89b5404 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 9 Mar 2022 12:08:19 -0800
Subject: [PATCH 33/49] fix eager test with new default non-stable sort

---
 tests/sort.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tests/sort.py b/tests/sort.py
index 50b705364..4937de7ec 100644
--- a/tests/sort.py
+++ b/tests/sort.py
@@ -114,12 +114,16 @@ def test_api(a=None):
     # sort axes
     for i in range(a.ndim):
         print("sort axis " + str(i))
-        compare_assert(np.sort(a, axis=i, kind="stable"), num.sort(a_num, i))
+        compare_assert(
+            np.sort(a, axis=i, kind="stable"),
+            num.sort(a_num, i, kind="stable"),
+        )
 
     # flatten
     print("sort flattened")
     compare_assert(
-        np.sort(a, axis=None, kind="stable"), num.sort(a_num, axis=None)
+        np.sort(a, axis=None, kind="stable"),
+        num.sort(a_num, axis=None, kind="stable"),
     )
 
     # msort
@@ -142,13 +146,15 @@ def test_api(a=None):
         compare_assert(a, a_num)
         print("argsort axis " + str(i))
         compare_assert(
-            np.argsort(a, axis=i, kind="stable"), num.argsort(a_num, axis=i)
+            np.argsort(a, axis=i, kind="stable"),
+            num.argsort(a_num, axis=i, kind="stable"),
         )
 
     # flatten
     print("argsort flattened")
     compare_assert(
-        np.argsort(a, axis=None, kind="stable"), num.argsort(a_num, axis=None)
+        np.argsort(a, axis=None, kind="stable"),
+        num.argsort(a_num, axis=None, kind="stable"),
     )
 
 

From 763b99c5cbf9f87169a442e40f54f7ad2e73b2f1 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 9 Mar 2022 12:08:56 -0800
Subject: [PATCH 34/49] fix naming conventions

---
 examples/sort.py | 41 +++++++++++++++++++----------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/examples/sort.py b/examples/sort.py
index 47c54f619..6713da232 100644
--- a/examples/sort.py
+++ b/examples/sort.py
@@ -17,57 +17,54 @@
 
 import argparse
 
-import numpy
+import numpy as np
 from benchmark import run_benchmark
 from legate.timing import time
 
-import cunumeric
+import cunumeric as num
 
 
 def check_sorted(a, a_sorted, axis=-1):
-    a_numpy = a.__array__()
-    a_numpy_sorted = numpy.sort(a_numpy, axis)
+    a_np = a.__array__()
+    a_np_sorted = np.sort(a_np, axis)
     print("Checking result...")
-    if cunumeric.allclose(a_numpy_sorted, a_sorted):
+    if num.allclose(a_np_sorted, a_sorted):
         print("PASS!")
     else:
         print("FAIL!")
-        print("NUMPY    : " + str(a_numpy_sorted))
+        print("NUMPY    : " + str(a_np_sorted))
         print("CUNUMERIC: " + str(a_sorted))
         assert False
 
 
 def run_sort(N, shape, axis, datatype, lower, upper, perform_check, timing):
 
-    cunumeric.random.seed(42)
-    newtype = numpy.dtype(datatype).type
+    num.random.seed(42)
+    newtype = np.dtype(datatype).type
     if shape is not None:
         shape = tuple(shape)
     else:
         shape = (N,)
 
-    if numpy.issubdtype(newtype, numpy.integer):
+    if np.issubdtype(newtype, np.integer):
         if lower is None:
-            lower = numpy.iinfo(newtype).min
+            lower = np.iinfo(newtype).min
         if upper is None:
-            upper = numpy.iinfo(newtype).max
-        a = cunumeric.random.randint(low=lower, high=upper, size=N).astype(
-            newtype
-        )
+            upper = np.iinfo(newtype).max
+        a = num.random.randint(low=lower, high=upper, size=N).astype(newtype)
         a = a.reshape(shape)
-    elif numpy.issubdtype(newtype, numpy.floating):
-        a = cunumeric.random.random(shape).astype(newtype)
-    elif numpy.issubdtype(newtype, numpy.complexfloating):
-        a = cunumeric.array(
-            cunumeric.random.random(shape)
-            + cunumeric.random.random(shape) * 1j
+    elif np.issubdtype(newtype, np.floating):
+        a = num.random.random(shape).astype(newtype)
+    elif np.issubdtype(newtype, np.complexfloating):
+        a = num.array(
+            num.random.random(shape) + num.random.random(shape) * 1j
         ).astype(newtype)
     else:
         print("UNKNOWN type " + str(newtype))
         assert False
 
     start = time()
-    a_sorted = cunumeric.sort(a, axis)
+    a_sorted = num.sort(a, axis)
     stop = time()
 
     if perform_check:
@@ -120,7 +117,7 @@ def run_sort(N, shape, axis, datatype, lower, upper, perform_check, timing):
         type=str,
         default="uint32",
         dest="datatype",
-        help="data type (default numpy.int32)",
+        help="data type (default np.int32)",
     )
     parser.add_argument(
         "-l",

From b210b69e6f84e6242854f0a8a96a316156a12ea2 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 9 Mar 2022 12:15:09 -0800
Subject: [PATCH 35/49] minor adjustemnts, comments

---
 src/cunumeric/sort/sort.cc           |  2 +-
 src/cunumeric/sort/sort.cu           |  9 ++++-----
 src/cunumeric/sort/sort_omp.cc       |  2 +-
 src/cunumeric/sort/sort_template.inl | 14 ++++++++++++--
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index f20169284..137c4ea21 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -69,7 +69,7 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
   {
     auto input = input_array.read_accessor<VAL, DIM>(rect);
     assert(input.accessor.is_dense_row_major(rect));
-    assert(!is_index_space || DIM > 1);  // not implemented for now
+    assert(!is_index_space || DIM > 1);
 
     if (argsort) {
       // make copy of the input
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index 1cd60aceb..8eb18762d 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -380,8 +380,10 @@ static SortPiece<VAL> sample_sort_nccl(SortPiece<VAL> local_sorted,
 {
   size_t volume = local_sorted.size;
 
-  // collect local samples
-  size_t num_local_samples  = num_ranks;  // handle case numRanks > volume!!
+  // collect local samples - for now we take num_ranks samples for every node
+  // worst case this leads to 2*N/ranks elements on a single node
+  size_t num_local_samples = num_ranks;
+
   size_t num_global_samples = num_local_samples * num_ranks;
   auto samples              = create_buffer<Sample<VAL>>(num_global_samples, Memory::GPU_FB_MEM);
 
@@ -452,9 +454,6 @@ static SortPiece<VAL> sample_sort_nccl(SortPiece<VAL> local_sorted,
     size_send[num_ranks - 1] = volume - last_position;
   }
 
-  // need to sync as we share values in between host/device
-  CHECK_CUDA(cudaStreamSynchronize(stream));
-
   // all2all exchange send/receive sizes
   auto size_recv = create_buffer<size_t>(num_ranks, Memory::Z_COPY_MEM);
   CHECK_NCCL(ncclGroupStart());
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index 192a75333..53ec8f503 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -71,7 +71,7 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
   {
     auto input = input_array.read_accessor<VAL, DIM>(rect);
     assert(input.accessor.is_dense_row_major(rect));
-    assert(!is_index_space || DIM > 1);  // not implemented for now
+    assert(!is_index_space || DIM > 1);
 
     if (argsort) {
       // make copy of the input
diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl
index 593b7cc21..75472925c 100644
--- a/src/cunumeric/sort/sort_template.inl
+++ b/src/cunumeric/sort/sort_template.inl
@@ -53,11 +53,21 @@ struct SortImpl {
      * 1. Sort is always requested for the 'last' dimension within rect
      * 2. We have product_of_all_other_dimensions independent sort ranges
      * 3. if we have more than one participants:
-     *  a) 1D-case: we need to perform parallel sort (e.g. via sampling) -- (only implemented for
-     * GPU) b) ND-case: rect needs to be the full domain in that last dimension
+     *  a) 1D-case: we perform parallel sort (via sampling) -- (only implemented for GPU)
+     *  b) ND-case: rect needs to be the full domain in that last dimension
      *
      */
 
+#ifdef DEBUG_CUNUMERIC
+    std::cout << typeid(KIND).name() << "(" << args.local_rank << "/" << args.num_ranks
+              << "): volume = " << volume << ", DIM=" << DIM << ", rect=" << rect
+              << ", dist. = " << args.is_index_space << ", stable. = " << args.stable
+              << ", argsort. = " << args.argsort << std::endl;
+
+    assert((DIM == 1 || (rect.hi[DIM - 1] - rect.lo[DIM - 1] + 1 == args.sort_dim_size)) &&
+           "multi-dimensional array should not be distributed in (sort) dimension");
+#endif
+
     // we shall not return on empty rectangle in case of distributed data
     // as the process might still participate in the parallel sort
     if ((DIM > 1 || !args.is_index_space) && rect.empty()) return;

From d945468e4753b4434a94bf78b162d9c7d7049838 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 9 Mar 2022 12:33:37 -0800
Subject: [PATCH 36/49] argsort also allows non-stable sort

---
 src/cunumeric/sort/sort.cc           | 16 ++++++++----
 src/cunumeric/sort/sort.cu           | 38 ++++++++++++++++++++--------
 src/cunumeric/sort/sort_omp.cc       | 16 ++++++++----
 src/cunumeric/sort/sort_template.inl | 12 +--------
 4 files changed, 51 insertions(+), 31 deletions(-)

diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index 137c4ea21..77eea8456 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -35,7 +35,8 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
   void thrust_local_sort_inplace(VAL* inptr,
                                  int64_t* argptr,
                                  const size_t volume,
-                                 const size_t sort_dim_size)
+                                 const size_t sort_dim_size,
+                                 const bool stable_argsort)
   {
     if (argptr == nullptr) {
       // sort (in place)
@@ -48,8 +49,13 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
         int64_t* segmentValues = argptr + start_idx;
         VAL* segmentKeys       = inptr + start_idx;
         std::iota(segmentValues, segmentValues + sort_dim_size, 0);  // init
-        thrust::stable_sort_by_key(
-          thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues);
+        if (stable_argsort) {
+          thrust::stable_sort_by_key(
+            thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues);
+        } else {
+          thrust::sort_by_key(
+            thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues);
+        }
       }
     }
   }
@@ -83,7 +89,7 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
 
       // sort data in place
       thrust_local_sort_inplace(
-        dense_input_copy.ptr(0), output.ptr(rect.lo), volume, sort_dim_size);
+        dense_input_copy.ptr(0), output.ptr(rect.lo), volume, sort_dim_size, stable);
 
     } else {
       AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
@@ -93,7 +99,7 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
       std::copy(src, src + volume, output.ptr(rect.lo));
 
       // sort data in place
-      thrust_local_sort_inplace(output.ptr(rect.lo), nullptr, volume, sort_dim_size);
+      thrust_local_sort_inplace(output.ptr(rect.lo), nullptr, volume, sort_dim_size, stable);
     }
   }
 };
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index 8eb18762d..0b1f1a0c8 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -200,8 +200,12 @@ void cub_local_sort_inplace(
 }
 
 template <class VAL>
-void thrust_local_sort_inplace(
-  VAL* inptr, int64_t* argptr, const size_t volume, const size_t sort_dim_size, cudaStream_t stream)
+void thrust_local_sort_inplace(VAL* inptr,
+                               int64_t* argptr,
+                               const size_t volume,
+                               const size_t sort_dim_size,
+                               const bool stable_argsort,
+                               cudaStream_t stream)
 {
   if (argptr == nullptr) {
     if (volume == sort_dim_size) {
@@ -224,7 +228,11 @@ void thrust_local_sort_inplace(
     }
   } else {
     if (volume == sort_dim_size) {
-      thrust::stable_sort_by_key(thrust::cuda::par.on(stream), inptr, inptr + volume, argptr);
+      if (stable_argsort) {
+        thrust::stable_sort_by_key(thrust::cuda::par.on(stream), inptr, inptr + volume, argptr);
+      } else {
+        thrust::sort_by_key(thrust::cuda::par.on(stream), inptr, inptr + volume, argptr);
+      }
     } else {
       auto sort_id = create_buffer<uint64_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
       // init combined keys
@@ -236,11 +244,19 @@ void thrust_local_sort_inplace(
                         thrust::divides<uint64_t>());
       auto combined = thrust::make_zip_iterator(thrust::make_tuple(sort_id.ptr(0), inptr));
 
-      thrust::stable_sort_by_key(thrust::cuda::par.on(stream),
-                                 combined,
-                                 combined + volume,
-                                 argptr,
-                                 thrust::less<thrust::tuple<size_t, VAL>>());
+      if (stable_argsort) {
+        thrust::stable_sort_by_key(thrust::cuda::par.on(stream),
+                                   combined,
+                                   combined + volume,
+                                   argptr,
+                                   thrust::less<thrust::tuple<size_t, VAL>>());
+      } else {
+        thrust::sort_by_key(thrust::cuda::par.on(stream),
+                            combined,
+                            combined + volume,
+                            argptr,
+                            thrust::less<thrust::tuple<size_t, VAL>>());
+      }
     }
   }
 }
@@ -260,6 +276,7 @@ void local_sort_inplace(legate_type_of<CODE>* inptr,
                         int64_t* argptr,
                         const size_t volume,
                         const size_t sort_dim_size,
+                        const bool stable_argsort,  // cub sort is always stable
                         cudaStream_t stream)
 {
   using VAL = legate_type_of<CODE>;
@@ -271,10 +288,11 @@ void local_sort_inplace(legate_type_of<CODE>* inptr,
                         int64_t* argptr,
                         const size_t volume,
                         const size_t sort_dim_size,
+                        const bool stable_argsort,
                         cudaStream_t stream)
 {
   using VAL = legate_type_of<CODE>;
-  thrust_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stream);
+  thrust_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stable_argsort, stream);
 }
 
 // auto align to multiples of 16 bytes
@@ -630,7 +648,7 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
         values_ptr, input.ptr(rect.lo), sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream));
 
       // sort data (locally)
-      local_sort_inplace<CODE>(values_ptr, indices_ptr, volume, sort_dim_size, stream);
+      local_sort_inplace<CODE>(values_ptr, indices_ptr, volume, sort_dim_size, stable, stream);
     }
 
     // this is linked to the decision in sorting.py on when to use an 'unbounded' output array.
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index 53ec8f503..c552fcb90 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -35,7 +35,8 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
   void thrust_local_sort_inplace(VAL* inptr,
                                  int64_t* argptr,
                                  const size_t volume,
-                                 const size_t sort_dim_size)
+                                 const size_t sort_dim_size,
+                                 const bool stable_argsort)
   {
     if (argptr == nullptr) {
       // sort (in place)
@@ -50,8 +51,13 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
         int64_t* segmentValues = argptr + start_idx;
         VAL* segmentKeys       = inptr + start_idx;
         std::iota(segmentValues, segmentValues + sort_dim_size, 0);  // init
-        thrust::stable_sort_by_key(
-          thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues);
+        if (stable_argsort) {
+          thrust::stable_sort_by_key(
+            thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues);
+        } else {
+          thrust::sort_by_key(
+            thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues);
+        }
       }
     }
   }
@@ -85,7 +91,7 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
 
       // sort data in place
       thrust_local_sort_inplace(
-        dense_input_copy.ptr(0), output.ptr(rect.lo), volume, sort_dim_size);
+        dense_input_copy.ptr(0), output.ptr(rect.lo), volume, sort_dim_size, stable);
 
     } else {
       AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
@@ -95,7 +101,7 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
       std::copy(src, src + volume, output.ptr(rect.lo));
 
       // sort data in place
-      thrust_local_sort_inplace(output.ptr(rect.lo), nullptr, volume, sort_dim_size);
+      thrust_local_sort_inplace(output.ptr(rect.lo), nullptr, volume, sort_dim_size, stable);
     }
   }
 };
diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl
index 75472925c..610989220 100644
--- a/src/cunumeric/sort/sort_template.inl
+++ b/src/cunumeric/sort/sort_template.inl
@@ -53,21 +53,11 @@ struct SortImpl {
      * 1. Sort is always requested for the 'last' dimension within rect
      * 2. We have product_of_all_other_dimensions independent sort ranges
      * 3. if we have more than one participants:
-     *  a) 1D-case: we perform parallel sort (via sampling) -- (only implemented for GPU)
+     *  a) 1D-case: we perform parallel sort (via sampling)
      *  b) ND-case: rect needs to be the full domain in that last dimension
      *
      */
 
-#ifdef DEBUG_CUNUMERIC
-    std::cout << typeid(KIND).name() << "(" << args.local_rank << "/" << args.num_ranks
-              << "): volume = " << volume << ", DIM=" << DIM << ", rect=" << rect
-              << ", dist. = " << args.is_index_space << ", stable. = " << args.stable
-              << ", argsort. = " << args.argsort << std::endl;
-
-    assert((DIM == 1 || (rect.hi[DIM - 1] - rect.lo[DIM - 1] + 1 == args.sort_dim_size)) &&
-           "multi-dimensional array should not be distributed in (sort) dimension");
-#endif
-
     // we shall not return on empty rectangle in case of distributed data
     // as the process might still participate in the parallel sort
     if ((DIM > 1 || !args.is_index_space) && rect.empty()) return;

From 898a8d2132ba02d9599c103251b523dd15efdea7 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 9 Mar 2022 14:00:15 -0800
Subject: [PATCH 37/49] adjusted more tests to force stable sort when comparing
 argsort results

---
 tests/sort.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/tests/sort.py b/tests/sort.py
index 4937de7ec..d662183a7 100644
--- a/tests/sort.py
+++ b/tests/sort.py
@@ -31,10 +31,13 @@ def test_sort_axis(a_np, a_num, axis):
     compare_assert(a_np, a_num)
     print("Sorting axis " + str(axis) + ":")
     sort_np = np.sort(a_np, axis, kind="stable")
+    sort_num = num.sort(a_num, axis, kind="stable")
+    compare_assert(sort_np, sort_num)
+    sort_np = np.sort(a_np, axis)
     sort_num = num.sort(a_num, axis)
     compare_assert(sort_np, sort_num)
     argsort_np = np.argsort(a_np, axis, kind="stable")
-    argsort_num = num.argsort(a_num, axis)
+    argsort_num = num.argsort(a_num, axis, kind="stable")
     compare_assert(argsort_np, argsort_num)
 
 
@@ -118,6 +121,10 @@ def test_api(a=None):
             np.sort(a, axis=i, kind="stable"),
             num.sort(a_num, i, kind="stable"),
         )
+        compare_assert(
+            np.sort(a, axis=i),
+            num.sort(a_num, i),
+        )
 
     # flatten
     print("sort flattened")
@@ -125,6 +132,10 @@ def test_api(a=None):
         np.sort(a, axis=None, kind="stable"),
         num.sort(a_num, axis=None, kind="stable"),
     )
+    compare_assert(
+        np.sort(a, axis=None),
+        num.sort(a_num, axis=None),
+    )
 
     # msort
     print("msort")
@@ -149,6 +160,7 @@ def test_api(a=None):
             np.argsort(a, axis=i, kind="stable"),
             num.argsort(a_num, axis=i, kind="stable"),
         )
+        num.argsort(a_num, axis=i)  # cannot be compared
 
     # flatten
     print("argsort flattened")
@@ -156,6 +168,7 @@ def test_api(a=None):
         np.argsort(a, axis=None, kind="stable"),
         num.argsort(a_num, axis=None, kind="stable"),
     )
+    num.argsort(a_num, axis=None)  # cannot be compared
 
 
 def generate_random(shape, datatype):

From 09ac1c872a467b5746a986f8eb929a354401e15c Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 9 Mar 2022 14:40:36 -0800
Subject: [PATCH 38/49] clarify offset iterator usage

---
 src/cunumeric/sort/sort.cu | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index 0b1f1a0c8..33ae367c0 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -81,7 +81,7 @@ void cub_local_sort_inplace(
   size_t temp_storage_bytes = 0;
   if (argptr == nullptr) {
     if (volume == sort_dim_size) {
-      // sort (initial call to compute bufffer size)
+      // sort (initial call to compute buffer size)
       cub::DeviceRadixSort::SortKeys(
         nullptr, temp_storage_bytes, keys_in.ptr(0), inptr, volume, 0, sizeof(VAL) * 8, stream);
       auto temp_storage =
@@ -95,10 +95,11 @@ void cub_local_sort_inplace(
                                      sizeof(VAL) * 8,
                                      stream);
     } else {
-      // segmented sort (initial call to compute bufffer size)
-      auto off_start_it =
+      // segmented sort (initial call to compute buffer size)
+      // generate start/end positions for all segments via iterators to avoid allocating buffers
+      auto off_start_pos_it =
         thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size));
-      auto off_end_it =
+      auto off_end_pos_it =
         thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size));
 
       cub::DeviceSegmentedRadixSort::SortKeys(nullptr,
@@ -107,8 +108,8 @@ void cub_local_sort_inplace(
                                               inptr,
                                               volume,
                                               volume / sort_dim_size,
-                                              off_start_it,
-                                              off_end_it,
+                                              off_start_pos_it,
+                                              off_end_pos_it,
                                               0,
                                               sizeof(VAL) * 8,
                                               stream);
@@ -121,8 +122,8 @@ void cub_local_sort_inplace(
                                               inptr,
                                               volume,
                                               volume / sort_dim_size,
-                                              off_start_it,
-                                              off_end_it,
+                                              off_start_pos_it,
+                                              off_end_pos_it,
                                               0,
                                               sizeof(VAL) * 8,
                                               stream);
@@ -133,7 +134,7 @@ void cub_local_sort_inplace(
       idx_in.ptr(0), argptr, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream));
 
     if (volume == sort_dim_size) {
-      // argsort (initial call to compute bufffer size)
+      // argsort (initial call to compute buffer size)
       cub::DeviceRadixSort::SortPairs(nullptr,
                                       temp_storage_bytes,
                                       keys_in.ptr(0),
@@ -159,10 +160,11 @@ void cub_local_sort_inplace(
                                       sizeof(VAL) * 8,
                                       stream);
     } else {
-      // segmented argsort (initial call to compute bufffer size)
-      auto off_start_it =
+      // segmented argsort (initial call to compute buffer size)
+      // generate start/end positions for all segments via iterators to avoid allocating buffers
+      auto off_start_pos_it =
         thrust::make_transform_iterator(thrust::make_counting_iterator(0), multiply(sort_dim_size));
-      auto off_end_it =
+      auto off_end_pos_it =
         thrust::make_transform_iterator(thrust::make_counting_iterator(1), multiply(sort_dim_size));
 
       cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
@@ -173,8 +175,8 @@ void cub_local_sort_inplace(
                                                argptr,
                                                volume,
                                                volume / sort_dim_size,
-                                               off_start_it,
-                                               off_end_it,
+                                               off_start_pos_it,
+                                               off_end_pos_it,
                                                0,
                                                sizeof(VAL) * 8,
                                                stream);
@@ -190,8 +192,8 @@ void cub_local_sort_inplace(
                                                argptr,
                                                volume,
                                                volume / sort_dim_size,
-                                               off_start_it,
-                                               off_end_it,
+                                               off_start_pos_it,
+                                               off_end_pos_it,
                                                0,
                                                sizeof(VAL) * 8,
                                                stream);

From 9cd31bb0161f1accc3fa711c80f6ef913a1621fe Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 10 Mar 2022 09:35:37 +0000
Subject: [PATCH 39/49] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/cunumeric/mapper.cc | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc
index a769bd0fc..fd1bfec2f 100644
--- a/src/cunumeric/mapper.cc
+++ b/src/cunumeric/mapper.cc
@@ -125,20 +125,20 @@ std::vector<StoreMapping> CuNumericMapper::store_mappings(
       mappings.back().policy.ordering.fortran_order();
       mappings.back().policy.exact = true;
       return std::move(mappings);
-    case CUNUMERIC_SORT: {
-      std::vector<StoreMapping> mappings;
-      auto& inputs = task.inputs();
-      mappings.push_back(StoreMapping::default_mapping(inputs[0], options.front()));
-      mappings.back().policy.ordering.c_order();
-      mappings.back().policy.exact = true;
-      return std::move(mappings);
+      case CUNUMERIC_SORT: {
+        std::vector<StoreMapping> mappings;
+        auto& inputs = task.inputs();
+        mappings.push_back(StoreMapping::default_mapping(inputs[0], options.front()));
+        mappings.back().policy.ordering.c_order();
+        mappings.back().policy.exact = true;
+        return std::move(mappings);
+      }
+      default: {
+        return {};
+      }
     }
-    default: {
+      assert(false);
       return {};
-    }
   }
-  assert(false);
-  return {};
-}
 
 }  // namespace cunumeric

From da79f8600fbf7b42bb622838379657a504048d3d Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Thu, 10 Mar 2022 01:45:55 -0800
Subject: [PATCH 40/49] fixed merge conflict

---
 src/cunumeric/mapper.cc | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc
index fd1bfec2f..962e68ecc 100644
--- a/src/cunumeric/mapper.cc
+++ b/src/cunumeric/mapper.cc
@@ -125,20 +125,21 @@ std::vector<StoreMapping> CuNumericMapper::store_mappings(
       mappings.back().policy.ordering.fortran_order();
       mappings.back().policy.exact = true;
       return std::move(mappings);
-      case CUNUMERIC_SORT: {
-        std::vector<StoreMapping> mappings;
-        auto& inputs = task.inputs();
-        mappings.push_back(StoreMapping::default_mapping(inputs[0], options.front()));
-        mappings.back().policy.ordering.c_order();
-        mappings.back().policy.exact = true;
-        return std::move(mappings);
-      }
-      default: {
-        return {};
-      }
     }
-      assert(false);
+    case CUNUMERIC_SORT: {
+      std::vector<StoreMapping> mappings;
+      auto& inputs = task.inputs();
+      mappings.push_back(StoreMapping::default_mapping(inputs[0], options.front()));
+      mappings.back().policy.ordering.c_order();
+      mappings.back().policy.exact = true;
+      return std::move(mappings);
+    }
+    default: {
       return {};
+    }
   }
+  assert(false);
+  return {};
+}
 
 }  // namespace cunumeric

From 04f811beca317aca2f729f0d0ed40d9ee8277da7 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 11 Mar 2022 05:01:21 -0800
Subject: [PATCH 41/49] ensure 16byte alignment for NCCL transfers

---
 src/cunumeric/sort/sort.cu | 93 +++++++++++++++++++++++++++++++-------
 1 file changed, 76 insertions(+), 17 deletions(-)

diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index 33ae367c0..2701e188c 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -298,7 +298,10 @@ void local_sort_inplace(legate_type_of<CODE>* inptr,
 }
 
 // auto align to multiples of 16 bytes
-auto get_aligned_size = [](auto size) { return std::max<size_t>(16, (size + 15) / 16 * 16); };
+auto get_16b_aligned = [](auto bytes) { return std::max<size_t>(16, (bytes + 15) / 16 * 16); };
+auto get_16b_aligned_count = [](auto count, auto element_bytes) {
+  return (get_16b_aligned(count * element_bytes) + element_bytes - 1) / element_bytes;
+};
 
 template <typename VAL>
 struct SortPiece {
@@ -488,40 +491,96 @@ static SortPiece<VAL> sample_sort_nccl(SortPiece<VAL> local_sorted,
 
   // allocate merge targets, data transfer...
   std::vector<SortPiece<VAL>> merge_buffers(num_ranks);
+  std::vector<size_t> aligned_pos_vals_send(num_ranks);
+  std::vector<size_t> aligned_pos_idcs_send(num_ranks);
 
+  size_t buf_size_send_vals_total = 0;
+  size_t buf_size_send_idcs_total = 0;
   for (size_t i = 0; i < num_ranks; ++i) {
     // align buffer to allow data transfer of 16byte blocks
-    auto recv_size_aligned   = get_aligned_size(size_recv[i] * sizeof(VAL));
-    auto buf_size            = (recv_size_aligned + sizeof(VAL) - 1) / sizeof(VAL);
-    merge_buffers[i].values  = create_buffer<VAL>(buf_size, Memory::GPU_FB_MEM);
-    merge_buffers[i].indices = create_buffer<int64_t>(argsort ? buf_size : 0, Memory::GPU_FB_MEM);
-    merge_buffers[i].size    = size_recv[i];
+    auto buf_size_vals_recv = get_16b_aligned_count(size_recv[i], sizeof(VAL));
+    merge_buffers[i].values = create_buffer<VAL>(buf_size_vals_recv, Memory::GPU_FB_MEM);
+    merge_buffers[i].size   = size_recv[i];
+
+    aligned_pos_vals_send[i] = buf_size_send_vals_total;
+    buf_size_send_vals_total += get_16b_aligned_count(size_send[i], sizeof(VAL));
+
+    if (argsort) {
+      auto buf_size_idcs_recv  = get_16b_aligned_count(size_recv[i], sizeof(int64_t));
+      merge_buffers[i].indices = create_buffer<int64_t>(buf_size_idcs_recv, Memory::GPU_FB_MEM);
+      aligned_pos_idcs_send[i] = buf_size_send_idcs_total;
+      buf_size_send_idcs_total += get_16b_aligned_count(size_send[i], sizeof(int64_t));
+    } else {
+      merge_buffers[i].indices = create_buffer<int64_t>(0, Memory::GPU_FB_MEM);
+    }
+  }
+
+  // copy values into aligned send buffer
+  auto val_send_buf = local_sorted.values;
+  if (buf_size_send_vals_total > volume) {
+    val_send_buf = create_buffer<VAL>(buf_size_send_vals_total, Memory::GPU_FB_MEM);
+    size_t pos   = 0;
+    for (size_t r = 0; r < num_ranks; ++r) {
+      CHECK_CUDA(cudaMemcpyAsync(val_send_buf.ptr(aligned_pos_vals_send[r]),
+                                 local_sorted.values.ptr(pos),
+                                 sizeof(VAL) * size_send[r],
+                                 cudaMemcpyDeviceToDevice,
+                                 stream));
+      pos += size_send[r];
+    }
   }
-  size_t send_pos = 0;
+
+  // copy indices into aligned send buffer
+  auto idc_send_buf = local_sorted.indices;
+  if (argsort && buf_size_send_idcs_total > volume) {
+    idc_send_buf = create_buffer<int64_t>(buf_size_send_idcs_total, Memory::GPU_FB_MEM);
+    size_t pos   = 0;
+    for (size_t r = 0; r < num_ranks; ++r) {
+      CHECK_CUDA(cudaMemcpyAsync(idc_send_buf.ptr(aligned_pos_idcs_send[r]),
+                                 local_sorted.indices.ptr(pos),
+                                 sizeof(int64_t) * size_send[r],
+                                 cudaMemcpyDeviceToDevice,
+                                 stream));
+      pos += size_send[r];
+    }
+  }
+
   CHECK_NCCL(ncclGroupStart());
   for (size_t r = 0; r < num_ranks; r++) {
-    CHECK_NCCL(ncclSend(local_sorted.values.ptr(send_pos),
-                        get_aligned_size(size_send[r] * sizeof(VAL)),
+    CHECK_NCCL(ncclSend(val_send_buf.ptr(aligned_pos_vals_send[r]),
+                        get_16b_aligned(size_send[r] * sizeof(VAL)),
                         ncclInt8,
                         r,
                         *comm,
                         stream));
     CHECK_NCCL(ncclRecv(merge_buffers[r].values.ptr(0),
-                        get_aligned_size(size_recv[r] * sizeof(VAL)),
+                        get_16b_aligned(size_recv[r] * sizeof(VAL)),
                         ncclInt8,
                         r,
                         *comm,
                         stream));
-    if (argsort) {
-      CHECK_NCCL(
-        ncclSend(local_sorted.indices.ptr(send_pos), size_send[r], ncclInt64, r, *comm, stream));
-      CHECK_NCCL(
-        ncclRecv(merge_buffers[r].indices.ptr(0), size_recv[r], ncclInt64, r, *comm, stream));
-    }
-    send_pos += size_send[r];
   }
   CHECK_NCCL(ncclGroupEnd());
 
+  if (argsort) {
+    CHECK_NCCL(ncclGroupStart());
+    for (size_t r = 0; r < num_ranks; r++) {
+      CHECK_NCCL(ncclSend(idc_send_buf.ptr(aligned_pos_idcs_send[r]),
+                          get_16b_aligned_count(size_send[r], sizeof(int64_t)),
+                          ncclInt64,
+                          r,
+                          *comm,
+                          stream));
+      CHECK_NCCL(ncclRecv(merge_buffers[r].indices.ptr(0),
+                          get_16b_aligned_count(size_recv[r], sizeof(int64_t)),
+                          ncclInt64,
+                          r,
+                          *comm,
+                          stream));
+    }
+    CHECK_NCCL(ncclGroupEnd());
+  }
+
   // now merge sort all into the result buffer
   // maybe k-way merge is more efficient here...
   for (size_t stride = 1; stride < num_ranks; stride *= 2) {

From 568523fcb2a9e3116d458287ba7ac783f3b5b752 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 16 Mar 2022 02:33:15 -0700
Subject: [PATCH 42/49] some minor adjustments

---
 cunumeric/deferred.py                |  4 ++--
 cunumeric/{sorting.py => sort.py}    |  4 ++--
 examples/sort.py                     | 24 ++++++++----------------
 src/cunumeric/sort/sort.cu           |  8 ++++++--
 src/cunumeric/sort/sort_template.inl |  3 +++
 5 files changed, 21 insertions(+), 22 deletions(-)
 rename cunumeric/{sorting.py => sort.py} (96%)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index fa9d18ef1..57327aeaf 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -32,7 +32,7 @@
     UnaryRedCode,
 )
 from .linalg.cholesky import cholesky
-from .sorting import sorting
+from .sort import sort
 from .thunk import NumPyThunk
 from .utils import get_arg_value_dtype
 
@@ -1559,4 +1559,4 @@ def sort(self, rhs, argsort=False, axis=-1, kind="quicksort", order=None):
         if axis is not None and (axis >= rhs.ndim or axis < -rhs.ndim):
             raise ValueError("invalid axis")
 
-        sorting(self, rhs, argsort, axis, stable)
+        sort(self, rhs, argsort, axis, stable)
diff --git a/cunumeric/sorting.py b/cunumeric/sort.py
similarity index 96%
rename from cunumeric/sorting.py
rename to cunumeric/sort.py
index f8c56fa0b..b2b8bb43d 100644
--- a/cunumeric/sorting.py
+++ b/cunumeric/sort.py
@@ -68,9 +68,9 @@ def sort_task(output, input, argsort, stable):
 
     if output.ndim > 1:
         task.add_broadcast(input.base, input.ndim - 1)
-    elif output.runtime.num_gpus > 0:
+    elif output.runtime.num_gpus > 1:
         task.add_nccl_communicator()
-    elif output.runtime.num_procs > 1:
+    elif output.runtime.num_gpus == 0 and output.runtime.num_procs > 1:
         # Distributed 1D sort on CPU not supported yet
         task.add_broadcast(input.base)
 
diff --git a/examples/sort.py b/examples/sort.py
index 6713da232..179cc223d 100644
--- a/examples/sort.py
+++ b/examples/sort.py
@@ -37,14 +37,15 @@ def check_sorted(a, a_sorted, axis=-1):
         assert False
 
 
-def run_sort(N, shape, axis, datatype, lower, upper, perform_check, timing):
+def run_sort(shape, axis, datatype, lower, upper, perform_check, timing):
 
     num.random.seed(42)
     newtype = np.dtype(datatype).type
-    if shape is not None:
-        shape = tuple(shape)
-    else:
-        shape = (N,)
+
+    N = 1
+    for e in shape:
+        N *= e
+    shape = tuple(shape)
 
     if np.issubdtype(newtype, np.integer):
         if lower is None:
@@ -87,14 +88,6 @@ def run_sort(N, shape, axis, datatype, lower, upper, perform_check, timing):
         action="store_true",
         help="check the result of the solve",
     )
-    parser.add_argument(
-        "-n",
-        "--num",
-        type=int,
-        default=1000000,
-        dest="N",
-        help="number of elements in one dimension",
-    )
     parser.add_argument(
         "-t",
         "--time",
@@ -107,9 +100,9 @@ def run_sort(N, shape, axis, datatype, lower, upper, perform_check, timing):
         "--shape",
         type=int,
         nargs="+",
-        default=None,
+        default=[1000000],
         dest="shape",
-        help="array reshape (default 'None')",
+        help="array reshape (default '[1000000]')",
     )
     parser.add_argument(
         "-d",
@@ -159,7 +152,6 @@ def run_sort(N, shape, axis, datatype, lower, upper, perform_check, timing):
         args.benchmark,
         "Sort",
         (
-            args.N,
             args.shape,
             args.axis,
             args.datatype,
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index 2701e188c..3dc268bdd 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -282,7 +282,12 @@ void local_sort_inplace(legate_type_of<CODE>* inptr,
                         cudaStream_t stream)
 {
   using VAL = legate_type_of<CODE>;
-  cub_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stream);
+  // fallback to thrust approach as segmented radix sort is not suited for small segments
+  if (volume == sort_dim_size || sort_dim_size > 300) {
+    cub_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stream);
+  } else {
+    thrust_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stable_argsort, stream);
+  }
 }
 
 template <LegateTypeCode CODE, std::enable_if_t<!support_cub<CODE>::value>* = nullptr>
@@ -450,7 +455,6 @@ static SortPiece<VAL> sample_sort_nccl(SortPiece<VAL> local_sorted,
   auto split_positions       = create_buffer<size_t>(num_splitters, Memory::Z_COPY_MEM);
   {
     const size_t num_blocks = (num_splitters + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    VAL init_value          = std::numeric_limits<VAL>::max();
     extract_split_positions<<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(
       local_sorted.values.ptr(0),
       volume,
diff --git a/src/cunumeric/sort/sort_template.inl b/src/cunumeric/sort/sort_template.inl
index 610989220..5c63813eb 100644
--- a/src/cunumeric/sort/sort_template.inl
+++ b/src/cunumeric/sort/sort_template.inl
@@ -58,6 +58,9 @@ struct SortImpl {
      *
      */
 
+    assert((DIM == 1 || (rect.hi[DIM - 1] - rect.lo[DIM - 1] + 1 == args.sort_dim_size)) &&
+           "multi-dimensional array should not be distributed in (sort) dimension");
+
     // we shall not return on empty rectangle in case of distributed data
     // as the process might still participate in the parallel sort
     if ((DIM > 1 || !args.is_index_space) && rect.empty()) return;

From e1b6c3182b8c5ef2744aedffdce993d9156deb13 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 16 Mar 2022 14:04:38 -0700
Subject: [PATCH 43/49] fixed renaming

---
 cunumeric/sort.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cunumeric/sort.py b/cunumeric/sort.py
index b2b8bb43d..f1def3c72 100644
--- a/cunumeric/sort.py
+++ b/cunumeric/sort.py
@@ -26,7 +26,7 @@ def sort_flattened(output, input, argsort, stable):
     sort_result = output.runtime.create_empty_thunk(
         flattened.shape, dtype=output.dtype, inputs=(flattened,)
     )
-    sorting(sort_result, flattened, argsort, stable=stable)
+    sort(sort_result, flattened, argsort, stable=stable)
     output.base = sort_result.base
     output.numpy_array = None
 
@@ -46,7 +46,7 @@ def sort_swapped(output, input, argsort, sort_axis, stable):
     sort_result = output.runtime.create_empty_thunk(
         swapped_copy.shape, dtype=output.dtype, inputs=(swapped_copy,)
     )
-    sorting(sort_result, swapped_copy, argsort, stable=stable)
+    sort(sort_result, swapped_copy, argsort, stable=stable)
 
     output.base = sort_result.swapaxes(input.ndim - 1, sort_axis).base
     output.numpy_array = None
@@ -84,7 +84,7 @@ def sort_task(output, input, argsort, stable):
         output.numpy_array = None
 
 
-def sorting(output, input, argsort, axis=-1, stable=False):
+def sort(output, input, argsort, axis=-1, stable=False):
     if axis is None and input.ndim > 1:
         sort_flattened(output, input, argsort, stable)
     else:

From 2edd7ba4bd35a48831f50b294a8c6fd6e685f995 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 18 Mar 2022 03:57:22 -0700
Subject: [PATCH 44/49] manually free temporary memory to reduce peak usage

---
 src/cunumeric/sort/sort.cu | 89 +++++++++++++++++++-------------------
 1 file changed, 44 insertions(+), 45 deletions(-)

diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index 3dc268bdd..da28cd889 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -34,34 +34,6 @@ namespace cunumeric {
 
 using namespace Legion;
 
-template <typename VAL, int DIM>
-__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
-  copy_into_buffer(VAL* out,
-                   const AccessorRO<VAL, DIM> accessor,
-                   const Point<DIM> lo,
-                   const Pitches<DIM - 1> pitches,
-                   const size_t volume)
-{
-  size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
-  if (offset >= volume) return;
-  auto point  = pitches.unflatten(offset, lo);
-  out[offset] = accessor[lo + point];
-}
-
-template <typename VAL, int DIM>
-__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
-  copy_into_output(AccessorWO<VAL, DIM> accessor,
-                   const VAL* data,
-                   const Point<DIM> lo,
-                   const Pitches<DIM - 1> pitches,
-                   const size_t volume)
-{
-  size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
-  if (offset >= volume) return;
-  auto point           = pitches.unflatten(offset, lo);
-  accessor[lo + point] = data[offset];
-}
-
 struct multiply : public thrust::unary_function<int, int> {
   const int constant;
 
@@ -94,6 +66,7 @@ void cub_local_sort_inplace(
                                      0,
                                      sizeof(VAL) * 8,
                                      stream);
+      temp_storage.destroy();
     } else {
       // segmented sort (initial call to compute buffer size)
       // generate start/end positions for all segments via iterators to avoid allocating buffers
@@ -127,6 +100,7 @@ void cub_local_sort_inplace(
                                               0,
                                               sizeof(VAL) * 8,
                                               stream);
+      temp_storage.destroy();
     }
   } else {
     auto idx_in = create_buffer<int64_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
@@ -159,6 +133,7 @@ void cub_local_sort_inplace(
                                       0,
                                       sizeof(VAL) * 8,
                                       stream);
+      temp_storage.destroy();
     } else {
       // segmented argsort (initial call to compute buffer size)
       // generate start/end positions for all segments via iterators to avoid allocating buffers
@@ -197,8 +172,11 @@ void cub_local_sort_inplace(
                                                0,
                                                sizeof(VAL) * 8,
                                                stream);
+      temp_storage.destroy();
     }
+    idx_in.destroy();
   }
+  keys_in.destroy();
 }
 
 template <class VAL>
@@ -227,6 +205,8 @@ void thrust_local_sort_inplace(VAL* inptr,
                    combined,
                    combined + volume,
                    thrust::less<thrust::tuple<size_t, VAL>>());
+
+      sort_id.destroy();
     }
   } else {
     if (volume == sort_dim_size) {
@@ -259,6 +239,8 @@ void thrust_local_sort_inplace(VAL* inptr,
                             argptr,
                             thrust::less<thrust::tuple<size_t, VAL>>());
       }
+
+      sort_id.destroy();
     }
   }
 }
@@ -481,6 +463,10 @@ static SortPiece<VAL> sample_sort_nccl(SortPiece<VAL> local_sorted,
     size_send[num_ranks - 1] = volume - last_position;
   }
 
+  // cleanup intermediate data structures
+  samples.destroy();
+  split_positions.destroy();
+
   // all2all exchange send/receive sizes
   auto size_recv = create_buffer<size_t>(num_ranks, Memory::Z_COPY_MEM);
   CHECK_NCCL(ncclGroupStart());
@@ -493,29 +479,18 @@ static SortPiece<VAL> sample_sort_nccl(SortPiece<VAL> local_sorted,
   // need to sync as we share values in between host/device
   CHECK_CUDA(cudaStreamSynchronize(stream));
 
-  // allocate merge targets, data transfer...
-  std::vector<SortPiece<VAL>> merge_buffers(num_ranks);
+  // handle alignment
   std::vector<size_t> aligned_pos_vals_send(num_ranks);
   std::vector<size_t> aligned_pos_idcs_send(num_ranks);
-
   size_t buf_size_send_vals_total = 0;
   size_t buf_size_send_idcs_total = 0;
   for (size_t i = 0; i < num_ranks; ++i) {
     // align buffer to allow data transfer of 16byte blocks
-    auto buf_size_vals_recv = get_16b_aligned_count(size_recv[i], sizeof(VAL));
-    merge_buffers[i].values = create_buffer<VAL>(buf_size_vals_recv, Memory::GPU_FB_MEM);
-    merge_buffers[i].size   = size_recv[i];
-
     aligned_pos_vals_send[i] = buf_size_send_vals_total;
     buf_size_send_vals_total += get_16b_aligned_count(size_send[i], sizeof(VAL));
-
     if (argsort) {
-      auto buf_size_idcs_recv  = get_16b_aligned_count(size_recv[i], sizeof(int64_t));
-      merge_buffers[i].indices = create_buffer<int64_t>(buf_size_idcs_recv, Memory::GPU_FB_MEM);
       aligned_pos_idcs_send[i] = buf_size_send_idcs_total;
       buf_size_send_idcs_total += get_16b_aligned_count(size_send[i], sizeof(int64_t));
-    } else {
-      merge_buffers[i].indices = create_buffer<int64_t>(0, Memory::GPU_FB_MEM);
     }
   }
 
@@ -532,6 +507,7 @@ static SortPiece<VAL> sample_sort_nccl(SortPiece<VAL> local_sorted,
                                  stream));
       pos += size_send[r];
     }
+    local_sorted.values.destroy();
   }
 
   // copy indices into aligned send buffer
@@ -547,6 +523,21 @@ static SortPiece<VAL> sample_sort_nccl(SortPiece<VAL> local_sorted,
                                  stream));
       pos += size_send[r];
     }
+    local_sorted.indices.destroy();
+  }
+
+  // allocate target buffers
+  std::vector<SortPiece<VAL>> merge_buffers(num_ranks);
+  for (size_t i = 0; i < num_ranks; ++i) {
+    auto buf_size_vals_recv = get_16b_aligned_count(size_recv[i], sizeof(VAL));
+    merge_buffers[i].values = create_buffer<VAL>(buf_size_vals_recv, Memory::GPU_FB_MEM);
+    merge_buffers[i].size   = size_recv[i];
+    if (argsort) {
+      auto buf_size_idcs_recv  = get_16b_aligned_count(size_recv[i], sizeof(int64_t));
+      merge_buffers[i].indices = create_buffer<int64_t>(buf_size_idcs_recv, Memory::GPU_FB_MEM);
+    } else {
+      merge_buffers[i].indices = create_buffer<int64_t>(0, Memory::GPU_FB_MEM);
+    }
   }
 
   CHECK_NCCL(ncclGroupStart());
@@ -585,6 +576,12 @@ static SortPiece<VAL> sample_sort_nccl(SortPiece<VAL> local_sorted,
     CHECK_NCCL(ncclGroupEnd());
   }
 
+  // cleanup remaining buffers
+  size_send.destroy();
+  size_recv.destroy();
+  val_send_buf.destroy();
+  idc_send_buf.destroy();
+
   // now merge sort all into the result buffer
   // maybe k-way merge is more efficient here...
   for (size_t stride = 1; stride < num_ranks; stride *= 2) {
@@ -664,14 +661,14 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
     VAL* values_ptr      = nullptr;
     if (argsort) {
       // make a buffer for input
-      auto input_copy = create_buffer<VAL>(volume, Legion::Memory::Kind::GPU_FB_MEM);
-      values_ptr      = input_copy.ptr(0);
+      auto input_copy     = create_buffer<VAL>(volume, Legion::Memory::Kind::GPU_FB_MEM);
+      local_sorted.values = input_copy;
+      values_ptr          = input_copy.ptr(0);
 
       // initialize indices
       if (output_array.dim() == -1) {
         auto indices_buffer  = create_buffer<int64_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
         indices_ptr          = indices_buffer.ptr(0);
-        local_sorted.values  = input_copy;
         local_sorted.indices = indices_buffer;
         local_sorted.size    = volume;
       } else {
@@ -699,8 +696,7 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
         values_ptr           = input_copy.ptr(0);
         local_sorted.values  = input_copy;
         local_sorted.indices = create_buffer<int64_t>(0, Legion::Memory::Kind::GPU_FB_MEM);
-        ;
-        local_sorted.size = volume;
+        local_sorted.size    = volume;
       } else {
         AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
         assert(output.accessor.is_dense_row_major(rect));
@@ -730,6 +726,9 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
         output_array.return_data(local_sorted_repartitioned.values,
                                  local_sorted_repartitioned.size);
       }
+    } else if (argsort) {
+      // cleanup
+      local_sorted.values.destroy();
     }
   }
 };

From ee52211deb3ab19b1948b9b9b721d2efa59c3085 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 18 Mar 2022 05:13:56 -0700
Subject: [PATCH 45/49] refactor sort interface to prevent 1 unneeded copy

---
 src/cunumeric/sort/sort.cu | 181 ++++++++++++++++++++++++-------------
 1 file changed, 116 insertions(+), 65 deletions(-)

diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index da28cd889..e4e5b2e22 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -43,25 +43,35 @@ struct multiply : public thrust::unary_function<int, int> {
 };
 
 template <class VAL>
-void cub_local_sort_inplace(
-  VAL* inptr, int64_t* argptr, const size_t volume, const size_t sort_dim_size, cudaStream_t stream)
+void cub_local_sort(const VAL* values_in,
+                    VAL* values_out,
+                    const int64_t* indices_in,
+                    int64_t* indices_out,
+                    const size_t volume,
+                    const size_t sort_dim_size,
+                    cudaStream_t stream)
 {
-  // make a copy of input --> we want inptr to return sorted values
-  auto keys_in = create_buffer<VAL>(volume, Legion::Memory::Kind::GPU_FB_MEM);
-  CHECK_CUDA(
-    cudaMemcpyAsync(keys_in.ptr(0), inptr, sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream));
+  Buffer<VAL> keys_in;
+  const VAL* values_in_cub = values_in;
+  if (values_in == values_out) {
+    keys_in       = create_buffer<VAL>(volume, Legion::Memory::Kind::GPU_FB_MEM);
+    values_in_cub = keys_in.ptr(0);
+    CHECK_CUDA(cudaMemcpyAsync(
+      keys_in.ptr(0), values_out, sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream));
+  }
+
   size_t temp_storage_bytes = 0;
-  if (argptr == nullptr) {
+  if (indices_out == nullptr) {
     if (volume == sort_dim_size) {
       // sort (initial call to compute buffer size)
       cub::DeviceRadixSort::SortKeys(
-        nullptr, temp_storage_bytes, keys_in.ptr(0), inptr, volume, 0, sizeof(VAL) * 8, stream);
+        nullptr, temp_storage_bytes, values_in_cub, values_out, volume, 0, sizeof(VAL) * 8, stream);
       auto temp_storage =
         create_buffer<unsigned char>(temp_storage_bytes, Legion::Memory::Kind::GPU_FB_MEM);
       cub::DeviceRadixSort::SortKeys(temp_storage.ptr(0),
                                      temp_storage_bytes,
-                                     keys_in.ptr(0),
-                                     inptr,
+                                     values_in_cub,
+                                     values_out,
                                      volume,
                                      0,
                                      sizeof(VAL) * 8,
@@ -77,8 +87,8 @@ void cub_local_sort_inplace(
 
       cub::DeviceSegmentedRadixSort::SortKeys(nullptr,
                                               temp_storage_bytes,
-                                              keys_in.ptr(0),
-                                              inptr,
+                                              values_in_cub,
+                                              values_out,
                                               volume,
                                               volume / sort_dim_size,
                                               off_start_pos_it,
@@ -91,8 +101,8 @@ void cub_local_sort_inplace(
 
       cub::DeviceSegmentedRadixSort::SortKeys(temp_storage.ptr(0),
                                               temp_storage_bytes,
-                                              keys_in.ptr(0),
-                                              inptr,
+                                              values_in_cub,
+                                              values_out,
                                               volume,
                                               volume / sort_dim_size,
                                               off_start_pos_it,
@@ -103,18 +113,23 @@ void cub_local_sort_inplace(
       temp_storage.destroy();
     }
   } else {
-    auto idx_in = create_buffer<int64_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
-    CHECK_CUDA(cudaMemcpyAsync(
-      idx_in.ptr(0), argptr, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream));
+    Buffer<int64_t> idx_in;
+    const int64_t* indices_in_cub = indices_in;
+    if (indices_in == indices_out) {
+      auto idx_in    = create_buffer<int64_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
+      indices_in_cub = idx_in.ptr(0);
+      CHECK_CUDA(cudaMemcpyAsync(
+        idx_in.ptr(0), indices_out, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream));
+    }
 
     if (volume == sort_dim_size) {
       // argsort (initial call to compute buffer size)
       cub::DeviceRadixSort::SortPairs(nullptr,
                                       temp_storage_bytes,
-                                      keys_in.ptr(0),
-                                      inptr,
-                                      idx_in.ptr(0),
-                                      argptr,
+                                      values_in_cub,
+                                      values_out,
+                                      indices_in_cub,
+                                      indices_out,
                                       volume,
                                       0,
                                       sizeof(VAL) * 8,
@@ -125,10 +140,10 @@ void cub_local_sort_inplace(
 
       cub::DeviceRadixSort::SortPairs(temp_storage.ptr(0),
                                       temp_storage_bytes,
-                                      keys_in.ptr(0),
-                                      inptr,
-                                      idx_in.ptr(0),
-                                      argptr,
+                                      values_in_cub,
+                                      values_out,
+                                      indices_in_cub,
+                                      indices_out,
                                       volume,
                                       0,
                                       sizeof(VAL) * 8,
@@ -144,10 +159,10 @@ void cub_local_sort_inplace(
 
       cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
                                                temp_storage_bytes,
-                                               keys_in.ptr(0),
-                                               inptr,
-                                               idx_in.ptr(0),
-                                               argptr,
+                                               values_in_cub,
+                                               values_out,
+                                               indices_in_cub,
+                                               indices_out,
                                                volume,
                                                volume / sort_dim_size,
                                                off_start_pos_it,
@@ -161,10 +176,10 @@ void cub_local_sort_inplace(
 
       cub::DeviceSegmentedRadixSort::SortPairs(temp_storage.ptr(0),
                                                temp_storage_bytes,
-                                               keys_in.ptr(0),
-                                               inptr,
-                                               idx_in.ptr(0),
-                                               argptr,
+                                               values_in_cub,
+                                               values_out,
+                                               indices_in_cub,
+                                               indices_out,
                                                volume,
                                                volume / sort_dim_size,
                                                off_start_pos_it,
@@ -174,22 +189,36 @@ void cub_local_sort_inplace(
                                                stream);
       temp_storage.destroy();
     }
-    idx_in.destroy();
+    if (indices_in == indices_out) idx_in.destroy();
   }
-  keys_in.destroy();
+
+  if (values_in == values_out) keys_in.destroy();
 }
 
 template <class VAL>
-void thrust_local_sort_inplace(VAL* inptr,
-                               int64_t* argptr,
-                               const size_t volume,
-                               const size_t sort_dim_size,
-                               const bool stable_argsort,
-                               cudaStream_t stream)
+void thrust_local_sort(const VAL* values_in,
+                       VAL* values_out,
+                       const int64_t* indices_in,
+                       int64_t* indices_out,
+                       const size_t volume,
+                       const size_t sort_dim_size,
+                       const bool stable_argsort,
+                       cudaStream_t stream)
 {
-  if (argptr == nullptr) {
+  if (values_in != values_out) {
+    // not in-place --> need a copy
+    CHECK_CUDA(cudaMemcpyAsync(
+      values_out, values_in, sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream));
+  }
+  if (indices_in != indices_out) {
+    // not in-place --> need a copy
+    CHECK_CUDA(cudaMemcpyAsync(
+      indices_out, values_in, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream));
+  }
+
+  if (indices_out == nullptr) {
     if (volume == sort_dim_size) {
-      thrust::sort(thrust::cuda::par.on(stream), inptr, inptr + volume);
+      thrust::sort(thrust::cuda::par.on(stream), values_out, values_out + volume);
     } else {
       auto sort_id = create_buffer<uint64_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
       // init combined keys
@@ -199,7 +228,7 @@ void thrust_local_sort_inplace(VAL* inptr,
                         thrust::make_constant_iterator<uint64_t>(sort_dim_size),
                         sort_id.ptr(0),
                         thrust::divides<uint64_t>());
-      auto combined = thrust::make_zip_iterator(thrust::make_tuple(sort_id.ptr(0), inptr));
+      auto combined = thrust::make_zip_iterator(thrust::make_tuple(sort_id.ptr(0), values_out));
 
       thrust::sort(thrust::cuda::par.on(stream),
                    combined,
@@ -211,9 +240,11 @@ void thrust_local_sort_inplace(VAL* inptr,
   } else {
     if (volume == sort_dim_size) {
       if (stable_argsort) {
-        thrust::stable_sort_by_key(thrust::cuda::par.on(stream), inptr, inptr + volume, argptr);
+        thrust::stable_sort_by_key(
+          thrust::cuda::par.on(stream), values_out, values_out + volume, indices_out);
       } else {
-        thrust::sort_by_key(thrust::cuda::par.on(stream), inptr, inptr + volume, argptr);
+        thrust::sort_by_key(
+          thrust::cuda::par.on(stream), values_out, values_out + volume, indices_out);
       }
     } else {
       auto sort_id = create_buffer<uint64_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
@@ -224,19 +255,19 @@ void thrust_local_sort_inplace(VAL* inptr,
                         thrust::make_constant_iterator<uint64_t>(sort_dim_size),
                         sort_id.ptr(0),
                         thrust::divides<uint64_t>());
-      auto combined = thrust::make_zip_iterator(thrust::make_tuple(sort_id.ptr(0), inptr));
+      auto combined = thrust::make_zip_iterator(thrust::make_tuple(sort_id.ptr(0), values_out));
 
       if (stable_argsort) {
         thrust::stable_sort_by_key(thrust::cuda::par.on(stream),
                                    combined,
                                    combined + volume,
-                                   argptr,
+                                   indices_out,
                                    thrust::less<thrust::tuple<size_t, VAL>>());
       } else {
         thrust::sort_by_key(thrust::cuda::par.on(stream),
                             combined,
                             combined + volume,
-                            argptr,
+                            indices_out,
                             thrust::less<thrust::tuple<size_t, VAL>>());
       }
 
@@ -256,32 +287,45 @@ struct support_cub<LegateTypeCode::COMPLEX128_LT> : std::false_type {
 };
 
 template <LegateTypeCode CODE, std::enable_if_t<support_cub<CODE>::value>* = nullptr>
-void local_sort_inplace(legate_type_of<CODE>* inptr,
-                        int64_t* argptr,
-                        const size_t volume,
-                        const size_t sort_dim_size,
-                        const bool stable_argsort,  // cub sort is always stable
-                        cudaStream_t stream)
+void local_sort(const legate_type_of<CODE>* values_in,
+                legate_type_of<CODE>* values_out,
+                const int64_t* indices_in,
+                int64_t* indices_out,
+                const size_t volume,
+                const size_t sort_dim_size,
+                const bool stable_argsort,  // cub sort is always stable
+                cudaStream_t stream)
 {
   using VAL = legate_type_of<CODE>;
   // fallback to thrust approach as segmented radix sort is not suited for small segments
   if (volume == sort_dim_size || sort_dim_size > 300) {
-    cub_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stream);
+    cub_local_sort<VAL>(
+      values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stream);
   } else {
-    thrust_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stable_argsort, stream);
+    thrust_local_sort<VAL>(values_in,
+                           values_out,
+                           indices_in,
+                           indices_out,
+                           volume,
+                           sort_dim_size,
+                           stable_argsort,
+                           stream);
   }
 }
 
 template <LegateTypeCode CODE, std::enable_if_t<!support_cub<CODE>::value>* = nullptr>
-void local_sort_inplace(legate_type_of<CODE>* inptr,
-                        int64_t* argptr,
-                        const size_t volume,
-                        const size_t sort_dim_size,
-                        const bool stable_argsort,
-                        cudaStream_t stream)
+void local_sort(const legate_type_of<CODE>* values_in,
+                legate_type_of<CODE>* values_out,
+                const int64_t* indices_in,
+                int64_t* indices_out,
+                const size_t volume,
+                const size_t sort_dim_size,
+                const bool stable_argsort,
+                cudaStream_t stream)
 {
   using VAL = legate_type_of<CODE>;
-  thrust_local_sort_inplace<VAL>(inptr, argptr, volume, sort_dim_size, stable_argsort, stream);
+  thrust_local_sort<VAL>(
+    values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stable_argsort, stream);
 }
 
 // auto align to multiples of 16 bytes
@@ -709,7 +753,14 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
         values_ptr, input.ptr(rect.lo), sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream));
 
       // sort data (locally)
-      local_sort_inplace<CODE>(values_ptr, indices_ptr, volume, sort_dim_size, stable, stream);
+      local_sort<CODE>(input.ptr(rect.lo),
+                       values_ptr,
+                       indices_ptr,
+                       indices_ptr,
+                       volume,
+                       sort_dim_size,
+                       stable,
+                       stream);
     }
 
     // this is linked to the decision in sorting.py on when to use an 'unbounded' output array.

From 927b54f2d2793de65b38340551fded3ff68eb417 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 18 Mar 2022 17:20:35 +0000
Subject: [PATCH 46/49] fixed init issue

---
 src/cunumeric/sort/sort.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index e4e5b2e22..17cbadd67 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -116,7 +116,7 @@ void cub_local_sort(const VAL* values_in,
     Buffer<int64_t> idx_in;
     const int64_t* indices_in_cub = indices_in;
     if (indices_in == indices_out) {
-      auto idx_in    = create_buffer<int64_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
+      idx_in    = create_buffer<int64_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
       indices_in_cub = idx_in.ptr(0);
       CHECK_CUDA(cudaMemcpyAsync(
         idx_in.ptr(0), indices_out, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream));

From 10e7ebb15b26880b343339c7da995f4737a9b650 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 18 Mar 2022 17:23:55 +0000
Subject: [PATCH 47/49] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/cunumeric/sort/sort.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index 17cbadd67..682b703f0 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -116,7 +116,7 @@ void cub_local_sort(const VAL* values_in,
     Buffer<int64_t> idx_in;
     const int64_t* indices_in_cub = indices_in;
     if (indices_in == indices_out) {
-      idx_in    = create_buffer<int64_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
+      idx_in         = create_buffer<int64_t>(volume, Legion::Memory::Kind::GPU_FB_MEM);
       indices_in_cub = idx_in.ptr(0);
       CHECK_CUDA(cudaMemcpyAsync(
         idx_in.ptr(0), indices_out, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream));

From e52b0177980ba0f7f190b934683006cd48f04688 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 18 Mar 2022 17:38:43 +0000
Subject: [PATCH 48/49] change to thrust openmp policy

---
 src/cunumeric/sort/sort_omp.cc | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index c552fcb90..c26d606a5 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -19,6 +19,7 @@
 
 #include <thrust/sort.h>
 #include <thrust/execution_policy.h>
+#include <thrust/system/omp/execution_policy.h>
 #include <numeric>
 #include <omp.h>
 
@@ -40,23 +41,21 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
   {
     if (argptr == nullptr) {
       // sort (in place)
-#pragma omp parallel for
       for (size_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
-        thrust::sort(thrust::host, inptr + start_idx, inptr + start_idx + sort_dim_size);
+        thrust::sort(thrust::omp::par, inptr + start_idx, inptr + start_idx + sort_dim_size);
       }
     } else {
       // argsort
-#pragma omp parallel for
       for (uint64_t start_idx = 0; start_idx < volume; start_idx += sort_dim_size) {
         int64_t* segmentValues = argptr + start_idx;
         VAL* segmentKeys       = inptr + start_idx;
         std::iota(segmentValues, segmentValues + sort_dim_size, 0);  // init
         if (stable_argsort) {
           thrust::stable_sort_by_key(
-            thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues);
+            thrust::omp::par, segmentKeys, segmentKeys + sort_dim_size, segmentValues);
         } else {
           thrust::sort_by_key(
-            thrust::host, segmentKeys, segmentKeys + sort_dim_size, segmentValues);
+            thrust::omp::par, segmentKeys, segmentKeys + sort_dim_size, segmentValues);
         }
       }
     }

From 99798e303a1f9a33c277cad23e6b293ac5ff11b4 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Mon, 21 Mar 2022 15:13:48 -0700
Subject: [PATCH 49/49] removed another copy on python side in case we can sort
 in place

---
 cunumeric/sort.py              | 18 +++++++++++-------
 src/cunumeric/sort/sort.cc     |  7 ++++---
 src/cunumeric/sort/sort.cu     |  5 -----
 src/cunumeric/sort/sort_omp.cc |  7 ++++---
 4 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/cunumeric/sort.py b/cunumeric/sort.py
index f1def3c72..fca32e80d 100644
--- a/cunumeric/sort.py
+++ b/cunumeric/sort.py
@@ -43,13 +43,17 @@ def sort_swapped(output, input, argsort, sort_axis, stable):
     swapped_copy.copy(swapped, deep=True)
 
     # run sort on last axis
-    sort_result = output.runtime.create_empty_thunk(
-        swapped_copy.shape, dtype=output.dtype, inputs=(swapped_copy,)
-    )
-    sort(sort_result, swapped_copy, argsort, stable=stable)
-
-    output.base = sort_result.swapaxes(input.ndim - 1, sort_axis).base
-    output.numpy_array = None
+    if argsort is True:
+        sort_result = output.runtime.create_empty_thunk(
+            swapped_copy.shape, dtype=output.dtype, inputs=(swapped_copy,)
+        )
+        sort(sort_result, swapped_copy, argsort, stable=stable)
+        output.base = sort_result.swapaxes(input.ndim - 1, sort_axis).base
+        output.numpy_array = None
+    else:
+        sort(swapped_copy, swapped_copy, argsort, stable=stable)
+        output.base = swapped_copy.swapaxes(input.ndim - 1, sort_axis).base
+        output.numpy_array = None
 
 
 def sort_task(output, input, argsort, stable):
diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index 77eea8456..dda79d396 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -95,11 +95,12 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
       AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
 
       // init output values
-      auto* src = input.ptr(rect.lo);
-      std::copy(src, src + volume, output.ptr(rect.lo));
+      auto* src    = input.ptr(rect.lo);
+      auto* target = output.ptr(rect.lo);
+      if (src != target) std::copy(src, src + volume, target);
 
       // sort data in place
-      thrust_local_sort_inplace(output.ptr(rect.lo), nullptr, volume, sort_dim_size, stable);
+      thrust_local_sort_inplace(target, nullptr, volume, sort_dim_size, stable);
     }
   }
 };
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index 682b703f0..13e632f9a 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -583,7 +583,6 @@ static SortPiece<VAL> sample_sort_nccl(SortPiece<VAL> local_sorted,
       merge_buffers[i].indices = create_buffer<int64_t>(0, Memory::GPU_FB_MEM);
     }
   }
-
   CHECK_NCCL(ncclGroupStart());
   for (size_t r = 0; r < num_ranks; r++) {
     CHECK_NCCL(ncclSend(val_send_buf.ptr(aligned_pos_vals_send[r]),
@@ -747,11 +746,7 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
         values_ptr = output.ptr(rect.lo);
       }
     }
-
     if (volume > 0) {
-      CHECK_CUDA(cudaMemcpyAsync(
-        values_ptr, input.ptr(rect.lo), sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream));
-
       // sort data (locally)
       local_sort<CODE>(input.ptr(rect.lo),
                        values_ptr,
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cunumeric/sort/sort_omp.cc
index c26d606a5..b3afc6019 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cunumeric/sort/sort_omp.cc
@@ -96,11 +96,12 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
       AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
 
       // init output values
-      auto* src = input.ptr(rect.lo);
-      std::copy(src, src + volume, output.ptr(rect.lo));
+      auto* src    = input.ptr(rect.lo);
+      auto* target = output.ptr(rect.lo);
+      if (src != target) std::copy(src, src + volume, target);
 
       // sort data in place
-      thrust_local_sort_inplace(output.ptr(rect.lo), nullptr, volume, sort_dim_size, stable);
+      thrust_local_sort_inplace(target, nullptr, volume, sort_dim_size, stable);
     }
   }
 };