nv-legate · mfoerste4 · Mar 22, 2022 · Feb 8, 2022 · Feb 8, 2022 · Feb 8, 2022
diff --git a/cunumeric/array.py b/cunumeric/array.py
@@ -2765,6 +2765,14 @@ def setflags(self, write=None, align=None, uic=None):
         """
         self.__array__().setflags(write=write, align=align, uic=uic)
 
+    def sort(self, axis=-1, kind="quicksort", order=None):
+        self._thunk.sort(rhs=self._thunk, axis=axis, kind=kind, order=order)
+
+    def argsort(self, axis=-1, kind="quicksort", order=None):
+        self._thunk.sort(
+            rhs=self._thunk, argsort=True, axis=axis, kind=kind, order=order
+        )
+
     def squeeze(self, axis=None):
         """a.squeeze(axis=None)
 

diff --git a/cunumeric/config.py b/cunumeric/config.py
@@ -100,6 +100,7 @@ class CuNumericOpCode(IntEnum):
     RAND = _cunumeric.CUNUMERIC_RAND
     READ = _cunumeric.CUNUMERIC_READ
     SCALAR_UNARY_RED = _cunumeric.CUNUMERIC_SCALAR_UNARY_RED
+    SORT = _cunumeric.CUNUMERIC_SORT
     SYRK = _cunumeric.CUNUMERIC_SYRK
     TILE = _cunumeric.CUNUMERIC_TILE
     TRANSPOSE_COPY_2D = _cunumeric.CUNUMERIC_TRANSPOSE_COPY_2D

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
@@ -32,6 +32,7 @@
     UnaryRedCode,
 )
 from .linalg.cholesky import cholesky
+from .sort import sort
 from .thunk import NumPyThunk
 from .utils import get_arg_value_dtype
 
@@ -1541,3 +1542,21 @@ def unique(self):
             )
 
         return result
+
+    @auto_convert([1])
+    def sort(self, rhs, argsort=False, axis=-1, kind="quicksort", order=None):
+
+        if kind == "stable":
+            stable = True
+        else:
+            stable = False
+
+        if order is not None:
+            raise NotImplementedError(
+                "cuNumeric does not support sorting with 'order' as "
+                "ndarray only supports numeric values"
+            )
+        if axis is not None and (axis >= rhs.ndim or axis < -rhs.ndim):
+            raise ValueError("invalid axis")
+
+        sort(self, rhs, argsort, axis, stable)
diff --git a/cunumeric/eager.py b/cunumeric/eager.py
@@ -502,6 +502,16 @@ def nonzero(self):
                 result += (EagerArray(self.runtime, array),)
             return result
 
+    def sort(self, rhs, argsort=False, axis=-1, kind="quicksort", order=None):
+        self.check_eager_args(rhs, axis, kind, order)
+        if self.deferred is not None:
+            self.deferred.sort(rhs, argsort, axis, kind, order)
+        else:
+            if argsort:
+                self.array = np.argsort(rhs.array, axis, kind, order)
+            else:
+                self.array = np.sort(rhs.array, axis, kind, order)
+
     def random_uniform(self):
         if self.deferred is not None:
             self.deferred.random_uniform()

diff --git a/cunumeric/module.py b/cunumeric/module.py
@@ -5721,6 +5721,168 @@ def unique(
 # Sorting, searching, and counting
 ##################################
 
+# Sorting
+
+
+@add_boilerplate("a")
+def argsort(a, axis=-1, kind="quicksort", order=None):
+    """
+
+    Returns the indices that would sort an array.
+
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+    axis : int or None, optional
+        Axis to sort. By default, the index -1 (the last axis) is used. If
+        None, the flattened array is used.
+    kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
+        Default is 'quicksort'. The underlying sort algorithm might vary.
+        The code basically supports 'stable' or *not* 'stable'.
+    order : str or list of str, optional
+        Currently not supported
+
+    Returns
+    -------
+    index_array : ndarray of ints
+        Array of indices that sort a along the specified axis. It has the
+        same shape as `a.shape` or is flattened in case of `axis` is None.
+
+    Notes
+    -----
+    The current implementation has only limited support for distributed data.
+    Distributed 1-D or flattened data will be broadcasted.
+
+    See Also
+    --------
+    numpy.argsort
+
+    Availability
+    --------
+    Multiple GPUs, Single CPU
+    """
+
+    result = ndarray(a.shape, np.int64)
+    result._thunk.sort(
+        rhs=a._thunk, argsort=True, axis=axis, kind=kind, order=order
+    )
+    return result
+
+
+def msort(a):
+    """
+
+    Returns a sorted copy of an array sorted along the first axis.
+
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+
+    Returns
+    -------
+    out : ndarray
+        Sorted array with same dtype and shape as `a`.
+
+    Notes
+    -----
+    The current implementation has only limited support for distributed data.
+    Distributed 1-D  data will be broadcasted.
+
+    See Also
+    --------
+    numpy.msort
+
+    Availability
+    --------
+    Multiple GPUs, Single CPU
+    """
+    return sort(a, axis=0)
+
+
+@add_boilerplate("a")
+def sort(a, axis=-1, kind="quicksort", order=None):
+    """
+
+    Returns a sorted copy of an array.
+
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+    axis : int or None, optional
+        Axis to sort. By default, the index -1 (the last axis) is used. If
+        None, the flattened array is used.
+    kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
+        Default is 'quicksort'. The underlying sort algorithm might vary.
+        The code basically supports 'stable' or *not* 'stable'.
+    order : str or list of str, optional
+        Currently not supported
+
+    Returns
+    -------
+    out : ndarray
+        Sorted array with same dtype and shape as `a`. In case `axis` is
+        None the result is flattened.
+
+    Notes
+    -----
+    The current implementation has only limited support for distributed data.
+    Distributed 1-D or flattened data will be broadcasted.
+
+    See Also
+    --------
+    numpy.sort
+
+    Availability
+    --------
+    Multiple GPUs, Single CPU
+    """
+    result = ndarray(a.shape, a.dtype)
+    result._thunk.sort(rhs=a._thunk, axis=axis, kind=kind, order=order)
+    return result
+
+
+@add_boilerplate("a")
+def sort_complex(a):
+    """
+
+    Returns a sorted copy of an array sorted along the last axis. Sorts the
+    real part first, the imaginary part second.
+
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+
+    Returns
+    -------
+    out : ndarray, complex
+        Sorted array with same shape as `a`.
+
+    Notes
+    -----
+    The current implementation has only limited support for distributed data.
+    Distributed 1-D data will be broadcasted.
+
+    See Also
+    --------
+    numpy.sort_complex
+
+    Availability
+    --------
+    Multiple GPUs, Single CPU
+    """
+
+    result = sort(a)
+    # force complex result upon return
+    if np.issubdtype(result.dtype, np.complexfloating):
+        return result
+    else:
+        return result.astype(np.complex64, copy=True)
+
+
 # Searching
 
 

diff --git a/cunumeric/sort.py b/cunumeric/sort.py
@@ -0,0 +1,101 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+from cunumeric.config import CuNumericOpCode
+
+from legate.core import types as ty
+
+
+def sort_flattened(output, input, argsort, stable):
+    flattened = input.reshape((input.size,), order="C")
+
+    # run sort flattened -- return 1D solution
+    sort_result = output.runtime.create_empty_thunk(
+        flattened.shape, dtype=output.dtype, inputs=(flattened,)
+    )
+    sort(sort_result, flattened, argsort, stable=stable)
+    output.base = sort_result.base
+    output.numpy_array = None
+
+
+def sort_swapped(output, input, argsort, sort_axis, stable):
+    assert sort_axis < input.ndim - 1 and sort_axis >= 0
+
+    # swap axes
+    swapped = input.swapaxes(sort_axis, input.ndim - 1)
+
+    swapped_copy = output.runtime.create_empty_thunk(
+        swapped.shape, dtype=input.dtype, inputs=(input, swapped)
+    )
+    swapped_copy.copy(swapped, deep=True)
+
+    # run sort on last axis
+    sort_result = output.runtime.create_empty_thunk(
+        swapped_copy.shape, dtype=output.dtype, inputs=(swapped_copy,)
+    )
+    sort(sort_result, swapped_copy, argsort, stable=stable)
+
+    output.base = sort_result.swapaxes(input.ndim - 1, sort_axis).base
+    output.numpy_array = None
+
+
+def sort_task(output, input, argsort, stable):
+    task = output.context.create_task(CuNumericOpCode.SORT)
+
+    needs_unbound_output = output.runtime.num_gpus > 1 and input.ndim == 1
+
+    if needs_unbound_output:
+        unbound = output.runtime.create_unbound_thunk(dtype=output.dtype)
+        task.add_output(unbound.base)
+    else:
+        task.add_output(output.base)
+        task.add_alignment(output.base, input.base)
+
+    task.add_input(input.base)
+
+    if output.ndim > 1:
+        task.add_broadcast(input.base, input.ndim - 1)
+    elif output.runtime.num_gpus > 1:
+        task.add_nccl_communicator()
+    elif output.runtime.num_gpus == 0 and output.runtime.num_procs > 1:
+        # Distributed 1D sort on CPU not supported yet
+        task.add_broadcast(input.base)
+
+    task.add_scalar_arg(argsort, bool)  # return indices flag
+    task.add_scalar_arg(input.base.shape, (ty.int32,))
+    task.add_scalar_arg(stable, bool)
+    task.execute()
+
+    if needs_unbound_output:
+        output.base = unbound.base
+        output.numpy_array = None
+
+
+def sort(output, input, argsort, axis=-1, stable=False):
+    if axis is None and input.ndim > 1:
+        sort_flattened(output, input, argsort, stable)
+    else:
+        if axis is None:
+            axis = 0
+        elif axis < 0:
+            axis = input.ndim + axis
+
+        if axis is not input.ndim - 1:
+            sort_swapped(output, input, argsort, axis, stable)
+
+        else:
+            # run actual sort task
+            sort_task(output, input, argsort, stable)