From a4cac72e5ceb1c6e5a94470e04f0f77867ea02e8 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Wed, 30 Mar 2022 19:05:18 -0700
Subject: [PATCH 1/2] Use ufuncs in metamethods

---
 cunumeric/_ufunc/ufunc.py |   6 +-
 cunumeric/array.py        | 441 +++++++++++++-------------------------
 2 files changed, 155 insertions(+), 292 deletions(-)

diff --git a/cunumeric/_ufunc/ufunc.py b/cunumeric/_ufunc/ufunc.py
index 1d02bdb4d..57fc146b5 100644
--- a/cunumeric/_ufunc/ufunc.py
+++ b/cunumeric/_ufunc/ufunc.py
@@ -132,7 +132,7 @@ def _maybe_cast_input(self, arr, to_dtype, casting):
         if arr.dtype == to_dtype:
             return arr
 
-        if not np_can_cast(arr.dtype, to_dtype):
+        if not np_can_cast(arr.dtype, to_dtype, casting=casting):
             raise TypeError(
                 f"Cannot cast ufunc '{self._name}' input from "
                 f"{arr.dtype} to {to_dtype} with casting rule '{casting}'"
@@ -244,7 +244,7 @@ def __call__(
             out = result
         else:
             if out.dtype != res_dtype:
-                if not np_can_cast(res_dtype, out.dtype):
+                if not np_can_cast(res_dtype, out.dtype, casting=casting):
                     raise TypeError(
                         f"Cannot cast ufunc '{self._name}' output from "
                         f"{res_dtype} to {out.dtype} with casting rule "
@@ -393,7 +393,7 @@ def __call__(
             out = result
         else:
             if out.dtype != res_dtype:
-                if not np_can_cast(res_dtype, out.dtype):
+                if not np_can_cast(res_dtype, out.dtype, casting=casting):
                     raise TypeError(
                         f"Cannot cast ufunc '{self._name}' output from "
                         f"{res_dtype} to {out.dtype} with casting rule "
diff --git a/cunumeric/array.py b/cunumeric/array.py
index 7d1ddad00..aa094ab02 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -23,7 +23,7 @@
 
 from legate.core import Array
 
-from .config import BinaryOpCode, UnaryOpCode, UnaryRedCode
+from .config import UnaryOpCode, UnaryRedCode
 from .coverage import clone_class
 from .runtime import runtime
 from .utils import dot_modes
@@ -558,14 +558,9 @@ def __abs__(self):
 
         """
         # Handle the nice case of it being unsigned
-        if (
-            self.dtype.type == np.uint16
-            or self.dtype.type == np.uint32
-            or self.dtype.type == np.uint64
-            or self.dtype.type == np.bool_
-        ):
-            return self
-        return self._perform_unary_op(UnaryOpCode.ABSOLUTE, self)
+        from ._ufunc import absolute
+
+        return absolute
 
     def __add__(self, rhs):
         """a.__add__(value, /)
@@ -577,8 +572,9 @@ def __add__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        return self._perform_binary_op(BinaryOpCode.ADD, self, rhs_array)
+        from ._ufunc import add
+
+        return add(self, rhs)
 
     def __and__(self, rhs):
         """a.__and__(value, /)
@@ -590,10 +586,9 @@ def __and__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        return self._perform_binary_op(
-            BinaryOpCode.LOGICAL_AND, self, rhs_array
-        )
+        from ._ufunc import logical_and
+
+        return logical_and(self, rhs)
 
     def __array__(self, dtype=None):
         """a.__array__([dtype], /)
@@ -694,7 +689,7 @@ def __div__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        return self._internal_truediv(rhs, inplace=False)
+        return self.__truediv__(rhs)
 
     def __divmod__(self, rhs):
         """a.__divmod__(value, /)
@@ -706,8 +701,9 @@ def __divmod__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        return self._perform_binary_op(BinaryOpCode.DIVMOD, self, rhs_array)
+        raise NotImplementedError(
+            "cunumeric.ndarray doesn't support __divmod__ yet"
+        )
 
     def __eq__(self, rhs):
         """a.__eq__(value, /)
@@ -719,10 +715,9 @@ def __eq__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        return self._perform_binary_op(
-            BinaryOpCode.EQUAL, self, rhs_array, out_dtype=np.dtype(np.bool_)
-        )
+        from ._ufunc import equal
+
+        return equal(self, rhs)
 
     def __float__(self):
         """a.__float__(/)
@@ -742,10 +737,9 @@ def __floordiv__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        return self._perform_binary_op(
-            BinaryOpCode.FLOOR_DIVIDE, self, rhs_array
-        )
+        from ._ufunc import floor_divide
+
+        return floor_divide(self, rhs)
 
     def __format__(self, *args, **kwargs):
         return self.__array__().__format__(*args, **kwargs)
@@ -760,13 +754,9 @@ def __ge__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        return self._perform_binary_op(
-            BinaryOpCode.GREATER_EQUAL,
-            self,
-            rhs_array,
-            out_dtype=np.dtype(np.bool_),
-        )
+        from ._ufunc import greater_equal
+
+        return greater_equal(self, rhs)
 
     # __getattribute__
 
@@ -805,10 +795,9 @@ def __gt__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        return self._perform_binary_op(
-            BinaryOpCode.GREATER, self, rhs_array, out_dtype=np.dtype(np.bool_)
-        )
+        from ._ufunc import greater
+
+        return greater(self, rhs)
 
     def __hash__(self, *args, **kwargs):
         raise TypeError("unhashable type: cunumeric.ndarray")
@@ -823,9 +812,9 @@ def __iadd__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        self._perform_binary_op(BinaryOpCode.ADD, self, rhs_array, out=self)
-        return self
+        from ._ufunc import add
+
+        return add(self, rhs, out=self)
 
     def __iand__(self, rhs):
         """a.__iand__(value, /)
@@ -837,11 +826,9 @@ def __iand__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        self._perform_binary_op(
-            BinaryOpCode.LOGICAL_AND, self, rhs_array, out=self
-        )
-        return self
+        from ._ufunc import logical_and
+
+        return logical_and(self, rhs, out=self)
 
     def __idiv__(self, rhs):
         """a.__idiv__(value, /)
@@ -853,7 +840,7 @@ def __idiv__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        return self._internal_truediv(rhs, inplace=True)
+        return self.__itruediv__(rhs)
 
     def __ifloordiv__(self, rhs):
         """a.__ifloordiv__(value, /)
@@ -865,11 +852,9 @@ def __ifloordiv__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        self._perform_binary_op(
-            BinaryOpCode.FLOOR_DIVIDE, self, rhs_array, out=self
-        )
-        return self
+        from ._ufunc import floor_divide
+
+        return floor_divide(self, rhs, out=self)
 
     def __ilshift__(self, rhs):
         """a.__ilshift__(value, /)
@@ -881,11 +866,9 @@ def __ilshift__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        self._perform_binary_op(
-            BinaryOpCode.SHIFT_LEFT, self, rhs_array, out=self
-        )
-        return self
+        from ._ufunc import left_shift
+
+        return left_shift(self, rhs, out=self)
 
     def __imod__(self, rhs):
         """a.__imod__(value, /)
@@ -897,11 +880,9 @@ def __imod__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        self._perform_binary_op(
-            BinaryOpCode.MODULUS, self, rhs_array, out=self
-        )
-        return self
+        from ._ufunc import remainder
+
+        return remainder(self, rhs, out=self)
 
     def __imul__(self, rhs):
         """a.__imul__(value, /)
@@ -913,11 +894,9 @@ def __imul__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        self._perform_binary_op(
-            BinaryOpCode.MULTIPLY, self, rhs_array, out=self
-        )
-        return self
+        from ._ufunc import multiply
+
+        return multiply(self, rhs, out=self)
 
     def __int__(self):
         """a.__int__(/)
@@ -939,11 +918,13 @@ def __invert__(self):
         """
         if self.dtype == np.bool_:
             # Boolean values are special, just do logical NOT
-            return self._perform_unary_op(
-                UnaryOpCode.LOGICAL_NOT, self, out_dtype=np.dtype(np.bool_)
-            )
+            from ._ufunc import logical_not
+
+            return logical_not(self)
         else:
-            return self._perform_unary_op(UnaryOpCode.INVERT, self)
+            from ._ufunc import invert
+
+            return invert(self)
 
     def __ior__(self, rhs):
         """a.__ior__(/)
@@ -955,11 +936,9 @@ def __ior__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        self._perform_binary_op(
-            BinaryOpCode.LOGICAL_OR, self, rhs_array, out=self
-        )
-        return self
+        from ._ufunc import logical_or
+
+        return logical_or(self, rhs, out=self)
 
     def __ipow__(self, rhs):
         """a.__ipow__(/)
@@ -971,9 +950,9 @@ def __ipow__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        self._perform_binary_op(BinaryOpCode.POWER, self, rhs_array, out=self)
-        return self
+        from ._ufunc import power
+
+        return power(self, rhs, out=self)
 
     def __irshift__(self, rhs):
         """a.__irshift__(/)
@@ -985,11 +964,9 @@ def __irshift__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        self._perform_binary_op(
-            BinaryOpCode.SHIFT_RIGHT, self, rhs_array, out=self
-        )
-        return self
+        from ._ufunc import right_shift
+
+        return right_shift(self, rhs, out=self)
 
     def __iter__(self):
         """a.__iter__(/)"""
@@ -1005,54 +982,9 @@ def __isub__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        self._perform_binary_op(
-            BinaryOpCode.SUBTRACT, self, rhs_array, out=self
-        )
-        return self
-
-    def _internal_truediv(self, rhs, inplace):
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        self_array = self
-        # Convert any non-floats to floating point arrays
-        if self_array.dtype.kind != "f" and self_array.dtype.kind != "c":
-            self_type = np.dtype(np.float64)
-        else:
-            self_type = self_array.dtype
-        if rhs_array.dtype.kind != "f" and rhs_array.dtype.kind != "c":
-            if inplace:
-                rhs_type = self_type
-            else:
-                rhs_type = np.dtype(np.float64)
-        else:
-            rhs_type = rhs_array.dtype
-        # If the types don't match then align them
-        if self_type != rhs_type:
-            common_type = self.find_common_type(self_array, rhs_array)
-        else:
-            common_type = self_type
-        if self_array.dtype != common_type:
-            temp = ndarray(
-                self_array.shape,
-                dtype=common_type,
-                inputs=(self_array,),
-            )
-            temp._thunk.convert(self_array._thunk, warn=False)
-            self_array = temp
-        if rhs_array.dtype != common_type:
-            temp = ndarray(
-                rhs_array.shape,
-                dtype=common_type,
-                inputs=(rhs_array,),
-            )
-            temp._thunk.convert(rhs_array._thunk, warn=False)
-            rhs_array = temp
-        return self._perform_binary_op(
-            BinaryOpCode.DIVIDE,
-            self_array,
-            rhs_array,
-            out=self if inplace else None,
-        )
+        from ._ufunc import subtract
+
+        return subtract(self, rhs, out=self)
 
     def __itruediv__(self, rhs):
         """a.__itruediv__(/)
@@ -1064,7 +996,9 @@ def __itruediv__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        return self._internal_truediv(rhs, inplace=True)
+        from ._ufunc import true_divide
+
+        return true_divide(self, rhs, out=self)
 
     def __ixor__(self, rhs):
         """a.__ixor__(/)
@@ -1076,11 +1010,9 @@ def __ixor__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        self._perform_binary_op(
-            BinaryOpCode.LOGICAL_XOR, self, rhs_array, out=self
-        )
-        return self
+        from ._ufunc import logical_xor
+
+        return logical_xor(self, rhs, out=self)
 
     def __le__(self, rhs):
         """a.__le__(value, /)
@@ -1092,13 +1024,9 @@ def __le__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        return self._perform_binary_op(
-            BinaryOpCode.LESS_EQUAL,
-            self,
-            rhs_array,
-            out_dtype=np.dtype(np.bool_),
-        )
+        from ._ufunc import less_equal
+
+        return less_equal(self, rhs)
 
     def __len__(self):
         """a.__len__(/)
@@ -1118,10 +1046,9 @@ def __lshift__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        return self._perform_binary_op(
-            BinaryOpCode.SHIFT_LEFT, self, rhs_array
-        )
+        from ._ufunc import left_shift
+
+        return left_shift(self, rhs)
 
     def __lt__(self, rhs):
         """a.__lt__(value, /)
@@ -1133,10 +1060,9 @@ def __lt__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        return self._perform_binary_op(
-            BinaryOpCode.LESS, self, rhs_array, out_dtype=np.dtype(np.bool_)
-        )
+        from ._ufunc import less
+
+        return less(self, rhs)
 
     def __matmul__(self, value):
         """a.__matmul__(value, /)
@@ -1160,8 +1086,9 @@ def __mod__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        return self._perform_binary_op(BinaryOpCode.MOD, self, rhs_array)
+        from ._ufunc import remainder
+
+        return remainder(self, rhs)
 
     def __mul__(self, rhs):
         """a.__mul__(value, /)
@@ -1173,8 +1100,9 @@ def __mul__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        return self._perform_binary_op(BinaryOpCode.MULTIPLY, self, rhs_array)
+        from ._ufunc import multiply
+
+        return multiply(self, rhs)
 
     def __ne__(self, rhs):
         """a.__ne__(value, /)
@@ -1186,13 +1114,9 @@ def __ne__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        return self._perform_binary_op(
-            BinaryOpCode.NOT_EQUAL,
-            self,
-            rhs_array,
-            out_dtype=np.dtype(np.bool_),
-        )
+        from ._ufunc import not_equal
+
+        return not_equal(self, rhs)
 
     def __neg__(self):
         """a.__neg__(value, /)
@@ -1204,7 +1128,9 @@ def __neg__(self):
         Multiple GPUs, Multiple CPUs
 
         """
-        return self._perform_unary_op(UnaryOpCode.NEGATIVE, self)
+        from ._ufunc import negative
+
+        return negative(self)
 
     # __new__
 
@@ -1258,10 +1184,9 @@ def __or__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        return self._perform_binary_op(
-            BinaryOpCode.LOGICAL_OR, self, rhs_array
-        )
+        from ._ufunc import logical_or
+
+        return logical_or(self, rhs)
 
     def __pos__(self):
         """a.__pos__(value, /)
@@ -1274,7 +1199,9 @@ def __pos__(self):
 
         """
         # the positive opeartor is equivalent to copy
-        return self._perform_unary_op(UnaryOpCode.COPY, self)
+        from ._ufunc import positive
+
+        return positive(self)
 
     def __pow__(self, rhs):
         """a.__pow__(value, /)
@@ -1286,8 +1213,9 @@ def __pow__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        return self._perform_binary_op(BinaryOpCode.POWER, self, rhs_array)
+        from ._ufunc import power
+
+        return power(self, rhs)
 
     def __radd__(self, lhs):
         """a.__radd__(value, /)
@@ -1299,8 +1227,9 @@ def __radd__(self, lhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        lhs_array = convert_to_cunumeric_ndarray(lhs)
-        return self._perform_binary_op(BinaryOpCode.ADD, lhs_array, self)
+        from ._ufunc import add
+
+        return add(lhs, self)
 
     def __rand__(self, lhs):
         """a.__rand__(value, /)
@@ -1312,10 +1241,9 @@ def __rand__(self, lhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        lhs_array = convert_to_cunumeric_ndarray(lhs)
-        return self._perform_binary_op(
-            BinaryOpCode.LOGICAL_AND, lhs_array, self
-        )
+        from ._ufunc import logical_and
+
+        return logical_and(lhs, self)
 
     def __rdiv__(self, lhs):
         """a.__rdiv__(value, /)
@@ -1327,8 +1255,9 @@ def __rdiv__(self, lhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        lhs_array = convert_to_cunumeric_ndarray(lhs)
-        return lhs_array._internal_truediv(self, inplace=False)
+        from ._ufunc import true_divide
+
+        return true_divide(lhs, self)
 
     def __rdivmod__(self, lhs):
         """a.__rdivmod__(value, /)
@@ -1340,8 +1269,9 @@ def __rdivmod__(self, lhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        lhs_array = convert_to_cunumeric_ndarray(lhs)
-        return self._perform_binary_op(BinaryOpCode.DIVMOD, lhs_array, self)
+        raise NotImplementedError(
+            "cunumeric.ndarray doesn't support __rdivmod__ yet"
+        )
 
     def __reduce__(self, *args, **kwargs):
         """a.__reduce__(/)
@@ -1376,10 +1306,9 @@ def __rfloordiv__(self, lhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        lhs_array = convert_to_cunumeric_ndarray(lhs)
-        return self._perform_binary_op(
-            BinaryOpCode.FLOOR_DIVIDE, lhs_array, self
-        )
+        from ._ufunc import floor_divide
+
+        return floor_divide(lhs, self)
 
     def __rmod__(self, lhs):
         """a.__rmod__(value, /)
@@ -1391,8 +1320,9 @@ def __rmod__(self, lhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        lhs_array = convert_to_cunumeric_ndarray(lhs)
-        return self._perform_binary_op(BinaryOpCode.MOD, lhs_array, self)
+        from ._ufunc import remainder
+
+        return remainder(lhs, self)
 
     def __rmul__(self, lhs):
         """a.__rmul__(value, /)
@@ -1404,8 +1334,9 @@ def __rmul__(self, lhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        lhs_array = convert_to_cunumeric_ndarray(lhs)
-        return self._perform_binary_op(BinaryOpCode.MULTIPLY, lhs_array, self)
+        from ._ufunc import multiply
+
+        return multiply(lhs, self)
 
     def __ror__(self, lhs):
         """a.__ror__(value, /)
@@ -1417,10 +1348,9 @@ def __ror__(self, lhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        lhs_array = convert_to_cunumeric_ndarray(lhs)
-        return self._perform_binary_op(
-            BinaryOpCode.LOGICAL_OR, lhs_array, self
-        )
+        from ._ufunc import logical_or
+
+        return logical_or(lhs, self)
 
     def __rpow__(self, lhs):
         """__rpow__(value, /)
@@ -1432,8 +1362,9 @@ def __rpow__(self, lhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        lhs_array = convert_to_cunumeric_ndarray(lhs)
-        return self._perform_binary_op(BinaryOpCode.POWER, lhs_array, self)
+        from ._ufunc import power
+
+        return power(lhs, self)
 
     def __rshift__(self, rhs):
         """a.__rshift__(value, /)
@@ -1445,10 +1376,9 @@ def __rshift__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        return self._perform_binary_op(
-            BinaryOpCode.SHIFT_RIGHT, self, rhs_array
-        )
+        from ._ufunc import right_shift
+
+        return right_shift(self, rhs)
 
     def __rsub__(self, lhs):
         """a.__rsub__(value, /)
@@ -1460,8 +1390,9 @@ def __rsub__(self, lhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        lhs_array = convert_to_cunumeric_ndarray(lhs)
-        return self._perform_binary_op(BinaryOpCode.SUBTRACT, lhs_array, self)
+        from ._ufunc import subtract
+
+        return subtract(lhs, self)
 
     def __rtruediv__(self, lhs):
         """a.__rtruediv__(value, /)
@@ -1473,8 +1404,9 @@ def __rtruediv__(self, lhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        lhs_array = convert_to_cunumeric_ndarray(lhs)
-        return lhs_array._internal_truediv(self, inplace=False)
+        from ._ufunc import true_divide
+
+        return true_divide(lhs, self)
 
     def __rxor__(self, lhs):
         """a.__rxor__(value, /)
@@ -1486,10 +1418,9 @@ def __rxor__(self, lhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        lhs_array = convert_to_cunumeric_ndarray(lhs)
-        return self._perform_binary_op(
-            BinaryOpCode.LOGICAL_XOR, lhs_array, self
-        )
+        from ._ufunc import bitwise_xor
+
+        return bitwise_xor(lhs, self)
 
     # __setattr__
 
@@ -1543,8 +1474,9 @@ def __sub__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        return self._perform_binary_op(BinaryOpCode.SUBTRACT, self, rhs_array)
+        from ._ufunc import subtract
+
+        return subtract(self, rhs)
 
     def __str__(self):
         """a.__str__(/)
@@ -1568,7 +1500,9 @@ def __truediv__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        return self._internal_truediv(rhs, inplace=False)
+        from ._ufunc import true_divide
+
+        return true_divide(self, rhs)
 
     def __xor__(self, rhs):
         """a.__xor__(value, /)
@@ -1580,10 +1514,9 @@ def __xor__(self, rhs):
         Multiple GPUs, Multiple CPUs
 
         """
-        rhs_array = convert_to_cunumeric_ndarray(rhs)
-        return self._perform_binary_op(
-            BinaryOpCode.LOGICAL_XOR, rhs_array, self
-        )
+        from ._ufunc import bitwise_xor
+
+        return bitwise_xor(self, rhs)
 
     @add_boilerplate()
     def all(
@@ -2132,12 +2065,10 @@ def dot(self, rhs, out=None):
         from .module import _contract  # work around circular import
 
         if self.ndim == 0 or rhs.ndim == 0:
-            return self._perform_binary_op(
-                BinaryOpCode.MULTIPLY,
-                self,
-                rhs,
-                out=out,
-            )
+            from ._ufunc import multiply
+
+            return multiply(self, rhs, out=out)
+
         (self_modes, rhs_modes, out_modes) = dot_modes(self.ndim, rhs.ndim)
         return _contract(
             self_modes,
@@ -2428,9 +2359,8 @@ def mean(self, axis=None, dtype=None, out=None, keepdims=False):
         # Divide by the number of things in the collapsed dimensions
         # Pick the right kinds of division based on the dtype
         if dtype.kind == "f":
-            sum_array._internal_truediv(
+            sum_array.__itruediv__(
                 np.array(divisor, dtype=sum_array.dtype),
-                inplace=True,
             )
         else:
             sum_array.__ifloordiv__(np.array(divisor, dtype=sum_array.dtype))
@@ -3322,73 +3252,6 @@ def _perform_unary_reduction(
             )
         return dst
 
-    # Return a new cuNumeric array for a binary operation
-    @classmethod
-    def _perform_binary_op(
-        cls,
-        op,
-        one,
-        two,
-        out=None,
-        dtype=None,
-        out_dtype=None,
-        where=True,
-        extra_args=None,
-    ):
-        args = (one, two, where)
-
-        # Compute the output shape
-        shapes = [one.shape, two.shape]
-        if isinstance(where, ndarray):
-            shapes.append(where.shape)
-        if out is not None:
-            shapes.append(out.shape)
-        out_shape = broadcast_shapes(*shapes)
-
-        if out_dtype is None:
-            out_dtype = (
-                dtype if dtype is not None else cls.find_common_type(one, two)
-            )
-
-        if out is not None:
-            if out.shape != out_shape:
-                raise ValueError(
-                    "non-broadcastable output operand with shape "
-                    + str(out.shape)
-                    + " doesn't match the broadcast shape "
-                    + str(out_shape)
-                )
-        else:
-            out = ndarray(shape=out_shape, dtype=out_dtype, inputs=args)
-
-        # Quick exit
-        if where is False:
-            return out
-
-        common_type = cls.find_common_type(one, two)
-        one = one._maybe_convert(common_type, args)
-        two = two._maybe_convert(common_type, args)
-
-        if out.dtype != out_dtype:
-            temp = ndarray(shape=out_shape, dtype=out_dtype, inputs=args)
-            temp._thunk.binary_op(
-                op,
-                one._thunk,
-                two._thunk,
-                cls._get_where_thunk(where, out_shape),
-                extra_args,
-            )
-            out._thunk.convert(temp._thunk)
-        else:
-            out._thunk.binary_op(
-                op,
-                one._thunk,
-                two._thunk,
-                cls._get_where_thunk(where, out_shape),
-                extra_args,
-            )
-        return out
-
     @classmethod
     def _perform_binary_reduction(
         cls,

From c77cf6b8dfe26852124ffca7c1257162c63a99f1 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Wed, 30 Mar 2022 22:22:04 -0700
Subject: [PATCH 2/2] Fix a silly mistake

---
 cunumeric/array.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index aa094ab02..77cad97c2 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -560,7 +560,7 @@ def __abs__(self):
         # Handle the nice case of it being unsigned
         from ._ufunc import absolute
 
-        return absolute
+        return absolute(self)
 
     def __add__(self, rhs):
         """a.__add__(value, /)