From 20f4bedfc90da873eb215b3564781f0a8be6fb86 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 4 Aug 2022 17:06:33 -0600
Subject: [PATCH 01/14] fixing advanced indexing operation for empty arrays

---
 cunumeric/deferred.py                        | 12 ++++
 src/cunumeric/index/advanced_indexing.cc     |  8 +--
 src/cunumeric/index/advanced_indexing.cu     |  5 --
 src/cunumeric/index/advanced_indexing_omp.cc |  7 +--
 tests/integration/test_advanced_indexing.py  | 62 ++++++++++++++++++++
 5 files changed, 76 insertions(+), 18 deletions(-)
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index c99c1c9e8..3ba7f46c2 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -518,6 +518,17 @@ def _create_indexing_array(
                     "np.dtype[Any]", rhs.runtime.get_point_type(N)
                 )
 
+            if key.size == 0:
+                out = cast(
+                    DeferredArray,
+                    self.runtime.create_empty_thunk(
+                        key.shape,
+                        out_dtype,
+                        inputs=[key],
+                    ),
+                )
+                return False, rhs, out, self
+
             # TODO : current implementation of the ND output regions
             # requires out.ndim == rhs.ndim. This will be fixed in the
             # future
@@ -542,6 +553,7 @@ def _create_indexing_array(
             # requires out.ndim == rhs.ndim.
             # The logic below will be removed in the future
             out_dim = rhs.ndim - key_dims + 1
+
             if out_dim != rhs.ndim:
                 out_tmp = out.base
                 for dim in range(rhs.ndim - out_dim):
diff --git a/src/cunumeric/index/advanced_indexing.cc b/src/cunumeric/index/advanced_indexing.cc
index 5bbbff376..b32e90378 100644
--- a/src/cunumeric/index/advanced_indexing.cc
+++ b/src/cunumeric/index/advanced_indexing.cc
@@ -79,11 +79,6 @@ struct AdvancedIndexingImplBody<VariantKind::CPU, CODE, DIM, OUT_TYPE> {
       if (index[p] == true) { size++; }
     }
 
-    if (0 == size) {
-      out_arr.make_empty();
-      return;
-    }
-
     // calculating the shape of the output region for this sub-task
     Point<DIM> extents;
     extents[0] = size;
@@ -94,8 +89,7 @@ struct AdvancedIndexingImplBody<VariantKind::CPU, CODE, DIM, OUT_TYPE> {
     for (size_t i = DIM - key_dim + 1; i < DIM; i++) extents[i] = 1;
 
     auto out = out_arr.create_output_buffer<OUT_TYPE, DIM>(extents, true);
-
-    compute_output(out, input, index, pitches, rect, volume, key_dim, skip_size);
+    if (size > 0) compute_output(out, input, index, pitches, rect, volume, key_dim, skip_size);
   }
 };
 
diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cunumeric/index/advanced_indexing.cu
index a5217b212..fa9e78ba9 100644
--- a/src/cunumeric/index/advanced_indexing.cu
+++ b/src/cunumeric/index/advanced_indexing.cu
@@ -132,11 +132,6 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM, OUT_TYPE> {
 
     size = compute_size(index, pitches, rect, volume, stream, offsets, skip_size, key_dim);
 
-    if (0 == size) {
-      out_arr.make_empty();
-      return;
-    }
-
     // calculating the shape of the output region for this sub-task
     Point<DIM> extents;
     extents[0] = size;
diff --git a/src/cunumeric/index/advanced_indexing_omp.cc b/src/cunumeric/index/advanced_indexing_omp.cc
index b78d3e826..8ad15c43c 100644
--- a/src/cunumeric/index/advanced_indexing_omp.cc
+++ b/src/cunumeric/index/advanced_indexing_omp.cc
@@ -102,11 +102,6 @@ struct AdvancedIndexingImplBody<VariantKind::OMP, CODE, DIM, OUT_TYPE> {
     size_t size =
       compute_output_offsets(offsets, index, pitches, rect, volume, skip_size, max_threads);
 
-    if (0 == size) {
-      out_arr.make_empty();
-      return;
-    }
-
     // calculating the shape of the output region for this sub-task
     Point<DIM> extents;
     extents[0] = size;
@@ -117,7 +112,7 @@ struct AdvancedIndexingImplBody<VariantKind::OMP, CODE, DIM, OUT_TYPE> {
     for (size_t i = DIM - key_dim + 1; i < DIM; i++) extents[i] = 1;
 
     auto out = out_arr.create_output_buffer<OUT_TYPE, DIM>(extents, true);
-
+    if (size > 0)
 #pragma omp parallel
     {
       const int tid   = omp_get_thread_num();
diff --git a/tests/integration/test_advanced_indexing.py b/tests/integration/test_advanced_indexing.py
index d545107d9..b185e8f4f 100644
--- a/tests/integration/test_advanced_indexing.py
+++ b/tests/integration/test_advanced_indexing.py
@@ -34,16 +34,23 @@ def arr_future():
     return num.full((1,), 42)
 
 
+@pytest.fixture
+def arr_empty1d():
+    return num.full((0), 0)
+
+
 idx_region_1d = num.zeros((3,), dtype=np.int64)[2:3]
 idx_future_1d = num.zeros((1,), dtype=np.int64)
 idx_region_0d = num.zeros((3,), dtype=np.int64)[2:3].reshape(())
 idx_future_0d = num.zeros((3,), dtype=np.int64).max()
+idx_empty_1d = num.array([], dtype=int)
 
 val_region_1d = num.full((3,), -1)[2:3]
 val_future_1d = num.full((1,), -1)
 val_region_0d = num.full((3,), -1)[2:3].reshape(())
 val_future_0d = num.full((3,), -1).max()
 
+
 # We use fixtures for `arr` because the `set_item` tests modify
 # their input.
 ARRS = (lazy_fixture("arr_region"), lazy_fixture("arr_future"))
@@ -51,6 +58,13 @@ def arr_future():
 VALS_0D = (val_future_0d,)  # TODO: val_region_0d fails
 IDXS_1D = (idx_region_1d, idx_future_1d)
 VALS_1D = (val_region_1d, val_future_1d)
+ARRS_EMPTY_1D = (
+    lazy_fixture("arr_empty1d"),
+    lazy_fixture("arr_region"),
+    lazy_fixture("arr_future"),
+)
+IDXS_EMPTY_1D = (idx_empty_1d,)
+VALS_EMPTY_1D = (num.array([]),)
 
 
 @pytest.mark.parametrize("idx", IDXS_0D)  # idx = 0
@@ -81,6 +95,54 @@ def test_setitem_scalar_1d(arr, idx, val):
     assert np.array_equal(arr, [-1])
 
 
+@pytest.mark.parametrize("idx", IDXS_EMPTY_1D)  # idx = []
+@pytest.mark.parametrize("arr", ARRS_EMPTY_1D)  # arr = [42], [5], []
+def test_getitem_empty_1d(arr, idx):
+    assert np.array_equal(arr[idx], [])
+
+
+@pytest.mark.parametrize("idx", IDXS_EMPTY_1D)  # idx = []
+@pytest.mark.parametrize("arr", ARRS_EMPTY_1D)  # arr = []
+@pytest.mark.parametrize("val", VALS_EMPTY_1D)  # val = []
+def test_setitem_empty_1d(arr, idx, val):
+    arr[idx] = val
+    assert np.array_equal(arr[idx], [])
+
+
+def test_empty_bool():
+    # empty arrays and indices
+    arr_np = np.array([[]])
+    arr_num = num.array([[]])
+    idx_np = np.array([[]], dtype=bool)
+    idx_num = num.array([[]], dtype=bool)
+    res_np = arr_np[idx_np]
+    res_num = arr_num[idx_num]
+    assert np.array_equal(res_np, res_num)
+
+    res_np = res_np.reshape((0,))
+    res_num = res_num.reshape((0,))
+
+    # set_item
+    val_np = np.array([])
+    val_num = num.array([])
+    arr_np[idx_np] = val_np
+    arr_num[idx_num] = val_num
+    assert np.array_equal(arr_np, arr_num)
+
+    # empty output
+    arr_np = np.array([[-1]])
+    arr_num = num.array([[-1]])
+    idx_np = np.array([[False]], dtype=bool)
+    idx_num = num.array([[False]], dtype=bool)
+    res_np = arr_np[idx_np]
+    res_num = arr_num[idx_num]
+    assert np.array_equal(res_np, res_num)
+
+    arr_np[idx_np] = val_np
+    arr_num[idx_num] = val_num
+    assert np.array_equal(arr_np, arr_num)
+
+
 def test_future_stores():
     # array is a future:
     arr_np = np.array([4])

From 1d28ac2674326ee77ec7a4d2bc5442ed587ee9f6 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 5 Aug 2022 10:36:22 -0600
Subject: [PATCH 02/14] fixing logic for advanced indexing with empty keys

---
 cunumeric/deferred.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 3ba7f46c2..884c6a7be 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -519,10 +519,15 @@ def _create_indexing_array(
                 )
 
             if key.size == 0:
+                out_dim = rhs.ndim - key.ndim + 1
+                out_shape = tuple(
+                    0 for i in range(rhs.ndim - out_dim, out_dim)
+                )
+
                 out = cast(
                     DeferredArray,
                     self.runtime.create_empty_thunk(
-                        key.shape,
+                        out_shape,
                         out_dtype,
                         inputs=[key],
                     ),

From b01dfc40f0cbfd1149aaf5c1c60ea3f55c918225 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 5 Aug 2022 15:12:49 -0600
Subject: [PATCH 03/14] fixing logic for the empty key in advanced indexing
 with bool

---
 cunumeric/deferred.py | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 884c6a7be..3a17e4238 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -503,6 +503,25 @@ def _create_indexing_array(
                         f"dimension {i} doesn't match to the shape of the"
                         f"index array which is {rhs.shape[i]}"
                     )
+
+            if key.size == 0:
+                rhs_store = rhs.base
+                for i in range(key.ndim - 1):
+                    rhs_store = rhs_store.project(0, 0)
+
+                out = cast(
+                    DeferredArray,
+                    self.runtime.create_empty_thunk(
+                        rhs_store.shape,
+                        rhs.dtype,
+                        inputs=[rhs],
+                    ),
+                )
+
+                out = cast(DeferredArray, out._copy_store(rhs_store))
+
+                return False, rhs, out, self
+
             key_store = key.base
             # bring key to the same shape as rhs
             for i in range(key_store.ndim, rhs.ndim):
@@ -518,22 +537,6 @@ def _create_indexing_array(
                     "np.dtype[Any]", rhs.runtime.get_point_type(N)
                 )
 
-            if key.size == 0:
-                out_dim = rhs.ndim - key.ndim + 1
-                out_shape = tuple(
-                    0 for i in range(rhs.ndim - out_dim, out_dim)
-                )
-
-                out = cast(
-                    DeferredArray,
-                    self.runtime.create_empty_thunk(
-                        out_shape,
-                        out_dtype,
-                        inputs=[key],
-                    ),
-                )
-                return False, rhs, out, self
-
             # TODO : current implementation of the ND output regions
             # requires out.ndim == rhs.ndim. This will be fixed in the
             # future

From bb895773f06470d4d1aeaa035032a8f967f0b2a1 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 5 Aug 2022 15:13:43 -0600
Subject: [PATCH 04/14] fixing logic for ravel when a.size==0 and a.ndim>1

---
 cunumeric/array.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 57824bca1..20303f046 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3017,6 +3017,17 @@ def ravel(self, order="C") -> ndarray:
         Multiple GPUs, Multiple CPUs
 
         """
+        if self.size == 0 and self.ndim > 1:
+            idx = tuple()
+            for i in range(self.ndim):
+                if self.shape[i] != 0:
+                    idx += (0,)
+                else:
+                    idx += (slice(None),)
+            idx = tuple(idx)
+            self = self[idx].copy()
+            return self
+
         return self.reshape(-1, order=order)
 
     def reshape(self, *args, order="C") -> ndarray:

From 4845364acc7ef217e69cdaf01f41ec8f56695b11 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 9 Aug 2022 15:35:42 -0600
Subject: [PATCH 05/14] adding test for reshaping empty arrays

---
 tests/integration/test_reshape.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/integration/test_reshape.py b/tests/integration/test_reshape.py
index a35d69f3e..edfb2f327 100644
--- a/tests/integration/test_reshape.py
+++ b/tests/integration/test_reshape.py
@@ -59,6 +59,14 @@ def test_ravel(self):
             np.ravel(self.anp),
         )
 
+        a = num.full((1, 0), 1, dtype=int)
+        anp = np.full((1, 0), 1, dtype=int)
+        assert np.array_equal(num.ravel(a), np.ravel(anp))
+
+        a = num.full((0, 1), 1, dtype=int)
+        anp = np.full((0, 1), 1, dtype=int)
+        assert np.array_equal(num.ravel(a), np.ravel(anp))
+
 
 RECT_CASES = [
     (10, 2, 10),

From 108f88ba3c42e3c007211748e50cb06006cdc692 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 15 Aug 2022 10:35:11 -0600
Subject: [PATCH 06/14] adding tests for zero sizes of arrays

---
 tests/integration/test_advanced_indexing.py | 37 +++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tests/integration/test_advanced_indexing.py b/tests/integration/test_advanced_indexing.py
index b185e8f4f..fc6895ef5 100644
--- a/tests/integration/test_advanced_indexing.py
+++ b/tests/integration/test_advanced_indexing.py
@@ -109,6 +109,43 @@ def test_setitem_empty_1d(arr, idx, val):
     assert np.array_equal(arr[idx], [])
 
 
+def mk_deferred_array(lib, shape):
+    if np.prod(shape) != 0:
+        return lib.ones(shape)
+    # for shape (2,0,3,4): good_shape = (2,1,3,4)
+    good_shape = tuple(max(1, dim) for dim in shape)
+    # for shape (2,0,3,4): key = [:,[False],:,:]
+    key = tuple([False] if dim == 0 else slice(None) for dim in shape)
+    print("IRINA DEBUG ", key, good_shape)
+    return lib.ones(good_shape)[key]
+
+
+def test_zero_size():
+    for arr_ndim in range(1, LEGATE_MAX_DIM + 1):
+        for idx_ndim in range(1, arr_ndim + 1):
+            for zero_dim in range(arr_ndim):
+                arr_shape = tuple(
+                    0 if dim == zero_dim else 3 for dim in range(arr_ndim)
+                )
+                np_arr = mk_deferred_array(np, arr_shape)
+                print("IRINA DEBUG shape numpy", np_arr.shape)
+                num_arr = mk_deferred_array(num, arr_shape)
+                idx_shape = arr_shape[:idx_ndim]
+                val_shape = (
+                    arr_shape
+                    if idx_ndim == 1
+                    else (np.prod(idx_shape),) + arr_shape[idx_ndim:]
+                )
+                np_idx = np.ones(idx_shape, dtype=np.bool_)
+                num_idx = num.ones(idx_shape, dtype=np.bool_)
+                assert np.array_equal(np_arr[np_idx], num_arr[num_idx])
+                np_val = np.random.random(val_shape)
+                num_val = num.array(np_val)
+                np_arr[np_idx] = np_val
+                num_arr[num_idx] = num_val
+                assert np.array_equal(np_arr, num_arr)
+
+
 def test_empty_bool():
     # empty arrays and indices
     arr_np = np.array([[]])

From dc961027468c7953841c8c34499d5e9ac496b6d7 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 18 Aug 2022 13:34:24 -0600
Subject: [PATCH 07/14] fixing advanced indexing operation for empty arrays

---
 cunumeric/deferred.py                         | 61 +++++++++++++------
 .../index/advanced_indexing_template.inl      |  5 +-
 src/cunumeric/nullary/fill.cc                 | 20 +++++-
 src/cunumeric/nullary/fill.cu                 | 47 ++++++++++++--
 src/cunumeric/nullary/fill.h                  |  1 +
 src/cunumeric/nullary/fill_omp.cc             | 24 +++++++-
 src/cunumeric/nullary/fill_template.inl       | 42 +++++++++++--
 tests/integration/test_advanced_indexing.py   | 11 +++-
 8 files changed, 174 insertions(+), 37 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 3a17e4238..3626c4ae3 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -482,6 +482,14 @@ def _copy_store(self, store: Any) -> NumPyThunk:
         store_copy.copy(store_to_copy, deep=True)
         return store_copy
 
+    def _fill_with_zero(self):
+        task = self.context.create_auto_task(CuNumericOpCode.FILL)
+        task.add_output(self.base)
+        task.add_scalar_arg(False, bool)
+        task.add_scalar_arg(True, bool)  # fill with 0
+
+        task.execute()
+
     def _create_indexing_array(
         self, key: Any, is_set: bool = False
     ) -> tuple[bool, Any, Any, Any]:
@@ -504,22 +512,29 @@ def _create_indexing_array(
                         f"index array which is {rhs.shape[i]}"
                     )
 
-            if key.size == 0:
-                rhs_store = rhs.base
-                for i in range(key.ndim - 1):
-                    rhs_store = rhs_store.project(0, 0)
+            # if key or rhs are empty, return an empty array with correct shape
+            if key.size == 0 or rhs.size == 0:
+                if rhs.size == 0 and key.size != 0:
+                    # we need to calculate shape of the 0 dim of output region
+                    # even though the size of it is 0
+                    # this can potentially be replaced with COUNT_NONZERO
+                    s = key.nonzero()[0].size
+                else:
+                    s = 0
 
+                out_shape = (s,) + tuple(
+                    rhs.shape[i] for i in range(key.ndim, rhs.ndim)
+                )
                 out = cast(
                     DeferredArray,
                     self.runtime.create_empty_thunk(
-                        rhs_store.shape,
+                        out_shape,
                         rhs.dtype,
                         inputs=[rhs],
                     ),
                 )
 
-                out = cast(DeferredArray, out._copy_store(rhs_store))
-
+                out._fill_with_zero()
                 return False, rhs, out, self
 
             key_store = key.base
@@ -651,8 +666,12 @@ def _create_indexing_array(
                     # in case of the mixed indises we all nonzero
                     # for the bool array
                     k = k.nonzero()
-                    shift += len(k) - 1
-                    tuple_of_arrays += k
+                    if k[0].size == 0:
+                        store = store.project(dim + shift, 0)
+                        store = store.promote(dim + shift, 0)
+                    else:
+                        shift += len(k) - 1
+                        tuple_of_arrays += k
                 else:
                     tuple_of_arrays += (k,)
             else:
@@ -671,6 +690,8 @@ def _create_indexing_array(
             # the store with transformation
             rhs = cast(DeferredArray, self._copy_store(store))
 
+        if len(tuple_of_arrays) == 0:
+            return False, rhs, rhs, self
         if len(tuple_of_arrays) <= rhs.ndim:
             output_arr = rhs._zip_indices(start_index, tuple_of_arrays)
             return True, rhs, output_arr, self
@@ -758,12 +779,10 @@ def get_item(self, key: Any) -> NumPyThunk:
                 self,
             ) = self._create_indexing_array(key)
 
-            store = rhs.base
-
             if copy_needed:
+
                 if rhs.base.kind == Future:
                     rhs = self._convert_future_to_store(rhs)
-                store = rhs.base
                 result: NumPyThunk
                 if index_array.base.kind == Future:
                     index_array = self._convert_future_to_store(index_array)
@@ -777,6 +796,7 @@ def get_item(self, key: Any) -> NumPyThunk:
                         base=result_store,
                         dtype=self.dtype,
                     )
+
                 else:
                     result = self.runtime.create_empty_thunk(
                         index_array.base.shape,
@@ -786,7 +806,7 @@ def get_item(self, key: Any) -> NumPyThunk:
 
                 copy = self.context.create_copy()
                 copy.set_source_indirect_out_of_range(False)
-                copy.add_input(store)
+                copy.add_input(rhs.base)
                 copy.add_source_indirect(index_array.base)
                 copy.add_output(result.base)  # type: ignore
                 copy.execute()
@@ -850,13 +870,15 @@ def set_item(self, key: Any, rhs: Any) -> None:
             if lhs.base.kind == Future:
                 lhs = self._convert_future_to_store(lhs)
 
-            copy = self.context.create_copy()
-            copy.set_target_indirect_out_of_range(False)
+            if index_array.size != 0:
 
-            copy.add_input(rhs_store)
-            copy.add_target_indirect(index_array.base)
-            copy.add_output(lhs.base)
-            copy.execute()
+                copy = self.context.create_copy()
+                copy.set_target_indirect_out_of_range(False)
+
+                copy.add_input(rhs_store)
+                copy.add_target_indirect(index_array.base)
+                copy.add_output(lhs.base)
+                copy.execute()
 
             # TODO this copy will be removed when affine copies are
             # supported in Legion/Realm
@@ -1250,6 +1272,7 @@ def _fill(self, value: Any) -> None:
             task.add_output(self.base)
             task.add_input(value)
             task.add_scalar_arg(argval, bool)
+            task.add_scalar_arg(False, bool)  # fill with zero
 
             task.execute()
 
diff --git a/src/cunumeric/index/advanced_indexing_template.inl b/src/cunumeric/index/advanced_indexing_template.inl
index cbc90b4d4..a632d25bc 100644
--- a/src/cunumeric/index/advanced_indexing_template.inl
+++ b/src/cunumeric/index/advanced_indexing_template.inl
@@ -51,12 +51,13 @@ struct AdvancedIndexingImpl {
 #endif
 
     if (volume == 0) {
+      auto extents = Point<DIM>::ZEROES();
       if (args.is_set) {
         auto empty = create_buffer<Point<DIM>>(0);
-        args.output.return_data(empty, Point<1>(0));
+        args.output.return_data(empty, extents);
       } else {
         auto empty = create_buffer<VAL>(0);
-        args.output.return_data(empty, Point<1>(0));
+        args.output.return_data(empty, extents);
       }
       return;
     }
diff --git a/src/cunumeric/nullary/fill.cc b/src/cunumeric/nullary/fill.cc
index 785028f63..7dfde984c 100644
--- a/src/cunumeric/nullary/fill.cc
+++ b/src/cunumeric/nullary/fill.cc
@@ -31,7 +31,25 @@ struct FillImplBody<VariantKind::CPU, VAL, DIM> {
                   bool dense) const
   {
     auto fill_value = in[0];
-    size_t volume   = rect.volume();
+    fill(out, fill_value, pitches, rect, dense);
+  }
+
+  void operator()(AccessorWO<VAL, DIM> out,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  bool dense) const
+  {
+    VAL fill_value = VAL(0);
+    fill(out, fill_value, pitches, rect, dense);
+  }
+
+  void fill(AccessorWO<VAL, DIM> out,
+            VAL& fill_value,
+            const Pitches<DIM - 1>& pitches,
+            const Rect<DIM>& rect,
+            bool dense) const
+  {
+    size_t volume = rect.volume();
     if (dense) {
       auto outptr = out.ptr(rect);
       for (size_t idx = 0; idx < volume; ++idx) outptr[idx] = fill_value;
diff --git a/src/cunumeric/nullary/fill.cu b/src/cunumeric/nullary/fill.cu
index bdbba46d6..7126708f3 100644
--- a/src/cunumeric/nullary/fill.cu
+++ b/src/cunumeric/nullary/fill.cu
@@ -23,18 +23,18 @@ namespace cunumeric {
 
 using namespace Legion;
 
-template <typename ARG, typename ReadAcc>
+template <typename ARG, typename VAL>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
-  dense_kernel(size_t volume, ARG* out, ReadAcc fill_value)
+  dense_kernel(size_t volume, ARG* out, AccessorRO<VAL, 1> fill_value)
 {
   const size_t idx = global_tid_1d();
   if (idx >= volume) return;
   out[idx] = fill_value[0];
 }
 
-template <typename WriteAcc, typename ReadAcc, typename Pitches, typename Rect>
-static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
-  generic_kernel(size_t volume, WriteAcc out, ReadAcc fill_value, Pitches pitches, Rect rect)
+template <typename WriteAcc, typename VAL, typename Pitches, typename Rect>
+static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) generic_kernel(
+  size_t volume, WriteAcc out, AccessorRO<VAL, 1> fill_value, Pitches pitches, Rect rect)
 {
   const size_t idx = global_tid_1d();
   if (idx >= volume) return;
@@ -42,6 +42,25 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   out[point] = fill_value[0];
 }
 
+template <typename VAL>
+static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  dense_kernel(size_t volume, VAL* out)
+{
+  const size_t idx = global_tid_1d();
+  if (idx >= volume) return;
+  out[idx] = VAL(0);
+}
+
+template <typename VAL, int32_t DIM, typename Pitches, typename Rect>
+static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  generic_kernel(size_t volume, AccessorWO<VAL, DIM> out, Pitches pitches, Rect rect)
+{
+  const size_t idx = global_tid_1d();
+  if (idx >= volume) return;
+  auto point = pitches.unflatten(idx, rect.lo);
+  out[point] = VAL(0);
+}
+
 template <typename VAL, int32_t DIM>
 struct FillImplBody<VariantKind::GPU, VAL, DIM> {
   void operator()(AccessorWO<VAL, DIM> out,
@@ -61,6 +80,24 @@ struct FillImplBody<VariantKind::GPU, VAL, DIM> {
     }
     CHECK_CUDA_STREAM(stream);
   }
+
+  // the case when we fill with 0
+  void operator()(AccessorWO<VAL, DIM> out,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  bool dense) const
+  {
+    size_t volume       = rect.volume();
+    const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    auto stream         = get_cached_stream();
+    if (dense) {
+      auto outptr = out.ptr(rect);
+      dense_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(volume, outptr);
+    } else {
+      generic_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(volume, out, pitches, rect);
+    }
+    CHECK_CUDA_STREAM(stream);
+  }
 };
 
 /*static*/ void FillTask::gpu_variant(TaskContext& context)
diff --git a/src/cunumeric/nullary/fill.h b/src/cunumeric/nullary/fill.h
index 0330edfd4..caa0f280d 100644
--- a/src/cunumeric/nullary/fill.h
+++ b/src/cunumeric/nullary/fill.h
@@ -24,6 +24,7 @@ struct FillArgs {
   const Array& out;
   const Array& fill_value;
   bool is_argval;
+  bool is_zero;
 };
 
 class FillTask : public CuNumericTask<FillTask> {
diff --git a/src/cunumeric/nullary/fill_omp.cc b/src/cunumeric/nullary/fill_omp.cc
index 6e206f01c..fc079c89a 100644
--- a/src/cunumeric/nullary/fill_omp.cc
+++ b/src/cunumeric/nullary/fill_omp.cc
@@ -36,7 +36,25 @@ struct FillImplBody<VariantKind::OMP, VAL, DIM> {
                   bool dense) const
   {
     auto fill_value = in[0];
-    size_t volume   = rect.volume();
+    fill(out, fill_value, pitches, rect, dense);
+  }
+
+  void operator()(AccessorWO<VAL, DIM> out,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  bool dense) const
+  {
+    VAL fill_value = VAL(0);
+    fill(out, fill_value, pitches, rect, dense);
+  }
+
+  void fill(AccessorWO<VAL, DIM> out,
+            VAL& fill_value,
+            const Pitches<DIM - 1>& pitches,
+            const Rect<DIM>& rect,
+            bool dense) const
+  {
+    size_t volume = rect.volume();
     if (dense) {
       auto outptr = out.ptr(rect);
 #pragma omp parallel for schedule(static)
@@ -44,8 +62,8 @@ struct FillImplBody<VariantKind::OMP, VAL, DIM> {
     } else {
 #pragma omp parallel for schedule(static)
       for (size_t idx = 0; idx < volume; ++idx) {
-        auto p = pitches.unflatten(idx, rect.lo);
-        out[p] = fill_value;
+        const auto point = pitches.unflatten(idx, rect.lo);
+        out[point]       = fill_value;
       }
     }
   }
diff --git a/src/cunumeric/nullary/fill_template.inl b/src/cunumeric/nullary/fill_template.inl
index aac10a61d..003e5d12d 100644
--- a/src/cunumeric/nullary/fill_template.inl
+++ b/src/cunumeric/nullary/fill_template.inl
@@ -42,9 +42,29 @@ struct FillImpl {
 
     if (volume == 0) return;
 
-    auto out        = args.out.write_accessor<VAL, DIM>(rect);
+    auto out = args.out.write_accessor<VAL, DIM>(rect);
+#ifndef LEGION_BOUNDS_CHECKS
+    // Check to see if this is dense or not
+    bool dense = out.accessor.is_dense_row_major(rect);
+#else
+    // No dense execution if we're doing bounds checks
+    bool dense = false;
+#endif
+
     auto fill_value = args.fill_value.read_accessor<VAL, 1>();
+    FillImplBody<KIND, VAL, DIM>{}(out, fill_value, pitches, rect, dense);
+  }
+  template <typename VAL, int DIM>
+  void fill_zero(FillArgs& args) const
+  {
+    auto rect = args.out.shape<DIM>();
+
+    Pitches<DIM - 1> pitches;
+    size_t volume = pitches.flatten(rect);
 
+    if (volume == 0) return;
+
+    auto out = args.out.write_accessor<VAL, DIM>(rect);
 #ifndef LEGION_BOUNDS_CHECKS
     // Check to see if this is dense or not
     bool dense = out.accessor.is_dense_row_major(rect);
@@ -52,7 +72,8 @@ struct FillImpl {
     // No dense execution if we're doing bounds checks
     bool dense = false;
 #endif
-    FillImplBody<KIND, VAL, DIM>{}(out, fill_value, pitches, rect, dense);
+
+    FillImplBody<KIND, VAL, DIM>{}(out, pitches, rect, dense);
   }
 
   template <LegateTypeCode CODE, int DIM>
@@ -60,10 +81,14 @@ struct FillImpl {
   {
     if (args.is_argval) {
       using VAL = Argval<legate_type_of<CODE>>;
+      assert(args.is_zero == false);
       fill<VAL, DIM>(args);
     } else {
       using VAL = legate_type_of<CODE>;
-      fill<VAL, DIM>(args);
+      if (args.is_zero)
+        fill_zero<VAL, DIM>(args);
+      else
+        fill<VAL, DIM>(args);
     }
   }
 };
@@ -71,8 +96,15 @@ struct FillImpl {
 template <VariantKind KIND>
 static void fill_template(TaskContext& context)
 {
-  FillArgs args{context.outputs()[0], context.inputs()[0], context.scalars()[0].value<bool>()};
-  double_dispatch(args.out.dim(), args.out.code(), FillImpl<KIND>{}, args);
+  bool is_zero = context.scalars()[1].value<bool>();
+  if (is_zero) {
+    FillArgs args{context.outputs()[0], Array(), context.scalars()[0].value<bool>(), true};
+    double_dispatch(args.out.dim(), args.out.code(), FillImpl<KIND>{}, args);
+  } else {
+    FillArgs args{
+      context.outputs()[0], context.inputs()[0], context.scalars()[0].value<bool>(), false};
+    double_dispatch(args.out.dim(), args.out.code(), FillImpl<KIND>{}, args);
+  }
 }
 
 }  // namespace cunumeric
diff --git a/tests/integration/test_advanced_indexing.py b/tests/integration/test_advanced_indexing.py
index fc6895ef5..e2925ccff 100644
--- a/tests/integration/test_advanced_indexing.py
+++ b/tests/integration/test_advanced_indexing.py
@@ -116,7 +116,7 @@ def mk_deferred_array(lib, shape):
     good_shape = tuple(max(1, dim) for dim in shape)
     # for shape (2,0,3,4): key = [:,[False],:,:]
     key = tuple([False] if dim == 0 else slice(None) for dim in shape)
-    print("IRINA DEBUG ", key, good_shape)
+    print("IRINA DEBUG good_shape , key", good_shape, key)
     return lib.ones(good_shape)[key]
 
 
@@ -128,7 +128,6 @@ def test_zero_size():
                     0 if dim == zero_dim else 3 for dim in range(arr_ndim)
                 )
                 np_arr = mk_deferred_array(np, arr_shape)
-                print("IRINA DEBUG shape numpy", np_arr.shape)
                 num_arr = mk_deferred_array(num, arr_shape)
                 idx_shape = arr_shape[:idx_ndim]
                 val_shape = (
@@ -138,7 +137,15 @@ def test_zero_size():
                 )
                 np_idx = np.ones(idx_shape, dtype=np.bool_)
                 num_idx = num.ones(idx_shape, dtype=np.bool_)
+                print(
+                    "IRINA DEBUG shapes ",
+                    np_arr[np_idx],
+                    np_arr[np_idx].shape,
+                    num_arr[num_idx],
+                    num_arr[num_idx].shape,
+                )
                 assert np.array_equal(np_arr[np_idx], num_arr[num_idx])
+
                 np_val = np.random.random(val_shape)
                 num_val = num.array(np_val)
                 np_arr[np_idx] = np_val

From 0130713985f743df798ecdf61613e71914ac6e9e Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 18 Aug 2022 15:30:33 -0600
Subject: [PATCH 08/14] fixing compile-time error

---
 src/cunumeric/index/advanced_indexing_template.inl | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/cunumeric/index/advanced_indexing_template.inl b/src/cunumeric/index/advanced_indexing_template.inl
index a632d25bc..bc69636fe 100644
--- a/src/cunumeric/index/advanced_indexing_template.inl
+++ b/src/cunumeric/index/advanced_indexing_template.inl
@@ -51,14 +51,7 @@ struct AdvancedIndexingImpl {
 #endif
 
     if (volume == 0) {
-      auto extents = Point<DIM>::ZEROES();
-      if (args.is_set) {
-        auto empty = create_buffer<Point<DIM>>(0);
-        args.output.return_data(empty, extents);
-      } else {
-        auto empty = create_buffer<VAL>(0);
-        args.output.return_data(empty, extents);
-      }
+      args.output.make_empty();
       return;
     }
 

From 99b57291d535e5910bce87c98e46b55b76838694 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 18 Aug 2022 21:11:29 -0600
Subject: [PATCH 09/14] removing unnecessary _fill_with_zero

---
 cunumeric/deferred.py                   | 11 +-----
 src/cunumeric/nullary/fill.cc           | 20 +----------
 src/cunumeric/nullary/fill.cu           | 47 +++----------------------
 src/cunumeric/nullary/fill.h            |  1 -
 src/cunumeric/nullary/fill_omp.cc       | 24 ++-----------
 src/cunumeric/nullary/fill_template.inl | 42 +++-------------------
 6 files changed, 15 insertions(+), 130 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 3626c4ae3..7d475755e 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -482,14 +482,6 @@ def _copy_store(self, store: Any) -> NumPyThunk:
         store_copy.copy(store_to_copy, deep=True)
         return store_copy
 
-    def _fill_with_zero(self):
-        task = self.context.create_auto_task(CuNumericOpCode.FILL)
-        task.add_output(self.base)
-        task.add_scalar_arg(False, bool)
-        task.add_scalar_arg(True, bool)  # fill with 0
-
-        task.execute()
-
     def _create_indexing_array(
         self, key: Any, is_set: bool = False
     ) -> tuple[bool, Any, Any, Any]:
@@ -533,8 +525,7 @@ def _create_indexing_array(
                         inputs=[rhs],
                     ),
                 )
-
-                out._fill_with_zero()
+                out.fill(np.zeros((), dtype=out.dtype))
                 return False, rhs, out, self
 
             key_store = key.base
diff --git a/src/cunumeric/nullary/fill.cc b/src/cunumeric/nullary/fill.cc
index 7dfde984c..785028f63 100644
--- a/src/cunumeric/nullary/fill.cc
+++ b/src/cunumeric/nullary/fill.cc
@@ -31,25 +31,7 @@ struct FillImplBody<VariantKind::CPU, VAL, DIM> {
                   bool dense) const
   {
     auto fill_value = in[0];
-    fill(out, fill_value, pitches, rect, dense);
-  }
-
-  void operator()(AccessorWO<VAL, DIM> out,
-                  const Pitches<DIM - 1>& pitches,
-                  const Rect<DIM>& rect,
-                  bool dense) const
-  {
-    VAL fill_value = VAL(0);
-    fill(out, fill_value, pitches, rect, dense);
-  }
-
-  void fill(AccessorWO<VAL, DIM> out,
-            VAL& fill_value,
-            const Pitches<DIM - 1>& pitches,
-            const Rect<DIM>& rect,
-            bool dense) const
-  {
-    size_t volume = rect.volume();
+    size_t volume   = rect.volume();
     if (dense) {
       auto outptr = out.ptr(rect);
       for (size_t idx = 0; idx < volume; ++idx) outptr[idx] = fill_value;
diff --git a/src/cunumeric/nullary/fill.cu b/src/cunumeric/nullary/fill.cu
index 7126708f3..bdbba46d6 100644
--- a/src/cunumeric/nullary/fill.cu
+++ b/src/cunumeric/nullary/fill.cu
@@ -23,42 +23,23 @@ namespace cunumeric {
 
 using namespace Legion;
 
-template <typename ARG, typename VAL>
+template <typename ARG, typename ReadAcc>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
-  dense_kernel(size_t volume, ARG* out, AccessorRO<VAL, 1> fill_value)
+  dense_kernel(size_t volume, ARG* out, ReadAcc fill_value)
 {
   const size_t idx = global_tid_1d();
   if (idx >= volume) return;
   out[idx] = fill_value[0];
 }
 
-template <typename WriteAcc, typename VAL, typename Pitches, typename Rect>
-static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) generic_kernel(
-  size_t volume, WriteAcc out, AccessorRO<VAL, 1> fill_value, Pitches pitches, Rect rect)
-{
-  const size_t idx = global_tid_1d();
-  if (idx >= volume) return;
-  auto point = pitches.unflatten(idx, rect.lo);
-  out[point] = fill_value[0];
-}
-
-template <typename VAL>
+template <typename WriteAcc, typename ReadAcc, typename Pitches, typename Rect>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
-  dense_kernel(size_t volume, VAL* out)
-{
-  const size_t idx = global_tid_1d();
-  if (idx >= volume) return;
-  out[idx] = VAL(0);
-}
-
-template <typename VAL, int32_t DIM, typename Pitches, typename Rect>
-static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
-  generic_kernel(size_t volume, AccessorWO<VAL, DIM> out, Pitches pitches, Rect rect)
+  generic_kernel(size_t volume, WriteAcc out, ReadAcc fill_value, Pitches pitches, Rect rect)
 {
   const size_t idx = global_tid_1d();
   if (idx >= volume) return;
   auto point = pitches.unflatten(idx, rect.lo);
-  out[point] = VAL(0);
+  out[point] = fill_value[0];
 }
 
 template <typename VAL, int32_t DIM>
@@ -80,24 +61,6 @@ struct FillImplBody<VariantKind::GPU, VAL, DIM> {
     }
     CHECK_CUDA_STREAM(stream);
   }
-
-  // the case when we fill with 0
-  void operator()(AccessorWO<VAL, DIM> out,
-                  const Pitches<DIM - 1>& pitches,
-                  const Rect<DIM>& rect,
-                  bool dense) const
-  {
-    size_t volume       = rect.volume();
-    const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    auto stream         = get_cached_stream();
-    if (dense) {
-      auto outptr = out.ptr(rect);
-      dense_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(volume, outptr);
-    } else {
-      generic_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(volume, out, pitches, rect);
-    }
-    CHECK_CUDA_STREAM(stream);
-  }
 };
 
 /*static*/ void FillTask::gpu_variant(TaskContext& context)
diff --git a/src/cunumeric/nullary/fill.h b/src/cunumeric/nullary/fill.h
index caa0f280d..0330edfd4 100644
--- a/src/cunumeric/nullary/fill.h
+++ b/src/cunumeric/nullary/fill.h
@@ -24,7 +24,6 @@ struct FillArgs {
   const Array& out;
   const Array& fill_value;
   bool is_argval;
-  bool is_zero;
 };
 
 class FillTask : public CuNumericTask<FillTask> {
diff --git a/src/cunumeric/nullary/fill_omp.cc b/src/cunumeric/nullary/fill_omp.cc
index fc079c89a..6e206f01c 100644
--- a/src/cunumeric/nullary/fill_omp.cc
+++ b/src/cunumeric/nullary/fill_omp.cc
@@ -36,25 +36,7 @@ struct FillImplBody<VariantKind::OMP, VAL, DIM> {
                   bool dense) const
   {
     auto fill_value = in[0];
-    fill(out, fill_value, pitches, rect, dense);
-  }
-
-  void operator()(AccessorWO<VAL, DIM> out,
-                  const Pitches<DIM - 1>& pitches,
-                  const Rect<DIM>& rect,
-                  bool dense) const
-  {
-    VAL fill_value = VAL(0);
-    fill(out, fill_value, pitches, rect, dense);
-  }
-
-  void fill(AccessorWO<VAL, DIM> out,
-            VAL& fill_value,
-            const Pitches<DIM - 1>& pitches,
-            const Rect<DIM>& rect,
-            bool dense) const
-  {
-    size_t volume = rect.volume();
+    size_t volume   = rect.volume();
     if (dense) {
       auto outptr = out.ptr(rect);
 #pragma omp parallel for schedule(static)
@@ -62,8 +44,8 @@ struct FillImplBody<VariantKind::OMP, VAL, DIM> {
     } else {
 #pragma omp parallel for schedule(static)
       for (size_t idx = 0; idx < volume; ++idx) {
-        const auto point = pitches.unflatten(idx, rect.lo);
-        out[point]       = fill_value;
+        auto p = pitches.unflatten(idx, rect.lo);
+        out[p] = fill_value;
       }
     }
   }
diff --git a/src/cunumeric/nullary/fill_template.inl b/src/cunumeric/nullary/fill_template.inl
index 003e5d12d..aac10a61d 100644
--- a/src/cunumeric/nullary/fill_template.inl
+++ b/src/cunumeric/nullary/fill_template.inl
@@ -42,29 +42,9 @@ struct FillImpl {
 
     if (volume == 0) return;
 
-    auto out = args.out.write_accessor<VAL, DIM>(rect);
-#ifndef LEGION_BOUNDS_CHECKS
-    // Check to see if this is dense or not
-    bool dense = out.accessor.is_dense_row_major(rect);
-#else
-    // No dense execution if we're doing bounds checks
-    bool dense = false;
-#endif
-
+    auto out        = args.out.write_accessor<VAL, DIM>(rect);
     auto fill_value = args.fill_value.read_accessor<VAL, 1>();
-    FillImplBody<KIND, VAL, DIM>{}(out, fill_value, pitches, rect, dense);
-  }
-  template <typename VAL, int DIM>
-  void fill_zero(FillArgs& args) const
-  {
-    auto rect = args.out.shape<DIM>();
-
-    Pitches<DIM - 1> pitches;
-    size_t volume = pitches.flatten(rect);
 
-    if (volume == 0) return;
-
-    auto out = args.out.write_accessor<VAL, DIM>(rect);
 #ifndef LEGION_BOUNDS_CHECKS
     // Check to see if this is dense or not
     bool dense = out.accessor.is_dense_row_major(rect);
@@ -72,8 +52,7 @@ struct FillImpl {
     // No dense execution if we're doing bounds checks
     bool dense = false;
 #endif
-
-    FillImplBody<KIND, VAL, DIM>{}(out, pitches, rect, dense);
+    FillImplBody<KIND, VAL, DIM>{}(out, fill_value, pitches, rect, dense);
   }
 
   template <LegateTypeCode CODE, int DIM>
@@ -81,14 +60,10 @@ struct FillImpl {
   {
     if (args.is_argval) {
       using VAL = Argval<legate_type_of<CODE>>;
-      assert(args.is_zero == false);
       fill<VAL, DIM>(args);
     } else {
       using VAL = legate_type_of<CODE>;
-      if (args.is_zero)
-        fill_zero<VAL, DIM>(args);
-      else
-        fill<VAL, DIM>(args);
+      fill<VAL, DIM>(args);
     }
   }
 };
@@ -96,15 +71,8 @@ struct FillImpl {
 template <VariantKind KIND>
 static void fill_template(TaskContext& context)
 {
-  bool is_zero = context.scalars()[1].value<bool>();
-  if (is_zero) {
-    FillArgs args{context.outputs()[0], Array(), context.scalars()[0].value<bool>(), true};
-    double_dispatch(args.out.dim(), args.out.code(), FillImpl<KIND>{}, args);
-  } else {
-    FillArgs args{
-      context.outputs()[0], context.inputs()[0], context.scalars()[0].value<bool>(), false};
-    double_dispatch(args.out.dim(), args.out.code(), FillImpl<KIND>{}, args);
-  }
+  FillArgs args{context.outputs()[0], context.inputs()[0], context.scalars()[0].value<bool>()};
+  double_dispatch(args.out.dim(), args.out.code(), FillImpl<KIND>{}, args);
 }
 
 }  // namespace cunumeric

From 1708a4b3dadf10e68733bee8e08dbe6b0d18d736 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 19 Aug 2022 09:13:22 -0600
Subject: [PATCH 10/14] addressing some of the PR comments

---
 cunumeric/deferred.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 7d475755e..bf7564745 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -657,12 +657,8 @@ def _create_indexing_array(
                     # in case of the mixed indises we all nonzero
                     # for the bool array
                     k = k.nonzero()
-                    if k[0].size == 0:
-                        store = store.project(dim + shift, 0)
-                        store = store.promote(dim + shift, 0)
-                    else:
-                        shift += len(k) - 1
-                        tuple_of_arrays += k
+                    shift += len(k) - 1
+                    tuple_of_arrays += k
                 else:
                     tuple_of_arrays += (k,)
             else:
@@ -1263,7 +1259,6 @@ def _fill(self, value: Any) -> None:
             task.add_output(self.base)
             task.add_input(value)
             task.add_scalar_arg(argval, bool)
-            task.add_scalar_arg(False, bool)  # fill with zero
 
             task.execute()
 

From a515c7d716b6422dab0525c2375a0d14b5722f05 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 19 Aug 2022 09:48:14 -0600
Subject: [PATCH 11/14] adding test for reshape with empty size and fixing some
 logic

---
 cunumeric/array.py                | 27 +++++++++++--------
 tests/integration/test_reshape.py | 44 +++++++++++++++++++++++++++----
 2 files changed, 55 insertions(+), 16 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 7ca3766af..cb1221e9b 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -51,6 +51,8 @@
 if TYPE_CHECKING:
     from .types import NdShapeLike
 
+from math import prod
+
 FALLBACK_WARNING = (
     "cuNumeric has not fully implemented {name} "
     + "and is falling back to canonical numpy. "
@@ -3019,17 +3021,6 @@ def ravel(self, order="C") -> ndarray:
         Multiple GPUs, Multiple CPUs
 
         """
-        if self.size == 0 and self.ndim > 1:
-            idx = tuple()
-            for i in range(self.ndim):
-                if self.shape[i] != 0:
-                    idx += (0,)
-                else:
-                    idx += (slice(None),)
-            idx = tuple(idx)
-            self = self[idx].copy()
-            return self
-
         return self.reshape(-1, order=order)
 
     def reshape(self, *args, order="C") -> ndarray:
@@ -3056,6 +3047,20 @@ def reshape(self, *args, order="C") -> ndarray:
         else:
             shape = args
 
+        if self.size == 0 and self.ndim > 1:
+            if shape == (-1,):
+                shape = (0,)
+            new_size = prod(shape)
+            if new_size > 0:
+                raise ValueError("new shape has bigger size than original")
+            result = ndarray(
+                shape=shape,
+                dtype=self.dtype,
+                inputs=(self,),
+            )
+            result.fill(0)
+            return result
+
         computed_shape = tuple(operator.index(extent) for extent in shape)
 
         num_unknowns = sum(extent < 0 for extent in computed_shape)
diff --git a/tests/integration/test_reshape.py b/tests/integration/test_reshape.py
index edfb2f327..1e4c7f8c5 100644
--- a/tests/integration/test_reshape.py
+++ b/tests/integration/test_reshape.py
@@ -59,14 +59,48 @@ def test_ravel(self):
             np.ravel(self.anp),
         )
 
-        a = num.full((1, 0), 1, dtype=int)
-        anp = np.full((1, 0), 1, dtype=int)
-        assert np.array_equal(num.ravel(a), np.ravel(anp))
+        i = num.array(
+            [
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+            ]
+        )
+        inp = np.array(
+            [
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+            ]
+        )
+        b = a[i, :]
+        bnp = self.anp[inp, :]
+        assert np.array_equal(b.ravel(), bnp.ravel())
 
-        a = num.full((0, 1), 1, dtype=int)
-        anp = np.full((0, 1), 1, dtype=int)
+        assert np.array_equal(b.reshape((0,)), bnp.reshape((0,)))
+
+        a = num.full((3, 0), 1, dtype=int)
+        anp = np.full((3, 0), 1, dtype=int)
         assert np.array_equal(num.ravel(a), np.ravel(anp))
 
+        a = num.full((0, 3), 1, dtype=int)
+        anp = np.full((0, 3), 1, dtype=int)
+        assert np.array_equal(a.ravel(), anp.ravel())
+
 
 RECT_CASES = [
     (10, 2, 10),

From 36dc812ec4dfadbf4aad98f3835e1e52b66753d1 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 19 Aug 2022 10:57:38 -0600
Subject: [PATCH 12/14] improving the advanced_indexing test

---
 tests/integration/test_advanced_indexing.py | 55 ++++++++++-----------
 1 file changed, 26 insertions(+), 29 deletions(-)

diff --git a/tests/integration/test_advanced_indexing.py b/tests/integration/test_advanced_indexing.py
index e2925ccff..42fabf5e7 100644
--- a/tests/integration/test_advanced_indexing.py
+++ b/tests/integration/test_advanced_indexing.py
@@ -116,41 +116,38 @@ def mk_deferred_array(lib, shape):
     good_shape = tuple(max(1, dim) for dim in shape)
     # for shape (2,0,3,4): key = [:,[False],:,:]
     key = tuple([False] if dim == 0 else slice(None) for dim in shape)
-    print("IRINA DEBUG good_shape , key", good_shape, key)
     return lib.ones(good_shape)[key]
 
 
-def test_zero_size():
+def gen_args():
+    result = []
     for arr_ndim in range(1, LEGATE_MAX_DIM + 1):
         for idx_ndim in range(1, arr_ndim + 1):
             for zero_dim in range(arr_ndim):
-                arr_shape = tuple(
-                    0 if dim == zero_dim else 3 for dim in range(arr_ndim)
-                )
-                np_arr = mk_deferred_array(np, arr_shape)
-                num_arr = mk_deferred_array(num, arr_shape)
-                idx_shape = arr_shape[:idx_ndim]
-                val_shape = (
-                    arr_shape
-                    if idx_ndim == 1
-                    else (np.prod(idx_shape),) + arr_shape[idx_ndim:]
-                )
-                np_idx = np.ones(idx_shape, dtype=np.bool_)
-                num_idx = num.ones(idx_shape, dtype=np.bool_)
-                print(
-                    "IRINA DEBUG shapes ",
-                    np_arr[np_idx],
-                    np_arr[np_idx].shape,
-                    num_arr[num_idx],
-                    num_arr[num_idx].shape,
-                )
-                assert np.array_equal(np_arr[np_idx], num_arr[num_idx])
-
-                np_val = np.random.random(val_shape)
-                num_val = num.array(np_val)
-                np_arr[np_idx] = np_val
-                num_arr[num_idx] = num_val
-                assert np.array_equal(np_arr, num_arr)
+                result += [[arr_ndim, idx_ndim, zero_dim]]
+    return result
+
+
+@pytest.mark.parametrize("arr_ndim,idx_ndim,zero_dim", gen_args())
+def test_zero_size(arr_ndim, idx_ndim, zero_dim):
+    arr_shape = tuple(0 if dim == zero_dim else 3 for dim in range(arr_ndim))
+    np_arr = mk_deferred_array(np, arr_shape)
+    num_arr = mk_deferred_array(num, arr_shape)
+    idx_shape = arr_shape[:idx_ndim]
+    val_shape = (
+        arr_shape
+        if idx_ndim == 1
+        else (np.prod(idx_shape),) + arr_shape[idx_ndim:]
+    )
+    np_idx = np.ones(idx_shape, dtype=np.bool_)
+    num_idx = num.ones(idx_shape, dtype=np.bool_)
+    assert np.array_equal(np_arr[np_idx], num_arr[num_idx])
+
+    np_val = np.random.random(val_shape)
+    num_val = num.array(np_val)
+    np_arr[np_idx] = np_val
+    num_arr[num_idx] = num_val
+    assert np.array_equal(np_arr, num_arr)
 
 
 def test_empty_bool():

From 9367f2808fa28e1b4f35eff2507987373559b298 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 19 Aug 2022 11:02:03 -0600
Subject: [PATCH 13/14] removing unused case

---
 cunumeric/deferred.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index bf7564745..fc72a850a 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -677,8 +677,6 @@ def _create_indexing_array(
             # the store with transformation
             rhs = cast(DeferredArray, self._copy_store(store))
 
-        if len(tuple_of_arrays) == 0:
-            return False, rhs, rhs, self
         if len(tuple_of_arrays) <= rhs.ndim:
             output_arr = rhs._zip_indices(start_index, tuple_of_arrays)
             return True, rhs, output_arr, self

From a408152840f46d1e6fa30a624bd2a65a16739e44 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 19 Aug 2022 11:03:41 -0600
Subject: [PATCH 14/14] improving the advanced_indexing test

---
 tests/integration/test_advanced_indexing.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/integration/test_advanced_indexing.py b/tests/integration/test_advanced_indexing.py
index 42fabf5e7..33d3b0a78 100644
--- a/tests/integration/test_advanced_indexing.py
+++ b/tests/integration/test_advanced_indexing.py
@@ -120,12 +120,10 @@ def mk_deferred_array(lib, shape):
 
 
 def gen_args():
-    result = []
     for arr_ndim in range(1, LEGATE_MAX_DIM + 1):
         for idx_ndim in range(1, arr_ndim + 1):
             for zero_dim in range(arr_ndim):
-                result += [[arr_ndim, idx_ndim, zero_dim]]
-    return result
+                yield arr_ndim, idx_ndim, zero_dim
 
 
 @pytest.mark.parametrize("arr_ndim,idx_ndim,zero_dim", gen_args())