From 867f7a318379d0b20bb3cf6ed4c409a3cad83aba Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 17 Feb 2022 16:10:37 -0700
Subject: [PATCH 01/33] adding support for  Adwance Indexing case when several
 index arrays are passed

---
 cunumeric/config.py                  |   1 +
 cunumeric/deferred.py                | 136 ++++++++++++++++++++++-----
 cunumeric/runtime.py                 |   9 ++
 src/cunumeric.mk                     |   7 +-
 src/cunumeric/cunumeric_c.h          |   1 +
 src/cunumeric/index/zip.cc           |  64 +++++++++++++
 src/cunumeric/index/zip.cu           |  95 +++++++++++++++++++
 src/cunumeric/index/zip.h            |  42 +++++++++
 src/cunumeric/index/zip_omp.cc       |  59 ++++++++++++
 src/cunumeric/index/zip_template.inl |  71 ++++++++++++++
 tests/index_routines.py              |  43 +++++++++
 11 files changed, 502 insertions(+), 26 deletions(-)
 create mode 100644 src/cunumeric/index/zip.cc
 create mode 100644 src/cunumeric/index/zip.cu
 create mode 100644 src/cunumeric/index/zip.h
 create mode 100644 src/cunumeric/index/zip_omp.cc
 create mode 100644 src/cunumeric/index/zip_template.inl
diff --git a/cunumeric/config.py b/cunumeric/config.py
index 6c7303f56..e1462fbd3 100644
--- a/cunumeric/config.py
+++ b/cunumeric/config.py
@@ -114,6 +114,7 @@ class CuNumericOpCode(IntEnum):
     UNLOAD_CUDALIBS = _cunumeric.CUNUMERIC_UNLOAD_CUDALIBS
     WHERE = _cunumeric.CUNUMERIC_WHERE
     WRITE = _cunumeric.CUNUMERIC_WRITE
+    ZIP = _cunumeric.CUNUMERIC_ZIP
 
 
 # Match these to CuNumericUnaryOpCode in cunumeric_c.h
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index cc8779692..bc9927747 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -305,37 +305,129 @@ def get_scalar_array(self):
         result = np.frombuffer(buf, dtype=self.dtype, count=1)
         return result.reshape(())
 
+    def _zip_indices(self, arrays):
+        if not isinstance(arrays, tuple):
+            raise TypeError("zip_indices expect tuple of arrays")
+        arrays = tuple(self.runtime.to_deferred_array(a) for a in arrays)
+        # all arrays should have the same shape and type
+        shape = arrays[0].shape
+        data_type = arrays[0].dtype
+        if not np.issubdtype(data_type, np.integer):
+            raise TypeError("a array should be integer type")
+        for a in arrays:
+            if a.shape != shape:
+                raise TypeError(
+                    "shape of all index arrrays should be the same"
+                )
+            if data_type != a.dtype:
+                raise TypeError("type of all index arrrays should be the same")
+        # create output array which will store Point<N> field where
+        # N is number of index arrays
+        # shape of the output array should be the same as the shape of each
+        # index array
+        # NOTE: We need to instantiate a RegionField of non-primitive
+        # dtype, to store N-dimensional index points, to be used as the
+        # indirection field in a copy.
+        # Such dtypes are technically not supported,
+        # but it should be safe to directly create a DeferredArray
+        # of that dtype, so long as we don't try to convert it to a
+        # NumPy array.
+        N = len(arrays)
+        pointN_dtype = self.runtime.add_point_type(N)
+        store = self.context.create_store(
+            pointN_dtype, shape=shape, optimize_scalar=True
+        )
+        output_arr = DeferredArray(
+            self.runtime, base=store, dtype=pointN_dtype
+        )
+        # call ZIP function to combine index arrays into a singe array
+        task = self.context.create_task(CuNumericOpCode.ZIP)
+        task.add_output(output_arr.base)
+        for index_arr in arrays:
+            task.add_input(index_arr.base)
+            task.add_alignment(output_arr.base, index_arr.base)
+        task.execute()
+
+        return output_arr
+
     def _create_indexing_array(self, key):
         # Convert everything into deferred arrays of int64
+        store = self.base
+        shift = 0
         if isinstance(key, tuple):
             tuple_of_arrays = ()
-            for k in key:
-                if not isinstance(k, NumPyThunk):
-                    raise NotImplementedError(
-                        "need support for mixed advanced indexing"
+            # for k in key:
+            for dim, k in enumerate(key):
+                if np.isscalar(k):
+                    if k < 0:
+                        k += store.shape[dim + shift]
+                    store = store.project(dim + shift, k)
+                    shift -= 1
+                elif isinstance(k, slice):
+                    store = store.slice(dim + shift, k)
+                elif isinstance(k, NumPyThunk):
+                    if k.dtype == np.bool:
+                        k = k.nonzero()
+                else:
+                    raise TypeError(
+                        "Unsupported entry type passed to advanced",
+                        "indexing operation",
                     )
-                tuple_of_arrays += (k,)
+                tuple_of_arrays += (self.runtime.to_deferred_array(k),)
         else:
             assert isinstance(key, NumPyThunk)
             # Handle the boolean array case
-            if key.dtype == bool:
+            if key.dtype == np.bool:
+                # irina fixme
                 if key.ndim != self.ndim:
                     raise TypeError(
                         "Boolean advanced indexing dimension mismatch"
                     )
-                # For boolean arrays do the non-zero operation to make
-                # them into a normal indexing array
+                # IRINA fixme: replace `nonzero` case with the task with
+                # output regions
                 tuple_of_arrays = key.nonzero()
             else:
-                tuple_of_arrays = (key,)
-        if len(tuple_of_arrays) != self.ndim:
+                tuple_of_arrays = (self.runtime.to_deferred_array(key),)
+
+        if len(tuple_of_arrays) > self.ndim:
             raise TypeError("Advanced indexing dimension mismatch")
-        if self.ndim > 1:
-            # Check that all the arrays can be broadcast together
-            # Concatenate all the arrays into a single array
-            raise NotImplementedError("need support for concatenating arrays")
+
+        if len(tuple_of_arrays) > 1:
+            # shape = tuple_of_arrays[0].shape
+            # for i in range(1, len(tuple_of_arrays)):
+            #    if shape != tuple_of_arrays[i].shape:
+            #        raise ValueError("index arrays should be the same shape")
+
+            # create output array which will store Point<N> field where
+            # N is number of index arrays
+            # shape of the output array should be the same as the shape of each
+            # index array
+            # NOTE: We need to instantiate a RegionField of non-primitive
+            # dtype, to store N-dimensional index points, to be used as the
+            # indirection field in a copy.
+            # Such dtypes are technically not supported,
+            # but it should be safe to directly create a DeferredArray
+            # of that dtype, so long as we don't try to convert it to a
+            # NumPy array.
+            # out_dtype = np.dtype((np.int64, (len(tuple_of_arrays),)))
+            # output_arr = DeferredArray(
+            #     self.runtime,
+            #     base=tuple_of_arrays[0].base,
+            #     dtype=out_dtype,
+            # )
+
+            # # call ZIP function to combine index arrays into a singe array
+            # task = self.context.create_task(CuNumericOpCode.ZIP)
+            # task.add_output(output_arr.base)
+            # for index_arr in tuple_of_arrays:
+            #     task.add_input(index_arr.base)
+            #     task.add_alignment(index_arr.base, output_arr.base)
+            # task.execute()
+
+            output_arr = tuple_of_arrays[0]._zip_indices(tuple_of_arrays)
+            return store, output_arr
         else:
-            return self.runtime.to_deferred_array(tuple_of_arrays[0])
+            return store, tuple_of_arrays[0]
 
     @staticmethod
     def _unpack_ellipsis(key, ndim):
@@ -397,28 +489,24 @@ def get_item(self, key):
         # Check to see if this is advanced indexing or not
         if self._is_advanced_indexing(key):
             # Create the indexing array
-            index_array = self._create_indexing_array(key)
+            store, index_array = self._create_indexing_array(key)
+
             # Create a new array to be the result
             result = self.runtime.create_empty_thunk(
                 index_array.base.shape,
                 self.dtype,
                 inputs=[self],
             )
-
-            if self.ndim != index_array.ndim:
-                raise NotImplementedError(
-                    "need support for indirect partitioning"
-                )
-
             copy = self.context.create_copy()
 
-            copy.add_input(self.base)
+            copy.add_input(store)
             copy.add_source_indirect(index_array.base)
             copy.add_output(result.base)
 
-            copy.add_alignment(index_array.base, result.base)
+            # copy.add_alignment(index_array.base, result.base)
 
             copy.execute()
+
         else:
             result = self._get_view(key)
 
diff --git a/cunumeric/runtime.py b/cunumeric/runtime.py
index a4c224fe7..a6d840356 100644
--- a/cunumeric/runtime.py
+++ b/cunumeric/runtime.py
@@ -175,6 +175,15 @@ def get_arg_dtype(self, value_dtype):
                 dtype.register_reduction_op(redop, redop_id)
         return arg_dtype
 
+    def add_point_type(self, n):
+        type_system = self.legate_context.type_system
+        point_type = "point" + str(n)
+        if point_type not in type_system:
+            code = type_system[ty.int64].code
+            size_in_bytes = 8 * n
+            type_system.add_type(point_type, size_in_bytes, code)
+        return point_type
+
     def _report_coverage(self):
         total = len(self.api_calls)
         implemented = sum(int(impl) for (_, _, impl) in self.api_calls)
diff --git a/src/cunumeric.mk b/src/cunumeric.mk
index 1adecd008..9778d5dd1 100644
--- a/src/cunumeric.mk
+++ b/src/cunumeric.mk
@@ -27,6 +27,7 @@ GEN_CPU_SRC += cunumeric/ternary/where.cc               \
 							 cunumeric/nullary/fill.cc                \
 							 cunumeric/index/choose.cc                \
 							 cunumeric/index/repeat.cc                \
+							 cunumeric/index/zip.cc                   \
 							 cunumeric/item/read.cc                   \
 							 cunumeric/item/write.cc                  \
 							 cunumeric/matrix/contract.cc             \
@@ -66,6 +67,7 @@ GEN_CPU_SRC += cunumeric/ternary/where_omp.cc          \
 							 cunumeric/nullary/fill_omp.cc           \
 							 cunumeric/index/choose_omp.cc           \
 							 cunumeric/index/repeat_omp.cc           \
+							 cunumeric/index/zip_omp.cc              \
 							 cunumeric/matrix/contract_omp.cc        \
 							 cunumeric/matrix/diag_omp.cc            \
 							 cunumeric/matrix/gemm_omp.cc            \
@@ -102,10 +104,11 @@ GEN_GPU_SRC += cunumeric/ternary/where.cu               \
 							 cunumeric/nullary/arange.cu              \
 							 cunumeric/nullary/eye.cu                 \
 							 cunumeric/nullary/fill.cu                \
+							 cunumeric/index/choose.cu                \
+                                                         cunumeric/index/repeat.cu                \
+							 cunumeric/index/zip.cu                   \
 							 cunumeric/item/read.cu                   \
 							 cunumeric/item/write.cu                  \
-							 cunumeric/index/choose.cu                \
-							 cunumeric/index/repeat.cu                \
 							 cunumeric/matrix/contract.cu             \
 							 cunumeric/matrix/diag.cu                 \
 							 cunumeric/matrix/gemm.cu                 \
diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h
index 1c4c5d84c..68f4f56fd 100644
--- a/src/cunumeric/cunumeric_c.h
+++ b/src/cunumeric/cunumeric_c.h
@@ -59,6 +59,7 @@ enum CuNumericOpCode {
   CUNUMERIC_UNLOAD_CUDALIBS,
   CUNUMERIC_WHERE,
   CUNUMERIC_WRITE,
+  CUNUMERIC_ZIP,
 };
 
 // Match these to UnaryOpCode in config.py
diff --git a/src/cunumeric/index/zip.cc b/src/cunumeric/index/zip.cc
new file mode 100644
index 000000000..364d7e973
--- /dev/null
+++ b/src/cunumeric/index/zip.cc
@@ -0,0 +1,64 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/index/zip.h"
+#include "cunumeric/index/zip_template.inl"
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+template <int DIM, int N>
+struct ZipImplBody<VariantKind::CPU, DIM, N> {
+  using VAL = int64_t;
+
+  template <size_t... Is>
+  void operator()(const AccessorWO<Point<N>, DIM>& out,
+                  const std::vector<AccessorRO<VAL, DIM>>& index_arrays,
+                  const Rect<DIM>& rect,
+                  const Pitches<DIM - 1>& pitches,
+                  bool dense,
+                  std::index_sequence<Is...>) const
+  {
+    const size_t volume = rect.volume();
+    if (dense) {
+      auto outptr = out.ptr(rect);
+      for (size_t idx = 0; idx < volume; ++idx) {
+        outptr[idx] = Legion::Point<N>(index_arrays[Is].ptr(rect)[idx]...);
+        // std::cout<<"IRINA DEBUG dense out = "<<outptr[idx]<<std::endl;
+      }
+    } else {
+      for (size_t idx = 0; idx < volume; ++idx) {
+        auto p = pitches.unflatten(idx, rect.lo);
+        out[p] = Legion::Point<N>(index_arrays[Is][p]...);
+        // std::cout<<"IRINA DEBUG out = "<<out[p]<<std::endl;
+      }
+    }
+  }
+};
+
+/*static*/ void ZipTask::cpu_variant(TaskContext& context)
+{
+  zip_template<VariantKind::CPU>(context);
+}
+
+namespace  // unnamed
+{
+static void __attribute__((constructor)) register_tasks(void) { ZipTask::register_variants(); }
+}  // namespace
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu
new file mode 100644
index 000000000..1a68e62b1
--- /dev/null
+++ b/src/cunumeric/index/zip.cu
@@ -0,0 +1,95 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/index/zip.h"
+#include "cunumeric/index/zip_template.inl"
+#include "cunumeric/cuda_help.h"
+
+namespace cunumeric {
+
+using namespace Legion;
+
+template <int DIM, int N, size_t... Is>
+__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  zip_kernel(const AccessorWO<Point<N>, DIM> out,
+             const DeferredBuffer<AccessorRO<int64_t, DIM>, 1> index_arrays,
+             const Rect<DIM> rect,
+             const Pitches<DIM - 1> pitches,
+             int volume,
+             std::index_sequence<Is...>)
+{
+  const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= volume) return;
+  auto p = pitches.unflatten(idx, rect.lo);
+  out[p] = Legion::Point<N>(index_arrays[Is][p]...);
+  printf("IRINA DEBUG point = %d %d %d \n", out[p][0], out[p][1], out[p][2]);
+}
+
+template <int DIM, int N, size_t... Is>
+__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  zip_kernel_dense(Point<N>* out,
+                   const DeferredBuffer<const int64_t*, 1> index_arrays,
+                   const Rect<DIM> rect,
+                   int volume,
+                   std::index_sequence<Is...>)
+{
+  const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= volume) return;
+  out[idx] = Legion::Point<N>(index_arrays[Is][idx]...);
+  printf("IRINA DEBUG dense point = %d %d %d \n", out[idx][0], out[idx][1], out[idx][2]);
+  printf("IRINA DEBUG dense index_arr = %d %d %d \n",
+         index_arrays[0][idx],
+         index_arrays[1][idx],
+         index_arrays[2][idx]);
+}
+
+template <int DIM, int N>
+struct ZipImplBody<VariantKind::GPU, DIM, N> {
+  using VAL = int64_t;
+
+  template <size_t... Is>
+  void operator()(const AccessorWO<Point<N>, DIM>& out,
+                  const std::vector<AccessorRO<VAL, DIM>>& index_arrays,
+                  const Rect<DIM>& rect,
+                  const Pitches<DIM - 1>& pitches,
+                  bool dense,
+                  std::index_sequence<Is...>) const
+  {
+    const size_t volume = rect.volume();
+    const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    if (dense) {
+      DeferredBuffer<const int64_t*, 1> idx_arr(Memory::Kind::Z_COPY_MEM,
+                                                Rect<1>(0, index_arrays.size() - 1));
+      for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) {
+        idx_arr[idx] = index_arrays[idx].ptr(rect);
+      }
+      zip_kernel_dense<DIM, N><<<blocks, THREADS_PER_BLOCK>>>(
+        out.ptr(rect), idx_arr, rect, volume, std::make_index_sequence<N>());
+    } else {
+      DeferredBuffer<AccessorRO<VAL, DIM>, 1> idx_arr(Memory::Kind::Z_COPY_MEM,
+                                                      Rect<1>(0, index_arrays.size() - 1));
+      for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) idx_arr[idx] = index_arrays[idx];
+      zip_kernel<DIM, N><<<blocks, THREADS_PER_BLOCK>>>(
+        out, idx_arr, rect, pitches, volume, std::make_index_sequence<N>());
+    }
+  }
+};
+
+/*static*/ void ZipTask::gpu_variant(TaskContext& context)
+{
+  zip_template<VariantKind::GPU>(context);
+}
+}  // namespace cunumeric
diff --git a/src/cunumeric/index/zip.h b/src/cunumeric/index/zip.h
new file mode 100644
index 000000000..ae7476b05
--- /dev/null
+++ b/src/cunumeric/index/zip.h
@@ -0,0 +1,42 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cunumeric/cunumeric.h"
+
+namespace cunumeric {
+
+struct ZipArgs {
+  const Array& out;
+  const std::vector<Array>& inputs;
+};
+
+class ZipTask : public CuNumericTask<ZipTask> {
+ public:
+  static const int TASK_ID = CUNUMERIC_ZIP;
+
+ public:
+  static void cpu_variant(legate::TaskContext& context);
+#ifdef LEGATE_USE_OPENMP
+  static void omp_variant(legate::TaskContext& context);
+#endif
+#ifdef LEGATE_USE_CUDA
+  static void gpu_variant(legate::TaskContext& context);
+#endif
+};
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/index/zip_omp.cc b/src/cunumeric/index/zip_omp.cc
new file mode 100644
index 000000000..8cfebb32d
--- /dev/null
+++ b/src/cunumeric/index/zip_omp.cc
@@ -0,0 +1,59 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/index/zip.h"
+#include "cunumeric/index/zip_template.inl"
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+template <int DIM, int N>
+struct ZipImplBody<VariantKind::OMP, DIM, N> {
+  using VAL = int64_t;
+
+  template <size_t... Is>
+  void operator()(const AccessorWO<Point<N>, DIM>& out,
+                  const std::vector<AccessorRO<VAL, DIM>>& index_arrays,
+                  const Rect<DIM>& rect,
+                  const Pitches<DIM - 1>& pitches,
+                  bool dense,
+                  std::index_sequence<Is...>) const
+  {
+    const size_t volume = rect.volume();
+    if (dense) {
+      auto outptr = out.ptr(rect);
+#pragma omp parallel for schedule(static)
+      for (size_t idx = 0; idx < volume; ++idx) {
+        outptr[idx] = Legion::Point<N>(index_arrays[Is].ptr(rect)[idx]...);
+      }
+    } else {
+#pragma omp parallel for schedule(static)
+      for (size_t idx = 0; idx < volume; ++idx) {
+        auto p = pitches.unflatten(idx, rect.lo);
+        out[p] = Legion::Point<N>(index_arrays[Is][p]...);
+      }
+    }  // else
+  }
+};
+
+/*static*/ void ZipTask::omp_variant(TaskContext& context)
+{
+  zip_template<VariantKind::OMP>(context);
+}
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/index/zip_template.inl b/src/cunumeric/index/zip_template.inl
new file mode 100644
index 000000000..bf8d95394
--- /dev/null
+++ b/src/cunumeric/index/zip_template.inl
@@ -0,0 +1,71 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/pitches.h"
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+template <VariantKind KIND, int DIM, int N>
+struct ZipImplBody;
+
+template <VariantKind KIND>
+struct ZipImpl {
+  template <int DIM, int N>
+  void operator()(ZipArgs& args) const
+  {
+    using VAL       = int64_t;
+    auto out_rect   = args.out.shape<DIM>();
+    auto out        = args.out.write_accessor<Point<N>, DIM>(out_rect);
+    auto index_rect = args.inputs[0].shape<DIM>();
+    Pitches<DIM - 1> pitches;
+    size_t volume = pitches.flatten(index_rect);
+    if (volume == 0) return;
+
+    std::cout << "IRINA DEBUG N=" << N << " , D = " << DIM << std::endl;
+
+    std::cout << "IRINA DEBUG idex_rect = " << index_rect << "out_rect = " << out_rect << std::endl;
+#ifndef LEGION_BOUNDS_CHECKS
+    bool dense = out.accessor.is_dense_row_major(out_rect);
+#endif
+    std::vector<AccessorRO<VAL, DIM>> index_arrays;
+    for (int i = 0; i < args.inputs.size(); i++) {
+#ifdef CUNUMERIC_DEBUG
+      assert(index_rect == args.inputs[i].shape<DIM>());
+#endif
+      index_arrays.push_back(args.inputs[i].read_accessor<VAL, DIM>(index_rect));
+      dense = dense && index_arrays[i].accessor.is_dense_row_major(out_rect);
+    }
+
+#ifdef LEGION_BOUNDS_CHECKS
+    bool dense = false;
+#endif
+
+    ZipImplBody<KIND, DIM, N>()(
+      out, index_arrays, index_rect, pitches, dense, std::make_index_sequence<N>());
+  }
+};
+
+template <VariantKind KIND>
+static void zip_template(TaskContext& context)
+{
+  ZipArgs args{context.outputs()[0], context.inputs()};
+  double_dispatch(args.inputs[0].dim(), args.inputs.size(), ZipImpl<KIND>{}, args);
+}
+
+}  // namespace cunumeric
diff --git a/tests/index_routines.py b/tests/index_routines.py
index 3906643b0..d6e598712 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -24,6 +24,45 @@
 from legate.core import LEGATE_MAX_DIM
 
 
+def advanced_indexing():
+
+    arr = num.array([1, 2, 3, 4, 5, 6, 7])
+    indx = num.array([1, 3, 5])
+    res = arr[indx]
+    z = np.array(
+        [
+            [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]],
+            [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]],
+        ]
+    )
+    # ind0 = np.array([True, False])
+    z_num = num.array(z)
+    # ind0_num = np.array(ind0)
+    # res = z_num[-1, :]
+    # print(res)
+    # print(z[-1, :])
+
+    # indx0_num = num.array([0, 0])
+    # indx1_num = num.array([1, 1])
+    # indx2_num = num.array([2, 2])
+
+    # indx0_num._thunk._zip_indices(
+    #  (indx0_num._thunk, indx1_num._thunk, indx2_num._thunk,))
+
+    indx0_num = num.array([[0, 0], [0, 0], [0, 0]])
+    indx1_num = num.array([[1, 1], [1, 1], [1, 1]])
+    indx2_num = num.array([[2, 2], [2, 2], [2, 2]])
+
+    # indx0_num._thunk._zip_indices((indx0_num._thunk,
+    #  indx1_num._thunk, indx2_num._thunk,))
+
+    res = z_num[indx0_num, indx1_num, indx2_num]
+    print(res)
+
+    # res = z_num[ind0_num, :, indx]
+    return
+
+
 def test():
     # --------------------------------------------------------------
     # choose operator
@@ -192,6 +231,10 @@ def test():
         fn = np.diag(en, k=k)
         assert np.array_equal(f, fn)
 
+    advanced_indexing()
+
+    return
+
 
 if __name__ == "__main__":
     test()

From 235716691e004454f88ca235d7d9449c94e4708c Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 10 Mar 2022 20:17:12 -0800
Subject: [PATCH 02/33] adding more tests for advanced indexing

---
 cunumeric/deferred.py                |  46 ++---------
 src/cunumeric/index/zip.cu           |   6 --
 src/cunumeric/index/zip_template.inl |   3 -
 tests/index_routines.py              | 113 +++++++++++++++++++++++----
 4 files changed, 104 insertions(+), 64 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index bc9927747..16a15bcfa 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -340,6 +340,7 @@ def _zip_indices(self, arrays):
         output_arr = DeferredArray(
             self.runtime, base=store, dtype=pointN_dtype
         )
+
         # call ZIP function to combine index arrays into a singe array
         task = self.context.create_task(CuNumericOpCode.ZIP)
         task.add_output(output_arr.base)
@@ -368,21 +369,21 @@ def _create_indexing_array(self, key):
                 elif isinstance(k, NumPyThunk):
                     if k.dtype == np.bool:
                         k = k.nonzero()
+                        tuple_of_arrays += k
+                    else:
+                        tuple_of_arrays += (self.runtime.to_deferred_array(k),)
                 else:
                     raise TypeError(
                         "Unsupported entry type passed to advanced",
                         "indexing operation",
                     )
-                tuple_of_arrays += (self.runtime.to_deferred_array(k),)
         else:
             assert isinstance(key, NumPyThunk)
+            # irina fixme
+            if key.ndim != self.ndim:
+                raise TypeError("Advanced indexing dimension mismatch")
             # Handle the boolean array case
             if key.dtype == np.bool:
-                # irina fixme
-                if key.ndim != self.ndim:
-                    raise TypeError(
-                        "Boolean advanced indexing dimension mismatch"
-                    )
                 # IRINA fixme: replace `nonzero` case with the task with
                 # output regions
                 tuple_of_arrays = key.nonzero()
@@ -393,37 +394,6 @@ def _create_indexing_array(self, key):
             raise TypeError("Advanced indexing dimension mismatch")
 
         if len(tuple_of_arrays) > 1:
-            # shape = tuple_of_arrays[0].shape
-            # for i in range(1, len(tuple_of_arrays)):
-            #    if shape != tuple_of_arrays[i].shape:
-            #        raise ValueError("index arrays should be the same shape")
-
-            # create output array which will store Point<N> field where
-            # N is number of index arrays
-            # shape of the output array should be the same as the shape of each
-            # index array
-            # NOTE: We need to instantiate a RegionField of non-primitive
-            # dtype, to store N-dimensional index points, to be used as the
-            # indirection field in a copy.
-            # Such dtypes are technically not supported,
-            # but it should be safe to directly create a DeferredArray
-            # of that dtype, so long as we don't try to convert it to a
-            # NumPy array.
-            # out_dtype = np.dtype((np.int64, (len(tuple_of_arrays),)))
-            # output_arr = DeferredArray(
-            #     self.runtime,
-            #     base=tuple_of_arrays[0].base,
-            #     dtype=out_dtype,
-            # )
-
-            # # call ZIP function to combine index arrays into a singe array
-            # task = self.context.create_task(CuNumericOpCode.ZIP)
-            # task.add_output(output_arr.base)
-            # for index_arr in tuple_of_arrays:
-            #     task.add_input(index_arr.base)
-            #     task.add_alignment(index_arr.base, output_arr.base)
-            # task.execute()
-
             output_arr = tuple_of_arrays[0]._zip_indices(tuple_of_arrays)
             return store, output_arr
         else:
@@ -503,8 +473,6 @@ def get_item(self, key):
             copy.add_source_indirect(index_array.base)
             copy.add_output(result.base)
 
-            # copy.add_alignment(index_array.base, result.base)
-
             copy.execute()
 
         else:
diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu
index 1a68e62b1..1bd8b6aef 100644
--- a/src/cunumeric/index/zip.cu
+++ b/src/cunumeric/index/zip.cu
@@ -35,7 +35,6 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   if (idx >= volume) return;
   auto p = pitches.unflatten(idx, rect.lo);
   out[p] = Legion::Point<N>(index_arrays[Is][p]...);
-  printf("IRINA DEBUG point = %d %d %d \n", out[p][0], out[p][1], out[p][2]);
 }
 
 template <int DIM, int N, size_t... Is>
@@ -49,11 +48,6 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= volume) return;
   out[idx] = Legion::Point<N>(index_arrays[Is][idx]...);
-  printf("IRINA DEBUG dense point = %d %d %d \n", out[idx][0], out[idx][1], out[idx][2]);
-  printf("IRINA DEBUG dense index_arr = %d %d %d \n",
-         index_arrays[0][idx],
-         index_arrays[1][idx],
-         index_arrays[2][idx]);
 }
 
 template <int DIM, int N>
diff --git a/src/cunumeric/index/zip_template.inl b/src/cunumeric/index/zip_template.inl
index bf8d95394..f16b89474 100644
--- a/src/cunumeric/index/zip_template.inl
+++ b/src/cunumeric/index/zip_template.inl
@@ -37,9 +37,6 @@ struct ZipImpl {
     size_t volume = pitches.flatten(index_rect);
     if (volume == 0) return;
 
-    std::cout << "IRINA DEBUG N=" << N << " , D = " << DIM << std::endl;
-
-    std::cout << "IRINA DEBUG idex_rect = " << index_rect << "out_rect = " << out_rect << std::endl;
 #ifndef LEGION_BOUNDS_CHECKS
     bool dense = out.accessor.is_dense_row_major(out_rect);
 #endif
diff --git a/tests/index_routines.py b/tests/index_routines.py
index d6e598712..599d7a045 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -35,31 +35,112 @@ def advanced_indexing():
             [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]],
         ]
     )
-    # ind0 = np.array([True, False])
     z_num = num.array(z)
-    # ind0_num = np.array(ind0)
-    # res = z_num[-1, :]
-    # print(res)
-    # print(z[-1, :])
-
-    # indx0_num = num.array([0, 0])
-    # indx1_num = num.array([1, 1])
-    # indx2_num = num.array([2, 2])
 
-    # indx0_num._thunk._zip_indices(
-    #  (indx0_num._thunk, indx1_num._thunk, indx2_num._thunk,))
+    # simple advance indexing:
+    y = np.array([0, -1, -2, -3, -4, -5])
+    y_num = num.array(y)
+    index = np.array([2, 4, 0, 4, 4, 4])
+    index_num = num.array(index)
+    assert np.array_equal(y[index], y_num[index_num])
+
+    # simple 2D case
+    # fixme dimension mismatch case
+    # index_2d = np.array([[ 1,  2,  0],
+    #                 [ 5,  5,  5],
+    #                 [ 2,  3,  4]])
+    # index_2d_num = num.array(index_2d)
+    # assert np.array_equal(y[index_2d], y_num[index_2d_num])
+
+    # mismatch dimesion case:
+    # indx_bool = np.array([True, True])
+    # indx_bool_num = num.array(indx_bool)
+    # res = z[indx_bool]
+    # res_num = z_num[indx_bool_num]
+    # print ("bool array as indx np:")
+    # print(res)
+    # print ("cunumeric:")
+    # print (res_num)
 
-    indx0_num = num.array([[0, 0], [0, 0], [0, 0]])
-    indx1_num = num.array([[1, 1], [1, 1], [1, 1]])
-    indx2_num = num.array([[2, 2], [2, 2], [2, 2]])
+    # test for bool array of the same dimension
+    indx_bool = np.array(
+        [
+            [
+                [False, True, False, False],
+                [True, True, False, False],
+                [True, False, True, False],
+            ],
+            [
+                [False, True, False, False],
+                [True, True, False, False],
+                [True, False, True, False],
+            ],
+        ]
+    )
+    indx_bool_num = num.array(indx_bool)
+    res = z[indx_bool]
+    res_num = z_num[indx_bool_num]
+    print("bool array as indx np:")
+    print(res)
+    print(z[indx_bool.nonzero()])
+    print("cunumeric:")
+    print(res_num)
+    # fixme unomment when nonzero is fixed
+    # assert np.array_equal(res, res_num)
+
+    # test mixed data
+    res = z[-1, :]
+    res_num = z_num[-1, :]
+    assert np.array_equal(res, res_num)
+
+    # case when multiple number of arays is send
+    indx0 = np.array([[0, 1], [1, 0], [0, 0]])
+    indx1 = np.array([[0, 1], [2, 0], [1, 2]])
+    indx2 = np.array([[3, 2], [1, 0], [3, 2]])
+
+    indx0_num = num.array(indx0)
+    indx1_num = num.array(indx1)
+    indx2_num = num.array(indx2)
 
     # indx0_num._thunk._zip_indices((indx0_num._thunk,
     #  indx1_num._thunk, indx2_num._thunk,))
 
     res = z_num[indx0_num, indx1_num, indx2_num]
-    print(res)
+    res_np = z[indx0, indx1, indx2]
+    assert np.array_equal(res, res_np)
+
+    # FIXME: Combining Basic and Advanced Indexing Schemes:
+    # ind0 = np.array([True, True])
+    # ind0_num=num.array(ind0)
+    # res = z[ind0, :, -1]
+    # res_num = z_num[ind0_num, :, -1]
+    # print (res)
+    # fixme error
+    # print(res_num)
+    # assert np.array_equal(res, res_num)
+
+    # In-Place & Augmented Assignments via Advanced Indexing
+    x = np.array(
+        [
+            [0.38, -0.16, 0.38, -0.41, -0.04],
+            [-0.47, -0.01, -0.18, -0.5, -0.49],
+            [0.02, 0.4, 0.33, 0.33, -0.13],
+        ]
+    )
+    indx0 = np.array([0, 2])
+    indx1 = np.array([2, 4])
+    # x_num = num.array(x)
+    # indx0_num = num.array(indx0)
+    # indx1_num = num.array(indx1)
+    print(x[indx0, indx1])
+    # FIXME 0:
+    # print (x_num[indx0_num,indx1_num])
+    # assert np.array_equal(x[indx0, indx1], x_num[indx0_num, indx1_num])
+    # print (x_num[indx0_num, indx1_num])
+    x[indx0, indx1] = 0.0
+    print(x)
+    # x_num[indx0_num, indx1_num] =0.0
 
-    # res = z_num[ind0_num, :, indx]
     return
 
 

From abc583ee563ecf8ea66a6755581e77182f4dce37 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 11 Mar 2022 14:27:09 -0800
Subject: [PATCH 03/33] addressing dimension mismatch case

---
 cunumeric/deferred.py   |  49 ++++++++++++------
 tests/index_routines.py | 112 ++++++++++++++++++++++++----------------
 2 files changed, 101 insertions(+), 60 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 16a15bcfa..3bd950e3b 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -379,9 +379,21 @@ def _create_indexing_array(self, key):
                     )
         else:
             assert isinstance(key, NumPyThunk)
-            # irina fixme
-            if key.ndim != self.ndim:
-                raise TypeError("Advanced indexing dimension mismatch")
+            if key.ndim < store.ndim:
+                raise TypeError("Unimplimented")
+                # FIXME advance indexing task
+                # diff = store.ndim - key.ndim
+                # print ("IRINA DEBUG store ndim = " , store)
+                # for i in range(diff):
+                #    store = store.slice((store.ndim - i - 1), slice(None))
+                #    print ("IRINA DEBUG store ndim = " , store)
+            elif key.ndim > store.ndim:
+                if store.ndim != 1:
+                    raise ValueError("Advance indexing dimention mismatch")
+                diff = store.ndim - key.ndim
+                for i in range(diff):
+                    store = store.promote(i + 1, store.shape[0])
+
             # Handle the boolean array case
             if key.dtype == np.bool:
                 # IRINA fixme: replace `nonzero` case with the task with
@@ -460,7 +472,6 @@ def get_item(self, key):
         if self._is_advanced_indexing(key):
             # Create the indexing array
             store, index_array = self._create_indexing_array(key)
-
             # Create a new array to be the result
             result = self.runtime.create_empty_thunk(
                 index_array.base.shape,
@@ -498,23 +509,31 @@ def set_item(self, key, rhs):
         # Check to see if this is advanced indexing or not
         if self._is_advanced_indexing(key):
             # Create the indexing array
-            index_array = self._create_indexing_array(key)
-            if index_array.shape != rhs.shape:
-                raise ValueError(
-                    "Advanced indexing array does not match source shape"
-                )
-            if self.ndim != index_array.ndim:
-                raise NotImplementedError(
-                    "need support for indirect partitioning"
+            store, index_array = self._create_indexing_array(key)
+            # if index_array.shape != rhs.shape:
+            #    raise ValueError(
+            #        "Advanced indexing array does not match source shape"
+            #    )
+            # if self.ndim != index_array.ndim:
+            #    raise NotImplementedError(
+            #        "need support for indirect partitioning"
+            #    )
+            if rhs.ndim == 0:
+                shape = store.shape
+                val = rhs
+                rhs = self.runtime.create_empty_thunk(
+                    shape,
+                    self.dtype,
+                    inputs=[self],
                 )
-
+                rhs.fill(val)
             copy = self.context.create_copy()
 
-            copy.add_input(rhs.base)
+            copy.add_input(store)
             copy.add_target_indirect(index_array.base)
             copy.add_output(self.base)
 
-            copy.add_alignment(index_array.base, rhs.base)
+            # copy.add_alignment(index_array.base, rhs.base)
 
             copy.execute()
 
diff --git a/tests/index_routines.py b/tests/index_routines.py
index 599d7a045..fcef0aefe 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -25,10 +25,24 @@
 
 
 def advanced_indexing():
+    # simple advance indexing:
+    print("advance indexing test 1")
+    x = np.array([1, 2, 3, 4, 5, 6, 7])
+    indx = np.array([1, 3, 5])
+    res = x[indx]
+    x_num = num.array(x)
+    indx_num = num.array(indx)
+    res_num = x_num[indx_num]
+    assert np.array_equal(res, res_num)
+
+    # advance indexing test when a.ndim ==1 , indx.ndim >1
+    print("advance indexing test 2")
+    y = np.array([0, -1, -2, -3, -4, -5])
+    y_num = num.array(y)
+    index = np.array([[1, 0, 1, 3, 0, 0], [2, 4, 0, 4, 4, 4]])
+    index_num = num.array(index)
+    assert np.array_equal(y[index], y_num[index_num])
 
-    arr = num.array([1, 2, 3, 4, 5, 6, 7])
-    indx = num.array([1, 3, 5])
-    res = arr[indx]
     z = np.array(
         [
             [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]],
@@ -37,32 +51,28 @@ def advanced_indexing():
     )
     z_num = num.array(z)
 
-    # simple advance indexing:
-    y = np.array([0, -1, -2, -3, -4, -5])
-    y_num = num.array(y)
-    index = np.array([2, 4, 0, 4, 4, 4])
-    index_num = num.array(index)
-    assert np.array_equal(y[index], y_num[index_num])
-
     # simple 2D case
-    # fixme dimension mismatch case
-    # index_2d = np.array([[ 1,  2,  0],
-    #                 [ 5,  5,  5],
-    #                 [ 2,  3,  4]])
-    # index_2d_num = num.array(index_2d)
-    # assert np.array_equal(y[index_2d], y_num[index_2d_num])
+    print("advance indexing test 3")
+    index_2d = np.array([[1, 2, 0], [5, 5, 5], [2, 3, 4]])
+    index_2d_num = num.array(index_2d)
+    assert np.array_equal(y[index_2d], y_num[index_2d_num])
 
     # mismatch dimesion case:
-    # indx_bool = np.array([True, True])
+    # print ("advance indexing test 4")
+    # indx_bool = np.array([True, False])
     # indx_bool_num = num.array(indx_bool)
     # res = z[indx_bool]
-    # res_num = z_num[indx_bool_num]
-    # print ("bool array as indx np:")
+    # print("IRINA DEBUG")
+    # assert np.array_equal(indx_bool.nonzero(), indx_bool_num.nonzero())
+    # print("bool array as indx np:")
     # print(res)
-    # print ("cunumeric:")
-    # print (res_num)
+    # print("cunumeric:")
+    # res_num = z_num[indx_bool_num]
+    # print(res_num)
+    # assert np.array_equal(res, res_num)
 
     # test for bool array of the same dimension
+    print("advance indexing test 5")
     indx_bool = np.array(
         [
             [
@@ -80,20 +90,20 @@ def advanced_indexing():
     indx_bool_num = num.array(indx_bool)
     res = z[indx_bool]
     res_num = z_num[indx_bool_num]
-    print("bool array as indx np:")
-    print(res)
-    print(z[indx_bool.nonzero()])
-    print("cunumeric:")
-    print(res_num)
-    # fixme unomment when nonzero is fixed
-    # assert np.array_equal(res, res_num)
+    # print("bool array as indx np:")
+    # print(res)
+    # print("cunumeric:")
+    # print(res_num)
+    assert np.array_equal(res, res_num)
 
     # test mixed data
+    print("advance indexing test 6")
     res = z[-1, :]
     res_num = z_num[-1, :]
     assert np.array_equal(res, res_num)
 
-    # case when multiple number of arays is send
+    # case when multiple number of arays is passed
+    print("advance indexing test 7")
     indx0 = np.array([[0, 1], [1, 0], [0, 0]])
     indx1 = np.array([[0, 1], [2, 0], [1, 2]])
     indx2 = np.array([[3, 2], [1, 0], [3, 2]])
@@ -110,35 +120,47 @@ def advanced_indexing():
     assert np.array_equal(res, res_np)
 
     # FIXME: Combining Basic and Advanced Indexing Schemes:
-    # ind0 = np.array([True, True])
-    # ind0_num=num.array(ind0)
+    # print ("advance indexing test 8")
+    # ind0 = np.array([1, 1])
+    # ind0_num = num.array(ind0)
     # res = z[ind0, :, -1]
     # res_num = z_num[ind0_num, :, -1]
-    # print (res)
-    # fixme error
+    # print(res)
     # print(res_num)
     # assert np.array_equal(res, res_num)
 
     # In-Place & Augmented Assignments via Advanced Indexing
-    x = np.array(
-        [
-            [0.38, -0.16, 0.38, -0.41, -0.04],
-            [-0.47, -0.01, -0.18, -0.5, -0.49],
-            [0.02, 0.4, 0.33, 0.33, -0.13],
-        ]
-    )
-    indx0 = np.array([0, 2])
-    indx1 = np.array([2, 4])
+    # simple 1d case
+    # y = np.array([0, -1, -2, -3, -4, -5])
+    # y_num = num.array(y)
+    # index = np.array([2, 4, 0, 4, 4, 4])
+    # index_num = num.array(index)
+    # print (y[index])
+    # print(y_num[index])
+    # y[index] = 0
+    # y_num[index_num] =0
+    # print (y_num)
+
+    # 2D test
+    # x = np.array(
+    #    [
+    #        [0.38, -0.16, 0.38, -0.41, -0.04],
+    #        [-0.47, -0.01, -0.18, -0.5, -0.49],
+    #        [0.02, 0.4, 0.33, 0.33, -0.13],
+    #    ]
+    # )
+    # indx0 = np.array([0, 1])
+    # indx1 = np.array([1, 2])
     # x_num = num.array(x)
     # indx0_num = num.array(indx0)
     # indx1_num = num.array(indx1)
-    print(x[indx0, indx1])
+    # print(x[indx0, indx1])
     # FIXME 0:
     # print (x_num[indx0_num,indx1_num])
     # assert np.array_equal(x[indx0, indx1], x_num[indx0_num, indx1_num])
     # print (x_num[indx0_num, indx1_num])
-    x[indx0, indx1] = 0.0
-    print(x)
+    # x[indx0, indx1] = 0.0
+    # print(x)
     # x_num[indx0_num, indx1_num] =0.0
 
     return

From 3925e52ddf2ccac51202297d6a5ab358db9bc59b Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 14 Mar 2022 14:24:58 -0700
Subject: [PATCH 04/33] adding broadcasting for index arrays

---
 cunumeric/deferred.py   | 46 ++++++++++++++++++++++++++---------------
 tests/index_routines.py | 30 +++++++++++++++++++--------
 2 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 3bd950e3b..37f5d5390 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -314,13 +314,16 @@ def _zip_indices(self, arrays):
         data_type = arrays[0].dtype
         if not np.issubdtype(data_type, np.integer):
             raise TypeError("a array should be integer type")
+        new_arrays = tuple()
         for a in arrays:
-            if a.shape != shape:
-                raise TypeError(
-                    "shape of all index arrrays should be the same"
-                )
             if data_type != a.dtype:
                 raise TypeError("type of all index arrrays should be the same")
+            if a.shape != shape:
+                a = a._broadcast(shape)
+            else:
+                a = a.base
+            new_arrays = new_arrays + (a,)
+        arrays = new_arrays
         # create output array which will store Point<N> field where
         # N is number of index arrays
         # shape of the output array should be the same as the shape of each
@@ -345,8 +348,8 @@ def _zip_indices(self, arrays):
         task = self.context.create_task(CuNumericOpCode.ZIP)
         task.add_output(output_arr.base)
         for index_arr in arrays:
-            task.add_input(index_arr.base)
-            task.add_alignment(output_arr.base, index_arr.base)
+            task.add_input(index_arr)
+            task.add_alignment(output_arr.base, index_arr)
         task.execute()
 
         return output_arr
@@ -365,6 +368,7 @@ def _create_indexing_array(self, key):
                     store = store.project(dim + shift, k)
                     shift -= 1
                 elif isinstance(k, slice):
+                    # FIXME do we need to transform the store here?
                     store = store.slice(dim + shift, k)
                 elif isinstance(k, NumPyThunk):
                     if k.dtype == np.bool:
@@ -379,15 +383,7 @@ def _create_indexing_array(self, key):
                     )
         else:
             assert isinstance(key, NumPyThunk)
-            if key.ndim < store.ndim:
-                raise TypeError("Unimplimented")
-                # FIXME advance indexing task
-                # diff = store.ndim - key.ndim
-                # print ("IRINA DEBUG store ndim = " , store)
-                # for i in range(diff):
-                #    store = store.slice((store.ndim - i - 1), slice(None))
-                #    print ("IRINA DEBUG store ndim = " , store)
-            elif key.ndim > store.ndim:
+            if key.ndim > store.ndim:
                 if store.ndim != 1:
                     raise ValueError("Advance indexing dimention mismatch")
                 diff = store.ndim - key.ndim
@@ -399,17 +395,33 @@ def _create_indexing_array(self, key):
                 # IRINA fixme: replace `nonzero` case with the task with
                 # output regions
                 tuple_of_arrays = key.nonzero()
+            elif key.ndim < store.ndim:
+                # FIXME test and see if it works for 2D
+                diff = store.ndim - key.ndim
+                indx = np.expand_dims(key, list(range(diff, self.ndim)))
+                tuple_of_arrays = (indx,)
+                for dim in range(diff, self.ndim):
+                    indx = np.expand_dims(
+                        np.arrange(
+                            self.shape[dim],
+                            list(i for i in range(self.ndim) if i != dim),
+                        )
+                    )
+                    tuple_of_arrays = tuple_of_arrays + (indx,)
             else:
                 tuple_of_arrays = (self.runtime.to_deferred_array(key),)
 
         if len(tuple_of_arrays) > self.ndim:
             raise TypeError("Advanced indexing dimension mismatch")
 
-        if len(tuple_of_arrays) > 1:
+        if len(tuple_of_arrays) == self.ndim and self.ndim > 1:
+
             output_arr = tuple_of_arrays[0]._zip_indices(tuple_of_arrays)
             return store, output_arr
-        else:
+        elif len(tuple_of_arrays) == 1 and self.ndim == 1:
             return store, tuple_of_arrays[0]
+        else:
+            raise ValueError("Advance indexing dimention mismatch")
 
     @staticmethod
     def _unpack_ellipsis(key, ndim):
diff --git a/tests/index_routines.py b/tests/index_routines.py
index fcef0aefe..7303e09c7 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -25,8 +25,8 @@
 
 
 def advanced_indexing():
-    # simple advance indexing:
-    print("advance indexing test 1")
+    # simple advanced indexing:
+    print("advanced indexing test 1")
     x = np.array([1, 2, 3, 4, 5, 6, 7])
     indx = np.array([1, 3, 5])
     res = x[indx]
@@ -35,8 +35,8 @@ def advanced_indexing():
     res_num = x_num[indx_num]
     assert np.array_equal(res, res_num)
 
-    # advance indexing test when a.ndim ==1 , indx.ndim >1
-    print("advance indexing test 2")
+    # advanced indexing test when a.ndim ==1 , indx.ndim >1
+    print("advanced indexing test 2")
     y = np.array([0, -1, -2, -3, -4, -5])
     y_num = num.array(y)
     index = np.array([[1, 0, 1, 3, 0, 0], [2, 4, 0, 4, 4, 4]])
@@ -52,7 +52,7 @@ def advanced_indexing():
     z_num = num.array(z)
 
     # simple 2D case
-    print("advance indexing test 3")
+    print("advanced indexing test 3")
     index_2d = np.array([[1, 2, 0], [5, 5, 5], [2, 3, 4]])
     index_2d_num = num.array(index_2d)
     assert np.array_equal(y[index_2d], y_num[index_2d_num])
@@ -72,7 +72,7 @@ def advanced_indexing():
     # assert np.array_equal(res, res_num)
 
     # test for bool array of the same dimension
-    print("advance indexing test 5")
+    print("advanced indexing test 5")
     indx_bool = np.array(
         [
             [
@@ -97,13 +97,13 @@ def advanced_indexing():
     assert np.array_equal(res, res_num)
 
     # test mixed data
-    print("advance indexing test 6")
+    print("advanced indexing test 6")
     res = z[-1, :]
     res_num = z_num[-1, :]
     assert np.array_equal(res, res_num)
 
     # case when multiple number of arays is passed
-    print("advance indexing test 7")
+    print("advanced indexing test 7")
     indx0 = np.array([[0, 1], [1, 0], [0, 0]])
     indx1 = np.array([[0, 1], [2, 0], [1, 2]])
     indx2 = np.array([[3, 2], [1, 0], [3, 2]])
@@ -119,8 +119,20 @@ def advanced_indexing():
     res_np = z[indx0, indx1, indx2]
     assert np.array_equal(res, res_np)
 
+    # indices with broadcast:
+    indx0 = np.array([[0, 1], [1, 0], [0, 0]])
+    indx1 = np.array([[0, 1]])
+    indx2 = np.array([[3, 2], [1, 0], [3, 2]])
+
+    indx0_num = num.array(indx0)
+    indx1_num = num.array(indx1)
+    indx2_num = num.array(indx2)
+    res = z_num[indx0_num, indx1_num, indx2_num]
+    res_np = z[indx0, indx1, indx2]
+    assert np.array_equal(res, res_np)
+
     # FIXME: Combining Basic and Advanced Indexing Schemes:
-    # print ("advance indexing test 8")
+    # print ("advanced indexing test 8")
     # ind0 = np.array([1, 1])
     # ind0_num = num.array(ind0)
     # res = z[ind0, :, -1]

From e7708c7447870c71f1f051c5a07c28a7787c9dab Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 15 Mar 2022 11:22:13 -0700
Subject: [PATCH 05/33] adding advanced_indexing task

---
 cunumeric/config.py                           |   1 +
 cunumeric/deferred.py                         |  73 +++++----
 src/cunumeric.mk                              |   3 +
 src/cunumeric/cunumeric_c.h                   |   1 +
 src/cunumeric/index/advanced_indexing.cc      |  76 +++++++++
 src/cunumeric/index/advanced_indexing.cu      | 148 ++++++++++++++++++
 src/cunumeric/index/advanced_indexing.h       |  43 +++++
 src/cunumeric/index/advanced_indexing_omp.cc  |  92 +++++++++++
 .../index/advanced_indexing_template.inl      |  79 ++++++++++
 tests/index_routines.py                       |  29 +++-
 tests/nonzero.py                              |   4 +
 11 files changed, 515 insertions(+), 34 deletions(-)
 create mode 100644 src/cunumeric/index/advanced_indexing.cc
 create mode 100644 src/cunumeric/index/advanced_indexing.cu
 create mode 100644 src/cunumeric/index/advanced_indexing.h
 create mode 100644 src/cunumeric/index/advanced_indexing_omp.cc
 create mode 100644 src/cunumeric/index/advanced_indexing_template.inl

diff --git a/cunumeric/config.py b/cunumeric/config.py
index e1462fbd3..2ec560d50 100644
--- a/cunumeric/config.py
+++ b/cunumeric/config.py
@@ -78,6 +78,7 @@ def destroy(self):
 # Match these to CuNumericOpCode in cunumeric_c.h
 @unique
 class CuNumericOpCode(IntEnum):
+    ADVANCED_INDX = _cunumeric.CUNUMERIC_ADVANCED_INDEXING
     ARANGE = _cunumeric.CUNUMERIC_ARANGE
     BINARY_OP = _cunumeric.CUNUMERIC_BINARY_OP
     BINARY_RED = _cunumeric.CUNUMERIC_BINARY_RED
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 37f5d5390..91cad64b7 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -372,6 +372,8 @@ def _create_indexing_array(self, key):
                     store = store.slice(dim + shift, k)
                 elif isinstance(k, NumPyThunk):
                     if k.dtype == np.bool:
+                        # in case of the mixed indises we all nonzero
+                        # for the bool array
                         k = k.nonzero()
                         tuple_of_arrays += k
                     else:
@@ -392,22 +394,36 @@ def _create_indexing_array(self, key):
 
             # Handle the boolean array case
             if key.dtype == np.bool:
+                if key.shape == self.shape:
+                    out = self.runtime.create_unbound_thunk(self.dtype)
+                    task = self.context.create_task(
+                        CuNumericOpCode.ADVANCED_INDX
+                    )
+                    task.add_output(out.base)
+                    task.add_input(self.base)
+                    task.add_input(key.base)
+                    task.add_alignment(self.base, key.base)
+                    task.execute()
+                    return False, store, out
                 # IRINA fixme: replace `nonzero` case with the task with
-                # output regions
+                # output regions when ND output regions are available
                 tuple_of_arrays = key.nonzero()
             elif key.ndim < store.ndim:
-                # FIXME test and see if it works for 2D
-                diff = store.ndim - key.ndim
-                indx = np.expand_dims(key, list(range(diff, self.ndim)))
-                tuple_of_arrays = (indx,)
-                for dim in range(diff, self.ndim):
-                    indx = np.expand_dims(
-                        np.arrange(
-                            self.shape[dim],
-                            list(i for i in range(self.ndim) if i != dim),
-                        )
-                    )
-                    tuple_of_arrays = tuple_of_arrays + (indx,)
+                raise ValueError("Advance indexing dimention mismatch")
+                # FIXME add extensions to ZIP taskD
+                # ndim_out = store.ndim + key.ndim-1
+                # indx = key._expand_dims(list(range(key.ndim, ndim_out)))
+                # np.expand_dims(key, list(range(key.ndim, ndim_out)))
+                # print("IRINA DEBUG shape key " , indx.shape)
+                # tuple_of_arrays = (indx,)
+                # for dim in range(1, store.ndim):
+                #    dims=  list(i for i in range(ndim_out) if i
+                # not in range(dim+key.ndim-1,dim+2*key.ndim-1))
+                #    print("IRINA DEBUG dims = ", dims)
+
+                #    indx = np.arrange(
+                #            self.shape[dim])._expand_dims(dims)
+                #    tuple_of_arrays = tuple_of_arrays + (indx,)
             else:
                 tuple_of_arrays = (self.runtime.to_deferred_array(key),)
 
@@ -417,9 +433,9 @@ def _create_indexing_array(self, key):
         if len(tuple_of_arrays) == self.ndim and self.ndim > 1:
 
             output_arr = tuple_of_arrays[0]._zip_indices(tuple_of_arrays)
-            return store, output_arr
+            return True, store, output_arr
         elif len(tuple_of_arrays) == 1 and self.ndim == 1:
-            return store, tuple_of_arrays[0]
+            return True, store, tuple_of_arrays[0]
         else:
             raise ValueError("Advance indexing dimention mismatch")
 
@@ -483,20 +499,23 @@ def get_item(self, key):
         # Check to see if this is advanced indexing or not
         if self._is_advanced_indexing(key):
             # Create the indexing array
-            store, index_array = self._create_indexing_array(key)
-            # Create a new array to be the result
-            result = self.runtime.create_empty_thunk(
-                index_array.base.shape,
-                self.dtype,
-                inputs=[self],
-            )
-            copy = self.context.create_copy()
+            copy_needed, store, index_array = self._create_indexing_array(key)
+            if copy_needed:
+                # Create a new array to be the result
+                result = self.runtime.create_empty_thunk(
+                    index_array.base.shape,
+                    self.dtype,
+                    inputs=[self],
+                )
+                copy = self.context.create_copy()
 
-            copy.add_input(store)
-            copy.add_source_indirect(index_array.base)
-            copy.add_output(result.base)
+                copy.add_input(store)
+                copy.add_source_indirect(index_array.base)
+                copy.add_output(result.base)
 
-            copy.execute()
+                copy.execute()
+            else:
+                return index_array
 
         else:
             result = self._get_view(key)
diff --git a/src/cunumeric.mk b/src/cunumeric.mk
index 9778d5dd1..896c6ca4f 100644
--- a/src/cunumeric.mk
+++ b/src/cunumeric.mk
@@ -25,6 +25,7 @@ GEN_CPU_SRC += cunumeric/ternary/where.cc               \
 							 cunumeric/nullary/arange.cc              \
 							 cunumeric/nullary/eye.cc                 \
 							 cunumeric/nullary/fill.cc                \
+                                                         cunumeric/index/advanced_indexing.cc     \
 							 cunumeric/index/choose.cc                \
 							 cunumeric/index/repeat.cc                \
 							 cunumeric/index/zip.cc                   \
@@ -65,6 +66,7 @@ GEN_CPU_SRC += cunumeric/ternary/where_omp.cc          \
 							 cunumeric/nullary/arange_omp.cc         \
 							 cunumeric/nullary/eye_omp.cc            \
 							 cunumeric/nullary/fill_omp.cc           \
+                                                         cunumeric/index/advanced_indexing_omp.cc\
 							 cunumeric/index/choose_omp.cc           \
 							 cunumeric/index/repeat_omp.cc           \
 							 cunumeric/index/zip_omp.cc              \
@@ -104,6 +106,7 @@ GEN_GPU_SRC += cunumeric/ternary/where.cu               \
 							 cunumeric/nullary/arange.cu              \
 							 cunumeric/nullary/eye.cu                 \
 							 cunumeric/nullary/fill.cu                \
+                                                         cunumeric/index/advanced_indexing.cu     \
 							 cunumeric/index/choose.cu                \
                                                          cunumeric/index/repeat.cu                \
 							 cunumeric/index/zip.cu                   \
diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h
index 68f4f56fd..0e8106ff1 100644
--- a/src/cunumeric/cunumeric_c.h
+++ b/src/cunumeric/cunumeric_c.h
@@ -23,6 +23,7 @@
 // Also, sort these alphabetically except the first one for easy lookup later
 enum CuNumericOpCode {
   _CUNUMERIC_OP_CODE_BASE = 0,
+  CUNUMERIC_ADVANCED_INDEXING,
   CUNUMERIC_ARANGE,
   CUNUMERIC_BINARY_OP,
   CUNUMERIC_BINARY_RED,
diff --git a/src/cunumeric/index/advanced_indexing.cc b/src/cunumeric/index/advanced_indexing.cc
new file mode 100644
index 000000000..6b9224338
--- /dev/null
+++ b/src/cunumeric/index/advanced_indexing.cc
@@ -0,0 +1,76 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/index/advanced_indexing.h"
+#include "cunumeric/index/advanced_indexing_template.inl"
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+template <LegateTypeCode CODE, int DIM1, int DIM2>
+struct AdvancedIndexingImplBody<VariantKind::CPU, CODE, DIM1, DIM2> {
+  using VAL = legate_type_of<CODE>;
+
+  size_t operator()(Buffer<VAL>& out,
+                    const AccessorRO<VAL, DIM1>& input,
+                    const AccessorRO<bool, DIM2>& index,
+                    const Pitches<DIM1 - 1>& pitches_input,
+                    const Rect<DIM1>& rect_input,
+                    const Pitches<DIM2 - 1>& pitches_index,
+                    const Rect<DIM2>& rect_index) const
+  {
+#ifdef CUNUMERIC_DEBUG
+    // in this case shapes for input and index arrays  should be the same
+    assert(rect_input == rect_index);
+#endif
+    const size_t volume = rect_index.volume();
+    size_t size         = 0;
+    for (size_t idx = 0; idx < volume; ++idx) {
+      auto p = pitches_index.unflatten(idx, rect_index.lo);
+      if (index[p] == true) { size++; }
+    }
+
+    out = create_buffer<VAL>(size, Memory::Kind::SYSTEM_MEM);
+
+    int64_t out_idx = 0;
+    for (size_t idx = 0; idx < volume; ++idx) {
+      auto p       = pitches_index.unflatten(idx, rect_index.lo);
+      auto p_input = pitches_input.unflatten(idx, rect_input.lo);
+      if (index[p] == true) {
+        out[out_idx] = input[p_input];
+        out_idx++;
+      }
+    }
+    return size;
+  }
+};
+
+/*static*/ void AdvancedIndexingTask::cpu_variant(TaskContext& context)
+{
+  advanced_indexing_template<VariantKind::CPU>(context);
+}
+
+namespace  // unnamed
+{
+static void __attribute__((constructor)) register_tasks(void)
+{
+  AdvancedIndexingTask::register_variants();
+}
+}  // namespace
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cunumeric/index/advanced_indexing.cu
new file mode 100644
index 000000000..f818579ed
--- /dev/null
+++ b/src/cunumeric/index/advanced_indexing.cu
@@ -0,0 +1,148 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/index/advanced_indexing.h"
+#include "cunumeric/index/advanced_indexing_template.inl"
+#include "cunumeric/cuda_help.h"
+
+#include <thrust/scan.h>
+#include <thrust/execution_policy.h>
+
+namespace cunumeric {
+
+using namespace Legion;
+
+template <typename Output, typename Pitches, typename Point, int32_t DIM>
+static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  count_nonzero_kernel(size_t volume,
+                       Output out,
+                       AccessorRO<bool, DIM> index,
+                       Pitches pitches,
+                       Point origin,
+                       size_t iters,
+                       Buffer<int64_t> offsets)
+{
+  int64_t value = 0;
+  for (size_t idx = 0; idx < iters; idx++) {
+    const size_t offset = (idx * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
+    if (offset < volume) {
+      auto point      = pitches.unflatten(offset, origin);
+      auto val        = static_cast<int64_t>(index[point]);
+      offsets[offset] = val;
+      SumReduction<int64_t>::fold<true>(value, val);
+    }
+  }
+  // Every thread in the thread block must participate in the exchange to get correct results
+  reduce_output(out, value);
+}
+
+template <typename VAL, int DIM1, int DIM2>
+static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  advanced_indexing_kernel(size_t volume,
+                           AccessorRO<VAL, DIM1> in,
+                           AccessorRO<bool, DIM2> index,
+                           Buffer<VAL> out,
+                           Pitches<DIM1 - 1> pitches_input,
+                           Point<DIM1> origin_input,
+                           Pitches<DIM2 - 1> pitches_index,
+                           Point<DIM2> origin_index,
+                           Buffer<int64_t> offsets)
+{
+  // FIXME works only when DIM1==DIM2
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid >= volume) return;
+  auto point       = pitches_index.unflatten(tid, origin_index);
+  auto point_input = pitches_input.unflatten(tid, origin_input);
+  if (index[point] == true) {
+    int64_t offset = offsets[tid];
+    out[offset]    = in[point_input];
+  }
+}
+template <LegateTypeCode CODE, int DIM1, int DIM2>
+struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM1, DIM2> {
+  using VAL = legate_type_of<CODE>;
+
+  int64_t compute_size(const AccessorRO<bool, DIM2>& in,
+                       const Pitches<DIM2 - 1>& pitches,
+                       const Rect<DIM2>& rect,
+                       const size_t volume,
+                       cudaStream_t stream,
+                       Buffer<int64_t>& offsets) const
+  {
+    DeferredReduction<SumReduction<int64_t>> size;
+
+    const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    size_t shmem_size   = THREADS_PER_BLOCK / 32 * sizeof(int64_t);
+
+    if (blocks >= MAX_REDUCTION_CTAS) {
+      const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS;
+      count_nonzero_kernel<<<MAX_REDUCTION_CTAS, THREADS_PER_BLOCK, shmem_size, stream>>>(
+        volume, size, in, pitches, rect.lo, iters, offsets);
+    } else
+      count_nonzero_kernel<<<blocks, THREADS_PER_BLOCK, shmem_size, stream>>>(
+        volume, size, in, pitches, rect.lo, 1, offsets);
+
+    cudaStreamSynchronize(stream);
+
+    auto off_ptr = offsets.ptr(0);
+    thrust::exclusive_scan(thrust::cuda::par.on(stream), off_ptr, off_ptr + volume, off_ptr);
+
+    return size.read();
+  }
+
+  size_t operator()(Buffer<VAL>& out,
+                    const AccessorRO<VAL, DIM1>& input,
+                    const AccessorRO<bool, DIM2>& index,
+                    const Pitches<DIM1 - 1>& pitches_input,
+                    const Rect<DIM1>& rect_input,
+                    const Pitches<DIM2 - 1>& pitches_index,
+                    const Rect<DIM2>& rect_index) const
+  {
+#ifdef CUNUMERIC_DEBUG
+    // in this case shapes for input and index arrays  should be the same
+    assert(rect_input == rect_index);
+#endif
+    int64_t size          = 0;
+    const bool* index_ptr = index.ptr(rect_index);
+    const size_t volume   = rect_index.volume();
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+    auto offsets = create_buffer<int64_t>(volume, Memory::Kind::GPU_FB_MEM);
+    size         = compute_size(index, pitches_index, rect_index, volume, stream, offsets);
+
+    out = create_buffer<VAL>(size, Memory::Kind::GPU_FB_MEM);
+    // populate output
+    if (size > 0) {
+      const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+      advanced_indexing_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(volume,
+                                                                         input,
+                                                                         index,
+                                                                         out,
+                                                                         pitches_input,
+                                                                         rect_input.lo,
+                                                                         pitches_index,
+                                                                         rect_index.lo,
+                                                                         offsets);
+    }
+    return size;
+  }
+};
+
+/*static*/ void AdvancedIndexingTask::gpu_variant(TaskContext& context)
+{
+  advanced_indexing_template<VariantKind::GPU>(context);
+}
+}  // namespace cunumeric
diff --git a/src/cunumeric/index/advanced_indexing.h b/src/cunumeric/index/advanced_indexing.h
new file mode 100644
index 000000000..ec0c92681
--- /dev/null
+++ b/src/cunumeric/index/advanced_indexing.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cunumeric/cunumeric.h"
+
+namespace cunumeric {
+
+struct AdvancedIndexingArgs {
+  Array& output;
+  const Array& input_array;
+  const Array& indexing_array;
+};
+
+class AdvancedIndexingTask : public CuNumericTask<AdvancedIndexingTask> {
+ public:
+  static const int TASK_ID = CUNUMERIC_ADVANCED_INDEXING;
+
+ public:
+  static void cpu_variant(legate::TaskContext& context);
+#ifdef LEGATE_USE_OPENMP
+  static void omp_variant(legate::TaskContext& context);
+#endif
+#ifdef LEGATE_USE_CUDA
+  static void gpu_variant(legate::TaskContext& context);
+#endif
+};
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/index/advanced_indexing_omp.cc b/src/cunumeric/index/advanced_indexing_omp.cc
new file mode 100644
index 000000000..5128ac75d
--- /dev/null
+++ b/src/cunumeric/index/advanced_indexing_omp.cc
@@ -0,0 +1,92 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/index/advanced_indexing.h"
+#include "cunumeric/index/advanced_indexing_template.inl"
+#include "cunumeric/omp_help.h"
+#include <omp.h>
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+template <LegateTypeCode CODE, int DIM1, int DIM2>
+struct AdvancedIndexingImplBody<VariantKind::OMP, CODE, DIM1, DIM2> {
+  using VAL = legate_type_of<CODE>;
+
+  size_t operator()(Buffer<VAL>& out,
+                    const AccessorRO<VAL, DIM1>& input,
+                    const AccessorRO<bool, DIM2>& index,
+                    const Pitches<DIM1 - 1>& pitches_input,
+                    const Rect<DIM1>& rect_input,
+                    const Pitches<DIM2 - 1>& pitches_index,
+                    const Rect<DIM2>& rect_index) const
+  {
+#ifdef CUNUMERIC_DEBUG
+    // in this case shapes for input and index arrays  should be the same
+    assert(rect_input == rect_index);
+#endif
+    const size_t volume    = rect_index.volume();
+    const auto max_threads = omp_get_max_threads();
+    int64_t size           = 0;
+    ThreadLocalStorage<int64_t> offsets(max_threads);
+
+    {
+      ThreadLocalStorage<int64_t> sizes(max_threads);
+      for (auto idx = 0; idx < max_threads; ++idx) sizes[idx] = 0;
+#pragma omp parallel
+      {
+        const int tid = omp_get_thread_num();
+#pragma omp for schedule(static)
+        for (size_t idx = 0; idx < volume; ++idx) {
+          auto point = pitches_index.unflatten(idx, rect_index.lo);
+          if (index[point] == true) sizes[tid] += 1;
+        }
+      }
+
+      for (auto idx = 0; idx < max_threads; ++idx) size += sizes[idx];
+
+      offsets[0] = 0;
+      for (auto idx = 1; idx < max_threads; ++idx) offsets[idx] = offsets[idx - 1] + sizes[idx - 1];
+    }
+    out = create_buffer<VAL>(size, Memory::Kind::SYSTEM_MEM);
+
+#pragma omp parallel
+    {
+      const int tid   = omp_get_thread_num();
+      int64_t out_idx = offsets[tid];
+#pragma omp for schedule(static)
+      for (size_t idx = 0; idx < volume; ++idx) {
+        auto point       = pitches_index.unflatten(idx, rect_index.lo);
+        auto point_input = pitches_input.unflatten(idx, rect_input.lo);
+        if (index[point] == true) {
+          out[out_idx] = input[point_input];
+          ++out_idx;
+        }
+      }
+    }
+
+    return size;
+  }
+};
+
+/*static*/ void AdvancedIndexingTask::omp_variant(TaskContext& context)
+{
+  advanced_indexing_template<VariantKind::OMP>(context);
+}
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/index/advanced_indexing_template.inl b/src/cunumeric/index/advanced_indexing_template.inl
new file mode 100644
index 000000000..ed88ac996
--- /dev/null
+++ b/src/cunumeric/index/advanced_indexing_template.inl
@@ -0,0 +1,79 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/pitches.h"
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+template <VariantKind KIND, LegateTypeCode CODE, int DIM1, int DIM2>
+struct AdvancedIndexingImplBody;
+
+template <VariantKind KIND, LegateTypeCode CODE, int DIM1>
+struct AdvancedIndexingImpl {
+  template <int DIM2>
+  void operator()(AdvancedIndexingArgs& args) const
+  {
+    using VAL       = legate_type_of<CODE>;
+    auto input_rect = args.input_array.shape<DIM1>();
+    auto input_arr  = args.input_array.read_accessor<VAL, DIM1>(input_rect);
+    Pitches<DIM1 - 1> input_pitches;
+    Buffer<VAL> output_arr;
+    size_t volume1 = input_pitches.flatten(input_rect);
+
+    auto index_rect = args.indexing_array.shape<DIM2>();
+    auto index_arr  = args.indexing_array.read_accessor<bool, DIM2>(index_rect);
+    Pitches<DIM2 - 1> index_pitches;
+    size_t volume2 = index_pitches.flatten(index_rect);
+
+    if (volume1 == 0 || volume2 == 0) {
+      auto empty = create_buffer<VAL>(0);
+      args.output.return_data(empty, 0);
+      return;
+    }
+
+    int64_t size = 0;
+    if (DIM1 == DIM2) {
+      size = AdvancedIndexingImplBody<KIND, CODE, DIM1, DIM2>{}(
+        output_arr, input_arr, index_arr, input_pitches, input_rect, index_pitches, index_rect);
+    } else {
+      // should never go here, not implemented
+      assert(false);
+    }
+    args.output.return_data(output_arr, size);
+  }
+};
+
+template <VariantKind KIND>
+struct AdvancedIndexingHelper {
+  template <LegateTypeCode CODE, int DIM1>
+  void operator()(AdvancedIndexingArgs& args) const
+  {
+    dim_dispatch(args.indexing_array.dim(), AdvancedIndexingImpl<KIND, CODE, DIM1>{}, args);
+  }
+};
+
+template <VariantKind KIND>
+static void advanced_indexing_template(TaskContext& context)
+{
+  AdvancedIndexingArgs args{context.outputs()[0], context.inputs()[0], context.inputs()[1]};
+  double_dispatch(
+    args.input_array.dim(), args.input_array.code(), AdvancedIndexingHelper<KIND>{}, args);
+}
+
+}  // namespace cunumeric
diff --git a/tests/index_routines.py b/tests/index_routines.py
index 7303e09c7..1681adf7f 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -57,6 +57,18 @@ def advanced_indexing():
     index_2d_num = num.array(index_2d)
     assert np.array_equal(y[index_2d], y_num[index_2d_num])
 
+    # mismatch dimesion case integers:
+    # print ("advance indexing test 4")
+    # indx = np.array([1, 1])
+    # indx_num = num.array(indx)
+    # res = z[indx]
+    # print("bool array as indx np:")
+    # print(res)
+    # print("cunumeric:")
+    # res_num = z_num[indx_num]
+    # print(res_num)
+    # assert np.array_equal(res, res_num)
+
     # mismatch dimesion case:
     # print ("advance indexing test 4")
     # indx_bool = np.array([True, False])
@@ -73,6 +85,12 @@ def advanced_indexing():
 
     # test for bool array of the same dimension
     print("advanced indexing test 5")
+    index = np.array([True, False, False, True, True, False])
+    index_num = num.array(index)
+    assert np.array_equal(y[index], y_num[index_num])
+
+    # test for bool array of the same dimension 2D
+    print("advanced indexing test 6")
     indx_bool = np.array(
         [
             [
@@ -90,20 +108,16 @@ def advanced_indexing():
     indx_bool_num = num.array(indx_bool)
     res = z[indx_bool]
     res_num = z_num[indx_bool_num]
-    # print("bool array as indx np:")
-    # print(res)
-    # print("cunumeric:")
-    # print(res_num)
     assert np.array_equal(res, res_num)
 
     # test mixed data
-    print("advanced indexing test 6")
+    print("advanced indexing test 7")
     res = z[-1, :]
     res_num = z_num[-1, :]
     assert np.array_equal(res, res_num)
 
     # case when multiple number of arays is passed
-    print("advanced indexing test 7")
+    print("advanced indexing test 8")
     indx0 = np.array([[0, 1], [1, 0], [0, 0]])
     indx1 = np.array([[0, 1], [2, 0], [1, 2]])
     indx2 = np.array([[3, 2], [1, 0], [3, 2]])
@@ -120,6 +134,7 @@ def advanced_indexing():
     assert np.array_equal(res, res_np)
 
     # indices with broadcast:
+    print("advanced indexing test 9")
     indx0 = np.array([[0, 1], [1, 0], [0, 0]])
     indx1 = np.array([[0, 1]])
     indx2 = np.array([[3, 2], [1, 0], [3, 2]])
@@ -132,7 +147,7 @@ def advanced_indexing():
     assert np.array_equal(res, res_np)
 
     # FIXME: Combining Basic and Advanced Indexing Schemes:
-    # print ("advanced indexing test 8")
+    # print ("advanced indexing test 10")
     # ind0 = np.array([1, 1])
     # ind0_num = num.array(ind0)
     # res = z[ind0, :, -1]
diff --git a/tests/nonzero.py b/tests/nonzero.py
index 109825f0f..6cd3d1472 100644
--- a/tests/nonzero.py
+++ b/tests/nonzero.py
@@ -101,6 +101,10 @@ def test():
     np_nonzero = np.nonzero(x_np)
     assert_equal(lg_nonzero, np_nonzero)
 
+    x_np = np.array([True, True])
+    x = num.array(x_np)
+    assert np.array_equal(x_np.nonzero(), x.nonzero())
+
 
 if __name__ == "__main__":
     test()

From 6b4acf1ae65246d991e54670bbadc859570ef6b4 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 17 Mar 2022 10:51:04 -0700
Subject: [PATCH 06/33] extended ZIP task to support case when index.ndim <
 self.ndim

---
 cunumeric/deferred.py                | 66 ++++++++++++++++------------
 src/cunumeric/index/zip.cc           | 28 ++++++++----
 src/cunumeric/index/zip.cu           | 50 +++++++++++++++------
 src/cunumeric/index/zip.h            |  2 +
 src/cunumeric/index/zip_omp.cc       | 28 ++++++++----
 src/cunumeric/index/zip_template.inl | 15 ++++---
 tests/index_routines.py              | 25 ++++++-----
 7 files changed, 140 insertions(+), 74 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 91cad64b7..7d7b0e53d 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -315,14 +315,28 @@ def _zip_indices(self, arrays):
         if not np.issubdtype(data_type, np.integer):
             raise TypeError("a array should be integer type")
         new_arrays = tuple()
-        for a in arrays:
-            if data_type != a.dtype:
-                raise TypeError("type of all index arrrays should be the same")
-            if a.shape != shape:
-                a = a._broadcast(shape)
-            else:
-                a = a.base
-            new_arrays = new_arrays + (a,)
+        key_dim = len(arrays[0].shape)
+
+        if len(arrays) == 1:
+            # special case when a single index array is passed and it's dim <
+            # self.ndims
+            shape = shape + tuple(self.shape[i] for i in range(1, self.ndim))
+            array = arrays[0].base
+            start = key_dim - 1
+            for i in range(1, self.ndim):
+                array = array.promote(start + i, self.shape[i])
+            new_arrays += (array,)
+        else:
+            for a in arrays:
+                if data_type != a.dtype:
+                    raise TypeError(
+                        "type of all index arrrays should be the same"
+                    )
+                if a.shape != shape:
+                    a = a._broadcast(shape)
+                else:
+                    a = a.base
+                new_arrays = new_arrays + (a,)
         arrays = new_arrays
         # create output array which will store Point<N> field where
         # N is number of index arrays
@@ -335,7 +349,7 @@ def _zip_indices(self, arrays):
         # but it should be safe to directly create a DeferredArray
         # of that dtype, so long as we don't try to convert it to a
         # NumPy array.
-        N = len(arrays)
+        N = self.ndim
         pointN_dtype = self.runtime.add_point_type(N)
         store = self.context.create_store(
             pointN_dtype, shape=shape, optimize_scalar=True
@@ -347,9 +361,18 @@ def _zip_indices(self, arrays):
         # call ZIP function to combine index arrays into a singe array
         task = self.context.create_task(CuNumericOpCode.ZIP)
         task.add_output(output_arr.base)
-        for index_arr in arrays:
-            task.add_input(index_arr)
-            task.add_alignment(output_arr.base, index_arr)
+        if len(arrays) == 1:
+            task.add_input(arrays[0])
+            task.add_alignment(arrays[0], output_arr.base)
+            task.add_scalar_arg(self.ndim, ty.int64)
+            task.add_scalar_arg(key_dim, ty.int64)
+            task.add_broadcast(arrays[0], axes=range(1, len(shape)))
+        else:
+            task.add_scalar_arg(self.ndim, ty.int64)
+            task.add_scalar_arg(self.ndim, ty.int64)
+            for index_arr in arrays:
+                task.add_input(index_arr)
+                task.add_alignment(output_arr.base, index_arr)
         task.execute()
 
         return output_arr
@@ -409,21 +432,8 @@ def _create_indexing_array(self, key):
                 # output regions when ND output regions are available
                 tuple_of_arrays = key.nonzero()
             elif key.ndim < store.ndim:
-                raise ValueError("Advance indexing dimention mismatch")
-                # FIXME add extensions to ZIP taskD
-                # ndim_out = store.ndim + key.ndim-1
-                # indx = key._expand_dims(list(range(key.ndim, ndim_out)))
-                # np.expand_dims(key, list(range(key.ndim, ndim_out)))
-                # print("IRINA DEBUG shape key " , indx.shape)
-                # tuple_of_arrays = (indx,)
-                # for dim in range(1, store.ndim):
-                #    dims=  list(i for i in range(ndim_out) if i
-                # not in range(dim+key.ndim-1,dim+2*key.ndim-1))
-                #    print("IRINA DEBUG dims = ", dims)
-
-                #    indx = np.arrange(
-                #            self.shape[dim])._expand_dims(dims)
-                #    tuple_of_arrays = tuple_of_arrays + (indx,)
+                output_arr = self._zip_indices((key,))
+                return True, store, output_arr
             else:
                 tuple_of_arrays = (self.runtime.to_deferred_array(key),)
 
@@ -432,7 +442,7 @@ def _create_indexing_array(self, key):
 
         if len(tuple_of_arrays) == self.ndim and self.ndim > 1:
 
-            output_arr = tuple_of_arrays[0]._zip_indices(tuple_of_arrays)
+            output_arr = self._zip_indices(tuple_of_arrays)
             return True, store, output_arr
         elif len(tuple_of_arrays) == 1 and self.ndim == 1:
             return True, store, tuple_of_arrays[0]
diff --git a/src/cunumeric/index/zip.cc b/src/cunumeric/index/zip.cc
index 364d7e973..e0bb67e48 100644
--- a/src/cunumeric/index/zip.cc
+++ b/src/cunumeric/index/zip.cc
@@ -32,20 +32,30 @@ struct ZipImplBody<VariantKind::CPU, DIM, N> {
                   const Rect<DIM>& rect,
                   const Pitches<DIM - 1>& pitches,
                   bool dense,
+                  const int64_t key_dim,
                   std::index_sequence<Is...>) const
   {
-    const size_t volume = rect.volume();
-    if (dense) {
-      auto outptr = out.ptr(rect);
-      for (size_t idx = 0; idx < volume; ++idx) {
-        outptr[idx] = Legion::Point<N>(index_arrays[Is].ptr(rect)[idx]...);
-        // std::cout<<"IRINA DEBUG dense out = "<<outptr[idx]<<std::endl;
+    if (index_arrays.size() > 1) {
+      const size_t volume = rect.volume();
+      if (dense) {
+        auto outptr = out.ptr(rect);
+        for (size_t idx = 0; idx < volume; ++idx) {
+          outptr[idx] = Legion::Point<N>(index_arrays[Is].ptr(rect)[idx]...);
+        }
+      } else {
+        for (size_t idx = 0; idx < volume; ++idx) {
+          auto p = pitches.unflatten(idx, rect.lo);
+          out[p] = Legion::Point<N>(index_arrays[Is][p]...);
+        }
       }
-    } else {
+    } else if (index_arrays.size() == 1) {
+      const size_t volume = rect.volume();
       for (size_t idx = 0; idx < volume; ++idx) {
         auto p = pitches.unflatten(idx, rect.lo);
-        out[p] = Legion::Point<N>(index_arrays[Is][p]...);
-        // std::cout<<"IRINA DEBUG out = "<<out[p]<<std::endl;
+        Legion::Point<N> new_point;
+        new_point[0] = index_arrays[0][p];
+        for (size_t i = 1; i < N; i++) { new_point[i] = p[key_dim + i - 1]; }
+        out[p] = new_point;
       }
     }
   }
diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu
index 1bd8b6aef..bbfdbbb07 100644
--- a/src/cunumeric/index/zip.cu
+++ b/src/cunumeric/index/zip.cu
@@ -50,6 +50,24 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   out[idx] = Legion::Point<N>(index_arrays[Is][idx]...);
 }
 
+template <int DIM, int N>
+__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  zip_kernel(const AccessorWO<Point<N>, DIM> out,
+             const AccessorRO<int64_t, DIM> index_array,
+             const Rect<DIM> rect,
+             const Pitches<DIM - 1> pitches,
+             int volume,
+             const int64_t key_dim)
+{
+  const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= volume) return;
+  auto p = pitches.unflatten(idx, rect.lo);
+  Legion::Point<N> new_point;
+  new_point[0] = index_array[p];
+  for (size_t i = 1; i < N; i++) { new_point[i] = p[key_dim + i - 1]; }
+  out[p] = new_point;
+}
+
 template <int DIM, int N>
 struct ZipImplBody<VariantKind::GPU, DIM, N> {
   using VAL = int64_t;
@@ -60,24 +78,30 @@ struct ZipImplBody<VariantKind::GPU, DIM, N> {
                   const Rect<DIM>& rect,
                   const Pitches<DIM - 1>& pitches,
                   bool dense,
+                  const int64_t key_dim,
                   std::index_sequence<Is...>) const
   {
     const size_t volume = rect.volume();
     const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    if (dense) {
-      DeferredBuffer<const int64_t*, 1> idx_arr(Memory::Kind::Z_COPY_MEM,
-                                                Rect<1>(0, index_arrays.size() - 1));
-      for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) {
-        idx_arr[idx] = index_arrays[idx].ptr(rect);
+    if (index_arrays.size() > 1) {
+      if (dense) {
+        DeferredBuffer<const int64_t*, 1> idx_arr(Memory::Kind::Z_COPY_MEM,
+                                                  Rect<1>(0, index_arrays.size() - 1));
+        for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) {
+          idx_arr[idx] = index_arrays[idx].ptr(rect);
+        }
+        zip_kernel_dense<DIM, N><<<blocks, THREADS_PER_BLOCK>>>(
+          out.ptr(rect), idx_arr, rect, volume, std::make_index_sequence<N>());
+      } else {
+        DeferredBuffer<AccessorRO<VAL, DIM>, 1> idx_arr(Memory::Kind::Z_COPY_MEM,
+                                                        Rect<1>(0, index_arrays.size() - 1));
+        for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) idx_arr[idx] = index_arrays[idx];
+        zip_kernel<DIM, N><<<blocks, THREADS_PER_BLOCK>>>(
+          out, idx_arr, rect, pitches, volume, std::make_index_sequence<N>());
       }
-      zip_kernel_dense<DIM, N><<<blocks, THREADS_PER_BLOCK>>>(
-        out.ptr(rect), idx_arr, rect, volume, std::make_index_sequence<N>());
-    } else {
-      DeferredBuffer<AccessorRO<VAL, DIM>, 1> idx_arr(Memory::Kind::Z_COPY_MEM,
-                                                      Rect<1>(0, index_arrays.size() - 1));
-      for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) idx_arr[idx] = index_arrays[idx];
-      zip_kernel<DIM, N><<<blocks, THREADS_PER_BLOCK>>>(
-        out, idx_arr, rect, pitches, volume, std::make_index_sequence<N>());
+    } else if (index_arrays.size() == 1) {
+      zip_kernel<DIM, N>
+        <<<blocks, THREADS_PER_BLOCK>>>(out, index_arrays[0], rect, pitches, volume, key_dim);
     }
   }
 };
diff --git a/src/cunumeric/index/zip.h b/src/cunumeric/index/zip.h
index ae7476b05..bedad8a7a 100644
--- a/src/cunumeric/index/zip.h
+++ b/src/cunumeric/index/zip.h
@@ -23,6 +23,8 @@ namespace cunumeric {
 struct ZipArgs {
   const Array& out;
   const std::vector<Array>& inputs;
+  const int64_t N;
+  const int64_t key_dim;
 };
 
 class ZipTask : public CuNumericTask<ZipTask> {
diff --git a/src/cunumeric/index/zip_omp.cc b/src/cunumeric/index/zip_omp.cc
index 8cfebb32d..30a51a48c 100644
--- a/src/cunumeric/index/zip_omp.cc
+++ b/src/cunumeric/index/zip_omp.cc
@@ -32,22 +32,34 @@ struct ZipImplBody<VariantKind::OMP, DIM, N> {
                   const Rect<DIM>& rect,
                   const Pitches<DIM - 1>& pitches,
                   bool dense,
+                  const int64_t key_dim,
                   std::index_sequence<Is...>) const
   {
     const size_t volume = rect.volume();
-    if (dense) {
-      auto outptr = out.ptr(rect);
+    if (index_arrays.size() > 1) {
+      if (dense) {
+        auto outptr = out.ptr(rect);
 #pragma omp parallel for schedule(static)
-      for (size_t idx = 0; idx < volume; ++idx) {
-        outptr[idx] = Legion::Point<N>(index_arrays[Is].ptr(rect)[idx]...);
-      }
-    } else {
+        for (size_t idx = 0; idx < volume; ++idx) {
+          outptr[idx] = Legion::Point<N>(index_arrays[Is].ptr(rect)[idx]...);
+        }
+      } else {
+#pragma omp parallel for schedule(static)
+        for (size_t idx = 0; idx < volume; ++idx) {
+          auto p = pitches.unflatten(idx, rect.lo);
+          out[p] = Legion::Point<N>(index_arrays[Is][p]...);
+        }
+      }  // else
+    } else if (index_arrays.size() == 1) {
 #pragma omp parallel for schedule(static)
       for (size_t idx = 0; idx < volume; ++idx) {
         auto p = pitches.unflatten(idx, rect.lo);
-        out[p] = Legion::Point<N>(index_arrays[Is][p]...);
+        Legion::Point<N> new_point;
+        new_point[0] = index_arrays[0][p];
+        for (size_t i = 1; i < N; i++) { new_point[i] = p[key_dim + i - 1]; }
+        out[p] = new_point;
       }
-    }  // else
+    }
   }
 };
 
diff --git a/src/cunumeric/index/zip_template.inl b/src/cunumeric/index/zip_template.inl
index f16b89474..e1c99771e 100644
--- a/src/cunumeric/index/zip_template.inl
+++ b/src/cunumeric/index/zip_template.inl
@@ -37,8 +37,12 @@ struct ZipImpl {
     size_t volume = pitches.flatten(index_rect);
     if (volume == 0) return;
 
+#ifdef CUNUMERIC_DEBUG
+    assert(out_rect == index_rect)
+#endif
+
 #ifndef LEGION_BOUNDS_CHECKS
-    bool dense = out.accessor.is_dense_row_major(out_rect);
+      bool dense = out.accessor.is_dense_row_major(out_rect);
 #endif
     std::vector<AccessorRO<VAL, DIM>> index_arrays;
     for (int i = 0; i < args.inputs.size(); i++) {
@@ -52,17 +56,18 @@ struct ZipImpl {
 #ifdef LEGION_BOUNDS_CHECKS
     bool dense = false;
 #endif
-
     ZipImplBody<KIND, DIM, N>()(
-      out, index_arrays, index_rect, pitches, dense, std::make_index_sequence<N>());
+      out, index_arrays, index_rect, pitches, dense, args.key_dim, std::make_index_sequence<N>());
   }
 };
 
 template <VariantKind KIND>
 static void zip_template(TaskContext& context)
 {
-  ZipArgs args{context.outputs()[0], context.inputs()};
-  double_dispatch(args.inputs[0].dim(), args.inputs.size(), ZipImpl<KIND>{}, args);
+  int64_t N       = context.scalars()[0].value<int64_t>();
+  int64_t key_dim = context.scalars()[1].value<int64_t>();
+  ZipArgs args{context.outputs()[0], context.inputs(), N, key_dim};
+  double_dispatch(args.inputs[0].dim(), N, ZipImpl<KIND>{}, args);
 }
 
 }  // namespace cunumeric
diff --git a/tests/index_routines.py b/tests/index_routines.py
index 1681adf7f..79db6de6d 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -58,18 +58,21 @@ def advanced_indexing():
     assert np.array_equal(y[index_2d], y_num[index_2d_num])
 
     # mismatch dimesion case integers:
-    # print ("advance indexing test 4")
-    # indx = np.array([1, 1])
-    # indx_num = num.array(indx)
-    # res = z[indx]
-    # print("bool array as indx np:")
-    # print(res)
-    # print("cunumeric:")
-    # res_num = z_num[indx_num]
-    # print(res_num)
-    # assert np.array_equal(res, res_num)
+    print("advanced indexing test 4")
+    indx = np.array([1, 1])
+    indx_num = num.array(indx)
+    res = z[indx]
+    res_num = z_num[indx_num]
+    assert np.array_equal(res, res_num)
+
+    # 2d:
+    indx = np.array([[1, 1], [1, 0]])
+    indx_num = num.array(indx)
+    res = z[indx]
+    res_num = z_num[indx_num]
+    assert np.array_equal(res, res_num)
 
-    # mismatch dimesion case:
+    # mismatch dimesion case bool:
     # print ("advance indexing test 4")
     # indx_bool = np.array([True, False])
     # indx_bool_num = num.array(indx_bool)

From 51c5f50c1cd9e6f1fc5dc2d9e0db7c460060cc07 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 17 Mar 2022 14:05:24 -0700
Subject: [PATCH 07/33] adding support for the mixed type of the arguments

---
 cunumeric/deferred.py   | 20 +++++++++++++++++---
 tests/index_routines.py | 20 +++++++++++---------
 2 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 7d7b0e53d..3c2a0f0a7 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -389,10 +389,22 @@ def _create_indexing_array(self, key):
                     if k < 0:
                         k += store.shape[dim + shift]
                     store = store.project(dim + shift, k)
+                    store_to_copy = DeferredArray(
+                        self.runtime,
+                        base=store,
+                        dtype=self.dtype,
+                    )
+                    store_copy = self.runtime.create_empty_thunk(
+                        store_to_copy.shape,
+                        self.dtype,
+                        inputs=[store_to_copy],
+                    )
+                    store_copy.copy(store_to_copy, deep=True)
+                    self = store_copy
+                    store = store_copy.base
                     shift -= 1
                 elif isinstance(k, slice):
-                    # FIXME do we need to transform the store here?
-                    store = store.slice(dim + shift, k)
+                    store = store
                 elif isinstance(k, NumPyThunk):
                     if k.dtype == np.bool:
                         # in case of the mixed indises we all nonzero
@@ -440,7 +452,9 @@ def _create_indexing_array(self, key):
         if len(tuple_of_arrays) > self.ndim:
             raise TypeError("Advanced indexing dimension mismatch")
 
-        if len(tuple_of_arrays) == self.ndim and self.ndim > 1:
+        if (len(tuple_of_arrays) == self.ndim and self.ndim > 1) or (
+            len(tuple_of_arrays) < self.ndim > 1
+        ):
 
             output_arr = self._zip_indices(tuple_of_arrays)
             return True, store, output_arr
diff --git a/tests/index_routines.py b/tests/index_routines.py
index 79db6de6d..72b281ef1 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -115,8 +115,8 @@ def advanced_indexing():
 
     # test mixed data
     print("advanced indexing test 7")
-    res = z[-1, :]
-    res_num = z_num[-1, :]
+    res = z[:, -1]
+    res_num = z_num[:, -1]
     assert np.array_equal(res, res_num)
 
     # case when multiple number of arays is passed
@@ -149,15 +149,17 @@ def advanced_indexing():
     res_np = z[indx0, indx1, indx2]
     assert np.array_equal(res, res_np)
 
-    # FIXME: Combining Basic and Advanced Indexing Schemes:
-    # print ("advanced indexing test 10")
-    # ind0 = np.array([1, 1])
-    # ind0_num = num.array(ind0)
-    # res = z[ind0, :, -1]
-    # res_num = z_num[ind0_num, :, -1]
+    # Combining Basic and Advanced Indexing Schemes:
+    print("advanced indexing test 10")
+    ind0 = np.array([1, 1])
+    ind0_num = num.array(ind0)
+    res = z[ind0, :, -1]
+    res_num = z_num[ind0_num, :, -1]
+    # res = z[ind0,-1]
     # print(res)
+    # res_num = z_num[ind0,-1]
     # print(res_num)
-    # assert np.array_equal(res, res_num)
+    assert np.array_equal(res, res_num)
 
     # In-Place & Augmented Assignments via Advanced Indexing
     # simple 1d case

From 2d6a67132ae0bd9c32eeffc5b41a601bc82d9956 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 17 Mar 2022 19:56:46 -0700
Subject: [PATCH 08/33] adding support for number of arrays passed as indices<
 self.ndim

---
 cunumeric/deferred.py      | 68 ++++++++++++++++++++++++++++++--------
 src/cunumeric/index/zip.cc | 12 ++++---
 tests/index_routines.py    | 18 ++++++++++
 3 files changed, 81 insertions(+), 17 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 3c2a0f0a7..776fb4fea 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -305,35 +305,76 @@ def get_scalar_array(self):
         result = np.frombuffer(buf, dtype=self.dtype, count=1)
         return result.reshape(())
 
+    def broadcast_shapes(self, shapes):
+        arrays = [np.empty(x, dtype=[]) for x in shapes]
+        return np.broadcast(*arrays).shape
+
     def _zip_indices(self, arrays):
         if not isinstance(arrays, tuple):
             raise TypeError("zip_indices expect tuple of arrays")
         arrays = tuple(self.runtime.to_deferred_array(a) for a in arrays)
         # all arrays should have the same shape and type
-        shape = arrays[0].shape
         data_type = arrays[0].dtype
         if not np.issubdtype(data_type, np.integer):
             raise TypeError("a array should be integer type")
-        new_arrays = tuple()
-        key_dim = len(arrays[0].shape)
+
+        shapes = tuple(a.shape for a in arrays)
+        if len(arrays) > 1:
+            b_shape = self.broadcast_shapes(shapes)
+        else:
+            b_shape = arrays[0].shape
+        key_dim = len(b_shape)
+        print("IRINA DEBUG key_dim", key_dim, b_shape)
+        out_shape = b_shape
 
         if len(arrays) == 1:
             # special case when a single index array is passed and it's dim <
             # self.ndims
-            shape = shape + tuple(self.shape[i] for i in range(1, self.ndim))
+            out_shape = b_shape + tuple(
+                self.shape[i] for i in range(1, self.ndim)
+            )
             array = arrays[0].base
             start = key_dim - 1
+            new_arrays = tuple()
             for i in range(1, self.ndim):
                 array = array.promote(start + i, self.shape[i])
             new_arrays += (array,)
+        elif len(arrays) < self.ndim:
+            N = len(arrays)
+            # broadcast shapes
+            new_arrays = tuple()
+            for a in arrays:
+                if data_type != a.dtype:
+                    raise TypeError(
+                        "type of all index arrrays should be the same"
+                    )
+                if a.shape != b_shape:
+                    new_arrays += (a._broadcast(b_shape),)
+                else:
+                    new_arrays += (a.base,)
+            arrays = new_arrays
+            # output shape
+            out_shape = b_shape + tuple(
+                self.shape[i] for i in range(N, self.ndim)
+            )
+            print("IRINA DEBUG out_shape = ", out_shape)
+            new_arrays = tuple()
+            start = key_dim - 1
+            for a in arrays:
+                for i in range(N, self.ndim):
+                    a = a.promote(key_dim + i - N, self.shape[i])
+                    new_arrays += (a,)
+            arrays = new_arrays
+
         else:
+            new_arrays = tuple()
             for a in arrays:
                 if data_type != a.dtype:
                     raise TypeError(
                         "type of all index arrrays should be the same"
                     )
-                if a.shape != shape:
-                    a = a._broadcast(shape)
+                if a.shape != b_shape:
+                    a = a._broadcast(b_shape)
                 else:
                     a = a.base
                 new_arrays = new_arrays + (a,)
@@ -352,7 +393,7 @@ def _zip_indices(self, arrays):
         N = self.ndim
         pointN_dtype = self.runtime.add_point_type(N)
         store = self.context.create_store(
-            pointN_dtype, shape=shape, optimize_scalar=True
+            pointN_dtype, shape=out_shape, optimize_scalar=True
         )
         output_arr = DeferredArray(
             self.runtime, base=store, dtype=pointN_dtype
@@ -361,12 +402,13 @@ def _zip_indices(self, arrays):
         # call ZIP function to combine index arrays into a singe array
         task = self.context.create_task(CuNumericOpCode.ZIP)
         task.add_output(output_arr.base)
-        if len(arrays) == 1:
-            task.add_input(arrays[0])
-            task.add_alignment(arrays[0], output_arr.base)
-            task.add_scalar_arg(self.ndim, ty.int64)
-            task.add_scalar_arg(key_dim, ty.int64)
-            task.add_broadcast(arrays[0], axes=range(1, len(shape)))
+        if len(arrays) < self.ndim:
+            task.add_scalar_arg(self.ndim, ty.int64)  # N of points in Point<N>
+            task.add_scalar_arg(key_dim, ty.int64)  # key_dim
+            for a in arrays:
+                task.add_input(a)
+                task.add_alignment(a, output_arr.base)
+                task.add_broadcast(a, axes=range(1, len(out_shape)))
         else:
             task.add_scalar_arg(self.ndim, ty.int64)
             task.add_scalar_arg(self.ndim, ty.int64)
diff --git a/src/cunumeric/index/zip.cc b/src/cunumeric/index/zip.cc
index e0bb67e48..8b446da5f 100644
--- a/src/cunumeric/index/zip.cc
+++ b/src/cunumeric/index/zip.cc
@@ -35,7 +35,7 @@ struct ZipImplBody<VariantKind::CPU, DIM, N> {
                   const int64_t key_dim,
                   std::index_sequence<Is...>) const
   {
-    if (index_arrays.size() > 1) {
+    if (index_arrays.size() == N) {
       const size_t volume = rect.volume();
       if (dense) {
         auto outptr = out.ptr(rect);
@@ -48,13 +48,17 @@ struct ZipImplBody<VariantKind::CPU, DIM, N> {
           out[p] = Legion::Point<N>(index_arrays[Is][p]...);
         }
       }
-    } else if (index_arrays.size() == 1) {
+    } else if (index_arrays.size() < N) {
       const size_t volume = rect.volume();
       for (size_t idx = 0; idx < volume; ++idx) {
         auto p = pitches.unflatten(idx, rect.lo);
         Legion::Point<N> new_point;
-        new_point[0] = index_arrays[0][p];
-        for (size_t i = 1; i < N; i++) { new_point[i] = p[key_dim + i - 1]; }
+        for (size_t i = 0; i < index_arrays.size(); i++) new_point[i] = index_arrays[i][p];
+        for (size_t i = index_arrays.size(); i < N; i++) {
+          int64_t j    = key_dim + i - 1 - (index_arrays.size() - 1);
+          new_point[i] = p[j];
+        }
+        std::cout << "IRINA DEBUG" << new_point << std::endl;
         out[p] = new_point;
       }
     }
diff --git a/tests/index_routines.py b/tests/index_routines.py
index 72b281ef1..bc2a374d2 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -72,6 +72,24 @@ def advanced_indexing():
     res_num = z_num[indx_num]
     assert np.array_equal(res, res_num)
 
+    # 2 arrays passed do 3d array
+    indx0 = np.array([1, 1])
+    indx1 = np.array([1, 0])
+    indx0_num = num.array(indx0)
+    indx1_num = num.array(indx1)
+    res = z[indx0, indx1]
+    res_num = z_num[indx0_num, indx1_num]
+    assert np.array_equal(res, res_num)
+
+    # 2 arrays with broadcasting
+    indx0 = np.array([1, 1])
+    indx1 = np.array([[1, 0], [1, 0]])
+    indx0_num = num.array(indx0)
+    indx1_num = num.array(indx1)
+    res = z[indx0, indx1]
+    res_num = z_num[indx0_num, indx1_num]
+    assert np.array_equal(res, res_num)
+
     # mismatch dimesion case bool:
     # print ("advance indexing test 4")
     # indx_bool = np.array([True, False])

From 31189691057b7f6797def7d98cc284ff8506b196 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 18 Mar 2022 12:18:54 -0700
Subject: [PATCH 09/33] adding support for the use case arr[:, indx, :]

---
 cunumeric/deferred.py                | 64 +++++++++++++++++++++-------
 src/cunumeric/index/zip.cc           |  9 ++--
 src/cunumeric/index/zip.cu           | 23 +++++++---
 src/cunumeric/index/zip.h            |  1 +
 src/cunumeric/index/zip_omp.cc       | 13 +++++-
 src/cunumeric/index/zip_template.inl | 21 ++++++---
 tests/index_routines.py              | 46 +++++++++++---------
 7 files changed, 123 insertions(+), 54 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 776fb4fea..8a37b6803 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -309,9 +309,12 @@ def broadcast_shapes(self, shapes):
         arrays = [np.empty(x, dtype=[]) for x in shapes]
         return np.broadcast(*arrays).shape
 
-    def _zip_indices(self, arrays):
+    def _zip_indices(self, start_index, arrays):
+
         if not isinstance(arrays, tuple):
             raise TypeError("zip_indices expect tuple of arrays")
+        if start_index == -1:
+            start_index = 0
         arrays = tuple(self.runtime.to_deferred_array(a) for a in arrays)
         # all arrays should have the same shape and type
         data_type = arrays[0].dtype
@@ -324,20 +327,27 @@ def _zip_indices(self, arrays):
         else:
             b_shape = arrays[0].shape
         key_dim = len(b_shape)
-        print("IRINA DEBUG key_dim", key_dim, b_shape)
         out_shape = b_shape
 
         if len(arrays) == 1:
             # special case when a single index array is passed and it's dim <
             # self.ndims
-            out_shape = b_shape + tuple(
-                self.shape[i] for i in range(1, self.ndim)
+            out_shape = (
+                tuple(self.shape[i] for i in range(0, start_index))
+                + b_shape
+                + tuple(
+                    self.shape[i] for i in range(start_index + 1, self.ndim)
+                )
             )
             array = arrays[0].base
             start = key_dim - 1
             new_arrays = tuple()
-            for i in range(1, self.ndim):
+            for i in range(0, start_index):
+                array = array.promote(i, self.shape[i])
+            for i in range(start_index + 1, self.ndim):
                 array = array.promote(start + i, self.shape[i])
+            if array.shape != out_shape:
+                raise ValueError("Wrong shape calculation")
             new_arrays += (array,)
         elif len(arrays) < self.ndim:
             N = len(arrays)
@@ -354,16 +364,21 @@ def _zip_indices(self, arrays):
                     new_arrays += (a.base,)
             arrays = new_arrays
             # output shape
-            out_shape = b_shape + tuple(
-                self.shape[i] for i in range(N, self.ndim)
+            out_shape = (
+                tuple(self.shape[i] for i in range(0, start_index))
+                + b_shape
+                + tuple(
+                    self.shape[i] for i in range(start_index + N, self.ndim)
+                )
             )
-            print("IRINA DEBUG out_shape = ", out_shape)
             new_arrays = tuple()
             start = key_dim - 1
             for a in arrays:
-                for i in range(N, self.ndim):
+                for i in range(0, start_index):
+                    a = a.promote(i, self.shape[i])
+                for i in range(start_index + N, self.ndim):
                     a = a.promote(key_dim + i - N, self.shape[i])
-                    new_arrays += (a,)
+                new_arrays += (a,)
             arrays = new_arrays
 
         else:
@@ -405,13 +420,18 @@ def _zip_indices(self, arrays):
         if len(arrays) < self.ndim:
             task.add_scalar_arg(self.ndim, ty.int64)  # N of points in Point<N>
             task.add_scalar_arg(key_dim, ty.int64)  # key_dim
+            task.add_scalar_arg(start_index, ty.int64)  # start_index
             for a in arrays:
                 task.add_input(a)
                 task.add_alignment(a, output_arr.base)
-                task.add_broadcast(a, axes=range(1, len(out_shape)))
+                task.add_broadcast(a, axes=tuple(range(1, len(out_shape))))
+                task.add_broadcast(
+                    output_arr.base, axes=tuple(range(1, len(out_shape)))
+                )
         else:
             task.add_scalar_arg(self.ndim, ty.int64)
             task.add_scalar_arg(self.ndim, ty.int64)
+            task.add_scalar_arg(start_index, ty.int64)
             for index_arr in arrays:
                 task.add_input(index_arr)
                 task.add_alignment(output_arr.base, index_arr)
@@ -423,6 +443,7 @@ def _create_indexing_array(self, key):
         # Convert everything into deferred arrays of int64
         store = self.base
         shift = 0
+        start_index = -1
         if isinstance(key, tuple):
             tuple_of_arrays = ()
             # for k in key:
@@ -448,6 +469,10 @@ def _create_indexing_array(self, key):
                 elif isinstance(k, slice):
                     store = store
                 elif isinstance(k, NumPyThunk):
+                    # the very first time we get cunumeric array, record
+                    # start_index
+                    if start_index == -1:
+                        start_index = dim
                     if k.dtype == np.bool:
                         # in case of the mixed indises we all nonzero
                         # for the bool array
@@ -480,13 +505,20 @@ def _create_indexing_array(self, key):
                     task.add_input(self.base)
                     task.add_input(key.base)
                     task.add_alignment(self.base, key.base)
+                    task.add_broadcast(
+                        self.base, axes=tuple(range(1, len(self.shape)))
+                    )
+                    task.add_broadcast(
+                        key.base, axes=tuple(range(1, len(key.shape)))
+                    )
                     task.execute()
                     return False, store, out
-                # IRINA fixme: replace `nonzero` case with the task with
-                # output regions when ND output regions are available
-                tuple_of_arrays = key.nonzero()
+                else:
+                    # IRINA fixme: replace `nonzero` case with the task with
+                    # output regions when ND output regions are available
+                    tuple_of_arrays = key.nonzero()
             elif key.ndim < store.ndim:
-                output_arr = self._zip_indices((key,))
+                output_arr = self._zip_indices(start_index, (key,))
                 return True, store, output_arr
             else:
                 tuple_of_arrays = (self.runtime.to_deferred_array(key),)
@@ -498,7 +530,7 @@ def _create_indexing_array(self, key):
             len(tuple_of_arrays) < self.ndim > 1
         ):
 
-            output_arr = self._zip_indices(tuple_of_arrays)
+            output_arr = self._zip_indices(start_index, tuple_of_arrays)
             return True, store, output_arr
         elif len(tuple_of_arrays) == 1 and self.ndim == 1:
             return True, store, tuple_of_arrays[0]
diff --git a/src/cunumeric/index/zip.cc b/src/cunumeric/index/zip.cc
index 8b446da5f..8696694d8 100644
--- a/src/cunumeric/index/zip.cc
+++ b/src/cunumeric/index/zip.cc
@@ -33,6 +33,7 @@ struct ZipImplBody<VariantKind::CPU, DIM, N> {
                   const Pitches<DIM - 1>& pitches,
                   bool dense,
                   const int64_t key_dim,
+                  const int64_t start_index,
                   std::index_sequence<Is...>) const
   {
     if (index_arrays.size() == N) {
@@ -53,12 +54,14 @@ struct ZipImplBody<VariantKind::CPU, DIM, N> {
       for (size_t idx = 0; idx < volume; ++idx) {
         auto p = pitches.unflatten(idx, rect.lo);
         Legion::Point<N> new_point;
-        for (size_t i = 0; i < index_arrays.size(); i++) new_point[i] = index_arrays[i][p];
-        for (size_t i = index_arrays.size(); i < N; i++) {
+        for (size_t i = 0; i < start_index; i++) { new_point[i] = p[i]; }
+        for (size_t i = 0; i < index_arrays.size(); i++) {
+          new_point[start_index + i] = index_arrays[i][p];
+        }
+        for (size_t i = (start_index + index_arrays.size()); i < N; i++) {
           int64_t j    = key_dim + i - 1 - (index_arrays.size() - 1);
           new_point[i] = p[j];
         }
-        std::cout << "IRINA DEBUG" << new_point << std::endl;
         out[p] = new_point;
       }
     }
diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu
index bbfdbbb07..28d97ed2f 100644
--- a/src/cunumeric/index/zip.cu
+++ b/src/cunumeric/index/zip.cu
@@ -53,18 +53,24 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
 template <int DIM, int N>
 __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   zip_kernel(const AccessorWO<Point<N>, DIM> out,
-             const AccessorRO<int64_t, DIM> index_array,
+             const DeferredBuffer<AccessorRO<int64_t, DIM>, 1> index_arrays,
              const Rect<DIM> rect,
              const Pitches<DIM - 1> pitches,
              int volume,
-             const int64_t key_dim)
+             const int64_t key_dim,
+             const int64_t start_index,
+             int num_arrays)
 {
   const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= volume) return;
   auto p = pitches.unflatten(idx, rect.lo);
   Legion::Point<N> new_point;
-  new_point[0] = index_array[p];
-  for (size_t i = 1; i < N; i++) { new_point[i] = p[key_dim + i - 1]; }
+  for (size_t i = 0; i < start_index; i++) { new_point[i] = p[i]; }
+  for (size_t i = 0; i < num_arrays; i++) { new_point[start_index + i] = index_arrays[i][p]; }
+  for (size_t i = (start_index + num_arrays); i < N; i++) {
+    int64_t j    = key_dim + i - 1 - (num_arrays);
+    new_point[i] = p[j];
+  }
   out[p] = new_point;
 }
 
@@ -79,6 +85,7 @@ struct ZipImplBody<VariantKind::GPU, DIM, N> {
                   const Pitches<DIM - 1>& pitches,
                   bool dense,
                   const int64_t key_dim,
+                  const int64_t start_index,
                   std::index_sequence<Is...>) const
   {
     const size_t volume = rect.volume();
@@ -100,8 +107,12 @@ struct ZipImplBody<VariantKind::GPU, DIM, N> {
           out, idx_arr, rect, pitches, volume, std::make_index_sequence<N>());
       }
     } else if (index_arrays.size() == 1) {
-      zip_kernel<DIM, N>
-        <<<blocks, THREADS_PER_BLOCK>>>(out, index_arrays[0], rect, pitches, volume, key_dim);
+      DeferredBuffer<AccessorRO<VAL, DIM>, 1> idx_arr(Memory::Kind::Z_COPY_MEM,
+                                                      Rect<1>(0, index_arrays.size() - 1));
+      for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) idx_arr[idx] = index_arrays[idx];
+      int num_arrays = index_arrays.size();
+      zip_kernel<DIM, N><<<blocks, THREADS_PER_BLOCK>>>(
+        out, idx_arr, rect, pitches, num_arrays, key_dim, start_index, num_arrays);
     }
   }
 };
diff --git a/src/cunumeric/index/zip.h b/src/cunumeric/index/zip.h
index bedad8a7a..cd6100cc8 100644
--- a/src/cunumeric/index/zip.h
+++ b/src/cunumeric/index/zip.h
@@ -25,6 +25,7 @@ struct ZipArgs {
   const std::vector<Array>& inputs;
   const int64_t N;
   const int64_t key_dim;
+  const int64_t start_index;
 };
 
 class ZipTask : public CuNumericTask<ZipTask> {
diff --git a/src/cunumeric/index/zip_omp.cc b/src/cunumeric/index/zip_omp.cc
index 30a51a48c..0848a0f83 100644
--- a/src/cunumeric/index/zip_omp.cc
+++ b/src/cunumeric/index/zip_omp.cc
@@ -33,6 +33,7 @@ struct ZipImplBody<VariantKind::OMP, DIM, N> {
                   const Pitches<DIM - 1>& pitches,
                   bool dense,
                   const int64_t key_dim,
+                  const int64_t start_index,
                   std::index_sequence<Is...>) const
   {
     const size_t volume = rect.volume();
@@ -55,9 +56,17 @@ struct ZipImplBody<VariantKind::OMP, DIM, N> {
       for (size_t idx = 0; idx < volume; ++idx) {
         auto p = pitches.unflatten(idx, rect.lo);
         Legion::Point<N> new_point;
-        new_point[0] = index_arrays[0][p];
-        for (size_t i = 1; i < N; i++) { new_point[i] = p[key_dim + i - 1]; }
+        std::cout << "IRINA DEBUG 2" << std::endl;
+        for (size_t i = 0; i < start_index; i++) { new_point[i] = p[i]; }
+        for (size_t i = 0; i < index_arrays.size(); i++) {
+          new_point[start_index + i] = index_arrays[i][p];
+        }
+        for (size_t i = (start_index + index_arrays.size()); i < N; i++) {
+          int64_t j    = key_dim + i - 1 - (index_arrays.size() - 1);
+          new_point[i] = p[j];
+        }
         out[p] = new_point;
+        std::cout << "IRINA DEBUG 3 " << out[p] << std::endl;
       }
     }
   }
diff --git a/src/cunumeric/index/zip_template.inl b/src/cunumeric/index/zip_template.inl
index e1c99771e..b9729949c 100644
--- a/src/cunumeric/index/zip_template.inl
+++ b/src/cunumeric/index/zip_template.inl
@@ -34,9 +34,11 @@ struct ZipImpl {
     auto out        = args.out.write_accessor<Point<N>, DIM>(out_rect);
     auto index_rect = args.inputs[0].shape<DIM>();
     Pitches<DIM - 1> pitches;
-    size_t volume = pitches.flatten(index_rect);
+    size_t volume = pitches.flatten(out_rect);
     if (volume == 0) return;
 
+    std::cout << "IRINA DEBUG out rect = " << out_rect << ", index rect = " << index_rect
+              << std::endl;
 #ifdef CUNUMERIC_DEBUG
     assert(out_rect == index_rect)
 #endif
@@ -56,17 +58,24 @@ struct ZipImpl {
 #ifdef LEGION_BOUNDS_CHECKS
     bool dense = false;
 #endif
-    ZipImplBody<KIND, DIM, N>()(
-      out, index_arrays, index_rect, pitches, dense, args.key_dim, std::make_index_sequence<N>());
+    ZipImplBody<KIND, DIM, N>()(out,
+                                index_arrays,
+                                index_rect,
+                                pitches,
+                                dense,
+                                args.key_dim,
+                                args.start_index,
+                                std::make_index_sequence<N>());
   }
 };
 
 template <VariantKind KIND>
 static void zip_template(TaskContext& context)
 {
-  int64_t N       = context.scalars()[0].value<int64_t>();
-  int64_t key_dim = context.scalars()[1].value<int64_t>();
-  ZipArgs args{context.outputs()[0], context.inputs(), N, key_dim};
+  int64_t N           = context.scalars()[0].value<int64_t>();
+  int64_t key_dim     = context.scalars()[1].value<int64_t>();
+  int64_t start_index = context.scalars()[2].value<int64_t>();
+  ZipArgs args{context.outputs()[0], context.inputs(), N, key_dim, start_index};
   double_dispatch(args.inputs[0].dim(), N, ZipImpl<KIND>{}, args);
 }
 
diff --git a/tests/index_routines.py b/tests/index_routines.py
index bc2a374d2..90d4a8f1b 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -57,12 +57,22 @@ def advanced_indexing():
     index_2d_num = num.array(index_2d)
     assert np.array_equal(y[index_2d], y_num[index_2d_num])
 
-    # mismatch dimesion case integers:
+    # mismatch dimesion case:
     print("advanced indexing test 4")
     indx = np.array([1, 1])
     indx_num = num.array(indx)
     res = z[indx]
     res_num = z_num[indx_num]
+    print(res)
+    print(res_num)
+    assert np.array_equal(res, res_num)
+
+    res = z[:, :, indx]
+    res_num = z_num[:, :, indx_num]
+    assert np.array_equal(res, res_num)
+
+    res = z[:, indx, :]
+    res_num = z_num[:, indx_num, :]
     assert np.array_equal(res, res_num)
 
     # 2d:
@@ -72,7 +82,11 @@ def advanced_indexing():
     res_num = z_num[indx_num]
     assert np.array_equal(res, res_num)
 
-    # 2 arrays passed do 3d array
+    res = z[:, indx]
+    res_num = z_num[:, indx_num]
+    assert np.array_equal(res, res_num)
+
+    # 2 arrays passed to 3d array
     indx0 = np.array([1, 1])
     indx1 = np.array([1, 0])
     indx0_num = num.array(indx0)
@@ -81,6 +95,10 @@ def advanced_indexing():
     res_num = z_num[indx0_num, indx1_num]
     assert np.array_equal(res, res_num)
 
+    res = z[:, indx0, indx1]
+    res_num = z_num[:, indx0_num, indx1_num]
+    assert np.array_equal(res, res_num)
+
     # 2 arrays with broadcasting
     indx0 = np.array([1, 1])
     indx1 = np.array([[1, 0], [1, 0]])
@@ -91,18 +109,11 @@ def advanced_indexing():
     assert np.array_equal(res, res_num)
 
     # mismatch dimesion case bool:
-    # print ("advance indexing test 4")
-    # indx_bool = np.array([True, False])
-    # indx_bool_num = num.array(indx_bool)
-    # res = z[indx_bool]
-    # print("IRINA DEBUG")
-    # assert np.array_equal(indx_bool.nonzero(), indx_bool_num.nonzero())
-    # print("bool array as indx np:")
-    # print(res)
-    # print("cunumeric:")
-    # res_num = z_num[indx_bool_num]
-    # print(res_num)
-    # assert np.array_equal(res, res_num)
+    indx_bool = np.array([True, False])
+    indx_bool_num = num.array(indx_bool)
+    res = z[indx_bool]
+    res_num = z_num[indx_bool_num]
+    assert np.array_equal(res, res_num)
 
     # test for bool array of the same dimension
     print("advanced indexing test 5")
@@ -147,9 +158,6 @@ def advanced_indexing():
     indx1_num = num.array(indx1)
     indx2_num = num.array(indx2)
 
-    # indx0_num._thunk._zip_indices((indx0_num._thunk,
-    #  indx1_num._thunk, indx2_num._thunk,))
-
     res = z_num[indx0_num, indx1_num, indx2_num]
     res_np = z[indx0, indx1, indx2]
     assert np.array_equal(res, res_np)
@@ -173,10 +181,6 @@ def advanced_indexing():
     ind0_num = num.array(ind0)
     res = z[ind0, :, -1]
     res_num = z_num[ind0_num, :, -1]
-    # res = z[ind0,-1]
-    # print(res)
-    # res_num = z_num[ind0,-1]
-    # print(res_num)
     assert np.array_equal(res, res_num)
 
     # In-Place & Augmented Assignments via Advanced Indexing

From b87c51b8f436bf42ba48f741f7cc2c4889cb2f21 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 18 Mar 2022 13:36:00 -0700
Subject: [PATCH 10/33] some clean-up

---
 cunumeric/deferred.py                        | 45 +++++++++-----------
 src/cunumeric/index/advanced_indexing_omp.cc |  5 ++-
 src/cunumeric/index/zip.cu                   | 18 ++++----
 src/cunumeric/index/zip_omp.cc               |  6 +--
 src/cunumeric/index/zip_template.inl         |  2 -
 tests/index_routines.py                      | 26 ++++++++++-
 6 files changed, 58 insertions(+), 44 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 8a37b6803..5904001af 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -310,46 +310,33 @@ def broadcast_shapes(self, shapes):
         return np.broadcast(*arrays).shape
 
     def _zip_indices(self, start_index, arrays):
-
         if not isinstance(arrays, tuple):
-            raise TypeError("zip_indices expect tuple of arrays")
+            raise TypeError("zip_indices expects tuple of arrays")
+        # start_index is the index from witch indices arrays are passed
+        # for example of arr[:, indx, :], start_index =1
         if start_index == -1:
             start_index = 0
+
         arrays = tuple(self.runtime.to_deferred_array(a) for a in arrays)
         # all arrays should have the same shape and type
         data_type = arrays[0].dtype
         if not np.issubdtype(data_type, np.integer):
             raise TypeError("a array should be integer type")
 
+        # find a broadcasted shape for all arrays passed as indices
         shapes = tuple(a.shape for a in arrays)
         if len(arrays) > 1:
             b_shape = self.broadcast_shapes(shapes)
         else:
             b_shape = arrays[0].shape
+
+        # key dim - dimension of indices arrays
         key_dim = len(b_shape)
         out_shape = b_shape
 
-        if len(arrays) == 1:
-            # special case when a single index array is passed and it's dim <
-            # self.ndims
-            out_shape = (
-                tuple(self.shape[i] for i in range(0, start_index))
-                + b_shape
-                + tuple(
-                    self.shape[i] for i in range(start_index + 1, self.ndim)
-                )
-            )
-            array = arrays[0].base
-            start = key_dim - 1
-            new_arrays = tuple()
-            for i in range(0, start_index):
-                array = array.promote(i, self.shape[i])
-            for i in range(start_index + 1, self.ndim):
-                array = array.promote(start + i, self.shape[i])
-            if array.shape != out_shape:
-                raise ValueError("Wrong shape calculation")
-            new_arrays += (array,)
-        elif len(arrays) < self.ndim:
+        if len(arrays) < self.ndim:
+            # the case when # of arrays passed is smaller than dimension of
+            # the input array
             N = len(arrays)
             # broadcast shapes
             new_arrays = tuple()
@@ -372,7 +359,7 @@ def _zip_indices(self, start_index, arrays):
                 )
             )
             new_arrays = tuple()
-            start = key_dim - 1
+            # promote all index arrays to have the same shape as output
             for a in arrays:
                 for i in range(0, start_index):
                     a = a.promote(i, self.shape[i])
@@ -382,6 +369,10 @@ def _zip_indices(self, start_index, arrays):
             arrays = new_arrays
 
         else:
+            # the use case when # of arrays passed is equal to the dimension
+            # of the input array
+            if len(arrays) > self.ndim:
+                raise ValueError("wrong number of index arrays passed")
             new_arrays = tuple()
             for a in arrays:
                 if data_type != a.dtype:
@@ -394,6 +385,7 @@ def _zip_indices(self, start_index, arrays):
                     a = a.base
                 new_arrays = new_arrays + (a,)
         arrays = new_arrays
+
         # create output array which will store Point<N> field where
         # N is number of index arrays
         # shape of the output array should be the same as the shape of each
@@ -487,6 +479,7 @@ def _create_indexing_array(self, key):
                     )
         else:
             assert isinstance(key, NumPyThunk)
+            # the use case when index array ndim >1 and input array ndim ==1
             if key.ndim > store.ndim:
                 if store.ndim != 1:
                     raise ValueError("Advance indexing dimention mismatch")
@@ -514,7 +507,7 @@ def _create_indexing_array(self, key):
                     task.execute()
                     return False, store, out
                 else:
-                    # IRINA fixme: replace `nonzero` case with the task with
+                    # FIXME: replace `nonzero` case with the task with
                     # output regions when ND output regions are available
                     tuple_of_arrays = key.nonzero()
             elif key.ndim < store.ndim:
@@ -527,7 +520,7 @@ def _create_indexing_array(self, key):
             raise TypeError("Advanced indexing dimension mismatch")
 
         if (len(tuple_of_arrays) == self.ndim and self.ndim > 1) or (
-            len(tuple_of_arrays) < self.ndim > 1
+            len(tuple_of_arrays) < self.ndim and self.ndim > 1
         ):
 
             output_arr = self._zip_indices(start_index, tuple_of_arrays)
diff --git a/src/cunumeric/index/advanced_indexing_omp.cc b/src/cunumeric/index/advanced_indexing_omp.cc
index 5128ac75d..192b898b1 100644
--- a/src/cunumeric/index/advanced_indexing_omp.cc
+++ b/src/cunumeric/index/advanced_indexing_omp.cc
@@ -63,7 +63,10 @@ struct AdvancedIndexingImplBody<VariantKind::OMP, CODE, DIM1, DIM2> {
       offsets[0] = 0;
       for (auto idx = 1; idx < max_threads; ++idx) offsets[idx] = offsets[idx - 1] + sizes[idx - 1];
     }
-    out = create_buffer<VAL>(size, Memory::Kind::SYSTEM_MEM);
+
+    Memory::Kind kind =
+      CuNumeric::has_numamem ? Memory::Kind::SOCKET_MEM : Memory::Kind::SYSTEM_MEM;
+    out = create_buffer<VAL>(size, kind);
 
 #pragma omp parallel
     {
diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu
index 28d97ed2f..88b999776 100644
--- a/src/cunumeric/index/zip.cu
+++ b/src/cunumeric/index/zip.cu
@@ -56,19 +56,19 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
              const DeferredBuffer<AccessorRO<int64_t, DIM>, 1> index_arrays,
              const Rect<DIM> rect,
              const Pitches<DIM - 1> pitches,
+             int narrays,
              int volume,
-             const int64_t key_dim,
-             const int64_t start_index,
-             int num_arrays)
+             int key_dim,
+             int start_index)
 {
   const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= volume) return;
   auto p = pitches.unflatten(idx, rect.lo);
   Legion::Point<N> new_point;
   for (size_t i = 0; i < start_index; i++) { new_point[i] = p[i]; }
-  for (size_t i = 0; i < num_arrays; i++) { new_point[start_index + i] = index_arrays[i][p]; }
-  for (size_t i = (start_index + num_arrays); i < N; i++) {
-    int64_t j    = key_dim + i - 1 - (num_arrays);
+  for (size_t i = 0; i < narrays; i++) { new_point[start_index + i] = index_arrays[i][p]; }
+  for (size_t i = (start_index + narrays); i < N; i++) {
+    int64_t j    = key_dim + i - 1 - (narrays - 1);
     new_point[i] = p[j];
   }
   out[p] = new_point;
@@ -90,7 +90,7 @@ struct ZipImplBody<VariantKind::GPU, DIM, N> {
   {
     const size_t volume = rect.volume();
     const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    if (index_arrays.size() > 1) {
+    if (index_arrays.size() == N) {
       if (dense) {
         DeferredBuffer<const int64_t*, 1> idx_arr(Memory::Kind::Z_COPY_MEM,
                                                   Rect<1>(0, index_arrays.size() - 1));
@@ -106,13 +106,13 @@ struct ZipImplBody<VariantKind::GPU, DIM, N> {
         zip_kernel<DIM, N><<<blocks, THREADS_PER_BLOCK>>>(
           out, idx_arr, rect, pitches, volume, std::make_index_sequence<N>());
       }
-    } else if (index_arrays.size() == 1) {
+    } else if (index_arrays.size() < N) {
       DeferredBuffer<AccessorRO<VAL, DIM>, 1> idx_arr(Memory::Kind::Z_COPY_MEM,
                                                       Rect<1>(0, index_arrays.size() - 1));
       for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) idx_arr[idx] = index_arrays[idx];
       int num_arrays = index_arrays.size();
       zip_kernel<DIM, N><<<blocks, THREADS_PER_BLOCK>>>(
-        out, idx_arr, rect, pitches, num_arrays, key_dim, start_index, num_arrays);
+        out, idx_arr, rect, pitches, num_arrays, volume, key_dim, start_index);
     }
   }
 };
diff --git a/src/cunumeric/index/zip_omp.cc b/src/cunumeric/index/zip_omp.cc
index 0848a0f83..4547f64d1 100644
--- a/src/cunumeric/index/zip_omp.cc
+++ b/src/cunumeric/index/zip_omp.cc
@@ -37,7 +37,7 @@ struct ZipImplBody<VariantKind::OMP, DIM, N> {
                   std::index_sequence<Is...>) const
   {
     const size_t volume = rect.volume();
-    if (index_arrays.size() > 1) {
+    if (index_arrays.size() == N) {
       if (dense) {
         auto outptr = out.ptr(rect);
 #pragma omp parallel for schedule(static)
@@ -51,12 +51,11 @@ struct ZipImplBody<VariantKind::OMP, DIM, N> {
           out[p] = Legion::Point<N>(index_arrays[Is][p]...);
         }
       }  // else
-    } else if (index_arrays.size() == 1) {
+    } else if (index_arrays.size() < N) {
 #pragma omp parallel for schedule(static)
       for (size_t idx = 0; idx < volume; ++idx) {
         auto p = pitches.unflatten(idx, rect.lo);
         Legion::Point<N> new_point;
-        std::cout << "IRINA DEBUG 2" << std::endl;
         for (size_t i = 0; i < start_index; i++) { new_point[i] = p[i]; }
         for (size_t i = 0; i < index_arrays.size(); i++) {
           new_point[start_index + i] = index_arrays[i][p];
@@ -66,7 +65,6 @@ struct ZipImplBody<VariantKind::OMP, DIM, N> {
           new_point[i] = p[j];
         }
         out[p] = new_point;
-        std::cout << "IRINA DEBUG 3 " << out[p] << std::endl;
       }
     }
   }
diff --git a/src/cunumeric/index/zip_template.inl b/src/cunumeric/index/zip_template.inl
index b9729949c..79476e1e2 100644
--- a/src/cunumeric/index/zip_template.inl
+++ b/src/cunumeric/index/zip_template.inl
@@ -37,8 +37,6 @@ struct ZipImpl {
     size_t volume = pitches.flatten(out_rect);
     if (volume == 0) return;
 
-    std::cout << "IRINA DEBUG out rect = " << out_rect << ", index rect = " << index_rect
-              << std::endl;
 #ifdef CUNUMERIC_DEBUG
     assert(out_rect == index_rect)
 #endif
diff --git a/tests/index_routines.py b/tests/index_routines.py
index 90d4a8f1b..1e2669659 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -63,8 +63,6 @@ def advanced_indexing():
     indx_num = num.array(indx)
     res = z[indx]
     res_num = z_num[indx_num]
-    print(res)
-    print(res_num)
     assert np.array_equal(res, res_num)
 
     res = z[:, :, indx]
@@ -217,6 +215,30 @@ def advanced_indexing():
     # print(x)
     # x_num[indx0_num, indx1_num] =0.0
 
+    # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by
+    # 1 when passig 2d index array
+    for ndim in range(2, LEGATE_MAX_DIM):
+        a_shape = tuple(random.randint(2, 9) for i in range(ndim))
+        np_array = mk_seq_array(np, a_shape)
+        num_array = mk_seq_array(num, a_shape)
+        # check when N of index arrays == N of dims
+        num_tuple_of_indices = tuple()
+        np_tuple_of_indices = tuple()
+        for i in range(ndim):
+            i_shape = (2, 4)
+            idx_arr_np = mk_seq_array(np, i_shape) % np_array.shape[i]
+            idx_arr_num = num.array(idx_arr_np)
+            np_tuple_of_indices += (idx_arr_np,)
+            num_tuple_of_indices += (idx_arr_num,)
+        assert np.array_equal(
+            np_array[np_tuple_of_indices], num_array[num_tuple_of_indices]
+        )
+        # check when N of index arrays == N of dims
+        i_shape = (2, 2)
+        idx_arr_np = mk_seq_array(np, i_shape) % np_array.shape[0]
+        idx_arr_num = num.array(idx_arr_np)
+        assert np.array_equal(np_array[idx_arr_np], num_array[idx_arr_num])
+
     return
 
 

From d06e03d48e974871763e355db507c905252396e1 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 24 Mar 2022 09:32:07 -0700
Subject: [PATCH 11/33] adding support for some corner cases in the advanced
 indexing

---
 cunumeric/deferred.py                | 84 +++++++++++++++++++++-------
 src/cunumeric/index/zip.cc           |  4 +-
 src/cunumeric/index/zip_omp.cc       |  4 +-
 src/cunumeric/index/zip_template.inl |  7 +--
 tests/index_routines.py              | 59 +++++++++++++++++++
 5 files changed, 133 insertions(+), 25 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 5904001af..364d7a92a 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -415,7 +415,7 @@ def _zip_indices(self, start_index, arrays):
             task.add_scalar_arg(start_index, ty.int64)  # start_index
             for a in arrays:
                 task.add_input(a)
-                task.add_alignment(a, output_arr.base)
+                task.add_alignment(output_arr.base, a)
                 task.add_broadcast(a, axes=tuple(range(1, len(out_shape))))
                 task.add_broadcast(
                     output_arr.base, axes=tuple(range(1, len(out_shape)))
@@ -432,40 +432,59 @@ def _zip_indices(self, start_index, arrays):
         return output_arr
 
     def _create_indexing_array(self, key):
-        # Convert everything into deferred arrays of int64
         store = self.base
-        shift = 0
+        # the index where the first index_array is passed to the [] operator
         start_index = -1
         if isinstance(key, tuple):
+            key = self._unpack_ellipsis(key, self.ndim)
+            shift = 0
+            last_index = self.ndim
+            # in case when index arrays are passed in the scaterred way,
+            # we need to transpose original array so all index arrays
+            # are close to each other
+            transpose_needed = False
+            transpose_indices = tuple()
+            # since we can't call Copy operation on transformed Store, after
+            # the transformation, we need to return a copy
+            copy_needed = False
             tuple_of_arrays = ()
-            # for k in key:
+
             for dim, k in enumerate(key):
                 if np.isscalar(k):
                     if k < 0:
                         k += store.shape[dim + shift]
                     store = store.project(dim + shift, k)
-                    store_to_copy = DeferredArray(
-                        self.runtime,
-                        base=store,
-                        dtype=self.dtype,
-                    )
-                    store_copy = self.runtime.create_empty_thunk(
-                        store_to_copy.shape,
-                        self.dtype,
-                        inputs=[store_to_copy],
-                    )
-                    store_copy.copy(store_to_copy, deep=True)
-                    self = store_copy
-                    store = store_copy.base
                     shift -= 1
+                    copy_needed = True
+                    last_index = dim + shift
+                elif k is np.newaxis:
+                    store = store.promote(dim + shift, 1)
+                    copy_needed = True
                 elif isinstance(k, slice):
-                    store = store
+                    store = store.slice(dim + shift, k)
+                    if k != slice(None):
+                        copy_needed = True
                 elif isinstance(k, NumPyThunk):
                     # the very first time we get cunumeric array, record
                     # start_index
                     if start_index == -1:
-                        start_index = dim
+                        start_index = dim + shift
+                        if (start_index - last_index) > 1:
+                            transpose_needed = True
+                        last_index = dim + shift
+                        transpose_indices += (dim + shift,)
+                    else:
+                        transpose_needed = transpose_needed or (
+                            (dim + shift - last_index) > 1
+                        )
+                        transpose_indices += (dim + shift,)
+                        last_index = dim + shift
                     if k.dtype == np.bool:
+                        if k.shape[0] != self.shape[dim]:
+                            raise ValueError(
+                                "boolean index did not match "
+                                "indexed array along dimension  "
+                            )
                         # in case of the mixed indises we all nonzero
                         # for the bool array
                         k = k.nonzero()
@@ -477,6 +496,33 @@ def _create_indexing_array(self, key):
                         "Unsupported entry type passed to advanced",
                         "indexing operation",
                     )
+            # if len(tuple_of_arrays) == 1:
+            #    transpose_needed = False
+            if transpose_needed:
+                copy_needed = True
+                start_index = 0
+                post_indices = tuple(
+                    i for i in range(store.ndim) if i not in transpose_indices
+                )
+                transpose_indices += post_indices
+                store = store.transpose(transpose_indices)
+            if copy_needed:
+                # after store is transformed we need to to return a copy of
+                # the store since Copy operation can't be done on
+                # the store with transformation
+                store_to_copy = DeferredArray(
+                    self.runtime,
+                    base=store,
+                    dtype=self.dtype,
+                )
+                store_copy = self.runtime.create_empty_thunk(
+                    store_to_copy.shape,
+                    self.dtype,
+                    inputs=[store_to_copy],
+                )
+                store_copy.copy(store_to_copy, deep=True)
+                self = store_copy
+                store = store_copy.base
         else:
             assert isinstance(key, NumPyThunk)
             # the use case when index array ndim >1 and input array ndim ==1
diff --git a/src/cunumeric/index/zip.cc b/src/cunumeric/index/zip.cc
index 8696694d8..b167d2f62 100644
--- a/src/cunumeric/index/zip.cc
+++ b/src/cunumeric/index/zip.cc
@@ -39,9 +39,11 @@ struct ZipImplBody<VariantKind::CPU, DIM, N> {
     if (index_arrays.size() == N) {
       const size_t volume = rect.volume();
       if (dense) {
+        std::vector<const VAL*> indx_ptrs;
+        for (auto a : index_arrays) indx_ptrs.push_back(a.ptr(rect));
         auto outptr = out.ptr(rect);
         for (size_t idx = 0; idx < volume; ++idx) {
-          outptr[idx] = Legion::Point<N>(index_arrays[Is].ptr(rect)[idx]...);
+          outptr[idx] = Legion::Point<N>(indx_ptrs[Is][idx]...);
         }
       } else {
         for (size_t idx = 0; idx < volume; ++idx) {
diff --git a/src/cunumeric/index/zip_omp.cc b/src/cunumeric/index/zip_omp.cc
index 4547f64d1..9276c3450 100644
--- a/src/cunumeric/index/zip_omp.cc
+++ b/src/cunumeric/index/zip_omp.cc
@@ -39,10 +39,12 @@ struct ZipImplBody<VariantKind::OMP, DIM, N> {
     const size_t volume = rect.volume();
     if (index_arrays.size() == N) {
       if (dense) {
+        std::vector<const VAL*> indx_ptrs;
+        for (auto a : index_arrays) indx_ptrs.push_back(a.ptr(rect));
         auto outptr = out.ptr(rect);
 #pragma omp parallel for schedule(static)
         for (size_t idx = 0; idx < volume; ++idx) {
-          outptr[idx] = Legion::Point<N>(index_arrays[Is].ptr(rect)[idx]...);
+          outptr[idx] = Legion::Point<N>(indx_ptrs[Is][idx]...);
         }
       } else {
 #pragma omp parallel for schedule(static)
diff --git a/src/cunumeric/index/zip_template.inl b/src/cunumeric/index/zip_template.inl
index 79476e1e2..d4b34a787 100644
--- a/src/cunumeric/index/zip_template.inl
+++ b/src/cunumeric/index/zip_template.inl
@@ -43,6 +43,8 @@ struct ZipImpl {
 
 #ifndef LEGION_BOUNDS_CHECKS
       bool dense = out.accessor.is_dense_row_major(out_rect);
+#else
+    bool dense = false;
 #endif
     std::vector<AccessorRO<VAL, DIM>> index_arrays;
     for (int i = 0; i < args.inputs.size(); i++) {
@@ -53,12 +55,9 @@ struct ZipImpl {
       dense = dense && index_arrays[i].accessor.is_dense_row_major(out_rect);
     }
 
-#ifdef LEGION_BOUNDS_CHECKS
-    bool dense = false;
-#endif
     ZipImplBody<KIND, DIM, N>()(out,
                                 index_arrays,
-                                index_rect,
+                                out_rect,
                                 pitches,
                                 dense,
                                 args.key_dim,
diff --git a/tests/index_routines.py b/tests/index_routines.py
index 1e2669659..aa1275534 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -97,6 +97,21 @@ def advanced_indexing():
     res_num = z_num[:, indx0_num, indx1_num]
     assert np.array_equal(res, res_num)
 
+    # 2 index arrays passed in a sparse way:
+    x = mk_seq_array(np, (3, 4, 5, 6))
+    x_num = mk_seq_array(num, (3, 4, 5, 6))
+    res = x[:, [0, 1], :, [0, 1]]
+    res_num = x_num[:, [0, 1], :, [0, 1]]
+    assert np.array_equal(res, res_num)
+
+    res = x[[0, 1], :, [0, 1], 1:]
+    res_num = x_num[[0, 1], :, [0, 1], 1:]
+    assert np.array_equal(res, res_num)
+
+    res = x[:, [0, 1], :, 1:]
+    res_num = x_num[:, [0, 1], :, 1:]
+    assert np.array_equal(res, res_num)
+
     # 2 arrays with broadcasting
     indx0 = np.array([1, 1])
     indx1 = np.array([[1, 0], [1, 0]])
@@ -181,6 +196,32 @@ def advanced_indexing():
     res_num = z_num[ind0_num, :, -1]
     assert np.array_equal(res, res_num)
 
+    res = z[ind0, :, [False, True, False, True]]
+    res_num = z_num[ind0_num, :, [False, True, False, True]]
+    assert np.array_equal(res, res_num)
+
+    res = z[ind0, :, ind0]
+    res_num = z_num[ind0_num, :, ind0_num]
+    assert np.array_equal(res, res_num)
+
+    res = z[ind0, :, 1:3]
+    res_num = z_num[ind0_num, :, 1:3]
+    assert np.array_equal(res, res_num)
+
+    res = z[1, :, ind0]
+    res_num = z_num[1, :, ind0_num]
+    assert np.array_equal(res, res_num)
+
+    x = mk_seq_array(np, (3, 4, 5, 6))
+    x_num = mk_seq_array(num, (3, 4, 5, 6))
+    res = x[[0, 1], [0, 1], :, 2]
+    res_num = x_num[[0, 1], [0, 1], :, 2]
+    assert np.array_equal(res, res_num)
+
+    res = x[..., [0, 1], 2]
+    res_num = x_num[..., [0, 1], 2]
+    assert np.array_equal(res, res_num)
+
     # In-Place & Augmented Assignments via Advanced Indexing
     # simple 1d case
     # y = np.array([0, -1, -2, -3, -4, -5])
@@ -238,6 +279,24 @@ def advanced_indexing():
         idx_arr_np = mk_seq_array(np, i_shape) % np_array.shape[0]
         idx_arr_num = num.array(idx_arr_np)
         assert np.array_equal(np_array[idx_arr_np], num_array[idx_arr_num])
+        idx_arr_np = np.array([[1, 0, 1], [1, 1, 0]])
+        idx_arr_num = num.array(idx_arr_np)
+        assert np.array_equal(
+            np_array[:, idx_arr_np], num_array[:, idx_arr_num]
+        )
+        if ndim > 2:
+            assert np.array_equal(
+                np_array[1, :, idx_arr_np], num_array[1, :, idx_arr_num]
+            )
+            assert np.array_equal(
+                np_array[:, idx_arr_np, idx_arr_np],
+                num_array[:, idx_arr_num, idx_arr_num],
+            )
+        if ndim > 3:
+            assert np.array_equal(
+                np_array[:, idx_arr_np, :, idx_arr_np],
+                num_array[:, idx_arr_num, :, idx_arr_num],
+            )
 
     return
 

From 7672be165d40d5b005c45579ca74026073033dea Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 28 Mar 2022 13:08:11 -0700
Subject: [PATCH 12/33] adding support for advanced indexing in-place
 assignment with views to original data

---
 cunumeric/deferred.py                         | 99 ++++++++++++-------
 src/cunumeric/index/advanced_indexing.cc      | 59 ++++++++---
 src/cunumeric/index/advanced_indexing.cu      | 33 ++++++-
 src/cunumeric/index/advanced_indexing.h       |  1 +
 src/cunumeric/index/advanced_indexing_omp.cc  | 62 +++++++++---
 .../index/advanced_indexing_template.inl      | 27 ++++-
 tests/index_routines.py                       | 85 ++++++++++------
 7 files changed, 269 insertions(+), 97 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 364d7a92a..526292887 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -431,8 +431,9 @@ def _zip_indices(self, start_index, arrays):
 
         return output_arr
 
-    def _create_indexing_array(self, key):
+    def _create_indexing_array(self, key, is_set=False):
         store = self.base
+        rhs = self
         # the index where the first index_array is passed to the [] operator
         start_index = -1
         if isinstance(key, tuple):
@@ -496,8 +497,6 @@ def _create_indexing_array(self, key):
                         "Unsupported entry type passed to advanced",
                         "indexing operation",
                     )
-            # if len(tuple_of_arrays) == 1:
-            #    transpose_needed = False
             if transpose_needed:
                 copy_needed = True
                 start_index = 0
@@ -521,7 +520,7 @@ def _create_indexing_array(self, key):
                     inputs=[store_to_copy],
                 )
                 store_copy.copy(store_to_copy, deep=True)
-                self = store_copy
+                rhs = store_copy
                 store = store_copy.base
         else:
             assert isinstance(key, NumPyThunk)
@@ -536,13 +535,19 @@ def _create_indexing_array(self, key):
             # Handle the boolean array case
             if key.dtype == np.bool:
                 if key.shape == self.shape:
-                    out = self.runtime.create_unbound_thunk(self.dtype)
+                    out_dtype = self.dtype
+                    if is_set:
+                        N = self.ndim
+                        out_dtype = self.runtime.add_point_type(N)
+
+                    out = self.runtime.create_unbound_thunk(out_dtype)
                     task = self.context.create_task(
                         CuNumericOpCode.ADVANCED_INDX
                     )
                     task.add_output(out.base)
                     task.add_input(self.base)
                     task.add_input(key.base)
+                    task.add_scalar_arg(is_set, bool)
                     task.add_alignment(self.base, key.base)
                     task.add_broadcast(
                         self.base, axes=tuple(range(1, len(self.shape)))
@@ -562,16 +567,13 @@ def _create_indexing_array(self, key):
             else:
                 tuple_of_arrays = (self.runtime.to_deferred_array(key),)
 
-        if len(tuple_of_arrays) > self.ndim:
+        if len(tuple_of_arrays) > rhs.ndim:
             raise TypeError("Advanced indexing dimension mismatch")
 
-        if (len(tuple_of_arrays) == self.ndim and self.ndim > 1) or (
-            len(tuple_of_arrays) < self.ndim and self.ndim > 1
-        ):
-
-            output_arr = self._zip_indices(start_index, tuple_of_arrays)
+        if len(tuple_of_arrays) <= rhs.ndim and rhs.ndim > 1:
+            output_arr = rhs._zip_indices(start_index, tuple_of_arrays)
             return True, store, output_arr
-        elif len(tuple_of_arrays) == 1 and self.ndim == 1:
+        elif len(tuple_of_arrays) == 1 and rhs.ndim == 1:
             return True, store, tuple_of_arrays[0]
         else:
             raise ValueError("Advance indexing dimention mismatch")
@@ -676,35 +678,66 @@ def set_item(self, key, rhs):
         assert self.dtype == rhs.dtype
         # Check to see if this is advanced indexing or not
         if self._is_advanced_indexing(key):
+            view_copy = False
             # Create the indexing array
-            store, index_array = self._create_indexing_array(key)
-            # if index_array.shape != rhs.shape:
-            #    raise ValueError(
-            #        "Advanced indexing array does not match source shape"
-            #    )
-            # if self.ndim != index_array.ndim:
-            #    raise NotImplementedError(
-            #        "need support for indirect partitioning"
-            #    )
+            copy_needed, store, index_array = self._create_indexing_array(
+                key, True
+            )
+            if copy_needed:
+                if self.base.transform.bottom:
+                    lhs = self
+                else:
+                    # if  store is transformed we need to to return a copy of
+                    # the store since Copy operation can't be done on
+                    # the store with transformation
+                    store_to_copy = DeferredArray(
+                        self.runtime,
+                        base=store,
+                        dtype=self.dtype,
+                    )
+                    store_copy = self.runtime.create_empty_thunk(
+                        store_to_copy.shape,
+                        self.dtype,
+                        inputs=[store_to_copy],
+                    )
+                    store_copy.copy(store_to_copy, deep=True)
+
+                    lhs = store_copy
+                    view_copy = True
+            else:
+                lhs = self
+                view_copy = False
+
             if rhs.ndim == 0:
-                shape = store.shape
-                val = rhs
-                rhs = self.runtime.create_empty_thunk(
-                    shape,
+                rhs_tmp = self.runtime.create_empty_thunk(
+                    index_array.base.shape,
                     self.dtype,
-                    inputs=[self],
+                    inputs=[index_array],
                 )
-                rhs.fill(val)
-            copy = self.context.create_copy()
+                task = self.context.create_task(CuNumericOpCode.FILL)
+                task.add_output(rhs_tmp.base)
+                task.add_input(rhs.base)
+                task.add_scalar_arg(False, bool)
+                task.execute()
+                rhs = rhs_tmp.base
+            else:
+                if rhs.shape != index_array.shape:
+                    rhs = rhs._broadcast(index_array.base.shape)
+                else:
+                    rhs = rhs.base
 
-            copy.add_input(store)
+            copy = self.context.create_copy()
+            copy.add_input(rhs)
             copy.add_target_indirect(index_array.base)
-            copy.add_output(self.base)
-
-            # copy.add_alignment(index_array.base, rhs.base)
-
+            copy.add_output(lhs.base)
             copy.execute()
 
+            if view_copy:
+                print("IRINA DEBUG", self.shape, lhs.shape)
+                print(self.base.transform.bottom)
+                print(self)
+                self.copy(lhs, deep=True)
+
         else:
             view = self._get_view(key)
 
diff --git a/src/cunumeric/index/advanced_indexing.cc b/src/cunumeric/index/advanced_indexing.cc
index 6b9224338..74882e0e3 100644
--- a/src/cunumeric/index/advanced_indexing.cc
+++ b/src/cunumeric/index/advanced_indexing.cc
@@ -22,11 +22,52 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
-template <LegateTypeCode CODE, int DIM1, int DIM2>
-struct AdvancedIndexingImplBody<VariantKind::CPU, CODE, DIM1, DIM2> {
+template <LegateTypeCode CODE, int DIM1, int DIM2, bool IS_SET>
+struct AdvancedIndexingImplBody<VariantKind::CPU, CODE, DIM1, DIM2, IS_SET> {
   using VAL = legate_type_of<CODE>;
 
-  size_t operator()(Buffer<VAL>& out,
+  void compute_output(Buffer<VAL>& out,
+                      const AccessorRO<VAL, DIM1>& input,
+                      const AccessorRO<bool, DIM2>& index,
+                      const Pitches<DIM1 - 1>& pitches_input,
+                      const Rect<DIM1>& rect_input,
+                      const Pitches<DIM2 - 1>& pitches_index,
+                      const Rect<DIM2>& rect_index,
+                      int volume) const
+  {
+    int64_t out_idx = 0;
+    for (size_t idx = 0; idx < volume; ++idx) {
+      auto p       = pitches_index.unflatten(idx, rect_index.lo);
+      auto p_input = pitches_input.unflatten(idx, rect_input.lo);
+      if (index[p] == true) {
+        out[out_idx] = input[p_input];
+        out_idx++;
+      }
+    }
+  }
+
+  void compute_output(Buffer<Point<DIM1>>& out,
+                      const AccessorRO<VAL, DIM1>&,
+                      const AccessorRO<bool, DIM2>& index,
+                      const Pitches<DIM1 - 1>& pitches_input,
+                      const Rect<DIM1>& rect_input,
+                      const Pitches<DIM2 - 1>& pitches_index,
+                      const Rect<DIM2>& rect_index,
+                      int volume) const
+  {
+    int64_t out_idx = 0;
+    for (size_t idx = 0; idx < volume; ++idx) {
+      auto p       = pitches_index.unflatten(idx, rect_index.lo);
+      auto p_input = pitches_input.unflatten(idx, rect_input.lo);
+      if (index[p] == true) {
+        out[out_idx] = p_input;
+        out_idx++;
+      }
+    }
+  }
+
+  template <typename OUT_TYPE>
+  size_t operator()(Buffer<OUT_TYPE>& out,
                     const AccessorRO<VAL, DIM1>& input,
                     const AccessorRO<bool, DIM2>& index,
                     const Pitches<DIM1 - 1>& pitches_input,
@@ -45,17 +86,9 @@ struct AdvancedIndexingImplBody<VariantKind::CPU, CODE, DIM1, DIM2> {
       if (index[p] == true) { size++; }
     }
 
-    out = create_buffer<VAL>(size, Memory::Kind::SYSTEM_MEM);
+    out = create_buffer<OUT_TYPE>(size, Memory::Kind::SYSTEM_MEM);
 
-    int64_t out_idx = 0;
-    for (size_t idx = 0; idx < volume; ++idx) {
-      auto p       = pitches_index.unflatten(idx, rect_index.lo);
-      auto p_input = pitches_input.unflatten(idx, rect_input.lo);
-      if (index[p] == true) {
-        out[out_idx] = input[p_input];
-        out_idx++;
-      }
-    }
+    compute_output(out, input, index, pitches_input, rect_input, pitches_index, rect_index, volume);
     return size;
   }
 };
diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cunumeric/index/advanced_indexing.cu
index f818579ed..fdce0f2e1 100644
--- a/src/cunumeric/index/advanced_indexing.cu
+++ b/src/cunumeric/index/advanced_indexing.cu
@@ -71,8 +71,32 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
     out[offset]    = in[point_input];
   }
 }
-template <LegateTypeCode CODE, int DIM1, int DIM2>
-struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM1, DIM2> {
+
+template <typename VAL, int DIM1, int DIM2>
+static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  advanced_indexing_kernel(size_t volume,
+                           AccessorRO<VAL, DIM1> in,
+                           AccessorRO<bool, DIM2> index,
+                           Buffer<Point<DIM1>> out,
+                           Pitches<DIM1 - 1> pitches_input,
+                           Point<DIM1> origin_input,
+                           Pitches<DIM2 - 1> pitches_index,
+                           Point<DIM2> origin_index,
+                           Buffer<int64_t> offsets)
+{
+  // FIXME works only when DIM1==DIM2
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid >= volume) return;
+  auto point       = pitches_index.unflatten(tid, origin_index);
+  auto point_input = pitches_input.unflatten(tid, origin_input);
+  if (index[point] == true) {
+    int64_t offset = offsets[tid];
+    out[offset]    = point_input;
+  }
+}
+
+template <LegateTypeCode CODE, int DIM1, int DIM2, bool IS_SET>
+struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM1, DIM2, IS_SET> {
   using VAL = legate_type_of<CODE>;
 
   int64_t compute_size(const AccessorRO<bool, DIM2>& in,
@@ -103,7 +127,8 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM1, DIM2> {
     return size.read();
   }
 
-  size_t operator()(Buffer<VAL>& out,
+  template <typename OUT_TYPE>
+  size_t operator()(Buffer<OUT_TYPE>& out,
                     const AccessorRO<VAL, DIM1>& input,
                     const AccessorRO<bool, DIM2>& index,
                     const Pitches<DIM1 - 1>& pitches_input,
@@ -123,7 +148,7 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM1, DIM2> {
     auto offsets = create_buffer<int64_t>(volume, Memory::Kind::GPU_FB_MEM);
     size         = compute_size(index, pitches_index, rect_index, volume, stream, offsets);
 
-    out = create_buffer<VAL>(size, Memory::Kind::GPU_FB_MEM);
+    out = create_buffer<OUT_TYPE>(size, Memory::Kind::GPU_FB_MEM);
     // populate output
     if (size > 0) {
       const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
diff --git a/src/cunumeric/index/advanced_indexing.h b/src/cunumeric/index/advanced_indexing.h
index ec0c92681..e375d2a72 100644
--- a/src/cunumeric/index/advanced_indexing.h
+++ b/src/cunumeric/index/advanced_indexing.h
@@ -24,6 +24,7 @@ struct AdvancedIndexingArgs {
   Array& output;
   const Array& input_array;
   const Array& indexing_array;
+  const bool is_set;
 };
 
 class AdvancedIndexingTask : public CuNumericTask<AdvancedIndexingTask> {
diff --git a/src/cunumeric/index/advanced_indexing_omp.cc b/src/cunumeric/index/advanced_indexing_omp.cc
index 192b898b1..0568b3fd1 100644
--- a/src/cunumeric/index/advanced_indexing_omp.cc
+++ b/src/cunumeric/index/advanced_indexing_omp.cc
@@ -24,11 +24,54 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
-template <LegateTypeCode CODE, int DIM1, int DIM2>
-struct AdvancedIndexingImplBody<VariantKind::OMP, CODE, DIM1, DIM2> {
+template <LegateTypeCode CODE, int DIM1, int DIM2, bool IS_SET>
+struct AdvancedIndexingImplBody<VariantKind::OMP, CODE, DIM1, DIM2, IS_SET> {
   using VAL = legate_type_of<CODE>;
 
-  size_t operator()(Buffer<VAL>& out,
+  void compute_output(Buffer<VAL>& out,
+                      const AccessorRO<VAL, DIM1>& input,
+                      const AccessorRO<bool, DIM2>& index,
+                      const Pitches<DIM1 - 1>& pitches_input,
+                      const Rect<DIM1>& rect_input,
+                      const Pitches<DIM2 - 1>& pitches_index,
+                      const Rect<DIM2>& rect_index,
+                      int volume,
+                      int64_t out_idx) const
+  {
+#pragma omp for schedule(static)
+    for (size_t idx = 0; idx < volume; ++idx) {
+      auto p       = pitches_index.unflatten(idx, rect_index.lo);
+      auto p_input = pitches_input.unflatten(idx, rect_input.lo);
+      if (index[p] == true) {
+        out[out_idx] = input[p_input];
+        out_idx++;
+      }
+    }
+  }
+
+  void compute_output(Buffer<Point<DIM1>>& out,
+                      const AccessorRO<VAL, DIM1>&,
+                      const AccessorRO<bool, DIM2>& index,
+                      const Pitches<DIM1 - 1>& pitches_input,
+                      const Rect<DIM1>& rect_input,
+                      const Pitches<DIM2 - 1>& pitches_index,
+                      const Rect<DIM2>& rect_index,
+                      int volume,
+                      int64_t out_idx) const
+  {
+#pragma omp for schedule(static)
+    for (size_t idx = 0; idx < volume; ++idx) {
+      auto p       = pitches_index.unflatten(idx, rect_index.lo);
+      auto p_input = pitches_input.unflatten(idx, rect_input.lo);
+      if (index[p] == true) {
+        out[out_idx] = p_input;
+        out_idx++;
+      }
+    }
+  }
+
+  template <typename OUT_TYPE>
+  size_t operator()(Buffer<OUT_TYPE>& out,
                     const AccessorRO<VAL, DIM1>& input,
                     const AccessorRO<bool, DIM2>& index,
                     const Pitches<DIM1 - 1>& pitches_input,
@@ -66,21 +109,14 @@ struct AdvancedIndexingImplBody<VariantKind::OMP, CODE, DIM1, DIM2> {
 
     Memory::Kind kind =
       CuNumeric::has_numamem ? Memory::Kind::SOCKET_MEM : Memory::Kind::SYSTEM_MEM;
-    out = create_buffer<VAL>(size, kind);
+    out = create_buffer<OUT_TYPE>(size, kind);
 
 #pragma omp parallel
     {
       const int tid   = omp_get_thread_num();
       int64_t out_idx = offsets[tid];
-#pragma omp for schedule(static)
-      for (size_t idx = 0; idx < volume; ++idx) {
-        auto point       = pitches_index.unflatten(idx, rect_index.lo);
-        auto point_input = pitches_input.unflatten(idx, rect_input.lo);
-        if (index[point] == true) {
-          out[out_idx] = input[point_input];
-          ++out_idx;
-        }
-      }
+      compute_output(
+        out, input, index, pitches_input, rect_input, pitches_index, rect_index, volume, out_idx);
     }
 
     return size;
diff --git a/src/cunumeric/index/advanced_indexing_template.inl b/src/cunumeric/index/advanced_indexing_template.inl
index ed88ac996..d4869e07b 100644
--- a/src/cunumeric/index/advanced_indexing_template.inl
+++ b/src/cunumeric/index/advanced_indexing_template.inl
@@ -21,7 +21,7 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
-template <VariantKind KIND, LegateTypeCode CODE, int DIM1, int DIM2>
+template <VariantKind KIND, LegateTypeCode CODE, int DIM1, int DIM2, bool IS_SET>
 struct AdvancedIndexingImplBody;
 
 template <VariantKind KIND, LegateTypeCode CODE, int DIM1>
@@ -34,6 +34,7 @@ struct AdvancedIndexingImpl {
     auto input_arr  = args.input_array.read_accessor<VAL, DIM1>(input_rect);
     Pitches<DIM1 - 1> input_pitches;
     Buffer<VAL> output_arr;
+    Buffer<Point<DIM1>> output_arr_set;
     size_t volume1 = input_pitches.flatten(input_rect);
 
     auto index_rect = args.indexing_array.shape<DIM2>();
@@ -49,13 +50,27 @@ struct AdvancedIndexingImpl {
 
     int64_t size = 0;
     if (DIM1 == DIM2) {
-      size = AdvancedIndexingImplBody<KIND, CODE, DIM1, DIM2>{}(
-        output_arr, input_arr, index_arr, input_pitches, input_rect, index_pitches, index_rect);
+      if (args.is_set) {
+        size = AdvancedIndexingImplBody<KIND, CODE, DIM1, DIM2, true>{}(output_arr_set,
+                                                                        input_arr,
+                                                                        index_arr,
+                                                                        input_pitches,
+                                                                        input_rect,
+                                                                        index_pitches,
+                                                                        index_rect);
+      } else {
+        size = AdvancedIndexingImplBody<KIND, CODE, DIM1, DIM2, false>{}(
+          output_arr, input_arr, index_arr, input_pitches, input_rect, index_pitches, index_rect);
+      }
     } else {
       // should never go here, not implemented
       assert(false);
     }
-    args.output.return_data(output_arr, size);
+    if (args.is_set) {
+      args.output.return_data(output_arr_set, size);
+    } else {
+      args.output.return_data(output_arr, size);
+    }
   }
 };
 
@@ -71,7 +86,9 @@ struct AdvancedIndexingHelper {
 template <VariantKind KIND>
 static void advanced_indexing_template(TaskContext& context)
 {
-  AdvancedIndexingArgs args{context.outputs()[0], context.inputs()[0], context.inputs()[1]};
+  // is_set flag is used to fill Point<N> field for in-place assignment operation
+  bool is_set = context.scalars()[0].value<bool>();
+  AdvancedIndexingArgs args{context.outputs()[0], context.inputs()[0], context.inputs()[1], is_set};
   double_dispatch(
     args.input_array.dim(), args.input_array.code(), AdvancedIndexingHelper<KIND>{}, args);
 }
diff --git a/tests/index_routines.py b/tests/index_routines.py
index aa1275534..3ba1f2fa1 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -134,6 +134,12 @@ def advanced_indexing():
     index_num = num.array(index)
     assert np.array_equal(y[index], y_num[index_num])
 
+    # test in-place assignment fir the case when idx arr
+    # is 1d bool array:
+    y[index] = 3
+    y_num[index_num] = 3
+    assert np.array_equal(y, y_num)
+
     # test for bool array of the same dimension 2D
     print("advanced indexing test 6")
     indx_bool = np.array(
@@ -155,6 +161,12 @@ def advanced_indexing():
     res_num = z_num[indx_bool_num]
     assert np.array_equal(res, res_num)
 
+    # test in-place assignment fir the case when idx arr
+    # is 2d bool array:
+    z[indx_bool] = 1
+    z_num[indx_bool] = 1
+    assert np.array_equal(z, z_num)
+
     # test mixed data
     print("advanced indexing test 7")
     res = z[:, -1]
@@ -175,6 +187,12 @@ def advanced_indexing():
     res_np = z[indx0, indx1, indx2]
     assert np.array_equal(res, res_np)
 
+    # test in-place assignment fir the case when
+    # several index arrays passed
+    z_num[indx0_num, indx1_num, indx2_num] = -2
+    z[indx0, indx1, indx2] = -2
+    assert np.array_equal(z, z_num)
+
     # indices with broadcast:
     print("advanced indexing test 9")
     indx0 = np.array([[0, 1], [1, 0], [0, 0]])
@@ -224,37 +242,38 @@ def advanced_indexing():
 
     # In-Place & Augmented Assignments via Advanced Indexing
     # simple 1d case
-    # y = np.array([0, -1, -2, -3, -4, -5])
-    # y_num = num.array(y)
-    # index = np.array([2, 4, 0, 4, 4, 4])
-    # index_num = num.array(index)
-    # print (y[index])
-    # print(y_num[index])
-    # y[index] = 0
-    # y_num[index_num] =0
-    # print (y_num)
+    y = np.array([0, -1, -2, -3, -4, -5])
+    y_num = num.array(y)
+    index = np.array([2, 4, 0, 4, 4, 4])
+    index_num = num.array(index)
+    y[index] = 0
+    y_num[index_num] = 0
+    assert np.array_equal(y, y_num)
+
+    y[index] = np.array([1, 2, 3, 4, 5, 6])
+    y_num[index_num] = num.array([1, 2, 3, 4, 5, 6])
+    print(y)
+    print(y_num)
+    # Order on which data is updated in case when indexing array points to the
+    # same daya in the original array is not guaranteed, so we can't call
+    # assert np.array_equal(y, y_num) here
 
     # 2D test
-    # x = np.array(
-    #    [
-    #        [0.38, -0.16, 0.38, -0.41, -0.04],
-    #        [-0.47, -0.01, -0.18, -0.5, -0.49],
-    #        [0.02, 0.4, 0.33, 0.33, -0.13],
-    #    ]
-    # )
-    # indx0 = np.array([0, 1])
-    # indx1 = np.array([1, 2])
-    # x_num = num.array(x)
-    # indx0_num = num.array(indx0)
-    # indx1_num = num.array(indx1)
-    # print(x[indx0, indx1])
-    # FIXME 0:
-    # print (x_num[indx0_num,indx1_num])
-    # assert np.array_equal(x[indx0, indx1], x_num[indx0_num, indx1_num])
-    # print (x_num[indx0_num, indx1_num])
-    # x[indx0, indx1] = 0.0
-    # print(x)
-    # x_num[indx0_num, indx1_num] =0.0
+    x = np.array(
+        [
+            [0.38, -0.16, 0.38, -0.41, -0.04],
+            [-0.47, -0.01, -0.18, -0.5, -0.49],
+            [0.02, 0.4, 0.33, 0.33, -0.13],
+        ]
+    )
+    indx0 = np.array([0, 1])
+    indx1 = np.array([1, 2])
+    x_num = num.array(x)
+    indx0_num = num.array(indx0)
+    indx1_num = num.array(indx1)
+    x[indx0, indx1] = 2.0
+    x_num[indx0_num, indx1_num] = 2.0
+    assert np.array_equal(x, x_num)
 
     # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by
     # 1 when passig 2d index array
@@ -279,11 +298,19 @@ def advanced_indexing():
         idx_arr_np = mk_seq_array(np, i_shape) % np_array.shape[0]
         idx_arr_num = num.array(idx_arr_np)
         assert np.array_equal(np_array[idx_arr_np], num_array[idx_arr_num])
+        # test in-place assignment
+        np_array[idx_arr_np] = 2
+        num_array[idx_arr_num] = 2
+        assert np.array_equal(num_array, np_array)
         idx_arr_np = np.array([[1, 0, 1], [1, 1, 0]])
         idx_arr_num = num.array(idx_arr_np)
         assert np.array_equal(
             np_array[:, idx_arr_np], num_array[:, idx_arr_num]
         )
+        # test in-place assignment
+        np_array[:, idx_arr_np] = 3
+        num_array[:, idx_arr_num] = 3
+        assert np.array_equal(num_array, np_array)
         if ndim > 2:
             assert np.array_equal(
                 np_array[1, :, idx_arr_np], num_array[1, :, idx_arr_num]

From d5e044687df16c1def17d400e1cddcfeb96015a2 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 5 Apr 2022 12:57:51 -0600
Subject: [PATCH 13/33] registering all PointN types during the Runtime
 initialization

---
 cunumeric/config.py         | 16 +++++++++++++++-
 cunumeric/deferred.py       |  6 +++---
 cunumeric/runtime.py        | 28 +++++++++++++++++++---------
 src/cunumeric/cunumeric_c.h | 12 ++++++++++++
 4 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/cunumeric/config.py b/cunumeric/config.py
index 2ec560d50..f369bfb09 100644
--- a/cunumeric/config.py
+++ b/cunumeric/config.py
@@ -78,7 +78,7 @@ def destroy(self):
 # Match these to CuNumericOpCode in cunumeric_c.h
 @unique
 class CuNumericOpCode(IntEnum):
-    ADVANCED_INDX = _cunumeric.CUNUMERIC_ADVANCED_INDEXING
+    ADVANCED_INDEXING = _cunumeric.CUNUMERIC_ADVANCED_INDEXING
     ARANGE = _cunumeric.CUNUMERIC_ARANGE
     BINARY_OP = _cunumeric.CUNUMERIC_BINARY_OP
     BINARY_RED = _cunumeric.CUNUMERIC_BINARY_RED
@@ -244,3 +244,17 @@ class CuNumericTunable(IntEnum):
     NUM_PROCS = _cunumeric.CUNUMERIC_TUNABLE_NUM_PROCS
     MAX_EAGER_VOLUME = _cunumeric.CUNUMERIC_TUNABLE_MAX_EAGER_VOLUME
     HAS_NUMAMEM = _cunumeric.CUNUMERIC_TUNABLE_HAS_NUMAMEM
+
+
+# Match these to CuNumericTypeCOdes in cunumeric_c.h
+@unique
+class CuNumericTypeCodes(IntEnum):
+    CUNUMERIC_TYPE_POINT1 = _cunumeric.CUNUMERIC_TYPE_POINT1
+    CUNUMERIC_TYPE_POINT2 = _cunumeric.CUNUMERIC_TYPE_POINT2
+    CUNUMERIC_TYPE_POINT3 = _cunumeric.CUNUMERIC_TYPE_POINT3
+    CUNUMERIC_TYPE_POINT4 = _cunumeric.CUNUMERIC_TYPE_POINT4
+    CUNUMERIC_TYPE_POINT5 = _cunumeric.CUNUMERIC_TYPE_POINT5
+    CUNUMERIC_TYPE_POINT6 = _cunumeric.CUNUMERIC_TYPE_POINT6
+    CUNUMERIC_TYPE_POINT7 = _cunumeric.CUNUMERIC_TYPE_POINT7
+    CUNUMERIC_TYPE_POINT8 = _cunumeric.CUNUMERIC_TYPE_POINT8
+    CUNUMERIC_TYPE_POINT9 = _cunumeric.CUNUMERIC_TYPE_POINT9
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 526292887..b57655715 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -398,7 +398,7 @@ def _zip_indices(self, start_index, arrays):
         # of that dtype, so long as we don't try to convert it to a
         # NumPy array.
         N = self.ndim
-        pointN_dtype = self.runtime.add_point_type(N)
+        pointN_dtype = self.runtime.get_point_type(N)
         store = self.context.create_store(
             pointN_dtype, shape=out_shape, optimize_scalar=True
         )
@@ -538,11 +538,11 @@ def _create_indexing_array(self, key, is_set=False):
                     out_dtype = self.dtype
                     if is_set:
                         N = self.ndim
-                        out_dtype = self.runtime.add_point_type(N)
+                        out_dtype = self.runtime.get_point_type(N)
 
                     out = self.runtime.create_unbound_thunk(out_dtype)
                     task = self.context.create_task(
-                        CuNumericOpCode.ADVANCED_INDX
+                        CuNumericOpCode.ADVANCED_INDEXING
                     )
                     task.add_output(out.base)
                     task.add_input(self.base)
diff --git a/cunumeric/runtime.py b/cunumeric/runtime.py
index a6d840356..14e5c5d4e 100644
--- a/cunumeric/runtime.py
+++ b/cunumeric/runtime.py
@@ -27,6 +27,7 @@
     CuNumericOpCode,
     CuNumericRedopCode,
     CuNumericTunable,
+    CuNumericTypeCodes,
     cunumeric_context,
     cunumeric_lib,
 )
@@ -96,6 +97,24 @@ def _register_dtypes(self):
         for numpy_type, core_type in _supported_dtypes.items():
             type_system.make_alias(np.dtype(numpy_type), core_type)
 
+        for n in range(1, LEGATE_MAX_DIM + 1):
+            self._register_point_type(n)
+
+    def _register_point_type(self, n):
+        type_system = self.legate_context.type_system
+        point_type = "" + str(n)
+        if point_type not in type_system:
+            code = CuNumericTypeCodes.CUNUMERIC_TYPE_POINT1 + n - 1
+            size_in_bytes = 8 * n
+            type_system.add_type(point_type, size_in_bytes, code)
+
+    def get_point_type(self, n):
+        type_system = self.legate_context.type_system
+        point_type = "" + str(n)
+        if point_type not in type_system:
+            raise ValueError(f"there is no point type registered fro {n}")
+        return point_type
+
     def _parse_command_args(self):
         try:
             # Prune it out so the application does not see it
@@ -175,15 +194,6 @@ def get_arg_dtype(self, value_dtype):
                 dtype.register_reduction_op(redop, redop_id)
         return arg_dtype
 
-    def add_point_type(self, n):
-        type_system = self.legate_context.type_system
-        point_type = "point" + str(n)
-        if point_type not in type_system:
-            code = type_system[ty.int64].code
-            size_in_bytes = 8 * n
-            type_system.add_type(point_type, size_in_bytes, code)
-        return point_type
-
     def _report_coverage(self):
         total = len(self.api_calls)
         implemented = sum(int(impl) for (_, _, impl) in self.api_calls)
diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h
index 0e8106ff1..a73a69eb8 100644
--- a/src/cunumeric/cunumeric_c.h
+++ b/src/cunumeric/cunumeric_c.h
@@ -187,6 +187,18 @@ enum CuNumericBounds {
   CUNUMERIC_MAX_TASKS   = 1048576,
 };
 
+enum CuNumericTypeCodes {
+  CUNUMERIC_TYPE_POINT1 = LEGION_TYPE_TOTAL + 1,
+  CUNUMERIC_TYPE_POINT2,
+  CUNUMERIC_TYPE_POINT3,
+  CUNUMERIC_TYPE_POINT4,
+  CUNUMERIC_TYPE_POINT5,
+  CUNUMERIC_TYPE_POINT6,
+  CUNUMERIC_TYPE_POINT7,
+  CUNUMERIC_TYPE_POINT8,
+  CUNUMERIC_TYPE_POINT9,
+};
+
 #ifdef __cplusplus
 extern "C" {
 #endif

From 6e1b4f17ffc70d25fb6f2e47cf26830457cee585 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 5 Apr 2022 14:22:51 -0600
Subject: [PATCH 14/33] fixing logic for transpose operation in advanced
 indexing

---
 cunumeric/deferred.py   | 57 +++++++++++++++++++++++------------------
 tests/index_routines.py |  8 ++++++
 2 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index b57655715..42676dfaa 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -449,15 +449,43 @@ def _create_indexing_array(self, key, is_set=False):
             # the transformation, we need to return a copy
             copy_needed = False
             tuple_of_arrays = ()
+            index_map = []
 
+            # First, we need to check if transpose is needed
             for dim, k in enumerate(key):
+                if np.isscalar(k) or isinstance(k, NumPyThunk):
+                    if start_index == -1:
+                        start_index = dim
+                    transpose_indices += (dim,)
+                    transpose_needed = transpose_needed or (
+                        (dim - last_index) > 1
+                    )
+                    last_index = dim
+
+            if transpose_needed:
+                copy_needed = True
+                start_index = 0
+                post_indices = tuple(
+                    i for i in range(store.ndim) if i not in transpose_indices
+                )
+                transpose_indices += post_indices
+                store = store.transpose(transpose_indices)
+                index_map = list(transpose_indices)
+                count = 0
+                for i in transpose_indices:
+                    index_map[i] = count
+                    count += 1
+            else:
+                index_map = tuple(range(len(key)))
+
+            for d, k in enumerate(key):
+                dim = index_map[d]
                 if np.isscalar(k):
                     if k < 0:
                         k += store.shape[dim + shift]
                     store = store.project(dim + shift, k)
                     shift -= 1
                     copy_needed = True
-                    last_index = dim + shift
                 elif k is np.newaxis:
                     store = store.promote(dim + shift, 1)
                     copy_needed = True
@@ -466,22 +494,8 @@ def _create_indexing_array(self, key, is_set=False):
                     if k != slice(None):
                         copy_needed = True
                 elif isinstance(k, NumPyThunk):
-                    # the very first time we get cunumeric array, record
-                    # start_index
-                    if start_index == -1:
-                        start_index = dim + shift
-                        if (start_index - last_index) > 1:
-                            transpose_needed = True
-                        last_index = dim + shift
-                        transpose_indices += (dim + shift,)
-                    else:
-                        transpose_needed = transpose_needed or (
-                            (dim + shift - last_index) > 1
-                        )
-                        transpose_indices += (dim + shift,)
-                        last_index = dim + shift
                     if k.dtype == np.bool:
-                        if k.shape[0] != self.shape[dim]:
+                        if k.shape[0] != store.shape[dim]:
                             raise ValueError(
                                 "boolean index did not match "
                                 "indexed array along dimension  "
@@ -497,14 +511,7 @@ def _create_indexing_array(self, key, is_set=False):
                         "Unsupported entry type passed to advanced",
                         "indexing operation",
                     )
-            if transpose_needed:
-                copy_needed = True
-                start_index = 0
-                post_indices = tuple(
-                    i for i in range(store.ndim) if i not in transpose_indices
-                )
-                transpose_indices += post_indices
-                store = store.transpose(transpose_indices)
+
             if copy_needed:
                 # after store is transformed we need to to return a copy of
                 # the store since Copy operation can't be done on
@@ -562,7 +569,7 @@ def _create_indexing_array(self, key, is_set=False):
                     # output regions when ND output regions are available
                     tuple_of_arrays = key.nonzero()
             elif key.ndim < store.ndim:
-                output_arr = self._zip_indices(start_index, (key,))
+                output_arr = rhs._zip_indices(start_index, (key,))
                 return True, store, output_arr
             else:
                 tuple_of_arrays = (self.runtime.to_deferred_array(key),)
diff --git a/tests/index_routines.py b/tests/index_routines.py
index 3ba1f2fa1..4e4845fa9 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -240,6 +240,14 @@ def advanced_indexing():
     res_num = x_num[..., [0, 1], 2]
     assert np.array_equal(res, res_num)
 
+    res = x[:, [0, 1], :, -1]
+    res_num = x_num[:, [0, 1], :, -1]
+    assert np.array_equal(res, res_num)
+
+    res = x[:, [0, 1], :, 1:]
+    res_num = x_num[:, [0, 1], :, 1:]
+    assert np.array_equal(res, res_num)
+
     # In-Place & Augmented Assignments via Advanced Indexing
     # simple 1d case
     y = np.array([0, -1, -2, -3, -4, -5])

From 76c1ae50e61716e299efa319a6cc0c36a6d78ee7 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 6 Apr 2022 11:20:57 -0600
Subject: [PATCH 15/33] fixing an issue when advanced indexing operation is
 performed on transformed store

---
 cunumeric/deferred.py   | 53 ++++++++++++++++++++++-------------------
 tests/index_routines.py | 10 ++++++++
 2 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 42676dfaa..372443ad6 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -431,6 +431,20 @@ def _zip_indices(self, start_index, arrays):
 
         return output_arr
 
+    def copy_store(self, store):
+        store_to_copy = DeferredArray(
+            self.runtime,
+            base=store,
+            dtype=self.dtype,
+        )
+        store_copy = self.runtime.create_empty_thunk(
+            store_to_copy.shape,
+            self.dtype,
+            inputs=[store_to_copy],
+        )
+        store_copy.copy(store_to_copy, deep=True)
+        return store_copy, store_copy.base
+
     def _create_indexing_array(self, key, is_set=False):
         store = self.base
         rhs = self
@@ -511,26 +525,15 @@ def _create_indexing_array(self, key, is_set=False):
                         "Unsupported entry type passed to advanced",
                         "indexing operation",
                     )
-
-            if copy_needed:
+            if copy_needed or (not store._transform.bottom):
                 # after store is transformed we need to to return a copy of
                 # the store since Copy operation can't be done on
                 # the store with transformation
-                store_to_copy = DeferredArray(
-                    self.runtime,
-                    base=store,
-                    dtype=self.dtype,
-                )
-                store_copy = self.runtime.create_empty_thunk(
-                    store_to_copy.shape,
-                    self.dtype,
-                    inputs=[store_to_copy],
-                )
-                store_copy.copy(store_to_copy, deep=True)
-                rhs = store_copy
-                store = store_copy.base
+                rhs, store = self.copy_store(store)
         else:
             assert isinstance(key, NumPyThunk)
+            if not store._transform.bottom:
+                rhs, store = self.copy_store(store)
             # the use case when index array ndim >1 and input array ndim ==1
             if key.ndim > store.ndim:
                 if store.ndim != 1:
@@ -541,23 +544,23 @@ def _create_indexing_array(self, key, is_set=False):
 
             # Handle the boolean array case
             if key.dtype == np.bool:
-                if key.shape == self.shape:
-                    out_dtype = self.dtype
+                if key.shape == rhs.shape:
+                    out_dtype = rhs.dtype
                     if is_set:
-                        N = self.ndim
-                        out_dtype = self.runtime.get_point_type(N)
+                        N = rhs.ndim
+                        out_dtype = rhs.runtime.get_point_type(N)
 
-                    out = self.runtime.create_unbound_thunk(out_dtype)
-                    task = self.context.create_task(
+                    out = rhs.runtime.create_unbound_thunk(out_dtype)
+                    task = rhs.context.create_task(
                         CuNumericOpCode.ADVANCED_INDEXING
                     )
                     task.add_output(out.base)
-                    task.add_input(self.base)
+                    task.add_input(rhs.base)
                     task.add_input(key.base)
                     task.add_scalar_arg(is_set, bool)
-                    task.add_alignment(self.base, key.base)
+                    task.add_alignment(rhs.base, key.base)
                     task.add_broadcast(
-                        self.base, axes=tuple(range(1, len(self.shape)))
+                        rhs.base, axes=tuple(range(1, len(rhs.shape)))
                     )
                     task.add_broadcast(
                         key.base, axes=tuple(range(1, len(key.shape)))
@@ -572,7 +575,7 @@ def _create_indexing_array(self, key, is_set=False):
                 output_arr = rhs._zip_indices(start_index, (key,))
                 return True, store, output_arr
             else:
-                tuple_of_arrays = (self.runtime.to_deferred_array(key),)
+                tuple_of_arrays = (rhs.runtime.to_deferred_array(key),)
 
         if len(tuple_of_arrays) > rhs.ndim:
             raise TypeError("Advanced indexing dimension mismatch")
diff --git a/tests/index_routines.py b/tests/index_routines.py
index 4e4845fa9..3f187db84 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -283,6 +283,16 @@ def advanced_indexing():
     x_num[indx0_num, indx1_num] = 2.0
     assert np.array_equal(x, x_num)
 
+    # use case when advanced indexing is called on a transformed array:
+    print("advanced indexing test 11")
+    z = z[:, 1:]
+    z_num = z_num[:, 1:]
+    indx = np.array([1, 1])
+    indx_num = num.array(indx)
+    res = z[indx]
+    res_num = z_num[indx_num]
+    assert np.array_equal(res, res_num)
+
     # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by
     # 1 when passig 2d index array
     for ndim in range(2, LEGATE_MAX_DIM):

From acdae9da8ecb384feb01ae44eba0c009b700fee3 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 7 Apr 2022 10:53:54 -0600
Subject: [PATCH 16/33] adapting to the output region API change

---
 src/cunumeric/index/advanced_indexing_template.inl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/cunumeric/index/advanced_indexing_template.inl b/src/cunumeric/index/advanced_indexing_template.inl
index d4869e07b..d324f5ccc 100644
--- a/src/cunumeric/index/advanced_indexing_template.inl
+++ b/src/cunumeric/index/advanced_indexing_template.inl
@@ -44,7 +44,7 @@ struct AdvancedIndexingImpl {
 
     if (volume1 == 0 || volume2 == 0) {
       auto empty = create_buffer<VAL>(0);
-      args.output.return_data(empty, 0);
+      args.output.return_data(empty, Point<1>(0));
       return;
     }
 
@@ -67,9 +67,9 @@ struct AdvancedIndexingImpl {
       assert(false);
     }
     if (args.is_set) {
-      args.output.return_data(output_arr_set, size);
+      args.output.return_data(output_arr_set, Point<1>(size));
     } else {
-      args.output.return_data(output_arr, size);
+      args.output.return_data(output_arr, Point<1>(size));
     }
   }
 };

From 6cab1ca1a70414083a654d1b0625d7644b06ddb2 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 11 Apr 2022 16:52:19 -0600
Subject: [PATCH 17/33] addressing PR comments

---
 cunumeric/config.py                          |  2 +-
 cunumeric/runtime.py                         |  6 +++---
 src/cunumeric/cunumeric_c.h                  |  1 +
 src/cunumeric/index/advanced_indexing.cc     |  2 +-
 src/cunumeric/index/advanced_indexing.cu     |  2 +-
 src/cunumeric/index/advanced_indexing_omp.cc |  2 +-
 src/cunumeric/index/zip.cc                   | 10 ++++++----
 src/cunumeric/index/zip.cu                   |  5 ++++-
 src/cunumeric/index/zip_omp.cc               | 10 ++++++----
 src/cunumeric/index/zip_template.inl         |  8 ++++----
 10 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/cunumeric/config.py b/cunumeric/config.py
index f369bfb09..9c29bbe64 100644
--- a/cunumeric/config.py
+++ b/cunumeric/config.py
@@ -246,7 +246,7 @@ class CuNumericTunable(IntEnum):
     HAS_NUMAMEM = _cunumeric.CUNUMERIC_TUNABLE_HAS_NUMAMEM
 
 
-# Match these to CuNumericTypeCOdes in cunumeric_c.h
+# Match these to CuNumericTypeCodes in cunumeric_c.h
 @unique
 class CuNumericTypeCodes(IntEnum):
     CUNUMERIC_TYPE_POINT1 = _cunumeric.CUNUMERIC_TYPE_POINT1
diff --git a/cunumeric/runtime.py b/cunumeric/runtime.py
index 14e5c5d4e..fd3d2070b 100644
--- a/cunumeric/runtime.py
+++ b/cunumeric/runtime.py
@@ -102,7 +102,7 @@ def _register_dtypes(self):
 
     def _register_point_type(self, n):
         type_system = self.legate_context.type_system
-        point_type = "" + str(n)
+        point_type = "Point" + str(n)
         if point_type not in type_system:
             code = CuNumericTypeCodes.CUNUMERIC_TYPE_POINT1 + n - 1
             size_in_bytes = 8 * n
@@ -110,9 +110,9 @@ def _register_point_type(self, n):
 
     def get_point_type(self, n):
         type_system = self.legate_context.type_system
-        point_type = "" + str(n)
+        point_type = "Point" + str(n)
         if point_type not in type_system:
-            raise ValueError(f"there is no point type registered fro {n}")
+            raise ValueError(f"there is no point type registered for {n}")
         return point_type
 
     def _parse_command_args(self):
diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h
index a73a69eb8..f270cc247 100644
--- a/src/cunumeric/cunumeric_c.h
+++ b/src/cunumeric/cunumeric_c.h
@@ -187,6 +187,7 @@ enum CuNumericBounds {
   CUNUMERIC_MAX_TASKS   = 1048576,
 };
 
+// Match these to CuNumericTypeCodes in config.py
 enum CuNumericTypeCodes {
   CUNUMERIC_TYPE_POINT1 = LEGION_TYPE_TOTAL + 1,
   CUNUMERIC_TYPE_POINT2,
diff --git a/src/cunumeric/index/advanced_indexing.cc b/src/cunumeric/index/advanced_indexing.cc
index 74882e0e3..bc2b1870a 100644
--- a/src/cunumeric/index/advanced_indexing.cc
+++ b/src/cunumeric/index/advanced_indexing.cc
@@ -75,7 +75,7 @@ struct AdvancedIndexingImplBody<VariantKind::CPU, CODE, DIM1, DIM2, IS_SET> {
                     const Pitches<DIM2 - 1>& pitches_index,
                     const Rect<DIM2>& rect_index) const
   {
-#ifdef CUNUMERIC_DEBUG
+#ifdef DEBUG_CUNUMERIC
     // in this case shapes for input and index arrays  should be the same
     assert(rect_input == rect_index);
 #endif
diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cunumeric/index/advanced_indexing.cu
index fdce0f2e1..c454d0860 100644
--- a/src/cunumeric/index/advanced_indexing.cu
+++ b/src/cunumeric/index/advanced_indexing.cu
@@ -136,7 +136,7 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM1, DIM2, IS_SET> {
                     const Pitches<DIM2 - 1>& pitches_index,
                     const Rect<DIM2>& rect_index) const
   {
-#ifdef CUNUMERIC_DEBUG
+#ifdef DEBUG_CUNUMERIC
     // in this case shapes for input and index arrays  should be the same
     assert(rect_input == rect_index);
 #endif
diff --git a/src/cunumeric/index/advanced_indexing_omp.cc b/src/cunumeric/index/advanced_indexing_omp.cc
index 0568b3fd1..aad31b6b4 100644
--- a/src/cunumeric/index/advanced_indexing_omp.cc
+++ b/src/cunumeric/index/advanced_indexing_omp.cc
@@ -79,7 +79,7 @@ struct AdvancedIndexingImplBody<VariantKind::OMP, CODE, DIM1, DIM2, IS_SET> {
                     const Pitches<DIM2 - 1>& pitches_index,
                     const Rect<DIM2>& rect_index) const
   {
-#ifdef CUNUMERIC_DEBUG
+#ifdef DEBUG_CUNUMERIC
     // in this case shapes for input and index arrays  should be the same
     assert(rect_input == rect_index);
 #endif
diff --git a/src/cunumeric/index/zip.cc b/src/cunumeric/index/zip.cc
index b167d2f62..9d055fef3 100644
--- a/src/cunumeric/index/zip.cc
+++ b/src/cunumeric/index/zip.cc
@@ -39,9 +39,8 @@ struct ZipImplBody<VariantKind::CPU, DIM, N> {
     if (index_arrays.size() == N) {
       const size_t volume = rect.volume();
       if (dense) {
-        std::vector<const VAL*> indx_ptrs;
-        for (auto a : index_arrays) indx_ptrs.push_back(a.ptr(rect));
-        auto outptr = out.ptr(rect);
+        std::vector<const VAL*> indx_ptrs = {index_arrays[Is].ptr(rect)...};
+        auto outptr                       = out.ptr(rect);
         for (size_t idx = 0; idx < volume; ++idx) {
           outptr[idx] = Legion::Point<N>(indx_ptrs[Is][idx]...);
         }
@@ -51,7 +50,10 @@ struct ZipImplBody<VariantKind::CPU, DIM, N> {
           out[p] = Legion::Point<N>(index_arrays[Is][p]...);
         }
       }
-    } else if (index_arrays.size() < N) {
+    } else {
+#ifdef DEBUG_CUNUMERIC
+      assert(index_arrays.size() < N);
+#endif
       const size_t volume = rect.volume();
       for (size_t idx = 0; idx < volume; ++idx) {
         auto p = pitches.unflatten(idx, rect.lo);
diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu
index 88b999776..3a3ae0243 100644
--- a/src/cunumeric/index/zip.cu
+++ b/src/cunumeric/index/zip.cu
@@ -106,7 +106,10 @@ struct ZipImplBody<VariantKind::GPU, DIM, N> {
         zip_kernel<DIM, N><<<blocks, THREADS_PER_BLOCK>>>(
           out, idx_arr, rect, pitches, volume, std::make_index_sequence<N>());
       }
-    } else if (index_arrays.size() < N) {
+    } else {
+#ifdef DEBUG_CUNUMERIC
+      assert(index_arrays.size() < N);
+#endif
       DeferredBuffer<AccessorRO<VAL, DIM>, 1> idx_arr(Memory::Kind::Z_COPY_MEM,
                                                       Rect<1>(0, index_arrays.size() - 1));
       for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) idx_arr[idx] = index_arrays[idx];
diff --git a/src/cunumeric/index/zip_omp.cc b/src/cunumeric/index/zip_omp.cc
index 9276c3450..d4f961777 100644
--- a/src/cunumeric/index/zip_omp.cc
+++ b/src/cunumeric/index/zip_omp.cc
@@ -39,9 +39,8 @@ struct ZipImplBody<VariantKind::OMP, DIM, N> {
     const size_t volume = rect.volume();
     if (index_arrays.size() == N) {
       if (dense) {
-        std::vector<const VAL*> indx_ptrs;
-        for (auto a : index_arrays) indx_ptrs.push_back(a.ptr(rect));
-        auto outptr = out.ptr(rect);
+        std::vector<const VAL*> indx_ptrs = {index_arrays[Is].ptr(rect)...};
+        auto outptr                       = out.ptr(rect);
 #pragma omp parallel for schedule(static)
         for (size_t idx = 0; idx < volume; ++idx) {
           outptr[idx] = Legion::Point<N>(indx_ptrs[Is][idx]...);
@@ -53,7 +52,10 @@ struct ZipImplBody<VariantKind::OMP, DIM, N> {
           out[p] = Legion::Point<N>(index_arrays[Is][p]...);
         }
       }  // else
-    } else if (index_arrays.size() < N) {
+    } else {
+#ifdef DEBUG_CUNUMERIC
+      assert(index_arrays.size() < N);
+#endif
 #pragma omp parallel for schedule(static)
       for (size_t idx = 0; idx < volume; ++idx) {
         auto p = pitches.unflatten(idx, rect.lo);
diff --git a/src/cunumeric/index/zip_template.inl b/src/cunumeric/index/zip_template.inl
index d4b34a787..fd536ff35 100644
--- a/src/cunumeric/index/zip_template.inl
+++ b/src/cunumeric/index/zip_template.inl
@@ -37,18 +37,18 @@ struct ZipImpl {
     size_t volume = pitches.flatten(out_rect);
     if (volume == 0) return;
 
-#ifdef CUNUMERIC_DEBUG
-    assert(out_rect == index_rect)
+#ifdef DEBUG_CUNUMERIC
+    assert(out_rect == index_rect);
 #endif
 
 #ifndef LEGION_BOUNDS_CHECKS
-      bool dense = out.accessor.is_dense_row_major(out_rect);
+    bool dense = out.accessor.is_dense_row_major(out_rect);
 #else
     bool dense = false;
 #endif
     std::vector<AccessorRO<VAL, DIM>> index_arrays;
     for (int i = 0; i < args.inputs.size(); i++) {
-#ifdef CUNUMERIC_DEBUG
+#ifdef DEBUG_CUNUMERIC
       assert(index_rect == args.inputs[i].shape<DIM>());
 #endif
       index_arrays.push_back(args.inputs[i].read_accessor<VAL, DIM>(index_rect));

From 9480cc0aee279128ea5bd74e25b0fb71e67c98f2 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 12 Apr 2022 10:14:34 -0600
Subject: [PATCH 18/33] some code clean-up + more tests

---
 cunumeric/deferred.py                |  3 --
 src/cunumeric/index/zip.cc           |  2 +-
 src/cunumeric/index/zip.cu           |  2 +-
 src/cunumeric/index/zip_omp.cc       |  2 +-
 src/cunumeric/index/zip_template.inl | 23 ++++++++++
 tests/index_routines.py              | 69 ++++++++++++++++++++++++++--
 6 files changed, 90 insertions(+), 11 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 372443ad6..5d5dfed5a 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -743,9 +743,6 @@ def set_item(self, key, rhs):
             copy.execute()
 
             if view_copy:
-                print("IRINA DEBUG", self.shape, lhs.shape)
-                print(self.base.transform.bottom)
-                print(self)
                 self.copy(lhs, deep=True)
 
         else:
diff --git a/src/cunumeric/index/zip.cc b/src/cunumeric/index/zip.cc
index 9d055fef3..a1bce3a5f 100644
--- a/src/cunumeric/index/zip.cc
+++ b/src/cunumeric/index/zip.cc
@@ -63,7 +63,7 @@ struct ZipImplBody<VariantKind::CPU, DIM, N> {
           new_point[start_index + i] = index_arrays[i][p];
         }
         for (size_t i = (start_index + index_arrays.size()); i < N; i++) {
-          int64_t j    = key_dim + i - 1 - (index_arrays.size() - 1);
+          int64_t j    = key_dim + i - index_arrays.size();
           new_point[i] = p[j];
         }
         out[p] = new_point;
diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu
index 3a3ae0243..abf4914de 100644
--- a/src/cunumeric/index/zip.cu
+++ b/src/cunumeric/index/zip.cu
@@ -68,7 +68,7 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   for (size_t i = 0; i < start_index; i++) { new_point[i] = p[i]; }
   for (size_t i = 0; i < narrays; i++) { new_point[start_index + i] = index_arrays[i][p]; }
   for (size_t i = (start_index + narrays); i < N; i++) {
-    int64_t j    = key_dim + i - 1 - (narrays - 1);
+    int64_t j    = key_dim + i - narrays;
     new_point[i] = p[j];
   }
   out[p] = new_point;
diff --git a/src/cunumeric/index/zip_omp.cc b/src/cunumeric/index/zip_omp.cc
index d4f961777..e4a5d5764 100644
--- a/src/cunumeric/index/zip_omp.cc
+++ b/src/cunumeric/index/zip_omp.cc
@@ -65,7 +65,7 @@ struct ZipImplBody<VariantKind::OMP, DIM, N> {
           new_point[start_index + i] = index_arrays[i][p];
         }
         for (size_t i = (start_index + index_arrays.size()); i < N; i++) {
-          int64_t j    = key_dim + i - 1 - (index_arrays.size() - 1);
+          int64_t j    = key_dim + i - index_arrays.size();
           new_point[i] = p[j];
         }
         out[p] = new_point;
diff --git a/src/cunumeric/index/zip_template.inl b/src/cunumeric/index/zip_template.inl
index fd536ff35..e1e2c9004 100644
--- a/src/cunumeric/index/zip_template.inl
+++ b/src/cunumeric/index/zip_template.inl
@@ -69,6 +69,29 @@ struct ZipImpl {
 template <VariantKind KIND>
 static void zip_template(TaskContext& context)
 {
+  // Here `N` is the number of dimenstions of the input array and the number
+  // of dimensions of the Point<N> field
+  // key_dim - is the number of dimensions of the index arrays before
+  // they were broadcasted to the shape of the input array (shape of
+  // all index arrays should be the same))
+  // start index - is the index from wich first index array was passed
+  // DIM - dimension of the output array
+  //
+  // for the example:
+  // x.shape = (2,3,4,5)
+  // ind1.shape = (6,7,8)
+  // ind2.shape = (6,7,8)
+  // y = x[:,ind1,ind2,:]
+  // y.shape == (2,6,7,8,5)
+  // out.shape == (2,6,7,8,5)
+  // index_arrays = [ind1', ind2']
+  // ind1' == ind1 promoted to (2,6,7,8,5)
+  // ind2' == ind2 promoted to (2,6,7,8,5)
+  // DIM = 5
+  // N = 4
+  // key_dim = 3
+  // start_index = 1
+
   int64_t N           = context.scalars()[0].value<int64_t>();
   int64_t key_dim     = context.scalars()[1].value<int64_t>();
   int64_t start_index = context.scalars()[2].value<int64_t>();
diff --git a/tests/index_routines.py b/tests/index_routines.py
index 3f187db84..087126c32 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -35,6 +35,13 @@ def advanced_indexing():
     res_num = x_num[indx_num]
     assert np.array_equal(res, res_num)
 
+    # after transformation:
+    x = x[1:]
+    x_num = x_num[1:]
+    res = x[indx]
+    res_num = x_num[indx_num]
+    assert np.array_equal(res, res_num)
+
     # advanced indexing test when a.ndim ==1 , indx.ndim >1
     print("advanced indexing test 2")
     y = np.array([0, -1, -2, -3, -4, -5])
@@ -43,6 +50,12 @@ def advanced_indexing():
     index_num = num.array(index)
     assert np.array_equal(y[index], y_num[index_num])
 
+    # simple 2D case
+    print("advanced indexing test 3")
+    index_2d = np.array([[1, 2, 0], [5, 5, 5], [2, 3, 4]])
+    index_2d_num = num.array(index_2d)
+    assert np.array_equal(y[index_2d], y_num[index_2d_num])
+
     z = np.array(
         [
             [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]],
@@ -51,11 +64,20 @@ def advanced_indexing():
     )
     z_num = num.array(z)
 
-    # simple 2D case
-    print("advanced indexing test 3")
-    index_2d = np.array([[1, 2, 0], [5, 5, 5], [2, 3, 4]])
-    index_2d_num = num.array(index_2d)
-    assert np.array_equal(y[index_2d], y_num[index_2d_num])
+    zt = z.transpose(
+        (
+            1,
+            0,
+            2,
+        )
+    )
+    zt_num = z_num.transpose(
+        (
+            1,
+            0,
+            2,
+        )
+    )
 
     # mismatch dimesion case:
     print("advanced indexing test 4")
@@ -65,14 +87,26 @@ def advanced_indexing():
     res_num = z_num[indx_num]
     assert np.array_equal(res, res_num)
 
+    res = zt[indx]
+    res_num = zt_num[indx_num]
+    assert np.array_equal(res, res_num)
+
     res = z[:, :, indx]
     res_num = z_num[:, :, indx_num]
     assert np.array_equal(res, res_num)
 
+    res = zt[:, :, indx]
+    res_num = zt_num[:, :, indx_num]
+    assert np.array_equal(res, res_num)
+
     res = z[:, indx, :]
     res_num = z_num[:, indx_num, :]
     assert np.array_equal(res, res_num)
 
+    res = zt[:, indx, :]
+    res_num = zt_num[:, indx_num, :]
+    assert np.array_equal(res, res_num)
+
     # 2d:
     indx = np.array([[1, 1], [1, 0]])
     indx_num = num.array(indx)
@@ -80,10 +114,18 @@ def advanced_indexing():
     res_num = z_num[indx_num]
     assert np.array_equal(res, res_num)
 
+    res = zt[indx]
+    res_num = zt_num[indx_num]
+    assert np.array_equal(res, res_num)
+
     res = z[:, indx]
     res_num = z_num[:, indx_num]
     assert np.array_equal(res, res_num)
 
+    res = zt[:, indx]
+    res_num = zt_num[:, indx_num]
+    assert np.array_equal(res, res_num)
+
     # 2 arrays passed to 3d array
     indx0 = np.array([1, 1])
     indx1 = np.array([1, 0])
@@ -93,10 +135,18 @@ def advanced_indexing():
     res_num = z_num[indx0_num, indx1_num]
     assert np.array_equal(res, res_num)
 
+    res = zt[indx0, indx1]
+    res_num = zt_num[indx0_num, indx1_num]
+    assert np.array_equal(res, res_num)
+
     res = z[:, indx0, indx1]
     res_num = z_num[:, indx0_num, indx1_num]
     assert np.array_equal(res, res_num)
 
+    res = zt[:, indx0, indx1]
+    res_num = zt_num[:, indx0_num, indx1_num]
+    assert np.array_equal(res, res_num)
+
     # 2 index arrays passed in a sparse way:
     x = mk_seq_array(np, (3, 4, 5, 6))
     x_num = mk_seq_array(num, (3, 4, 5, 6))
@@ -121,6 +171,10 @@ def advanced_indexing():
     res_num = z_num[indx0_num, indx1_num]
     assert np.array_equal(res, res_num)
 
+    res = zt[indx0, indx1]
+    res_num = zt_num[indx0_num, indx1_num]
+    assert np.array_equal(res, res_num)
+
     # mismatch dimesion case bool:
     indx_bool = np.array([True, False])
     indx_bool_num = num.array(indx_bool)
@@ -293,6 +347,11 @@ def advanced_indexing():
     res_num = z_num[indx_num]
     assert np.array_equal(res, res_num)
 
+    # in-place assignment
+    z[indx] = 10
+    z_num[indx_num] = 10
+    assert np.array_equal(z, z_num)
+
     # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by
     # 1 when passig 2d index array
     for ndim in range(2, LEGATE_MAX_DIM):

From 02864d5d22caaf8f71b0b7b85aa0353106037714 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 13 Apr 2022 10:56:51 -0600
Subject: [PATCH 19/33] addressing PR comments for AdvancedIndexing task

---
 src/cunumeric/index/advanced_indexing.cc      | 16 +++++-----
 src/cunumeric/index/advanced_indexing.cu      | 16 +++++-----
 src/cunumeric/index/advanced_indexing_omp.cc  | 30 ++++++++++---------
 .../index/advanced_indexing_template.inl      | 29 ++++++++++--------
 src/cunumeric/omp_help.h                      |  9 +++++-
 5 files changed, 57 insertions(+), 43 deletions(-)

diff --git a/src/cunumeric/index/advanced_indexing.cc b/src/cunumeric/index/advanced_indexing.cc
index bc2b1870a..fea649bb3 100644
--- a/src/cunumeric/index/advanced_indexing.cc
+++ b/src/cunumeric/index/advanced_indexing.cc
@@ -22,8 +22,8 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
-template <LegateTypeCode CODE, int DIM1, int DIM2, bool IS_SET>
-struct AdvancedIndexingImplBody<VariantKind::CPU, CODE, DIM1, DIM2, IS_SET> {
+template <LegateTypeCode CODE, int DIM1, int DIM2>
+struct AdvancedIndexingImplBody<VariantKind::CPU, CODE, DIM1, DIM2> {
   using VAL = legate_type_of<CODE>;
 
   void compute_output(Buffer<VAL>& out,
@@ -33,13 +33,13 @@ struct AdvancedIndexingImplBody<VariantKind::CPU, CODE, DIM1, DIM2, IS_SET> {
                       const Rect<DIM1>& rect_input,
                       const Pitches<DIM2 - 1>& pitches_index,
                       const Rect<DIM2>& rect_index,
-                      int volume) const
+                      const size_t volume) const
   {
     int64_t out_idx = 0;
     for (size_t idx = 0; idx < volume; ++idx) {
-      auto p       = pitches_index.unflatten(idx, rect_index.lo);
-      auto p_input = pitches_input.unflatten(idx, rect_input.lo);
+      auto p = pitches_index.unflatten(idx, rect_index.lo);
       if (index[p] == true) {
+        auto p_input = pitches_input.unflatten(idx, rect_input.lo);
         out[out_idx] = input[p_input];
         out_idx++;
       }
@@ -53,13 +53,13 @@ struct AdvancedIndexingImplBody<VariantKind::CPU, CODE, DIM1, DIM2, IS_SET> {
                       const Rect<DIM1>& rect_input,
                       const Pitches<DIM2 - 1>& pitches_index,
                       const Rect<DIM2>& rect_index,
-                      int volume) const
+                      const size_t volume) const
   {
     int64_t out_idx = 0;
     for (size_t idx = 0; idx < volume; ++idx) {
-      auto p       = pitches_index.unflatten(idx, rect_index.lo);
-      auto p_input = pitches_input.unflatten(idx, rect_input.lo);
+      auto p = pitches_index.unflatten(idx, rect_index.lo);
       if (index[p] == true) {
+        auto p_input = pitches_input.unflatten(idx, rect_input.lo);
         out[out_idx] = p_input;
         out_idx++;
       }
diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cunumeric/index/advanced_indexing.cu
index c454d0860..1eb320aa4 100644
--- a/src/cunumeric/index/advanced_indexing.cu
+++ b/src/cunumeric/index/advanced_indexing.cu
@@ -35,14 +35,14 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
                        size_t iters,
                        Buffer<int64_t> offsets)
 {
-  int64_t value = 0;
+  size_t value = 0;
   for (size_t idx = 0; idx < iters; idx++) {
     const size_t offset = (idx * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
     if (offset < volume) {
       auto point      = pitches.unflatten(offset, origin);
-      auto val        = static_cast<int64_t>(index[point]);
+      auto val        = static_cast<size_t>(index[point]);
       offsets[offset] = val;
-      SumReduction<int64_t>::fold<true>(value, val);
+      SumReduction<size_t>::fold<true>(value, val);
     }
   }
   // Every thread in the thread block must participate in the exchange to get correct results
@@ -95,8 +95,8 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   }
 }
 
-template <LegateTypeCode CODE, int DIM1, int DIM2, bool IS_SET>
-struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM1, DIM2, IS_SET> {
+template <LegateTypeCode CODE, int DIM1, int DIM2>
+struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM1, DIM2> {
   using VAL = legate_type_of<CODE>;
 
   int64_t compute_size(const AccessorRO<bool, DIM2>& in,
@@ -106,10 +106,10 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM1, DIM2, IS_SET> {
                        cudaStream_t stream,
                        Buffer<int64_t>& offsets) const
   {
-    DeferredReduction<SumReduction<int64_t>> size;
+    DeferredReduction<SumReduction<size_t>> size;
 
     const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    size_t shmem_size   = THREADS_PER_BLOCK / 32 * sizeof(int64_t);
+    size_t shmem_size   = THREADS_PER_BLOCK / 32 * sizeof(size_t);
 
     if (blocks >= MAX_REDUCTION_CTAS) {
       const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS;
@@ -140,7 +140,7 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM1, DIM2, IS_SET> {
     // in this case shapes for input and index arrays  should be the same
     assert(rect_input == rect_index);
 #endif
-    int64_t size          = 0;
+    size_t size           = 0;
     const bool* index_ptr = index.ptr(rect_index);
     const size_t volume   = rect_index.volume();
     cudaStream_t stream;
diff --git a/src/cunumeric/index/advanced_indexing_omp.cc b/src/cunumeric/index/advanced_indexing_omp.cc
index aad31b6b4..7c34cf8df 100644
--- a/src/cunumeric/index/advanced_indexing_omp.cc
+++ b/src/cunumeric/index/advanced_indexing_omp.cc
@@ -18,14 +18,17 @@
 #include "cunumeric/index/advanced_indexing_template.inl"
 #include "cunumeric/omp_help.h"
 #include <omp.h>
+#include <thrust/fill.h>
+#include <thrust/execution_policy.h>
+#include <thrust/system/omp/execution_policy.h>
 
 namespace cunumeric {
 
 using namespace Legion;
 using namespace legate;
 
-template <LegateTypeCode CODE, int DIM1, int DIM2, bool IS_SET>
-struct AdvancedIndexingImplBody<VariantKind::OMP, CODE, DIM1, DIM2, IS_SET> {
+template <LegateTypeCode CODE, int DIM1, int DIM2>
+struct AdvancedIndexingImplBody<VariantKind::OMP, CODE, DIM1, DIM2> {
   using VAL = legate_type_of<CODE>;
 
   void compute_output(Buffer<VAL>& out,
@@ -35,14 +38,14 @@ struct AdvancedIndexingImplBody<VariantKind::OMP, CODE, DIM1, DIM2, IS_SET> {
                       const Rect<DIM1>& rect_input,
                       const Pitches<DIM2 - 1>& pitches_index,
                       const Rect<DIM2>& rect_index,
-                      int volume,
+                      const size_t volume,
                       int64_t out_idx) const
   {
 #pragma omp for schedule(static)
     for (size_t idx = 0; idx < volume; ++idx) {
-      auto p       = pitches_index.unflatten(idx, rect_index.lo);
-      auto p_input = pitches_input.unflatten(idx, rect_input.lo);
+      auto p = pitches_index.unflatten(idx, rect_index.lo);
       if (index[p] == true) {
+        auto p_input = pitches_input.unflatten(idx, rect_input.lo);
         out[out_idx] = input[p_input];
         out_idx++;
       }
@@ -56,14 +59,14 @@ struct AdvancedIndexingImplBody<VariantKind::OMP, CODE, DIM1, DIM2, IS_SET> {
                       const Rect<DIM1>& rect_input,
                       const Pitches<DIM2 - 1>& pitches_index,
                       const Rect<DIM2>& rect_index,
-                      int volume,
+                      const size_t volume,
                       int64_t out_idx) const
   {
 #pragma omp for schedule(static)
     for (size_t idx = 0; idx < volume; ++idx) {
-      auto p       = pitches_index.unflatten(idx, rect_index.lo);
-      auto p_input = pitches_input.unflatten(idx, rect_input.lo);
+      auto p = pitches_index.unflatten(idx, rect_index.lo);
       if (index[p] == true) {
+        auto p_input = pitches_input.unflatten(idx, rect_input.lo);
         out[out_idx] = p_input;
         out_idx++;
       }
@@ -85,12 +88,12 @@ struct AdvancedIndexingImplBody<VariantKind::OMP, CODE, DIM1, DIM2, IS_SET> {
 #endif
     const size_t volume    = rect_index.volume();
     const auto max_threads = omp_get_max_threads();
-    int64_t size           = 0;
+    size_t size            = 0;
     ThreadLocalStorage<int64_t> offsets(max_threads);
 
     {
-      ThreadLocalStorage<int64_t> sizes(max_threads);
-      for (auto idx = 0; idx < max_threads; ++idx) sizes[idx] = 0;
+      ThreadLocalStorage<size_t> sizes(max_threads);
+      thrust::fill(thrust::omp::par, sizes.begin(), sizes.end(), 0);
 #pragma omp parallel
       {
         const int tid = omp_get_thread_num();
@@ -101,10 +104,9 @@ struct AdvancedIndexingImplBody<VariantKind::OMP, CODE, DIM1, DIM2, IS_SET> {
         }
       }
 
-      for (auto idx = 0; idx < max_threads; ++idx) size += sizes[idx];
+      size = thrust::reduce(thrust::omp::par, sizes.begin(), sizes.end(), 0);
 
-      offsets[0] = 0;
-      for (auto idx = 1; idx < max_threads; ++idx) offsets[idx] = offsets[idx - 1] + sizes[idx - 1];
+      thrust::exclusive_scan(thrust::omp::par, sizes.begin(), sizes.end(), offsets.begin());
     }
 
     Memory::Kind kind =
diff --git a/src/cunumeric/index/advanced_indexing_template.inl b/src/cunumeric/index/advanced_indexing_template.inl
index d324f5ccc..bc7dc923b 100644
--- a/src/cunumeric/index/advanced_indexing_template.inl
+++ b/src/cunumeric/index/advanced_indexing_template.inl
@@ -21,7 +21,7 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
-template <VariantKind KIND, LegateTypeCode CODE, int DIM1, int DIM2, bool IS_SET>
+template <VariantKind KIND, LegateTypeCode CODE, int DIM1, int DIM2>
 struct AdvancedIndexingImplBody;
 
 template <VariantKind KIND, LegateTypeCode CODE, int DIM1>
@@ -43,23 +43,28 @@ struct AdvancedIndexingImpl {
     size_t volume2 = index_pitches.flatten(index_rect);
 
     if (volume1 == 0 || volume2 == 0) {
-      auto empty = create_buffer<VAL>(0);
-      args.output.return_data(empty, Point<1>(0));
+      if (args.is_set) {
+        auto empty = create_buffer<Point<DIM1>>(0);
+        args.output.return_data(empty, Point<1>(0));
+      } else {
+        auto empty = create_buffer<VAL>(0);
+        args.output.return_data(empty, Point<1>(0));
+      }
       return;
     }
 
-    int64_t size = 0;
+    size_t size = 0;
     if (DIM1 == DIM2) {
       if (args.is_set) {
-        size = AdvancedIndexingImplBody<KIND, CODE, DIM1, DIM2, true>{}(output_arr_set,
-                                                                        input_arr,
-                                                                        index_arr,
-                                                                        input_pitches,
-                                                                        input_rect,
-                                                                        index_pitches,
-                                                                        index_rect);
+        size = AdvancedIndexingImplBody<KIND, CODE, DIM1, DIM2>{}(output_arr_set,
+                                                                  input_arr,
+                                                                  index_arr,
+                                                                  input_pitches,
+                                                                  input_rect,
+                                                                  index_pitches,
+                                                                  index_rect);
       } else {
-        size = AdvancedIndexingImplBody<KIND, CODE, DIM1, DIM2, false>{}(
+        size = AdvancedIndexingImplBody<KIND, CODE, DIM1, DIM2>{}(
           output_arr, input_arr, index_arr, input_pitches, input_rect, index_pitches, index_rect);
       }
     } else {
diff --git a/src/cunumeric/omp_help.h b/src/cunumeric/omp_help.h
index 2cf3cb106..8d7440724 100644
--- a/src/cunumeric/omp_help.h
+++ b/src/cunumeric/omp_help.h
@@ -27,7 +27,10 @@ struct ThreadLocalStorage {
   static constexpr size_t CACHE_LINE_SIZE = 64;
 
  public:
-  ThreadLocalStorage(size_t num_threads) : storage_(CACHE_LINE_SIZE * num_threads) {}
+  ThreadLocalStorage(size_t num_threads)
+    : storage_(CACHE_LINE_SIZE * num_threads), num_threads_(num_threads)
+  {
+  }
   ~ThreadLocalStorage() {}
 
  public:
@@ -36,8 +39,12 @@ struct ThreadLocalStorage {
     return *reinterpret_cast<VAL*>(storage_.data() + CACHE_LINE_SIZE * idx);
   }
 
+  VAL* begin() { return reinterpret_cast<VAL*>(storage_.data()); }
+  VAL* end() { return reinterpret_cast<VAL*>(storage_.data() + CACHE_LINE_SIZE * num_threads_); }
+
  private:
   std::vector<int8_t> storage_;
+  size_t num_threads_;
 };
 
 }  // namespace cunumeric

From c69897f8f6bc9c6a6c5a16c6368904fc8e40f9b9 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 13 Apr 2022 21:33:05 -0600
Subject: [PATCH 20/33] addressing some PR comments on cunumeric/deferred

---
 cunumeric/deferred.py                        | 50 +++++++++-----------
 src/cunumeric/index/advanced_indexing.cc     |  2 +-
 src/cunumeric/index/advanced_indexing.cu     |  2 +-
 src/cunumeric/index/advanced_indexing_omp.cc |  2 +-
 tests/index_routines.py                      | 17 +++++++
 5 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index ed00f4ac0..a1cb77de9 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -531,7 +531,7 @@ def _create_indexing_array(self, key, is_set=False):
                 # the store with transformation
                 rhs, store = self.copy_store(store)
         else:
-            assert isinstance(key, NumPyThunk)
+            assert isinstance(key, DeferredArray)
             if not store._transform.bottom:
                 rhs, store = self.copy_store(store)
             # the use case when index array ndim >1 and input array ndim ==1
@@ -688,41 +688,34 @@ def set_item(self, key, rhs):
         assert self.dtype == rhs.dtype
         # Check to see if this is advanced indexing or not
         if is_advanced_indexing(key):
-            view_copy = False
             # Create the indexing array
             copy_needed, store, index_array = self._create_indexing_array(
                 key, True
             )
-            if copy_needed:
-                if self.base.transform.bottom:
-                    lhs = self
-                else:
-                    # if  store is transformed we need to to return a copy of
-                    # the store since Copy operation can't be done on
-                    # the store with transformation
-                    store_to_copy = DeferredArray(
-                        self.runtime,
-                        base=store,
-                        dtype=self.dtype,
-                    )
-                    store_copy = self.runtime.create_empty_thunk(
-                        store_to_copy.shape,
-                        self.dtype,
-                        inputs=[store_to_copy],
-                    )
-                    store_copy.copy(store_to_copy, deep=True)
-
-                    lhs = store_copy
-                    view_copy = True
-            else:
+            if self.base.transform.bottom:
                 lhs = self
-                view_copy = False
+            else:
+                # if  store is transformed we need to to return a copy of
+                # the store since Copy operation can't be done on
+                # the store with transformation
+                store_to_copy = DeferredArray(
+                    self.runtime,
+                    base=store,
+                    dtype=self.dtype,
+                )
+                store_copy = self.runtime.create_empty_thunk(
+                    store_to_copy.shape,
+                    self.dtype,
+                    inputs=[store_to_copy],
+                )
+                store_copy.copy(store_to_copy, deep=True)
+                lhs = store_copy
 
             if rhs.ndim == 0:
                 rhs_tmp = self.runtime.create_empty_thunk(
                     index_array.base.shape,
                     self.dtype,
-                    inputs=[index_array],
+                    inputs=[],
                 )
                 task = self.context.create_task(CuNumericOpCode.FILL)
                 task.add_output(rhs_tmp.base)
@@ -732,7 +725,8 @@ def set_item(self, key, rhs):
                 rhs = rhs_tmp.base
             else:
                 if rhs.shape != index_array.shape:
-                    rhs = rhs._broadcast(index_array.base.shape)
+                    rhs_tmp = rhs._broadcast(index_array.base.shape)
+                    rhs_tmp, rhs = rhs.copy_store(rhs_tmp)
                 else:
                     rhs = rhs.base
 
@@ -742,7 +736,7 @@ def set_item(self, key, rhs):
             copy.add_output(lhs.base)
             copy.execute()
 
-            if view_copy:
+            if lhs is not self:
                 self.copy(lhs, deep=True)
 
         else:
diff --git a/src/cunumeric/index/advanced_indexing.cc b/src/cunumeric/index/advanced_indexing.cc
index fea649bb3..d5b1c2c16 100644
--- a/src/cunumeric/index/advanced_indexing.cc
+++ b/src/cunumeric/index/advanced_indexing.cc
@@ -77,7 +77,7 @@ struct AdvancedIndexingImplBody<VariantKind::CPU, CODE, DIM1, DIM2> {
   {
 #ifdef DEBUG_CUNUMERIC
     // in this case shapes for input and index arrays  should be the same
-    assert(rect_input == rect_index);
+    assert(Domain(rect_input) == Domain(rect_index));
 #endif
     const size_t volume = rect_index.volume();
     size_t size         = 0;
diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cunumeric/index/advanced_indexing.cu
index 1eb320aa4..c78fec4cf 100644
--- a/src/cunumeric/index/advanced_indexing.cu
+++ b/src/cunumeric/index/advanced_indexing.cu
@@ -138,7 +138,7 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM1, DIM2> {
   {
 #ifdef DEBUG_CUNUMERIC
     // in this case shapes for input and index arrays  should be the same
-    assert(rect_input == rect_index);
+    assert(Domain(rect_input) == Domain(rect_index));
 #endif
     size_t size           = 0;
     const bool* index_ptr = index.ptr(rect_index);
diff --git a/src/cunumeric/index/advanced_indexing_omp.cc b/src/cunumeric/index/advanced_indexing_omp.cc
index 7c34cf8df..8bffa2199 100644
--- a/src/cunumeric/index/advanced_indexing_omp.cc
+++ b/src/cunumeric/index/advanced_indexing_omp.cc
@@ -84,7 +84,7 @@ struct AdvancedIndexingImplBody<VariantKind::OMP, CODE, DIM1, DIM2> {
   {
 #ifdef DEBUG_CUNUMERIC
     // in this case shapes for input and index arrays  should be the same
-    assert(rect_input == rect_index);
+    assert(Domain(rect_input) == Domain(rect_index));
 #endif
     const size_t volume    = rect_index.volume();
     const auto max_threads = omp_get_max_threads();
diff --git a/tests/index_routines.py b/tests/index_routines.py
index 087126c32..c0a62d87e 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -320,6 +320,23 @@ def advanced_indexing():
     # same daya in the original array is not guaranteed, so we can't call
     # assert np.array_equal(y, y_num) here
 
+    index = np.array([1, 4, 3, 2, 0, 5])
+    index_num = num.array(index)
+    y[index] = np.array([1, 2, 3, 4, 5, 6])
+    y_num[index_num] = num.array([1, 2, 3, 4, 5, 6])
+    print(y)
+    print(y_num)
+    assert np.array_equal(y, y_num)
+
+    # the case when broadcast is needed:
+    index = np.array([[1, 4, 3], [2, 0, 5]])
+    index_num = num.array(index)
+    y[index] = np.array([[1, 2, 3]])
+    y_num[index_num] = num.array([[1, 2, 3]])
+    print(y)
+    print(y_num)
+    assert np.array_equal(y, y_num)
+
     # 2D test
     x = np.array(
         [

From e2bbcb522c5bfb805d1f05c15e29978a83f99580 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 13 Apr 2022 22:42:08 -0600
Subject: [PATCH 21/33] addressing some PR comments on cunumeric/deferred 2

---
 cunumeric/deferred.py   | 62 +++++++++++------------------------------
 tests/index_routines.py |  7 +++++
 2 files changed, 24 insertions(+), 45 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index a1cb77de9..139cdc1af 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -305,10 +305,6 @@ def get_scalar_array(self):
         result = np.frombuffer(buf, dtype=self.dtype, count=1)
         return result.reshape(())
 
-    def broadcast_shapes(self, shapes):
-        arrays = [np.empty(x, dtype=[]) for x in shapes]
-        return np.broadcast(*arrays).shape
-
     def _zip_indices(self, start_index, arrays):
         if not isinstance(arrays, tuple):
             raise TypeError("zip_indices expects tuple of arrays")
@@ -326,7 +322,8 @@ def _zip_indices(self, start_index, arrays):
         # find a broadcasted shape for all arrays passed as indices
         shapes = tuple(a.shape for a in arrays)
         if len(arrays) > 1:
-            b_shape = self.broadcast_shapes(shapes)
+            # TODO: replace with cunumeric.broadcast_shapes, when available
+            b_shape = np.broadcast_shapes(*shapes)
         else:
             b_shape = arrays[0].shape
 
@@ -431,7 +428,7 @@ def _zip_indices(self, start_index, arrays):
 
         return output_arr
 
-    def copy_store(self, store):
+    def _copy_store(self, store):
         store_to_copy = DeferredArray(
             self.runtime,
             base=store,
@@ -461,7 +458,6 @@ def _create_indexing_array(self, key, is_set=False):
             transpose_indices = tuple()
             # since we can't call Copy operation on transformed Store, after
             # the transformation, we need to return a copy
-            copy_needed = False
             tuple_of_arrays = ()
             index_map = []
 
@@ -477,7 +473,6 @@ def _create_indexing_array(self, key, is_set=False):
                     last_index = dim
 
             if transpose_needed:
-                copy_needed = True
                 start_index = 0
                 post_indices = tuple(
                     i for i in range(store.ndim) if i not in transpose_indices
@@ -499,14 +494,10 @@ def _create_indexing_array(self, key, is_set=False):
                         k += store.shape[dim + shift]
                     store = store.project(dim + shift, k)
                     shift -= 1
-                    copy_needed = True
                 elif k is np.newaxis:
                     store = store.promote(dim + shift, 1)
-                    copy_needed = True
                 elif isinstance(k, slice):
                     store = store.slice(dim + shift, k)
-                    if k != slice(None):
-                        copy_needed = True
                 elif isinstance(k, NumPyThunk):
                     if k.dtype == np.bool:
                         if k.shape[0] != store.shape[dim]:
@@ -525,23 +516,22 @@ def _create_indexing_array(self, key, is_set=False):
                         "Unsupported entry type passed to advanced",
                         "indexing operation",
                     )
-            if copy_needed or (not store._transform.bottom):
+            if store.transformed:
                 # after store is transformed we need to to return a copy of
                 # the store since Copy operation can't be done on
                 # the store with transformation
-                rhs, store = self.copy_store(store)
+                rhs, store = self._copy_store(store)
         else:
             assert isinstance(key, DeferredArray)
-            if not store._transform.bottom:
-                rhs, store = self.copy_store(store)
-            # the use case when index array ndim >1 and input array ndim ==1
+            # the use case when index array ndim >input array ndim
             if key.ndim > store.ndim:
-                if store.ndim != 1:
-                    raise ValueError("Advance indexing dimention mismatch")
                 diff = store.ndim - key.ndim
                 for i in range(diff):
                     store = store.promote(i + 1, store.shape[0])
 
+            if store.transformed:
+                rhs, store = self._copy_store(store)
+
             # Handle the boolean array case
             if key.dtype == np.bool:
                 if key.shape == rhs.shape:
@@ -566,14 +556,14 @@ def _create_indexing_array(self, key, is_set=False):
                         key.base, axes=tuple(range(1, len(key.shape)))
                     )
                     task.execute()
-                    return False, store, out
+                    return False, rhs, out
                 else:
                     # FIXME: replace `nonzero` case with the task with
                     # output regions when ND output regions are available
                     tuple_of_arrays = key.nonzero()
             elif key.ndim < store.ndim:
                 output_arr = rhs._zip_indices(start_index, (key,))
-                return True, store, output_arr
+                return True, rhs, output_arr
             else:
                 tuple_of_arrays = (rhs.runtime.to_deferred_array(key),)
 
@@ -582,9 +572,9 @@ def _create_indexing_array(self, key, is_set=False):
 
         if len(tuple_of_arrays) <= rhs.ndim and rhs.ndim > 1:
             output_arr = rhs._zip_indices(start_index, tuple_of_arrays)
-            return True, store, output_arr
+            return True, rhs, output_arr
         elif len(tuple_of_arrays) == 1 and rhs.ndim == 1:
-            return True, store, tuple_of_arrays[0]
+            return True, rhs, tuple_of_arrays[0]
         else:
             raise ValueError("Advance indexing dimention mismatch")
 
@@ -648,7 +638,8 @@ def get_item(self, key):
         # Check to see if this is advanced indexing or not
         if is_advanced_indexing(key):
             # Create the indexing array
-            copy_needed, store, index_array = self._create_indexing_array(key)
+            copy_needed, rhs, index_array = self._create_indexing_array(key)
+            store = rhs.base
             if copy_needed:
                 # Create a new array to be the result
                 result = self.runtime.create_empty_thunk(
@@ -689,28 +680,9 @@ def set_item(self, key, rhs):
         # Check to see if this is advanced indexing or not
         if is_advanced_indexing(key):
             # Create the indexing array
-            copy_needed, store, index_array = self._create_indexing_array(
+            copy_needed, lhs, index_array = self._create_indexing_array(
                 key, True
             )
-            if self.base.transform.bottom:
-                lhs = self
-            else:
-                # if  store is transformed we need to to return a copy of
-                # the store since Copy operation can't be done on
-                # the store with transformation
-                store_to_copy = DeferredArray(
-                    self.runtime,
-                    base=store,
-                    dtype=self.dtype,
-                )
-                store_copy = self.runtime.create_empty_thunk(
-                    store_to_copy.shape,
-                    self.dtype,
-                    inputs=[store_to_copy],
-                )
-                store_copy.copy(store_to_copy, deep=True)
-                lhs = store_copy
-
             if rhs.ndim == 0:
                 rhs_tmp = self.runtime.create_empty_thunk(
                     index_array.base.shape,
@@ -726,7 +698,7 @@ def set_item(self, key, rhs):
             else:
                 if rhs.shape != index_array.shape:
                     rhs_tmp = rhs._broadcast(index_array.base.shape)
-                    rhs_tmp, rhs = rhs.copy_store(rhs_tmp)
+                    rhs_tmp, rhs = rhs._copy_store(rhs_tmp)
                 else:
                     rhs = rhs.base
 
diff --git a/tests/index_routines.py b/tests/index_routines.py
index c0a62d87e..92f055ba3 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -354,6 +354,13 @@ def advanced_indexing():
     x_num[indx0_num, indx1_num] = 2.0
     assert np.array_equal(x, x_num)
 
+    # shape mismatch:
+    indx = np.ones((2, 2, 2), dtype=int)
+    indx_num = num.array(indx)
+    res = x[indx]
+    res_num = x_num[indx_num]
+    assert np.array_equal(res, res_num)
+
     # use case when advanced indexing is called on a transformed array:
     print("advanced indexing test 11")
     z = z[:, 1:]

From a8bf6adc78aadacfa383aa4160170e7279061ac2 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 15 Apr 2022 11:35:17 -0600
Subject: [PATCH 22/33] cleaning up deferred.py

---
 cunumeric/array.py      |  17 ++-
 cunumeric/deferred.py   | 287 +++++++++++++++++-----------------------
 tests/index_routines.py |  29 ++++
 3 files changed, 167 insertions(+), 166 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 0af95e6f6..e6de88710 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -767,8 +767,21 @@ def _convert_key(self, key, first=True):
         elif isinstance(key, tuple) and first:
             return tuple(self._convert_key(k, first=False) for k in key)
         else:
-            # Otherwise convert it to a cuNumeric array and get the thunk
-            return convert_to_cunumeric_ndarray(key)._thunk
+            # Otherwise convert it to a cuNumeric array, check types
+            # and get the thunk
+            key = convert_to_cunumeric_ndarray(key)
+            if key.dtype != np.bool and not np.issubdtype(
+                key.dtype, np.integer
+            ):
+                raise TypeError("index arrays should be int or bool type")
+            if key.dtype != np.bool and key.dtype != np.int64:
+                runtime.warn(
+                    "converting index array to int64 type",
+                    category=RuntimeWarning,
+                )
+                key = key.astype(np.int64)
+
+            return key._thunk
 
     @add_boilerplate()
     def __getitem__(self, key):
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 139cdc1af..5a4a0ef93 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -313,11 +313,15 @@ def _zip_indices(self, start_index, arrays):
         if start_index == -1:
             start_index = 0
 
-        arrays = tuple(self.runtime.to_deferred_array(a) for a in arrays)
-        # all arrays should have the same shape and type
-        data_type = arrays[0].dtype
-        if not np.issubdtype(data_type, np.integer):
-            raise TypeError("a array should be integer type")
+        new_arrays = tuple()
+        # check array's type and converting them to deferred arrays
+        for a in arrays:
+            a = self.runtime.to_deferred_array(a)
+            data_type = a.dtype
+            if data_type != np.int64:
+                raise TypeError("index arrays should be int64 type")
+            new_arrays += (a,)
+        arrays = new_arrays
 
         # find a broadcasted shape for all arrays passed as indices
         shapes = tuple(a.shape for a in arrays)
@@ -331,22 +335,19 @@ def _zip_indices(self, start_index, arrays):
         key_dim = len(b_shape)
         out_shape = b_shape
 
+        # broadcast shapes
+        new_arrays = tuple()
+        for a in arrays:
+            if a.shape != b_shape:
+                new_arrays += (a._broadcast(b_shape),)
+            else:
+                new_arrays += (a.base,)
+        arrays = new_arrays
+
         if len(arrays) < self.ndim:
             # the case when # of arrays passed is smaller than dimension of
             # the input array
             N = len(arrays)
-            # broadcast shapes
-            new_arrays = tuple()
-            for a in arrays:
-                if data_type != a.dtype:
-                    raise TypeError(
-                        "type of all index arrrays should be the same"
-                    )
-                if a.shape != b_shape:
-                    new_arrays += (a._broadcast(b_shape),)
-                else:
-                    new_arrays += (a.base,)
-            arrays = new_arrays
             # output shape
             out_shape = (
                 tuple(self.shape[i] for i in range(0, start_index))
@@ -364,24 +365,8 @@ def _zip_indices(self, start_index, arrays):
                     a = a.promote(key_dim + i - N, self.shape[i])
                 new_arrays += (a,)
             arrays = new_arrays
-
-        else:
-            # the use case when # of arrays passed is equal to the dimension
-            # of the input array
-            if len(arrays) > self.ndim:
-                raise ValueError("wrong number of index arrays passed")
-            new_arrays = tuple()
-            for a in arrays:
-                if data_type != a.dtype:
-                    raise TypeError(
-                        "type of all index arrrays should be the same"
-                    )
-                if a.shape != b_shape:
-                    a = a._broadcast(b_shape)
-                else:
-                    a = a.base
-                new_arrays = new_arrays + (a,)
-        arrays = new_arrays
+        elif len(arrays) > self.ndim:
+            raise ValueError("wrong number of index arrays passed")
 
         # create output array which will store Point<N> field where
         # N is number of index arrays
@@ -406,24 +391,12 @@ def _zip_indices(self, start_index, arrays):
         # call ZIP function to combine index arrays into a singe array
         task = self.context.create_task(CuNumericOpCode.ZIP)
         task.add_output(output_arr.base)
-        if len(arrays) < self.ndim:
-            task.add_scalar_arg(self.ndim, ty.int64)  # N of points in Point<N>
-            task.add_scalar_arg(key_dim, ty.int64)  # key_dim
-            task.add_scalar_arg(start_index, ty.int64)  # start_index
-            for a in arrays:
-                task.add_input(a)
-                task.add_alignment(output_arr.base, a)
-                task.add_broadcast(a, axes=tuple(range(1, len(out_shape))))
-                task.add_broadcast(
-                    output_arr.base, axes=tuple(range(1, len(out_shape)))
-                )
-        else:
-            task.add_scalar_arg(self.ndim, ty.int64)
-            task.add_scalar_arg(self.ndim, ty.int64)
-            task.add_scalar_arg(start_index, ty.int64)
-            for index_arr in arrays:
-                task.add_input(index_arr)
-                task.add_alignment(output_arr.base, index_arr)
+        task.add_scalar_arg(self.ndim, ty.int64)  # N of points in Point<N>
+        task.add_scalar_arg(key_dim, ty.int64)  # key_dim
+        task.add_scalar_arg(start_index, ty.int64)  # start_index
+        for a in arrays:
+            task.add_input(a)
+            task.add_alignment(output_arr.base, a)
         task.execute()
 
         return output_arr
@@ -447,128 +420,112 @@ def _create_indexing_array(self, key, is_set=False):
         rhs = self
         # the index where the first index_array is passed to the [] operator
         start_index = -1
-        if isinstance(key, tuple):
-            key = self._unpack_ellipsis(key, self.ndim)
-            shift = 0
-            last_index = self.ndim
-            # in case when index arrays are passed in the scaterred way,
-            # we need to transpose original array so all index arrays
-            # are close to each other
-            transpose_needed = False
-            transpose_indices = tuple()
-            # since we can't call Copy operation on transformed Store, after
-            # the transformation, we need to return a copy
-            tuple_of_arrays = ()
-            index_map = []
-
-            # First, we need to check if transpose is needed
-            for dim, k in enumerate(key):
-                if np.isscalar(k) or isinstance(k, NumPyThunk):
-                    if start_index == -1:
-                        start_index = dim
-                    transpose_indices += (dim,)
-                    transpose_needed = transpose_needed or (
-                        (dim - last_index) > 1
-                    )
-                    last_index = dim
+        if (
+            isinstance(key, NumPyThunk)
+            and key.dtype == np.bool
+            and key.shape == rhs.shape
+        ):
+            if not isinstance(key, DeferredArray):
+                key = self.runtime.to_deferred_array(key)
+
+            out_dtype = rhs.dtype
+            if is_set:
+                N = rhs.ndim
+                out_dtype = rhs.runtime.get_point_type(N)
+
+            out = rhs.runtime.create_unbound_thunk(out_dtype)
+            task = rhs.context.create_task(CuNumericOpCode.ADVANCED_INDEXING)
+            task.add_output(out.base)
+            task.add_input(rhs.base)
+            task.add_input(key.base)
+            task.add_scalar_arg(is_set, bool)
+            task.add_alignment(rhs.base, key.base)
+            task.execute()
+            return False, rhs, out
 
-            if transpose_needed:
-                start_index = 0
-                post_indices = tuple(
-                    i for i in range(store.ndim) if i not in transpose_indices
-                )
-                transpose_indices += post_indices
-                store = store.transpose(transpose_indices)
-                index_map = list(transpose_indices)
-                count = 0
-                for i in transpose_indices:
-                    index_map[i] = count
-                    count += 1
-            else:
-                index_map = tuple(range(len(key)))
-
-            for d, k in enumerate(key):
-                dim = index_map[d]
-                if np.isscalar(k):
-                    if k < 0:
-                        k += store.shape[dim + shift]
-                    store = store.project(dim + shift, k)
-                    shift -= 1
-                elif k is np.newaxis:
-                    store = store.promote(dim + shift, 1)
-                elif isinstance(k, slice):
-                    store = store.slice(dim + shift, k)
-                elif isinstance(k, NumPyThunk):
-                    if k.dtype == np.bool:
-                        if k.shape[0] != store.shape[dim]:
-                            raise ValueError(
-                                "boolean index did not match "
-                                "indexed array along dimension  "
-                            )
-                        # in case of the mixed indises we all nonzero
-                        # for the bool array
-                        k = k.nonzero()
-                        tuple_of_arrays += k
-                    else:
-                        tuple_of_arrays += (self.runtime.to_deferred_array(k),)
-                else:
-                    raise TypeError(
-                        "Unsupported entry type passed to advanced",
-                        "indexing operation",
-                    )
-            if store.transformed:
-                # after store is transformed we need to to return a copy of
-                # the store since Copy operation can't be done on
-                # the store with transformation
-                rhs, store = self._copy_store(store)
-        else:
-            assert isinstance(key, DeferredArray)
+        if isinstance(key, NumPyThunk):
             # the use case when index array ndim >input array ndim
             if key.ndim > store.ndim:
                 diff = store.ndim - key.ndim
                 for i in range(diff):
                     store = store.promote(i + 1, store.shape[0])
 
-            if store.transformed:
-                rhs, store = self._copy_store(store)
+            key = (key,)
 
-            # Handle the boolean array case
-            if key.dtype == np.bool:
-                if key.shape == rhs.shape:
-                    out_dtype = rhs.dtype
-                    if is_set:
-                        N = rhs.ndim
-                        out_dtype = rhs.runtime.get_point_type(N)
+        assert isinstance(key, tuple)
+        key = self._unpack_ellipsis(key, self.ndim)
+        shift = 0
+        last_index = self.ndim
+        # in case when index arrays are passed in the scaterred way,
+        # we need to transpose original array so all index arrays
+        # are close to each other
+        transpose_needed = False
+        transpose_indices = tuple()
+        key_transpose_indices = tuple()
+        # since we can't call Copy operation on transformed Store, after
+        # the transformation, we need to return a copy
+        tuple_of_arrays = ()
+
+        # First, we need to check if transpose is needed
+        for dim, k in enumerate(key):
+            if np.isscalar(k) or isinstance(k, NumPyThunk):
+                if start_index == -1:
+                    start_index = dim
+                transpose_indices += (dim,)
+                transpose_needed = transpose_needed or ((dim - last_index) > 1)
+                last_index = dim
+
+        if transpose_needed:
+            start_index = 0
+            post_indices = tuple(
+                i for i in range(store.ndim) if i not in transpose_indices
+            )
+            key_transpose_indices = transpose_indices
+            transpose_indices += post_indices
+            post_indices = tuple(
+                i for i in range(len(key)) if i not in key_transpose_indices
+            )
+            key_transpose_indices += post_indices
+            store = store.transpose(transpose_indices)
 
-                    out = rhs.runtime.create_unbound_thunk(out_dtype)
-                    task = rhs.context.create_task(
-                        CuNumericOpCode.ADVANCED_INDEXING
-                    )
-                    task.add_output(out.base)
-                    task.add_input(rhs.base)
-                    task.add_input(key.base)
-                    task.add_scalar_arg(is_set, bool)
-                    task.add_alignment(rhs.base, key.base)
-                    task.add_broadcast(
-                        rhs.base, axes=tuple(range(1, len(rhs.shape)))
-                    )
-                    task.add_broadcast(
-                        key.base, axes=tuple(range(1, len(key.shape)))
-                    )
-                    task.execute()
-                    return False, rhs, out
+            key = tuple(key[i] for i in key_transpose_indices)
+
+        for d, k in enumerate(key):
+            dim = d
+            if np.isscalar(k):
+                if k < 0:
+                    k += store.shape[dim + shift]
+                store = store.project(dim + shift, k)
+                shift -= 1
+            elif k is np.newaxis:
+                store = store.promote(dim + shift, 1)
+            elif isinstance(k, slice):
+                store = store.slice(dim + shift, k)
+            elif isinstance(k, NumPyThunk):
+                if not isinstance(key, DeferredArray):
+                    k = self.runtime.to_deferred_array(k)
+                if k.dtype == np.bool:
+                    if k.shape[0] != store.shape[dim + shift]:
+                        raise ValueError(
+                            "shape of boolean index did not match "
+                            "indexed array "
+                        )
+                    # in case of the mixed indises we all nonzero
+                    # for the bool array
+                    k = k.nonzero()
+                    tuple_of_arrays += k
                 else:
-                    # FIXME: replace `nonzero` case with the task with
-                    # output regions when ND output regions are available
-                    tuple_of_arrays = key.nonzero()
-            elif key.ndim < store.ndim:
-                output_arr = rhs._zip_indices(start_index, (key,))
-                return True, rhs, output_arr
+                    tuple_of_arrays += (k,)
             else:
-                tuple_of_arrays = (rhs.runtime.to_deferred_array(key),)
-
-        if len(tuple_of_arrays) > rhs.ndim:
-            raise TypeError("Advanced indexing dimension mismatch")
+                raise TypeError(
+                    "Unsupported entry type passed to advanced ",
+                    "indexing operation",
+                )
+        if store.transformed:
+            # after store is transformed we need to to return a copy of
+            # the store since Copy operation can't be done on
+            # the store with transformation
+            rhs, store = self._copy_store(store)
 
         if len(tuple_of_arrays) <= rhs.ndim and rhs.ndim > 1:
             output_arr = rhs._zip_indices(start_index, tuple_of_arrays)
@@ -683,6 +640,8 @@ def set_item(self, key, rhs):
             copy_needed, lhs, index_array = self._create_indexing_array(
                 key, True
             )
+            # TODO: remove rhs.ndim ==0 logic when issue with scalars not being
+            # type of Store is addressed
             if rhs.ndim == 0:
                 rhs_tmp = self.runtime.create_empty_thunk(
                     index_array.base.shape,
diff --git a/tests/index_routines.py b/tests/index_routines.py
index 92f055ba3..0ebafaeef 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -376,6 +376,35 @@ def advanced_indexing():
     z_num[indx_num] = 10
     assert np.array_equal(z, z_num)
 
+    x = np.ones((3, 4))
+    x_num = num.array(x)
+    ind = np.full((4,), True)
+    ind_num = num.array(ind)
+    res = x[:, ind]
+    res_num = x_num[:, ind_num]
+    assert np.array_equal(res, res_num)
+
+    if LEGATE_MAX_DIM > 7:
+        x = np.ones((2, 3, 4, 5, 3, 4))
+        ind1 = np.full((3, 4), True)
+        ind2 = np.full((3, 4), True)
+        x_num = num.array(x)
+        ind1_num = num.array(ind1)
+        ind2_num = num.array(ind2)
+        res = x[:, ind1, :ind2]
+        res_num = x[:, ind1_num, :ind2_num]
+        res = x[ind1, :ind2]
+        res_num = x[ind1_num, :ind2_num]
+        assert np.array_equal(res, res_num)
+
+    x = np.ones((3, 4))
+    x_num = num.array(x)
+    ind = np.full((3,), 1, dtype=np.int32)
+    ind_num = num.array(ind)
+    res = x[ind, ind]
+    res_num = x_num[ind_num, ind_num]
+    assert np.array_equal(res, res_num)
+
     # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by
     # 1 when passig 2d index array
     for ndim in range(2, LEGATE_MAX_DIM):

From 04a3cc48008b486e1280b541701922ccbbaecef5 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 15 Apr 2022 15:47:46 -0600
Subject: [PATCH 23/33] making zip task to compile for the case when Legion is
 compiled with the support for large dimensions

---
 src/cunumeric/index/zip.cc     | 8 ++++++--
 src/cunumeric/index/zip.cu     | 8 ++++++--
 src/cunumeric/index/zip_omp.cc | 8 ++++++--
 tests/index_routines.py        | 8 +++-----
 4 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/src/cunumeric/index/zip.cc b/src/cunumeric/index/zip.cc
index a1bce3a5f..e755634ce 100644
--- a/src/cunumeric/index/zip.cc
+++ b/src/cunumeric/index/zip.cc
@@ -42,12 +42,16 @@ struct ZipImplBody<VariantKind::CPU, DIM, N> {
         std::vector<const VAL*> indx_ptrs = {index_arrays[Is].ptr(rect)...};
         auto outptr                       = out.ptr(rect);
         for (size_t idx = 0; idx < volume; ++idx) {
-          outptr[idx] = Legion::Point<N>(indx_ptrs[Is][idx]...);
+          Legion::Point<N> new_point;
+          for (size_t i = 0; i < N; i++) { new_point[i] = indx_ptrs[i][idx]; }
+          outptr[idx] = new_point;
         }
       } else {
         for (size_t idx = 0; idx < volume; ++idx) {
           auto p = pitches.unflatten(idx, rect.lo);
-          out[p] = Legion::Point<N>(index_arrays[Is][p]...);
+          Legion::Point<N> new_point;
+          for (size_t i = 0; i < N; i++) { new_point[i] = index_arrays[i][p]; }
+          out[p] = new_point;
         }
       }
     } else {
diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu
index abf4914de..c130b9bb4 100644
--- a/src/cunumeric/index/zip.cu
+++ b/src/cunumeric/index/zip.cu
@@ -34,7 +34,9 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= volume) return;
   auto p = pitches.unflatten(idx, rect.lo);
-  out[p] = Legion::Point<N>(index_arrays[Is][p]...);
+  Legion::Point<N> new_point;
+  for (size_t i = 0; i < N; i++) { new_point[i] = index_arrays[i][p]; }
+  out[p] = new_point;
 }
 
 template <int DIM, int N, size_t... Is>
@@ -47,7 +49,9 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
 {
   const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= volume) return;
-  out[idx] = Legion::Point<N>(index_arrays[Is][idx]...);
+  Legion::Point<N> new_point;
+  for (size_t i = 0; i < N; i++) { new_point[i] = index_arrays[i][idx]; }
+  out[idx] = new_point;
 }
 
 template <int DIM, int N>
diff --git a/src/cunumeric/index/zip_omp.cc b/src/cunumeric/index/zip_omp.cc
index e4a5d5764..51eb4c04f 100644
--- a/src/cunumeric/index/zip_omp.cc
+++ b/src/cunumeric/index/zip_omp.cc
@@ -43,13 +43,17 @@ struct ZipImplBody<VariantKind::OMP, DIM, N> {
         auto outptr                       = out.ptr(rect);
 #pragma omp parallel for schedule(static)
         for (size_t idx = 0; idx < volume; ++idx) {
-          outptr[idx] = Legion::Point<N>(indx_ptrs[Is][idx]...);
+          Legion::Point<N> new_point;
+          for (size_t i = 0; i < N; i++) { new_point[i] = indx_ptrs[i][idx]; }
+          outptr[idx] = new_point;
         }
       } else {
 #pragma omp parallel for schedule(static)
         for (size_t idx = 0; idx < volume; ++idx) {
           auto p = pitches.unflatten(idx, rect.lo);
-          out[p] = Legion::Point<N>(index_arrays[Is][p]...);
+          Legion::Point<N> new_point;
+          for (size_t i = 0; i < N; i++) { new_point[i] = index_arrays[i][p]; }
+          out[p] = new_point;
         }
       }  // else
     } else {
diff --git a/tests/index_routines.py b/tests/index_routines.py
index 0ebafaeef..abd3b9536 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -391,10 +391,8 @@ def advanced_indexing():
         x_num = num.array(x)
         ind1_num = num.array(ind1)
         ind2_num = num.array(ind2)
-        res = x[:, ind1, :ind2]
-        res_num = x[:, ind1_num, :ind2_num]
-        res = x[ind1, :ind2]
-        res_num = x[ind1_num, :ind2_num]
+        res = x[:, ind1, :, ind2]
+        res_num = x[:, ind1_num, :, ind2_num]
         assert np.array_equal(res, res_num)
 
     x = np.ones((3, 4))
@@ -408,7 +406,7 @@ def advanced_indexing():
     # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by
     # 1 when passig 2d index array
     for ndim in range(2, LEGATE_MAX_DIM):
-        a_shape = tuple(random.randint(2, 9) for i in range(ndim))
+        a_shape = tuple(random.randint(2, 5) for i in range(ndim))
         np_array = mk_seq_array(np, a_shape)
         num_array = mk_seq_array(num, a_shape)
         # check when N of index arrays == N of dims

From 173f47fccabdfd068e7572327acbae0f120fa512 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 18 Apr 2022 10:17:47 -0600
Subject: [PATCH 24/33] Removing the cehck for the output_rect == input_rect

In the case when input arrays (index arrays) are
broadcasted and, legate not always will partition them
(it will sometimes just broadcast them). This doesn't
effect correctness.
---
 src/cunumeric/index/zip_template.inl | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/cunumeric/index/zip_template.inl b/src/cunumeric/index/zip_template.inl
index e1e2c9004..3f7fcfe69 100644
--- a/src/cunumeric/index/zip_template.inl
+++ b/src/cunumeric/index/zip_template.inl
@@ -37,10 +37,6 @@ struct ZipImpl {
     size_t volume = pitches.flatten(out_rect);
     if (volume == 0) return;
 
-#ifdef DEBUG_CUNUMERIC
-    assert(out_rect == index_rect);
-#endif
-
 #ifndef LEGION_BOUNDS_CHECKS
     bool dense = out.accessor.is_dense_row_major(out_rect);
 #else
@@ -48,10 +44,7 @@ struct ZipImpl {
 #endif
     std::vector<AccessorRO<VAL, DIM>> index_arrays;
     for (int i = 0; i < args.inputs.size(); i++) {
-#ifdef DEBUG_CUNUMERIC
-      assert(index_rect == args.inputs[i].shape<DIM>());
-#endif
-      index_arrays.push_back(args.inputs[i].read_accessor<VAL, DIM>(index_rect));
+      index_arrays.push_back(args.inputs[i].read_accessor<VAL, DIM>(args.inputs[i].shape<DIM>()));
       dense = dense && index_arrays[i].accessor.is_dense_row_major(out_rect);
     }
 

From 3601cdbe6773de69e0099b189e57dfb0feda0b2d Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 18 Apr 2022 14:53:03 -0600
Subject: [PATCH 25/33] cleaning-up tests + fixing logic for transformed rhs
 and index arrays

---
 cunumeric/deferred.py       |  14 +-
 src/cunumeric/cunumeric_c.h |   2 +-
 tests/advanced_indexing.py  | 591 ++++++++++++++++++++++++++++++++++++
 tests/index_routines.py     | 434 --------------------------
 4 files changed, 598 insertions(+), 443 deletions(-)
 create mode 100644 tests/advanced_indexing.py

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 5a4a0ef93..948f83186 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -444,12 +444,6 @@ def _create_indexing_array(self, key, is_set=False):
             return False, rhs, out
 
         if isinstance(key, NumPyThunk):
-            # the use case when index array ndim >input array ndim
-            if key.ndim > store.ndim:
-                diff = store.ndim - key.ndim
-                for i in range(diff):
-                    store = store.promote(i + 1, store.shape[0])
-
             key = (key,)
 
         assert isinstance(key, tuple)
@@ -487,7 +481,6 @@ def _create_indexing_array(self, key, is_set=False):
             )
             key_transpose_indices += post_indices
             store = store.transpose(transpose_indices)
-
             key = tuple(key[i] for i in key_transpose_indices)
 
         for d, k in enumerate(key):
@@ -531,7 +524,10 @@ def _create_indexing_array(self, key, is_set=False):
             output_arr = rhs._zip_indices(start_index, tuple_of_arrays)
             return True, rhs, output_arr
         elif len(tuple_of_arrays) == 1 and rhs.ndim == 1:
-            return True, rhs, tuple_of_arrays[0]
+            key = tuple_of_arrays[0]
+            if key.base.transformed:
+                key, key_store = key._copy_store(key.base)
+            return True, rhs, key
         else:
             raise ValueError("Advance indexing dimention mismatch")
 
@@ -659,6 +655,8 @@ def set_item(self, key, rhs):
                     rhs_tmp = rhs._broadcast(index_array.base.shape)
                     rhs_tmp, rhs = rhs._copy_store(rhs_tmp)
                 else:
+                    if rhs.base.transformed:
+                        rhs, rhs_base = rhs._copy_store(rhs.base)
                     rhs = rhs.base
 
             copy = self.context.create_copy()
diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h
index f270cc247..fc95446c4 100644
--- a/src/cunumeric/cunumeric_c.h
+++ b/src/cunumeric/cunumeric_c.h
@@ -189,7 +189,7 @@ enum CuNumericBounds {
 
 // Match these to CuNumericTypeCodes in config.py
 enum CuNumericTypeCodes {
-  CUNUMERIC_TYPE_POINT1 = LEGION_TYPE_TOTAL + 1,
+  CUNUMERIC_TYPE_POINT1 = MAX_TYPE_NUMBER + 1,
   CUNUMERIC_TYPE_POINT2,
   CUNUMERIC_TYPE_POINT3,
   CUNUMERIC_TYPE_POINT4,
diff --git a/tests/advanced_indexing.py b/tests/advanced_indexing.py
new file mode 100644
index 000000000..f0e478eb9
--- /dev/null
+++ b/tests/advanced_indexing.py
@@ -0,0 +1,591 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import random
+
+import numpy as np
+from test_tools.generators import mk_seq_array
+
+import cunumeric as num
+from legate.core import LEGATE_MAX_DIM
+
+
+def test():
+
+    # tests on 1D input array:
+    print("advanced indexing test 1")
+
+    # a: simple 1D test
+    x = np.array([1, 2, 3, 4, 5, 6, 7])
+    indx = np.array([1, 3, 5])
+    res = x[indx]
+    x_num = num.array(x)
+    indx_num = num.array(indx)
+    res_num = x_num[indx_num]
+    assert np.array_equal(res, res_num)
+
+    # b: after base array transformation:
+    xt = x[1:]
+    xt_num = x_num[1:]
+    res = xt[indx]
+    res_num = xt_num[indx_num]
+    assert np.array_equal(res, res_num)
+
+    # c: after index array transformation:
+    indxt = indx[1:]
+    indxt_num = indx_num[1:]
+    res = x[indxt]
+    res_num = x_num[indxt_num]
+    assert np.array_equal(res, res_num)
+
+    # d: test in-place assignment with scalar:
+    x[indx] = 13
+    x_num[indx_num] = 13
+    assert np.array_equal(x, x_num)
+
+    # e: test in-place assignment with array:
+    xt[indx] = np.array([3, 5, 7])
+    xt_num[indx_num] = num.array([3, 5, 7])
+    assert np.array_equal(xt, xt_num)
+    assert np.array_equal(x, x_num)
+
+    # f: test in-place assignment with transformed rhs array:
+    b = np.array([3, 5, 7, 8])
+    b_num = num.array([3, 5, 7, 8])
+    bt = b[1:]
+    bt_num = b_num[1:]
+    x[indx] = bt
+    x_num[indx_num] = bt_num
+    assert np.array_equal(x, x_num)
+
+    # g: test in-place assignment with transformed
+    #    rhs and lhs arrays:
+    b = np.array([3, 5, 7, 8])
+    b_num = num.array([3, 5, 7, 8])
+    b1 = b[1:]
+    b1_num = b_num[1:]
+    xt[indx] = b1
+    xt_num[indx_num] = b1_num
+    assert np.array_equal(xt, xt_num)
+    assert np.array_equal(x, x_num)
+
+    # h: in-place assignment with transformed index array:
+    b = np.array([5, 7])
+    b_num = num.array([5, 7])
+    x[indxt] = b
+    x_num[indxt_num] = b_num
+    assert np.array_equal(x, x_num)
+
+    # i: the case when index.ndim > input.ndim:
+    index = np.array([[1, 0, 1, 3, 0, 0], [2, 4, 0, 4, 4, 4]])
+    index_num = num.array(index)
+    assert np.array_equal(x[index], x_num[index_num])
+
+    # j: test for bool array of the same dimension
+    index = np.array([True, False, False, True, True, False, True])
+    index_num = num.array(index)
+    assert np.array_equal(x[index], x_num[index_num])
+
+    # k: test in-place assignment fir the case when idx arr
+    #    is 1d bool array:
+    x[index] = 3
+    x_num[index_num] = 3
+    assert np.array_equal(x, x_num)
+
+    # l: test when type of a base array is different from int:
+    x_float = x.astype(float)
+    x_num_float = x_num.astype(float)
+    index = np.array([[1, 0, 1, 3, 0, 0], [2, 4, 0, 4, 4, 4]])
+    index_num = num.array(index)
+    assert np.array_equal(x_float[index], x_num_float[index_num])
+
+    # m: test when type of the index array is not int64
+    index = np.array([1, 3, 5], dtype=np.int16)
+    index_num = num.array(index)
+    assert np.array_equal(x[index], x_num[index_num])
+
+    # n: the case when rhs is a different type
+    x[index] = 3.5
+    x_num[index_num] = 3.5
+    assert np.array_equal(x, x_num)
+
+    # o: the case when rhs is an array of different type
+    b = np.array([2.1, 3.3, 7.2])
+    b_num = num.array(b)
+    x[index] = b
+    x_num[index_num] = b_num
+    assert np.array_equal(x, x_num)
+
+    # p: in-place assignment where some indices point to the
+    # same location:
+    index = np.array([2, 4, 0, 4, 4, 4])
+    index_num = num.array(index)
+    x[index] = 0
+    x_num[index_num] = 0
+    assert np.array_equal(x, x_num)
+
+    # q: in-place assignment in the case when broadcast is needed:
+    index = np.array([[1, 4, 3], [2, 0, 5]])
+    index_num = num.array(index)
+    x[index] = np.array([[1, 2, 3]])
+    x_num[index_num] = num.array([[1, 2, 3]])
+    assert np.array_equal(x, x_num)
+
+    # Nd cases
+    print("advanced indexing test 2")
+
+    x = mk_seq_array(np, (2, 3, 4, 5))
+    x_num = mk_seq_array(num, (2, 3, 4, 5))
+    xt = x.transpose(
+        (
+            1,
+            0,
+            2,
+            3,
+        )
+    )
+    xt_num = x_num.transpose(
+        (
+            1,
+            0,
+            2,
+            3,
+        )
+    )
+
+    # a: 1d index  array passed to a different indices:
+    indx = np.array([1, 1])
+    indx_num = num.array(indx)
+    res = x[indx]
+    res_num = x_num[indx_num]
+    assert np.array_equal(res, res_num)
+
+    res = xt[indx]
+    res_num = xt_num[indx_num]
+    assert np.array_equal(res, res_num)
+
+    res = x[:, :, indx]
+    res_num = x_num[:, :, indx_num]
+    assert np.array_equal(res, res_num)
+
+    res = xt[:, :, indx]
+    res_num = xt_num[:, :, indx_num]
+    assert np.array_equal(res, res_num)
+
+    res = x[:, :, :, indx]
+    res_num = x_num[:, :, :, indx_num]
+    assert np.array_equal(res, res_num)
+
+    res = xt[:, :, :, indx]
+    res_num = xt_num[:, :, :, indx_num]
+    assert np.array_equal(res, res_num)
+
+    res = x[:, indx, :]
+    res_num = x_num[:, indx_num, :]
+    assert np.array_equal(res, res_num)
+
+    res = xt[:, indx, :]
+    res_num = xt_num[:, indx_num, :]
+    assert np.array_equal(res, res_num)
+
+    # b : 2 1d index arrays passed
+    indx0 = np.array([1, 1])
+    indx1 = np.array([1, 0])
+    indx0_num = num.array(indx0)
+    indx1_num = num.array(indx1)
+    res = x[indx0, indx1]
+    res_num = x_num[indx0_num, indx1_num]
+    assert np.array_equal(res, res_num)
+
+    res = xt[indx0, indx1]
+    res_num = xt_num[indx0_num, indx1_num]
+    assert np.array_equal(res, res_num)
+
+    res = x[:, indx0, indx1]
+    res_num = x_num[:, indx0_num, indx1_num]
+    assert np.array_equal(res, res_num)
+
+    res = xt[:, indx0, indx1]
+    res_num = xt_num[:, indx0_num, indx1_num]
+    assert np.array_equal(res, res_num)
+
+    # c:  2 index arrays passed in a sparse way:
+    res = x[:, [0, 1], :, [0, 1]]
+    res_num = x_num[:, [0, 1], :, [0, 1]]
+    assert np.array_equal(res, res_num)
+
+    res = xt[:, [0, 1], :, [0, 1]]
+    res_num = xt_num[:, [0, 1], :, [0, 1]]
+    assert np.array_equal(res, res_num)
+
+    res = x[[0, 1], :, [0, 1], 1:]
+    res_num = x_num[[0, 1], :, [0, 1], 1:]
+    assert np.array_equal(res, res_num)
+
+    res = xt[[0, 1], :, [0, 1], 1:]
+    res_num = xt_num[[0, 1], :, [0, 1], 1:]
+    assert np.array_equal(res, res_num)
+
+    res = x[:, [0, 1], :, 1:]
+    res_num = x_num[:, [0, 1], :, 1:]
+    assert np.array_equal(res, res_num)
+
+    res = xt[:, [0, 1], :, 1:]
+    res_num = xt_num[:, [0, 1], :, 1:]
+    assert np.array_equal(res, res_num)
+
+    z = x
+    z_num = x_num
+    z[[0, 1], [0, 1]] = 11
+    z_num[[0, 1], [0, 1]] = 11
+    assert np.array_equal(z, z_num)
+
+    # d: newaxis is passed along with array:
+
+    res = x[..., [1, 0]]
+    res_num = x_num[..., [1, 0]]
+    assert np.array_equal(res, res_num)
+
+    res = xt[..., [0, 1], 1:]
+    res_num = xt_num[..., [0, 1], 1:]
+    assert np.array_equal(res, res_num)
+
+    res = x[..., [0, 1], [1, 1]]
+    res_num = x_num[..., [0, 1], [1, 1]]
+    assert np.array_equal(res, res_num)
+
+    # e: index arrays that have different shape:
+    indx0 = np.array([1, 1])
+    indx1 = np.array([[1, 0], [1, 0]])
+    indx0_num = num.array(indx0)
+    indx1_num = num.array(indx1)
+    res = x[indx0, indx1]
+    res_num = x_num[indx0_num, indx1_num]
+    assert np.array_equal(res, res_num)
+
+    res = xt[indx0, indx1]
+    res_num = xt_num[indx0_num, indx1_num]
+    assert np.array_equal(res, res_num)
+
+    res = x[indx0, indx1, indx0, indx1]
+    res_num = x_num[indx0_num, indx1_num, indx0_num, indx1_num]
+    assert np.array_equal(res, res_num)
+
+    res = x[indx0, :, indx1]
+    res_num = x_num[indx0_num, :, indx1_num]
+    assert np.array_equal(res, res_num)
+
+    res = xt[:, indx0, indx1, 1:]
+    res_num = xt_num[:, indx0_num, indx1_num, 1:]
+    assert np.array_equal(res, res_num)
+
+    # f: single boolean array passed:
+    indx_bool = np.array([True, False])
+    indx_bool_num = num.array(indx_bool)
+    res = x[indx_bool]
+    res_num = x_num[indx_bool_num]
+    assert np.array_equal(res, res_num)
+
+    indx_bool = np.array([True, False, True])
+    indx_bool_num = num.array(indx_bool)
+    res = x[:, indx_bool]
+    res_num = x_num[:, indx_bool_num]
+    assert np.array_equal(res, res_num)
+
+    # on the transposed base
+    indx_bool = np.array([True, False, True])
+    indx_bool_num = num.array(indx_bool)
+    res = xt[indx_bool]
+    res_num = xt_num[indx_bool_num]
+    assert np.array_equal(res, res_num)
+
+    indx_bool = np.array([True, False, True, False, False])
+    indx_bool_num = num.array(indx_bool)
+    res = x[..., indx_bool]
+    res_num = x_num[..., indx_bool_num]
+    assert np.array_equal(res, res_num)
+
+    print("IRINA DEBUG 1")
+    indx1_bool = np.array([True, False])
+    indx1_bool_num = num.array(indx1_bool)
+    indx2_bool = np.array([True, False, True, True])
+    indx2_bool_num = num.array(indx2_bool)
+    res = x[indx1_bool, :, indx2_bool]
+    print(res.shape)
+    print(res)
+    res_num = x_num[indx1_bool_num, :, indx2_bool_num]
+    print(res_num.shape)
+    print(res_num)
+    assert np.array_equal(res, res_num)
+
+    print("IRINA DEBUG 2")
+    res = x[indx1_bool, 1, indx2_bool]
+    # res_num = x_num[indx1_bool_num, 1, indx2_bool_num]
+    # print(res.shape)
+    # print(res_num.shape)
+    # assert np.array_equal(res, res_num)
+
+    # g: boolean array with the same shape is passed to x:
+    indx = x % 2
+    indx = indx.astype(bool)
+    indx_num = num.array(indx)
+    res = x[indx]
+    res_num = x_num[indx_num]
+    assert np.array_equal(res, res_num)
+
+    # h: inplace assignment with bool arays
+    z = x
+    z_num = x_num
+    z[indx] = 1
+    z_num[indx_num] = 1
+    assert np.array_equal(z, z_num)
+    print("IRINA DEBUG 3")
+
+    indx_bool = np.array([True, False, True])
+    indx_bool_num = num.array(indx_bool)
+    z[:, indx_bool] = 5
+    z_num[:, indx_bool_num] = 5
+    assert np.array_equal(z, z_num)
+
+    print("IRINA DEBUG 4")
+    # i: two bool array of the same shape are passed:
+    x = mk_seq_array(
+        np,
+        (
+            3,
+            4,
+            3,
+            4,
+        ),
+    )
+    x_num = mk_seq_array(
+        num,
+        (
+            3,
+            4,
+            3,
+            4,
+        ),
+    )
+    indx = np.array(
+        [
+            [True, False, False, False],
+            [False, False, False, False],
+            [False, False, False, True],
+        ]
+    )
+    indx_num = num.array(indx)
+    res = x[indx, indx]
+    print("IRINA DEBUG res = ", res.shape)
+    # res_num = x_num[indx_num, indx_num]
+    # assert np.array_equal(res, res_num)
+
+    # j: 2 bool arrays should be broadcasted:
+    # res = x[idx, [True,False,False]]
+    # res_num = x_num[idx_num, [True,False,False]]
+
+    # 2d bool array not at the first index:
+    indx = np.full((4, 3), True)
+    indx_num = num.array(indx)
+    res = x[:, indx]
+    # res_num = x_num[:, indx]
+    # assert np.array_equal(res, res_num)
+
+    # 3: testing mixed type of the arguments passed:
+
+    # a: bool and index arrays
+    x = mk_seq_array(
+        np,
+        (
+            2,
+            3,
+            4,
+            5,
+        ),
+    )
+    x_num = mk_seq_array(
+        num,
+        (
+            2,
+            3,
+            4,
+            5,
+        ),
+    )
+    res = x[[1, 1], [False, True, False]]
+    # res_num = x_num[[1,1], [False, True,False]]
+    # assert np.array_equal(res, res_num)
+
+    res = x[[1, 1], :, [False, True, False, True]]
+    res_num = x_num[[1, 1], :, [False, True, False, True]]
+    assert np.array_equal(res, res_num)
+
+    # b: combining basic and advanced indexing schemes
+    ind0 = np.array([1, 1])
+    ind0_num = num.array(ind0)
+    res = x[ind0, :, -1]
+    res_num = x_num[ind0_num, :, -1]
+    assert np.array_equal(res, res_num)
+
+    res = x[ind0, :, 1:3]
+    res_num = x_num[ind0_num, :, 1:3]
+    assert np.array_equal(res, res_num)
+
+    res = x[1, :, ind0]
+    res_num = x_num[1, :, ind0_num]
+    assert np.array_equal(res, res_num)
+
+    x = mk_seq_array(np, (3, 4, 5, 6))
+    x_num = mk_seq_array(num, (3, 4, 5, 6))
+    res = x[[0, 1], [0, 1], :, 2]
+    res_num = x_num[[0, 1], [0, 1], :, 2]
+    assert np.array_equal(res, res_num)
+
+    res = x[..., [0, 1], 2]
+    res_num = x_num[..., [0, 1], 2]
+    assert np.array_equal(res, res_num)
+
+    res = x[:, [0, 1], :, -1]
+    res_num = x_num[:, [0, 1], :, -1]
+    assert np.array_equal(res, res_num)
+
+    res = x[:, [0, 1], :, 1:]
+    res_num = x_num[:, [0, 1], :, 1:]
+    assert np.array_equal(res, res_num)
+
+    # c: transformed base:
+    z = x[:, 1:]
+    z_num = x_num[:, 1:]
+    indx = np.array([1, 1])
+    indx_num = num.array(indx)
+    res = z[indx]
+    res_num = z_num[indx_num]
+    assert np.array_equal(res, res_num)
+
+    # d: shape mismatch case:
+    x = np.array(
+        [
+            [0.38, -0.16, 0.38, -0.41, -0.04],
+            [-0.47, -0.01, -0.18, -0.5, -0.49],
+            [0.02, 0.4, 0.33, 0.33, -0.13],
+        ]
+    )
+    x_num = num.array(x)
+
+    indx = np.ones((2, 2, 2), dtype=int)
+    indx_num = num.array(indx)
+    res = x[indx]
+    res_num = x_num[indx_num]
+    assert np.array_equal(res, res_num)
+
+    x = np.ones(
+        (
+            3,
+            4,
+        ),
+        dtype=int,
+    )
+    x_num = num.array(x)
+    ind = np.full((4,), True)
+    ind_num = num.array(ind)
+    res = x[:, ind]
+    res_num = x_num[:, ind_num]
+    assert np.array_equal(res, res_num)
+
+    if LEGATE_MAX_DIM > 7:
+        x = np.ones((2, 3, 4, 5, 3, 4))
+        ind1 = np.full((3, 4), True)
+        ind2 = np.full((3, 4), True)
+        x_num = num.array(x)
+        ind1_num = num.array(ind1)
+        ind2_num = num.array(ind2)
+        res = x[:, ind1, :, ind2]
+        res_num = x[:, ind1_num, :, ind2_num]
+        assert np.array_equal(res, res_num)
+
+    # e: type mismatch case:
+    x = np.ones((3, 4))
+    x_num = num.array(x)
+    ind = np.full((3,), 1, dtype=np.int32)
+    ind_num = num.array(ind)
+    res = x[ind, ind]
+    res_num = x_num[ind_num, ind_num]
+    assert np.array_equal(res, res_num)
+
+    x = np.ones((3, 4), dtype=float)
+    x_num = num.array(x)
+    ind = np.full((3,), 1)
+    ind_num = num.array(ind)
+    res = x[ind, ind]
+    res_num = x_num[ind_num, ind_num]
+    assert np.array_equal(res, res_num)
+
+    x[ind, ind] = 5
+    x_num[ind_num, ind_num] = 5
+    assert np.array_equal(x, x_num)
+
+    # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by
+    # 1 when passig 2d index array
+    for ndim in range(2, LEGATE_MAX_DIM):
+        a_shape = tuple(random.randint(2, 5) for i in range(ndim))
+        np_array = mk_seq_array(np, a_shape)
+        num_array = mk_seq_array(num, a_shape)
+        # check when N of index arrays == N of dims
+        num_tuple_of_indices = tuple()
+        np_tuple_of_indices = tuple()
+        for i in range(ndim):
+            i_shape = (2, 4)
+            idx_arr_np = mk_seq_array(np, i_shape) % np_array.shape[i]
+            idx_arr_num = num.array(idx_arr_np)
+            np_tuple_of_indices += (idx_arr_np,)
+            num_tuple_of_indices += (idx_arr_num,)
+        assert np.array_equal(
+            np_array[np_tuple_of_indices], num_array[num_tuple_of_indices]
+        )
+        # check when N of index arrays == N of dims
+        i_shape = (2, 2)
+        idx_arr_np = mk_seq_array(np, i_shape) % np_array.shape[0]
+        idx_arr_num = num.array(idx_arr_np)
+        assert np.array_equal(np_array[idx_arr_np], num_array[idx_arr_num])
+        # test in-place assignment
+        np_array[idx_arr_np] = 2
+        num_array[idx_arr_num] = 2
+        assert np.array_equal(num_array, np_array)
+        idx_arr_np = np.array([[1, 0, 1], [1, 1, 0]])
+        idx_arr_num = num.array(idx_arr_np)
+        assert np.array_equal(
+            np_array[:, idx_arr_np], num_array[:, idx_arr_num]
+        )
+        # test in-place assignment
+        np_array[:, idx_arr_np] = 3
+        num_array[:, idx_arr_num] = 3
+        assert np.array_equal(num_array, np_array)
+        if ndim > 2:
+            assert np.array_equal(
+                np_array[1, :, idx_arr_np], num_array[1, :, idx_arr_num]
+            )
+            assert np.array_equal(
+                np_array[:, idx_arr_np, idx_arr_np],
+                num_array[:, idx_arr_num, idx_arr_num],
+            )
+        if ndim > 3:
+            assert np.array_equal(
+                np_array[:, idx_arr_np, :, idx_arr_np],
+                num_array[:, idx_arr_num, :, idx_arr_num],
+            )
+
+
+if __name__ == "__main__":
+    test()
diff --git a/tests/index_routines.py b/tests/index_routines.py
index abd3b9536..3f5344df5 100644
--- a/tests/index_routines.py
+++ b/tests/index_routines.py
@@ -24,438 +24,6 @@
 from legate.core import LEGATE_MAX_DIM
 
 
-def advanced_indexing():
-    # simple advanced indexing:
-    print("advanced indexing test 1")
-    x = np.array([1, 2, 3, 4, 5, 6, 7])
-    indx = np.array([1, 3, 5])
-    res = x[indx]
-    x_num = num.array(x)
-    indx_num = num.array(indx)
-    res_num = x_num[indx_num]
-    assert np.array_equal(res, res_num)
-
-    # after transformation:
-    x = x[1:]
-    x_num = x_num[1:]
-    res = x[indx]
-    res_num = x_num[indx_num]
-    assert np.array_equal(res, res_num)
-
-    # advanced indexing test when a.ndim ==1 , indx.ndim >1
-    print("advanced indexing test 2")
-    y = np.array([0, -1, -2, -3, -4, -5])
-    y_num = num.array(y)
-    index = np.array([[1, 0, 1, 3, 0, 0], [2, 4, 0, 4, 4, 4]])
-    index_num = num.array(index)
-    assert np.array_equal(y[index], y_num[index_num])
-
-    # simple 2D case
-    print("advanced indexing test 3")
-    index_2d = np.array([[1, 2, 0], [5, 5, 5], [2, 3, 4]])
-    index_2d_num = num.array(index_2d)
-    assert np.array_equal(y[index_2d], y_num[index_2d_num])
-
-    z = np.array(
-        [
-            [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]],
-            [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]],
-        ]
-    )
-    z_num = num.array(z)
-
-    zt = z.transpose(
-        (
-            1,
-            0,
-            2,
-        )
-    )
-    zt_num = z_num.transpose(
-        (
-            1,
-            0,
-            2,
-        )
-    )
-
-    # mismatch dimesion case:
-    print("advanced indexing test 4")
-    indx = np.array([1, 1])
-    indx_num = num.array(indx)
-    res = z[indx]
-    res_num = z_num[indx_num]
-    assert np.array_equal(res, res_num)
-
-    res = zt[indx]
-    res_num = zt_num[indx_num]
-    assert np.array_equal(res, res_num)
-
-    res = z[:, :, indx]
-    res_num = z_num[:, :, indx_num]
-    assert np.array_equal(res, res_num)
-
-    res = zt[:, :, indx]
-    res_num = zt_num[:, :, indx_num]
-    assert np.array_equal(res, res_num)
-
-    res = z[:, indx, :]
-    res_num = z_num[:, indx_num, :]
-    assert np.array_equal(res, res_num)
-
-    res = zt[:, indx, :]
-    res_num = zt_num[:, indx_num, :]
-    assert np.array_equal(res, res_num)
-
-    # 2d:
-    indx = np.array([[1, 1], [1, 0]])
-    indx_num = num.array(indx)
-    res = z[indx]
-    res_num = z_num[indx_num]
-    assert np.array_equal(res, res_num)
-
-    res = zt[indx]
-    res_num = zt_num[indx_num]
-    assert np.array_equal(res, res_num)
-
-    res = z[:, indx]
-    res_num = z_num[:, indx_num]
-    assert np.array_equal(res, res_num)
-
-    res = zt[:, indx]
-    res_num = zt_num[:, indx_num]
-    assert np.array_equal(res, res_num)
-
-    # 2 arrays passed to 3d array
-    indx0 = np.array([1, 1])
-    indx1 = np.array([1, 0])
-    indx0_num = num.array(indx0)
-    indx1_num = num.array(indx1)
-    res = z[indx0, indx1]
-    res_num = z_num[indx0_num, indx1_num]
-    assert np.array_equal(res, res_num)
-
-    res = zt[indx0, indx1]
-    res_num = zt_num[indx0_num, indx1_num]
-    assert np.array_equal(res, res_num)
-
-    res = z[:, indx0, indx1]
-    res_num = z_num[:, indx0_num, indx1_num]
-    assert np.array_equal(res, res_num)
-
-    res = zt[:, indx0, indx1]
-    res_num = zt_num[:, indx0_num, indx1_num]
-    assert np.array_equal(res, res_num)
-
-    # 2 index arrays passed in a sparse way:
-    x = mk_seq_array(np, (3, 4, 5, 6))
-    x_num = mk_seq_array(num, (3, 4, 5, 6))
-    res = x[:, [0, 1], :, [0, 1]]
-    res_num = x_num[:, [0, 1], :, [0, 1]]
-    assert np.array_equal(res, res_num)
-
-    res = x[[0, 1], :, [0, 1], 1:]
-    res_num = x_num[[0, 1], :, [0, 1], 1:]
-    assert np.array_equal(res, res_num)
-
-    res = x[:, [0, 1], :, 1:]
-    res_num = x_num[:, [0, 1], :, 1:]
-    assert np.array_equal(res, res_num)
-
-    # 2 arrays with broadcasting
-    indx0 = np.array([1, 1])
-    indx1 = np.array([[1, 0], [1, 0]])
-    indx0_num = num.array(indx0)
-    indx1_num = num.array(indx1)
-    res = z[indx0, indx1]
-    res_num = z_num[indx0_num, indx1_num]
-    assert np.array_equal(res, res_num)
-
-    res = zt[indx0, indx1]
-    res_num = zt_num[indx0_num, indx1_num]
-    assert np.array_equal(res, res_num)
-
-    # mismatch dimesion case bool:
-    indx_bool = np.array([True, False])
-    indx_bool_num = num.array(indx_bool)
-    res = z[indx_bool]
-    res_num = z_num[indx_bool_num]
-    assert np.array_equal(res, res_num)
-
-    # test for bool array of the same dimension
-    print("advanced indexing test 5")
-    index = np.array([True, False, False, True, True, False])
-    index_num = num.array(index)
-    assert np.array_equal(y[index], y_num[index_num])
-
-    # test in-place assignment fir the case when idx arr
-    # is 1d bool array:
-    y[index] = 3
-    y_num[index_num] = 3
-    assert np.array_equal(y, y_num)
-
-    # test for bool array of the same dimension 2D
-    print("advanced indexing test 6")
-    indx_bool = np.array(
-        [
-            [
-                [False, True, False, False],
-                [True, True, False, False],
-                [True, False, True, False],
-            ],
-            [
-                [False, True, False, False],
-                [True, True, False, False],
-                [True, False, True, False],
-            ],
-        ]
-    )
-    indx_bool_num = num.array(indx_bool)
-    res = z[indx_bool]
-    res_num = z_num[indx_bool_num]
-    assert np.array_equal(res, res_num)
-
-    # test in-place assignment fir the case when idx arr
-    # is 2d bool array:
-    z[indx_bool] = 1
-    z_num[indx_bool] = 1
-    assert np.array_equal(z, z_num)
-
-    # test mixed data
-    print("advanced indexing test 7")
-    res = z[:, -1]
-    res_num = z_num[:, -1]
-    assert np.array_equal(res, res_num)
-
-    # case when multiple number of arays is passed
-    print("advanced indexing test 8")
-    indx0 = np.array([[0, 1], [1, 0], [0, 0]])
-    indx1 = np.array([[0, 1], [2, 0], [1, 2]])
-    indx2 = np.array([[3, 2], [1, 0], [3, 2]])
-
-    indx0_num = num.array(indx0)
-    indx1_num = num.array(indx1)
-    indx2_num = num.array(indx2)
-
-    res = z_num[indx0_num, indx1_num, indx2_num]
-    res_np = z[indx0, indx1, indx2]
-    assert np.array_equal(res, res_np)
-
-    # test in-place assignment fir the case when
-    # several index arrays passed
-    z_num[indx0_num, indx1_num, indx2_num] = -2
-    z[indx0, indx1, indx2] = -2
-    assert np.array_equal(z, z_num)
-
-    # indices with broadcast:
-    print("advanced indexing test 9")
-    indx0 = np.array([[0, 1], [1, 0], [0, 0]])
-    indx1 = np.array([[0, 1]])
-    indx2 = np.array([[3, 2], [1, 0], [3, 2]])
-
-    indx0_num = num.array(indx0)
-    indx1_num = num.array(indx1)
-    indx2_num = num.array(indx2)
-    res = z_num[indx0_num, indx1_num, indx2_num]
-    res_np = z[indx0, indx1, indx2]
-    assert np.array_equal(res, res_np)
-
-    # Combining Basic and Advanced Indexing Schemes:
-    print("advanced indexing test 10")
-    ind0 = np.array([1, 1])
-    ind0_num = num.array(ind0)
-    res = z[ind0, :, -1]
-    res_num = z_num[ind0_num, :, -1]
-    assert np.array_equal(res, res_num)
-
-    res = z[ind0, :, [False, True, False, True]]
-    res_num = z_num[ind0_num, :, [False, True, False, True]]
-    assert np.array_equal(res, res_num)
-
-    res = z[ind0, :, ind0]
-    res_num = z_num[ind0_num, :, ind0_num]
-    assert np.array_equal(res, res_num)
-
-    res = z[ind0, :, 1:3]
-    res_num = z_num[ind0_num, :, 1:3]
-    assert np.array_equal(res, res_num)
-
-    res = z[1, :, ind0]
-    res_num = z_num[1, :, ind0_num]
-    assert np.array_equal(res, res_num)
-
-    x = mk_seq_array(np, (3, 4, 5, 6))
-    x_num = mk_seq_array(num, (3, 4, 5, 6))
-    res = x[[0, 1], [0, 1], :, 2]
-    res_num = x_num[[0, 1], [0, 1], :, 2]
-    assert np.array_equal(res, res_num)
-
-    res = x[..., [0, 1], 2]
-    res_num = x_num[..., [0, 1], 2]
-    assert np.array_equal(res, res_num)
-
-    res = x[:, [0, 1], :, -1]
-    res_num = x_num[:, [0, 1], :, -1]
-    assert np.array_equal(res, res_num)
-
-    res = x[:, [0, 1], :, 1:]
-    res_num = x_num[:, [0, 1], :, 1:]
-    assert np.array_equal(res, res_num)
-
-    # In-Place & Augmented Assignments via Advanced Indexing
-    # simple 1d case
-    y = np.array([0, -1, -2, -3, -4, -5])
-    y_num = num.array(y)
-    index = np.array([2, 4, 0, 4, 4, 4])
-    index_num = num.array(index)
-    y[index] = 0
-    y_num[index_num] = 0
-    assert np.array_equal(y, y_num)
-
-    y[index] = np.array([1, 2, 3, 4, 5, 6])
-    y_num[index_num] = num.array([1, 2, 3, 4, 5, 6])
-    print(y)
-    print(y_num)
-    # Order on which data is updated in case when indexing array points to the
-    # same daya in the original array is not guaranteed, so we can't call
-    # assert np.array_equal(y, y_num) here
-
-    index = np.array([1, 4, 3, 2, 0, 5])
-    index_num = num.array(index)
-    y[index] = np.array([1, 2, 3, 4, 5, 6])
-    y_num[index_num] = num.array([1, 2, 3, 4, 5, 6])
-    print(y)
-    print(y_num)
-    assert np.array_equal(y, y_num)
-
-    # the case when broadcast is needed:
-    index = np.array([[1, 4, 3], [2, 0, 5]])
-    index_num = num.array(index)
-    y[index] = np.array([[1, 2, 3]])
-    y_num[index_num] = num.array([[1, 2, 3]])
-    print(y)
-    print(y_num)
-    assert np.array_equal(y, y_num)
-
-    # 2D test
-    x = np.array(
-        [
-            [0.38, -0.16, 0.38, -0.41, -0.04],
-            [-0.47, -0.01, -0.18, -0.5, -0.49],
-            [0.02, 0.4, 0.33, 0.33, -0.13],
-        ]
-    )
-    indx0 = np.array([0, 1])
-    indx1 = np.array([1, 2])
-    x_num = num.array(x)
-    indx0_num = num.array(indx0)
-    indx1_num = num.array(indx1)
-    x[indx0, indx1] = 2.0
-    x_num[indx0_num, indx1_num] = 2.0
-    assert np.array_equal(x, x_num)
-
-    # shape mismatch:
-    indx = np.ones((2, 2, 2), dtype=int)
-    indx_num = num.array(indx)
-    res = x[indx]
-    res_num = x_num[indx_num]
-    assert np.array_equal(res, res_num)
-
-    # use case when advanced indexing is called on a transformed array:
-    print("advanced indexing test 11")
-    z = z[:, 1:]
-    z_num = z_num[:, 1:]
-    indx = np.array([1, 1])
-    indx_num = num.array(indx)
-    res = z[indx]
-    res_num = z_num[indx_num]
-    assert np.array_equal(res, res_num)
-
-    # in-place assignment
-    z[indx] = 10
-    z_num[indx_num] = 10
-    assert np.array_equal(z, z_num)
-
-    x = np.ones((3, 4))
-    x_num = num.array(x)
-    ind = np.full((4,), True)
-    ind_num = num.array(ind)
-    res = x[:, ind]
-    res_num = x_num[:, ind_num]
-    assert np.array_equal(res, res_num)
-
-    if LEGATE_MAX_DIM > 7:
-        x = np.ones((2, 3, 4, 5, 3, 4))
-        ind1 = np.full((3, 4), True)
-        ind2 = np.full((3, 4), True)
-        x_num = num.array(x)
-        ind1_num = num.array(ind1)
-        ind2_num = num.array(ind2)
-        res = x[:, ind1, :, ind2]
-        res_num = x[:, ind1_num, :, ind2_num]
-        assert np.array_equal(res, res_num)
-
-    x = np.ones((3, 4))
-    x_num = num.array(x)
-    ind = np.full((3,), 1, dtype=np.int32)
-    ind_num = num.array(ind)
-    res = x[ind, ind]
-    res_num = x_num[ind_num, ind_num]
-    assert np.array_equal(res, res_num)
-
-    # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by
-    # 1 when passig 2d index array
-    for ndim in range(2, LEGATE_MAX_DIM):
-        a_shape = tuple(random.randint(2, 5) for i in range(ndim))
-        np_array = mk_seq_array(np, a_shape)
-        num_array = mk_seq_array(num, a_shape)
-        # check when N of index arrays == N of dims
-        num_tuple_of_indices = tuple()
-        np_tuple_of_indices = tuple()
-        for i in range(ndim):
-            i_shape = (2, 4)
-            idx_arr_np = mk_seq_array(np, i_shape) % np_array.shape[i]
-            idx_arr_num = num.array(idx_arr_np)
-            np_tuple_of_indices += (idx_arr_np,)
-            num_tuple_of_indices += (idx_arr_num,)
-        assert np.array_equal(
-            np_array[np_tuple_of_indices], num_array[num_tuple_of_indices]
-        )
-        # check when N of index arrays == N of dims
-        i_shape = (2, 2)
-        idx_arr_np = mk_seq_array(np, i_shape) % np_array.shape[0]
-        idx_arr_num = num.array(idx_arr_np)
-        assert np.array_equal(np_array[idx_arr_np], num_array[idx_arr_num])
-        # test in-place assignment
-        np_array[idx_arr_np] = 2
-        num_array[idx_arr_num] = 2
-        assert np.array_equal(num_array, np_array)
-        idx_arr_np = np.array([[1, 0, 1], [1, 1, 0]])
-        idx_arr_num = num.array(idx_arr_np)
-        assert np.array_equal(
-            np_array[:, idx_arr_np], num_array[:, idx_arr_num]
-        )
-        # test in-place assignment
-        np_array[:, idx_arr_np] = 3
-        num_array[:, idx_arr_num] = 3
-        assert np.array_equal(num_array, np_array)
-        if ndim > 2:
-            assert np.array_equal(
-                np_array[1, :, idx_arr_np], num_array[1, :, idx_arr_num]
-            )
-            assert np.array_equal(
-                np_array[:, idx_arr_np, idx_arr_np],
-                num_array[:, idx_arr_num, idx_arr_num],
-            )
-        if ndim > 3:
-            assert np.array_equal(
-                np_array[:, idx_arr_np, :, idx_arr_np],
-                num_array[:, idx_arr_num, :, idx_arr_num],
-            )
-
-    return
-
-
 def test():
     # --------------------------------------------------------------
     # choose operator
@@ -624,8 +192,6 @@ def test():
         fn = np.diag(en, k=k)
         assert np.array_equal(f, fn)
 
-    advanced_indexing()
-
     return
 
 

From 502a4b25a9d8769a2d1e68f068ea590765de8317 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 19 Apr 2022 09:40:30 -0600
Subject: [PATCH 26/33] removing unnecessary call to the FILL task

---
 cunumeric/deferred.py | 28 +++++++---------------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 948f83186..a8edd089d 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -636,28 +636,14 @@ def set_item(self, key, rhs):
             copy_needed, lhs, index_array = self._create_indexing_array(
                 key, True
             )
-            # TODO: remove rhs.ndim ==0 logic when issue with scalars not being
-            # type of Store is addressed
-            if rhs.ndim == 0:
-                rhs_tmp = self.runtime.create_empty_thunk(
-                    index_array.base.shape,
-                    self.dtype,
-                    inputs=[],
-                )
-                task = self.context.create_task(CuNumericOpCode.FILL)
-                task.add_output(rhs_tmp.base)
-                task.add_input(rhs.base)
-                task.add_scalar_arg(False, bool)
-                task.execute()
-                rhs = rhs_tmp.base
+            rhs = self.runtime.to_deferred_array(rhs)
+            if rhs.shape != index_array.shape:
+                rhs_tmp = rhs._broadcast(index_array.base.shape)
+                rhs_tmp, rhs = rhs._copy_store(rhs_tmp)
             else:
-                if rhs.shape != index_array.shape:
-                    rhs_tmp = rhs._broadcast(index_array.base.shape)
-                    rhs_tmp, rhs = rhs._copy_store(rhs_tmp)
-                else:
-                    if rhs.base.transformed:
-                        rhs, rhs_base = rhs._copy_store(rhs.base)
-                    rhs = rhs.base
+                if rhs.base.transformed:
+                    rhs, rhs_base = rhs._copy_store(rhs.base)
+                rhs = rhs.base
 
             copy = self.context.create_copy()
             copy.add_input(rhs)

From 95ae53d0c8d328f0e42423760f667d30a61bf0c8 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 19 Apr 2022 14:24:08 -0600
Subject: [PATCH 27/33] fixed the logic for transposing base array when bool
 arrays are passed as indices

---
 cunumeric/deferred.py      |  18 ++++--
 tests/advanced_indexing.py | 126 +++++++++++++++++++++++++++++--------
 2 files changed, 113 insertions(+), 31 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index a8edd089d..33a3a3b4d 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -465,8 +465,18 @@ def _create_indexing_array(self, key, is_set=False):
             if np.isscalar(k) or isinstance(k, NumPyThunk):
                 if start_index == -1:
                     start_index = dim
-                transpose_indices += (dim,)
+                key_transpose_indices += (dim,)
                 transpose_needed = transpose_needed or ((dim - last_index) > 1)
+                if (
+                    isinstance(k, NumPyThunk)
+                    and k.dtype == np.bool
+                    and k.ndim >= 2
+                ):
+                    for i in range(dim, dim + k.ndim):
+                        transpose_indices += (shift + i,)
+                    shift += k.ndim - 1
+                else:
+                    transpose_indices += (dim,)
                 last_index = dim
 
         if transpose_needed:
@@ -474,7 +484,6 @@ def _create_indexing_array(self, key, is_set=False):
             post_indices = tuple(
                 i for i in range(store.ndim) if i not in transpose_indices
             )
-            key_transpose_indices = transpose_indices
             transpose_indices += post_indices
             post_indices = tuple(
                 i for i in range(len(key)) if i not in key_transpose_indices
@@ -483,8 +492,8 @@ def _create_indexing_array(self, key, is_set=False):
             store = store.transpose(transpose_indices)
             key = tuple(key[i] for i in key_transpose_indices)
 
-        for d, k in enumerate(key):
-            dim = d
+        shift = 0
+        for dim, k in enumerate(key):
             if np.isscalar(k):
                 if k < 0:
                     k += store.shape[dim + shift]
@@ -506,6 +515,7 @@ def _create_indexing_array(self, key, is_set=False):
                     # in case of the mixed indises we all nonzero
                     # for the bool array
                     k = k.nonzero()
+                    shift += len(k) - 1
                     tuple_of_arrays += k
                 else:
                     tuple_of_arrays += (k,)
diff --git a/tests/advanced_indexing.py b/tests/advanced_indexing.py
index f0e478eb9..2bd2c2c16 100644
--- a/tests/advanced_indexing.py
+++ b/tests/advanced_indexing.py
@@ -246,11 +246,9 @@ def test():
     res_num = xt_num[:, [0, 1], :, 1:]
     assert np.array_equal(res, res_num)
 
-    z = x
-    z_num = x_num
-    z[[0, 1], [0, 1]] = 11
-    z_num[[0, 1], [0, 1]] = 11
-    assert np.array_equal(z, z_num)
+    x[[0, 1], [0, 1]] = 11
+    x_num[[0, 1], [0, 1]] = 11
+    assert np.array_equal(x, x_num)
 
     # d: newaxis is passed along with array:
 
@@ -258,6 +256,22 @@ def test():
     res_num = x_num[..., [1, 0]]
     assert np.array_equal(res, res_num)
 
+    xt = x.transpose(
+        (
+            1,
+            3,
+            0,
+            2,
+        )
+    )
+    xt_num = x_num.transpose(
+        (
+            1,
+            3,
+            0,
+            2,
+        )
+    )
     res = xt[..., [0, 1], 1:]
     res_num = xt_num[..., [0, 1], 1:]
     assert np.array_equal(res, res_num)
@@ -317,25 +331,17 @@ def test():
     res_num = x_num[..., indx_bool_num]
     assert np.array_equal(res, res_num)
 
-    print("IRINA DEBUG 1")
     indx1_bool = np.array([True, False])
     indx1_bool_num = num.array(indx1_bool)
     indx2_bool = np.array([True, False, True, True])
     indx2_bool_num = num.array(indx2_bool)
     res = x[indx1_bool, :, indx2_bool]
-    print(res.shape)
-    print(res)
     res_num = x_num[indx1_bool_num, :, indx2_bool_num]
-    print(res_num.shape)
-    print(res_num)
     assert np.array_equal(res, res_num)
 
-    print("IRINA DEBUG 2")
     res = x[indx1_bool, 1, indx2_bool]
-    # res_num = x_num[indx1_bool_num, 1, indx2_bool_num]
-    # print(res.shape)
-    # print(res_num.shape)
-    # assert np.array_equal(res, res_num)
+    res_num = x_num[indx1_bool_num, 1, indx2_bool_num]
+    assert np.array_equal(res, res_num)
 
     # g: boolean array with the same shape is passed to x:
     indx = x % 2
@@ -351,7 +357,6 @@ def test():
     z[indx] = 1
     z_num[indx_num] = 1
     assert np.array_equal(z, z_num)
-    print("IRINA DEBUG 3")
 
     indx_bool = np.array([True, False, True])
     indx_bool_num = num.array(indx_bool)
@@ -359,7 +364,6 @@ def test():
     z_num[:, indx_bool_num] = 5
     assert np.array_equal(z, z_num)
 
-    print("IRINA DEBUG 4")
     # i: two bool array of the same shape are passed:
     x = mk_seq_array(
         np,
@@ -388,20 +392,66 @@ def test():
     )
     indx_num = num.array(indx)
     res = x[indx, indx]
-    print("IRINA DEBUG res = ", res.shape)
-    # res_num = x_num[indx_num, indx_num]
-    # assert np.array_equal(res, res_num)
+    res_num = x_num[indx_num, indx_num]
+    assert np.array_equal(res, res_num)
+    if LEGATE_MAX_DIM > 4:
+        x = mk_seq_array(
+            np,
+            (
+                3,
+                4,
+                5,
+                3,
+                4,
+            ),
+        )
+        x_num = mk_seq_array(
+            num,
+            (
+                3,
+                4,
+                5,
+                3,
+                4,
+            ),
+        )
+        res = x[indx, 1, indx]
+        res_num = x_num[indx_num, 1, indx_num]
+        assert np.array_equal(res, res_num)
+
+        res = x[indx, :, indx]
+        res_num = x_num[indx_num, :, indx_num]
+        assert np.array_equal(res, res_num)
 
     # j: 2 bool arrays should be broadcasted:
-    # res = x[idx, [True,False,False]]
-    # res_num = x_num[idx_num, [True,False,False]]
+    x = mk_seq_array(
+        np,
+        (
+            3,
+            4,
+            3,
+            4,
+        ),
+    )
+    x_num = mk_seq_array(
+        num,
+        (
+            3,
+            4,
+            3,
+            4,
+        ),
+    )
+    res = x[indx, [True, False, False]]
+    res_num = x_num[indx_num, [True, False, False]]
+    assert np.array_equal(res, res_num)
 
     # 2d bool array not at the first index:
     indx = np.full((4, 3), True)
     indx_num = num.array(indx)
     res = x[:, indx]
-    # res_num = x_num[:, indx]
-    # assert np.array_equal(res, res_num)
+    res_num = x_num[:, indx]
+    assert np.array_equal(res, res_num)
 
     # 3: testing mixed type of the arguments passed:
 
@@ -425,8 +475,8 @@ def test():
         ),
     )
     res = x[[1, 1], [False, True, False]]
-    # res_num = x_num[[1,1], [False, True,False]]
-    # assert np.array_equal(res, res_num)
+    res_num = x_num[[1, 1], [False, True, False]]
+    assert np.array_equal(res, res_num)
 
     res = x[[1, 1], :, [False, True, False, True]]
     res_num = x_num[[1, 1], :, [False, True, False, True]]
@@ -465,7 +515,7 @@ def test():
     res_num = x_num[:, [0, 1], :, 1:]
     assert np.array_equal(res, res_num)
 
-    # c: transformed base:
+    # c: transformed base or index or rhs:
     z = x[:, 1:]
     z_num = x_num[:, 1:]
     indx = np.array([1, 1])
@@ -474,6 +524,22 @@ def test():
     res_num = z_num[indx_num]
     assert np.array_equal(res, res_num)
 
+    indx = np.array([1, 1, 0])
+    indx_num = num.array(indx)
+    indx = indx[1:]
+    indx_num = indx_num[1:]
+    res = z[1, indx]
+    res_num = z_num[1, indx_num]
+    assert np.array_equal(res, res_num)
+
+    b = np.ones((2, 3, 6, 5))
+    b_num = num.array(b)
+    b = b.transpose((0, 1, 3, 2))
+    b_num = b_num.transpose((0, 1, 3, 2))
+    z[indx] = b
+    z_num[indx_num] = b_num
+    assert np.array_equal(z, z_num)
+
     # d: shape mismatch case:
     x = np.array(
         [
@@ -536,6 +602,12 @@ def test():
     x_num[ind_num, ind_num] = 5
     assert np.array_equal(x, x_num)
 
+    b = np.array([1, 2, 3], dtype=np.int16)
+    b_num = num.array(b)
+    x[ind, ind] = b
+    x_num[ind_num, ind_num] = b_num
+    assert np.array_equal(x, x_num)
+
     # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by
     # 1 when passig 2d index array
     for ndim in range(2, LEGATE_MAX_DIM):

From a3979dc41b8a0c784146a0e97a7a7a90da16be8c Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 19 Apr 2022 22:41:29 -0600
Subject: [PATCH 28/33] adding logic for the set_item when base array was
 transposed internally

---
 cunumeric/deferred.py      | 32 +++++++++++++++++++++++++-------
 tests/advanced_indexing.py |  4 ++++
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 33a3a3b4d..674143438 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -441,7 +441,7 @@ def _create_indexing_array(self, key, is_set=False):
             task.add_scalar_arg(is_set, bool)
             task.add_alignment(rhs.base, key.base)
             task.execute()
-            return False, rhs, out
+            return False, rhs, out, None
 
         if isinstance(key, NumPyThunk):
             key = (key,)
@@ -491,6 +491,8 @@ def _create_indexing_array(self, key, is_set=False):
             key_transpose_indices += post_indices
             store = store.transpose(transpose_indices)
             key = tuple(key[i] for i in key_transpose_indices)
+        else:
+            transpose_indices = None
 
         shift = 0
         for dim, k in enumerate(key):
@@ -532,12 +534,12 @@ def _create_indexing_array(self, key, is_set=False):
 
         if len(tuple_of_arrays) <= rhs.ndim and rhs.ndim > 1:
             output_arr = rhs._zip_indices(start_index, tuple_of_arrays)
-            return True, rhs, output_arr
+            return True, rhs, output_arr, transpose_indices
         elif len(tuple_of_arrays) == 1 and rhs.ndim == 1:
             key = tuple_of_arrays[0]
             if key.base.transformed:
                 key, key_store = key._copy_store(key.base)
-            return True, rhs, key
+            return True, rhs, key, transpose_indices
         else:
             raise ValueError("Advance indexing dimention mismatch")
 
@@ -601,7 +603,12 @@ def get_item(self, key):
         # Check to see if this is advanced indexing or not
         if is_advanced_indexing(key):
             # Create the indexing array
-            copy_needed, rhs, index_array = self._create_indexing_array(key)
+            (
+                copy_needed,
+                rhs,
+                index_array,
+                transpose_indices,
+            ) = self._create_indexing_array(key)
             store = rhs.base
             if copy_needed:
                 # Create a new array to be the result
@@ -643,9 +650,12 @@ def set_item(self, key, rhs):
         # Check to see if this is advanced indexing or not
         if is_advanced_indexing(key):
             # Create the indexing array
-            copy_needed, lhs, index_array = self._create_indexing_array(
-                key, True
-            )
+            (
+                copy_needed,
+                lhs,
+                index_array,
+                transpose_indices,
+            ) = self._create_indexing_array(key, True)
             rhs = self.runtime.to_deferred_array(rhs)
             if rhs.shape != index_array.shape:
                 rhs_tmp = rhs._broadcast(index_array.base.shape)
@@ -661,7 +671,15 @@ def set_item(self, key, rhs):
             copy.add_output(lhs.base)
             copy.execute()
 
+            # todo this copy will be removed when affine copies are
+            # supported in Legion/Realm
             if lhs is not self:
+                # if lhs was transposed in _create_indexing_array
+                # we need to transpose self as well
+                if transpose_indices is not None:
+                    store = self.base
+                    store = store.transpose(transpose_indices)
+                    self = DeferredArray(self.runtime, store, self.dtype)
                 self.copy(lhs, deep=True)
 
         else:
diff --git a/tests/advanced_indexing.py b/tests/advanced_indexing.py
index 2bd2c2c16..d0662fbde 100644
--- a/tests/advanced_indexing.py
+++ b/tests/advanced_indexing.py
@@ -250,6 +250,10 @@ def test():
     x_num[[0, 1], [0, 1]] = 11
     assert np.array_equal(x, x_num)
 
+    x[[0, 1], :, [0, 1]] = 11
+    x_num[[0, 1], :, [0, 1]] = 11
+    assert np.array_equal(x, x_num)
+
     # d: newaxis is passed along with array:
 
     res = x[..., [1, 0]]

From f7990adc9c124dc4b48130117326f0175ff4c4ca Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 20 Apr 2022 10:30:23 -0600
Subject: [PATCH 29/33] making set_item work for the case when any
 transformations are  done to se base array internally

---
 cunumeric/deferred.py      | 25 +++++++++++--------------
 tests/advanced_indexing.py | 16 ++++++++++++++--
 2 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 674143438..b34e6cb1f 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -441,7 +441,7 @@ def _create_indexing_array(self, key, is_set=False):
             task.add_scalar_arg(is_set, bool)
             task.add_alignment(rhs.base, key.base)
             task.execute()
-            return False, rhs, out, None
+            return False, rhs, out, self
 
         if isinstance(key, NumPyThunk):
             key = (key,)
@@ -491,8 +491,6 @@ def _create_indexing_array(self, key, is_set=False):
             key_transpose_indices += post_indices
             store = store.transpose(transpose_indices)
             key = tuple(key[i] for i in key_transpose_indices)
-        else:
-            transpose_indices = None
 
         shift = 0
         for dim, k in enumerate(key):
@@ -527,6 +525,11 @@ def _create_indexing_array(self, key, is_set=False):
                     "indexing operation",
                 )
         if store.transformed:
+            # in case this operation is called for the set_item, we need
+            # to apply all the transformations to self as well before
+            # creating a copy
+            if is_set:
+                self = DeferredArray(self.runtime, store, self.dtype)
             # after store is transformed we need to to return a copy of
             # the store since Copy operation can't be done on
             # the store with transformation
@@ -534,12 +537,12 @@ def _create_indexing_array(self, key, is_set=False):
 
         if len(tuple_of_arrays) <= rhs.ndim and rhs.ndim > 1:
             output_arr = rhs._zip_indices(start_index, tuple_of_arrays)
-            return True, rhs, output_arr, transpose_indices
+            return True, rhs, output_arr, self
         elif len(tuple_of_arrays) == 1 and rhs.ndim == 1:
             key = tuple_of_arrays[0]
             if key.base.transformed:
                 key, key_store = key._copy_store(key.base)
-            return True, rhs, key, transpose_indices
+            return True, rhs, key, self
         else:
             raise ValueError("Advance indexing dimention mismatch")
 
@@ -607,7 +610,7 @@ def get_item(self, key):
                 copy_needed,
                 rhs,
                 index_array,
-                transpose_indices,
+                self,
             ) = self._create_indexing_array(key)
             store = rhs.base
             if copy_needed:
@@ -654,7 +657,7 @@ def set_item(self, key, rhs):
                 copy_needed,
                 lhs,
                 index_array,
-                transpose_indices,
+                self,
             ) = self._create_indexing_array(key, True)
             rhs = self.runtime.to_deferred_array(rhs)
             if rhs.shape != index_array.shape:
@@ -673,13 +676,7 @@ def set_item(self, key, rhs):
 
             # todo this copy will be removed when affine copies are
             # supported in Legion/Realm
-            if lhs is not self:
-                # if lhs was transposed in _create_indexing_array
-                # we need to transpose self as well
-                if transpose_indices is not None:
-                    store = self.base
-                    store = store.transpose(transpose_indices)
-                    self = DeferredArray(self.runtime, store, self.dtype)
+            if lhs is not self or self.base.transformed:
                 self.copy(lhs, deep=True)
 
         else:
diff --git a/tests/advanced_indexing.py b/tests/advanced_indexing.py
index d0662fbde..b70ca512e 100644
--- a/tests/advanced_indexing.py
+++ b/tests/advanced_indexing.py
@@ -250,8 +250,16 @@ def test():
     x_num[[0, 1], [0, 1]] = 11
     assert np.array_equal(x, x_num)
 
-    x[[0, 1], :, [0, 1]] = 11
-    x_num[[0, 1], :, [0, 1]] = 11
+    x[[0, 1], :, [0, 1]] = 12
+    x_num[[0, 1], :, [0, 1]] = 12
+    assert np.array_equal(x, x_num)
+
+    x[[0, 1], 1:3, [0, 1]] = 3.5
+    x_num[[0, 1], 1:3, [0, 1]] = 3.5
+    assert np.array_equal(x, x_num)
+
+    x[1:2, :, [0, 1]] = 7
+    x_num[1:2, :, [0, 1]] = 7
     assert np.array_equal(x, x_num)
 
     # d: newaxis is passed along with array:
@@ -260,6 +268,10 @@ def test():
     res_num = x_num[..., [1, 0]]
     assert np.array_equal(res, res_num)
 
+    x[..., [1, 0]] = 8
+    x_num[..., [1, 0]] = 8
+    assert np.array_equal(res, res_num)
+
     xt = x.transpose(
         (
             1,

From a444581894a951e6b4717df1d20b4f3ad7d936bf Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 20 Apr 2022 10:58:32 -0600
Subject: [PATCH 30/33] some code clean-up + documentation

---
 cunumeric/deferred.py      | 30 +++++++++++++++++-------------
 tests/advanced_indexing.py |  2 ++
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index b34e6cb1f..774a910bf 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -314,7 +314,7 @@ def _zip_indices(self, start_index, arrays):
             start_index = 0
 
         new_arrays = tuple()
-        # check array's type and converting them to deferred arrays
+        # check array's type and convert them to deferred arrays
         for a in arrays:
             a = self.runtime.to_deferred_array(a)
             data_type = a.dtype
@@ -429,6 +429,9 @@ def _create_indexing_array(self, key, is_set=False):
                 key = self.runtime.to_deferred_array(key)
 
             out_dtype = rhs.dtype
+            # in cease this operation is called for the set_item, we
+            # return Point<N> type field that is later used for
+            # indirect copy operation
             if is_set:
                 N = rhs.ndim
                 out_dtype = rhs.runtime.get_point_type(N)
@@ -456,8 +459,6 @@ def _create_indexing_array(self, key, is_set=False):
         transpose_needed = False
         transpose_indices = tuple()
         key_transpose_indices = tuple()
-        # since we can't call Copy operation on transformed Store, after
-        # the transformation, we need to return a copy
         tuple_of_arrays = ()
 
         # First, we need to check if transpose is needed
@@ -507,11 +508,12 @@ def _create_indexing_array(self, key, is_set=False):
                 if not isinstance(key, DeferredArray):
                     k = self.runtime.to_deferred_array(k)
                 if k.dtype == np.bool:
-                    if k.shape[0] != store.shape[dim + shift]:
-                        raise ValueError(
-                            "shape of boolean index did not match "
-                            "indexed array "
-                        )
+                    for i in range(k.ndim):
+                        if k.shape[i] != store.shape[dim + i + shift]:
+                            raise ValueError(
+                                "shape of boolean index did not match "
+                                "indexed array "
+                            )
                     # in case of the mixed indises we all nonzero
                     # for the bool array
                     k = k.nonzero()
@@ -525,9 +527,9 @@ def _create_indexing_array(self, key, is_set=False):
                     "indexing operation",
                 )
         if store.transformed:
-            # in case this operation is called for the set_item, we need
-            # to apply all the transformations to self as well before
-            # creating a copy
+            # in the case this operation is called for the set_item, we need
+            # to apply all the transformations done to `store` to `self`
+            # as well before creating a copy
             if is_set:
                 self = DeferredArray(self.runtime, store, self.dtype)
             # after store is transformed we need to to return a copy of
@@ -540,11 +542,13 @@ def _create_indexing_array(self, key, is_set=False):
             return True, rhs, output_arr, self
         elif len(tuple_of_arrays) == 1 and rhs.ndim == 1:
             key = tuple_of_arrays[0]
+            # when key is transformed, we need to return a copy in purpose
+            # to use it as an indirection in copy operation
             if key.base.transformed:
                 key, key_store = key._copy_store(key.base)
             return True, rhs, key, self
         else:
-            raise ValueError("Advance indexing dimention mismatch")
+            raise ValueError("Advanced indexing dimention mismatch")
 
     @staticmethod
     def _unpack_ellipsis(key, ndim):
@@ -674,7 +678,7 @@ def set_item(self, key, rhs):
             copy.add_output(lhs.base)
             copy.execute()
 
-            # todo this copy will be removed when affine copies are
+            # TODO this copy will be removed when affine copies are
             # supported in Legion/Realm
             if lhs is not self or self.base.transformed:
                 self.copy(lhs, deep=True)
diff --git a/tests/advanced_indexing.py b/tests/advanced_indexing.py
index b70ca512e..4ef35de70 100644
--- a/tests/advanced_indexing.py
+++ b/tests/advanced_indexing.py
@@ -431,10 +431,12 @@ def test():
                 4,
             ),
         )
+        # 2 bool arrays separated by scalar
         res = x[indx, 1, indx]
         res_num = x_num[indx_num, 1, indx_num]
         assert np.array_equal(res, res_num)
 
+        # 2 bool arrays separated by :
         res = x[indx, :, indx]
         res_num = x_num[indx_num, :, indx_num]
         assert np.array_equal(res, res_num)

From 8796301b18a51199daeb9765b7df02d8974d99f6 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 20 Apr 2022 21:37:20 -0600
Subject: [PATCH 31/33] fixing some small issues

---
 cunumeric/deferred.py      | 14 ++++++++++----
 src/cunumeric/index/zip.cu | 10 +++++-----
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 33fa69ebb..1c6d39266 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -443,6 +443,12 @@ def _create_indexing_array(self, key, is_set=False):
             task.add_input(key.base)
             task.add_scalar_arg(is_set, bool)
             task.add_alignment(rhs.base, key.base)
+            task.add_broadcast(
+                key.base, axes=tuple(range(1, len(key.base.shape)))
+            )
+            task.add_broadcast(
+                rhs.base, axes=tuple(range(1, len(rhs.base.shape)))
+            )
             task.execute()
             return False, rhs, out, self
 
@@ -477,7 +483,7 @@ def _create_indexing_array(self, key, is_set=False):
                         transpose_indices += (shift + i,)
                     shift += k.ndim - 1
                 else:
-                    transpose_indices += (dim,)
+                    transpose_indices += ((dim + shift),)
                 last_index = dim
 
         if transpose_needed:
@@ -548,7 +554,7 @@ def _create_indexing_array(self, key, is_set=False):
                 key, key_store = key._copy_store(key.base)
             return True, rhs, key, self
         else:
-            raise ValueError("Advanced indexing dimention mismatch")
+            raise ValueError("Advanced indexing dimension mismatch")
 
     @staticmethod
     def _unpack_ellipsis(key, ndim):
@@ -663,7 +669,7 @@ def set_item(self, key, rhs):
                 index_array,
                 self,
             ) = self._create_indexing_array(key, True)
-            rhs = self.runtime.to_deferred_array(rhs)
+
             if rhs.shape != index_array.shape:
                 rhs_tmp = rhs._broadcast(index_array.base.shape)
                 rhs_tmp, rhs = rhs._copy_store(rhs_tmp)
@@ -680,7 +686,7 @@ def set_item(self, key, rhs):
 
             # TODO this copy will be removed when affine copies are
             # supported in Legion/Realm
-            if lhs is not self or self.base.transformed:
+            if lhs is not self:
                 self.copy(lhs, deep=True)
 
         else:
diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu
index c130b9bb4..f748b95d8 100644
--- a/src/cunumeric/index/zip.cu
+++ b/src/cunumeric/index/zip.cu
@@ -28,7 +28,7 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
              const DeferredBuffer<AccessorRO<int64_t, DIM>, 1> index_arrays,
              const Rect<DIM> rect,
              const Pitches<DIM - 1> pitches,
-             int volume,
+             size_t volume,
              std::index_sequence<Is...>)
 {
   const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -44,7 +44,7 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   zip_kernel_dense(Point<N>* out,
                    const DeferredBuffer<const int64_t*, 1> index_arrays,
                    const Rect<DIM> rect,
-                   int volume,
+                   size_t volume,
                    std::index_sequence<Is...>)
 {
   const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -61,9 +61,9 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
              const Rect<DIM> rect,
              const Pitches<DIM - 1> pitches,
              int narrays,
-             int volume,
-             int key_dim,
-             int start_index)
+             size_t volume,
+             int64_t key_dim,
+             int64_t start_index)
 {
   const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= volume) return;

From 30539a92a57191a2fb0a2d9ac7dbd0c2f2ac8924 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 21 Apr 2022 10:22:07 -0600
Subject: [PATCH 32/33] adding debugging checks for cuda task variants

---
 src/cunumeric/index/advanced_indexing.cu |  8 ++---
 src/cunumeric/index/zip.cu               | 38 +++++++++++++-----------
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cunumeric/index/advanced_indexing.cu
index c78fec4cf..ebfca1971 100644
--- a/src/cunumeric/index/advanced_indexing.cu
+++ b/src/cunumeric/index/advanced_indexing.cu
@@ -143,10 +143,9 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM1, DIM2> {
     size_t size           = 0;
     const bool* index_ptr = index.ptr(rect_index);
     const size_t volume   = rect_index.volume();
-    cudaStream_t stream;
-    cudaStreamCreate(&stream);
-    auto offsets = create_buffer<int64_t>(volume, Memory::Kind::GPU_FB_MEM);
-    size         = compute_size(index, pitches_index, rect_index, volume, stream, offsets);
+    auto stream           = get_cached_stream();
+    auto offsets          = create_buffer<int64_t>(volume, Memory::Kind::GPU_FB_MEM);
+    size                  = compute_size(index, pitches_index, rect_index, volume, stream, offsets);
 
     out = create_buffer<OUT_TYPE>(size, Memory::Kind::GPU_FB_MEM);
     // populate output
@@ -162,6 +161,7 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM1, DIM2> {
                                                                          rect_index.lo,
                                                                          offsets);
     }
+    CHECK_CUDA_STREAM(stream);
     return size;
   }
 };
diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu
index f748b95d8..5a18f776d 100644
--- a/src/cunumeric/index/zip.cu
+++ b/src/cunumeric/index/zip.cu
@@ -25,7 +25,7 @@ using namespace Legion;
 template <int DIM, int N, size_t... Is>
 __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   zip_kernel(const AccessorWO<Point<N>, DIM> out,
-             const DeferredBuffer<AccessorRO<int64_t, DIM>, 1> index_arrays,
+             const Buffer<AccessorRO<int64_t, DIM>, 1> index_arrays,
              const Rect<DIM> rect,
              const Pitches<DIM - 1> pitches,
              size_t volume,
@@ -42,7 +42,7 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
 template <int DIM, int N, size_t... Is>
 __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   zip_kernel_dense(Point<N>* out,
-                   const DeferredBuffer<const int64_t*, 1> index_arrays,
+                   const Buffer<const int64_t*, 1> index_arrays,
                    const Rect<DIM> rect,
                    size_t volume,
                    std::index_sequence<Is...>)
@@ -57,7 +57,7 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
 template <int DIM, int N>
 __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   zip_kernel(const AccessorWO<Point<N>, DIM> out,
-             const DeferredBuffer<AccessorRO<int64_t, DIM>, 1> index_arrays,
+             const Buffer<AccessorRO<int64_t, DIM>, 1> index_arrays,
              const Rect<DIM> rect,
              const Pitches<DIM - 1> pitches,
              int narrays,
@@ -92,35 +92,37 @@ struct ZipImplBody<VariantKind::GPU, DIM, N> {
                   const int64_t start_index,
                   std::index_sequence<Is...>) const
   {
+    auto stream         = get_cached_stream();
     const size_t volume = rect.volume();
     const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     if (index_arrays.size() == N) {
       if (dense) {
-        DeferredBuffer<const int64_t*, 1> idx_arr(Memory::Kind::Z_COPY_MEM,
-                                                  Rect<1>(0, index_arrays.size() - 1));
+        auto index_buf = create_buffer<const int64_t*, 1>(
+          index_arrays.size(), Memory::Kind::Z_COPY_MEM, 128 /*alignment*/);
         for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) {
-          idx_arr[idx] = index_arrays[idx].ptr(rect);
+          index_buf[idx] = index_arrays[idx].ptr(rect);
         }
-        zip_kernel_dense<DIM, N><<<blocks, THREADS_PER_BLOCK>>>(
-          out.ptr(rect), idx_arr, rect, volume, std::make_index_sequence<N>());
+        zip_kernel_dense<DIM, N><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
+          out.ptr(rect), index_buf, rect, volume, std::make_index_sequence<N>());
       } else {
-        DeferredBuffer<AccessorRO<VAL, DIM>, 1> idx_arr(Memory::Kind::Z_COPY_MEM,
-                                                        Rect<1>(0, index_arrays.size() - 1));
-        for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) idx_arr[idx] = index_arrays[idx];
-        zip_kernel<DIM, N><<<blocks, THREADS_PER_BLOCK>>>(
-          out, idx_arr, rect, pitches, volume, std::make_index_sequence<N>());
+        auto index_buf = create_buffer<AccessorRO<VAL, DIM>, 1>(
+          index_arrays.size(), Memory::Kind::Z_COPY_MEM, 128 /*alignment*/);
+        for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) index_buf[idx] = index_arrays[idx];
+        zip_kernel<DIM, N><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
+          out, index_buf, rect, pitches, volume, std::make_index_sequence<N>());
       }
     } else {
 #ifdef DEBUG_CUNUMERIC
       assert(index_arrays.size() < N);
 #endif
-      DeferredBuffer<AccessorRO<VAL, DIM>, 1> idx_arr(Memory::Kind::Z_COPY_MEM,
-                                                      Rect<1>(0, index_arrays.size() - 1));
-      for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) idx_arr[idx] = index_arrays[idx];
+      auto index_buf = create_buffer<AccessorRO<VAL, DIM>, 1>(
+        index_arrays.size(), Memory::Kind::Z_COPY_MEM, 128 /*alignment*/);
+      for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) index_buf[idx] = index_arrays[idx];
       int num_arrays = index_arrays.size();
-      zip_kernel<DIM, N><<<blocks, THREADS_PER_BLOCK>>>(
-        out, idx_arr, rect, pitches, num_arrays, volume, key_dim, start_index);
+      zip_kernel<DIM, N><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
+        out, index_buf, rect, pitches, num_arrays, volume, key_dim, start_index);
     }
+    CHECK_CUDA_STREAM(stream);
   }
 };
 

From 8704cb02ff139ee2a3c57568f073ec710da2df3a Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 21 Apr 2022 11:16:28 -0600
Subject: [PATCH 33/33] removing explicit alignment from the buffers

---
 src/cunumeric/index/zip.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu
index 5a18f776d..ae82e7d10 100644
--- a/src/cunumeric/index/zip.cu
+++ b/src/cunumeric/index/zip.cu
@@ -97,16 +97,16 @@ struct ZipImplBody<VariantKind::GPU, DIM, N> {
     const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     if (index_arrays.size() == N) {
       if (dense) {
-        auto index_buf = create_buffer<const int64_t*, 1>(
-          index_arrays.size(), Memory::Kind::Z_COPY_MEM, 128 /*alignment*/);
+        auto index_buf =
+          create_buffer<const int64_t*, 1>(index_arrays.size(), Memory::Kind::Z_COPY_MEM);
         for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) {
           index_buf[idx] = index_arrays[idx].ptr(rect);
         }
         zip_kernel_dense<DIM, N><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
           out.ptr(rect), index_buf, rect, volume, std::make_index_sequence<N>());
       } else {
-        auto index_buf = create_buffer<AccessorRO<VAL, DIM>, 1>(
-          index_arrays.size(), Memory::Kind::Z_COPY_MEM, 128 /*alignment*/);
+        auto index_buf =
+          create_buffer<AccessorRO<VAL, DIM>, 1>(index_arrays.size(), Memory::Kind::Z_COPY_MEM);
         for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) index_buf[idx] = index_arrays[idx];
         zip_kernel<DIM, N><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
           out, index_buf, rect, pitches, volume, std::make_index_sequence<N>());
@@ -115,8 +115,8 @@ struct ZipImplBody<VariantKind::GPU, DIM, N> {
 #ifdef DEBUG_CUNUMERIC
       assert(index_arrays.size() < N);
 #endif
-      auto index_buf = create_buffer<AccessorRO<VAL, DIM>, 1>(
-        index_arrays.size(), Memory::Kind::Z_COPY_MEM, 128 /*alignment*/);
+      auto index_buf =
+        create_buffer<AccessorRO<VAL, DIM>, 1>(index_arrays.size(), Memory::Kind::Z_COPY_MEM);
       for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) index_buf[idx] = index_arrays[idx];
       int num_arrays = index_arrays.size();
       zip_kernel<DIM, N><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(