From 867f7a318379d0b20bb3cf6ed4c409a3cad83aba Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 17 Feb 2022 16:10:37 -0700 Subject: [PATCH 01/33] adding support for Adwance Indexing case when several index arrays are passed --- cunumeric/config.py | 1 + cunumeric/deferred.py | 136 ++++++++++++++++++++++----- cunumeric/runtime.py | 9 ++ src/cunumeric.mk | 7 +- src/cunumeric/cunumeric_c.h | 1 + src/cunumeric/index/zip.cc | 64 +++++++++++++ src/cunumeric/index/zip.cu | 95 +++++++++++++++++++ src/cunumeric/index/zip.h | 42 +++++++++ src/cunumeric/index/zip_omp.cc | 59 ++++++++++++ src/cunumeric/index/zip_template.inl | 71 ++++++++++++++ tests/index_routines.py | 43 +++++++++ 11 files changed, 502 insertions(+), 26 deletions(-) create mode 100644 src/cunumeric/index/zip.cc create mode 100644 src/cunumeric/index/zip.cu create mode 100644 src/cunumeric/index/zip.h create mode 100644 src/cunumeric/index/zip_omp.cc create mode 100644 src/cunumeric/index/zip_template.inl diff --git a/cunumeric/config.py b/cunumeric/config.py index 6c7303f56..e1462fbd3 100644 --- a/cunumeric/config.py +++ b/cunumeric/config.py @@ -114,6 +114,7 @@ class CuNumericOpCode(IntEnum): UNLOAD_CUDALIBS = _cunumeric.CUNUMERIC_UNLOAD_CUDALIBS WHERE = _cunumeric.CUNUMERIC_WHERE WRITE = _cunumeric.CUNUMERIC_WRITE + ZIP = _cunumeric.CUNUMERIC_ZIP # Match these to CuNumericUnaryOpCode in cunumeric_c.h diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index cc8779692..bc9927747 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -305,37 +305,129 @@ def get_scalar_array(self): result = np.frombuffer(buf, dtype=self.dtype, count=1) return result.reshape(()) + def _zip_indices(self, arrays): + if not isinstance(arrays, tuple): + raise TypeError("zip_indices expect tuple of arrays") + arrays = tuple(self.runtime.to_deferred_array(a) for a in arrays) + # all arrays should have the same shape and type + shape = arrays[0].shape + data_type = arrays[0].dtype + if not np.issubdtype(data_type, np.integer): + raise TypeError("a array should be integer type") + for a in arrays: + if a.shape != shape: + raise TypeError( + "shape of all index arrrays should be the same" + ) + if data_type != a.dtype: + raise TypeError("type of all index arrrays should be the same") + # create output array which will store Point field where + # N is number of index arrays + # shape of the output array should be the same as the shape of each + # index array + # NOTE: We need to instantiate a RegionField of non-primitive + # dtype, to store N-dimensional index points, to be used as the + # indirection field in a copy. + # Such dtypes are technically not supported, + # but it should be safe to directly create a DeferredArray + # of that dtype, so long as we don't try to convert it to a + # NumPy array. + N = len(arrays) + pointN_dtype = self.runtime.add_point_type(N) + store = self.context.create_store( + pointN_dtype, shape=shape, optimize_scalar=True + ) + output_arr = DeferredArray( + self.runtime, base=store, dtype=pointN_dtype + ) + # call ZIP function to combine index arrays into a singe array + task = self.context.create_task(CuNumericOpCode.ZIP) + task.add_output(output_arr.base) + for index_arr in arrays: + task.add_input(index_arr.base) + task.add_alignment(output_arr.base, index_arr.base) + task.execute() + + return output_arr + def _create_indexing_array(self, key): # Convert everything into deferred arrays of int64 + store = self.base + shift = 0 if isinstance(key, tuple): tuple_of_arrays = () - for k in key: - if not isinstance(k, NumPyThunk): - raise NotImplementedError( - "need support for mixed advanced indexing" + # for k in key: + for dim, k in enumerate(key): + if np.isscalar(k): + if k < 0: + k += store.shape[dim + shift] + store = store.project(dim + shift, k) + shift -= 1 + elif isinstance(k, slice): + store = store.slice(dim + shift, k) + elif isinstance(k, NumPyThunk): + if k.dtype == np.bool: + k = k.nonzero() + else: + raise TypeError( + "Unsupported entry type passed to advanced", + "indexing operation", ) - tuple_of_arrays += (k,) + tuple_of_arrays += (self.runtime.to_deferred_array(k),) else: assert isinstance(key, NumPyThunk) # Handle the boolean array case - if key.dtype == bool: + if key.dtype == np.bool: + # irina fixme if key.ndim != self.ndim: raise TypeError( "Boolean advanced indexing dimension mismatch" ) - # For boolean arrays do the non-zero operation to make - # them into a normal indexing array + # IRINA fixme: replace `nonzero` case with the task with + # output regions tuple_of_arrays = key.nonzero() else: - tuple_of_arrays = (key,) - if len(tuple_of_arrays) != self.ndim: + tuple_of_arrays = (self.runtime.to_deferred_array(key),) + + if len(tuple_of_arrays) > self.ndim: raise TypeError("Advanced indexing dimension mismatch") - if self.ndim > 1: - # Check that all the arrays can be broadcast together - # Concatenate all the arrays into a single array - raise NotImplementedError("need support for concatenating arrays") + + if len(tuple_of_arrays) > 1: + # shape = tuple_of_arrays[0].shape + # for i in range(1, len(tuple_of_arrays)): + # if shape != tuple_of_arrays[i].shape: + # raise ValueError("index arrays should be the same shape") + + # create output array which will store Point field where + # N is number of index arrays + # shape of the output array should be the same as the shape of each + # index array + # NOTE: We need to instantiate a RegionField of non-primitive + # dtype, to store N-dimensional index points, to be used as the + # indirection field in a copy. + # Such dtypes are technically not supported, + # but it should be safe to directly create a DeferredArray + # of that dtype, so long as we don't try to convert it to a + # NumPy array. + # out_dtype = np.dtype((np.int64, (len(tuple_of_arrays),))) + # output_arr = DeferredArray( + # self.runtime, + # base=tuple_of_arrays[0].base, + # dtype=out_dtype, + # ) + + # # call ZIP function to combine index arrays into a singe array + # task = self.context.create_task(CuNumericOpCode.ZIP) + # task.add_output(output_arr.base) + # for index_arr in tuple_of_arrays: + # task.add_input(index_arr.base) + # task.add_alignment(index_arr.base, output_arr.base) + # task.execute() + + output_arr = tuple_of_arrays[0]._zip_indices(tuple_of_arrays) + return store, output_arr else: - return self.runtime.to_deferred_array(tuple_of_arrays[0]) + return store, tuple_of_arrays[0] @staticmethod def _unpack_ellipsis(key, ndim): @@ -397,28 +489,24 @@ def get_item(self, key): # Check to see if this is advanced indexing or not if self._is_advanced_indexing(key): # Create the indexing array - index_array = self._create_indexing_array(key) + store, index_array = self._create_indexing_array(key) + # Create a new array to be the result result = self.runtime.create_empty_thunk( index_array.base.shape, self.dtype, inputs=[self], ) - - if self.ndim != index_array.ndim: - raise NotImplementedError( - "need support for indirect partitioning" - ) - copy = self.context.create_copy() - copy.add_input(self.base) + copy.add_input(store) copy.add_source_indirect(index_array.base) copy.add_output(result.base) - copy.add_alignment(index_array.base, result.base) + # copy.add_alignment(index_array.base, result.base) copy.execute() + else: result = self._get_view(key) diff --git a/cunumeric/runtime.py b/cunumeric/runtime.py index a4c224fe7..a6d840356 100644 --- a/cunumeric/runtime.py +++ b/cunumeric/runtime.py @@ -175,6 +175,15 @@ def get_arg_dtype(self, value_dtype): dtype.register_reduction_op(redop, redop_id) return arg_dtype + def add_point_type(self, n): + type_system = self.legate_context.type_system + point_type = "point" + str(n) + if point_type not in type_system: + code = type_system[ty.int64].code + size_in_bytes = 8 * n + type_system.add_type(point_type, size_in_bytes, code) + return point_type + def _report_coverage(self): total = len(self.api_calls) implemented = sum(int(impl) for (_, _, impl) in self.api_calls) diff --git a/src/cunumeric.mk b/src/cunumeric.mk index 1adecd008..9778d5dd1 100644 --- a/src/cunumeric.mk +++ b/src/cunumeric.mk @@ -27,6 +27,7 @@ GEN_CPU_SRC += cunumeric/ternary/where.cc \ cunumeric/nullary/fill.cc \ cunumeric/index/choose.cc \ cunumeric/index/repeat.cc \ + cunumeric/index/zip.cc \ cunumeric/item/read.cc \ cunumeric/item/write.cc \ cunumeric/matrix/contract.cc \ @@ -66,6 +67,7 @@ GEN_CPU_SRC += cunumeric/ternary/where_omp.cc \ cunumeric/nullary/fill_omp.cc \ cunumeric/index/choose_omp.cc \ cunumeric/index/repeat_omp.cc \ + cunumeric/index/zip_omp.cc \ cunumeric/matrix/contract_omp.cc \ cunumeric/matrix/diag_omp.cc \ cunumeric/matrix/gemm_omp.cc \ @@ -102,10 +104,11 @@ GEN_GPU_SRC += cunumeric/ternary/where.cu \ cunumeric/nullary/arange.cu \ cunumeric/nullary/eye.cu \ cunumeric/nullary/fill.cu \ + cunumeric/index/choose.cu \ + cunumeric/index/repeat.cu \ + cunumeric/index/zip.cu \ cunumeric/item/read.cu \ cunumeric/item/write.cu \ - cunumeric/index/choose.cu \ - cunumeric/index/repeat.cu \ cunumeric/matrix/contract.cu \ cunumeric/matrix/diag.cu \ cunumeric/matrix/gemm.cu \ diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h index 1c4c5d84c..68f4f56fd 100644 --- a/src/cunumeric/cunumeric_c.h +++ b/src/cunumeric/cunumeric_c.h @@ -59,6 +59,7 @@ enum CuNumericOpCode { CUNUMERIC_UNLOAD_CUDALIBS, CUNUMERIC_WHERE, CUNUMERIC_WRITE, + CUNUMERIC_ZIP, }; // Match these to UnaryOpCode in config.py diff --git a/src/cunumeric/index/zip.cc b/src/cunumeric/index/zip.cc new file mode 100644 index 000000000..364d7e973 --- /dev/null +++ b/src/cunumeric/index/zip.cc @@ -0,0 +1,64 @@ +/* Copyright 2022 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/index/zip.h" +#include "cunumeric/index/zip_template.inl" + +namespace cunumeric { + +using namespace Legion; +using namespace legate; + +template +struct ZipImplBody { + using VAL = int64_t; + + template + void operator()(const AccessorWO, DIM>& out, + const std::vector>& index_arrays, + const Rect& rect, + const Pitches& pitches, + bool dense, + std::index_sequence) const + { + const size_t volume = rect.volume(); + if (dense) { + auto outptr = out.ptr(rect); + for (size_t idx = 0; idx < volume; ++idx) { + outptr[idx] = Legion::Point(index_arrays[Is].ptr(rect)[idx]...); + // std::cout<<"IRINA DEBUG dense out = "<(index_arrays[Is][p]...); + // std::cout<<"IRINA DEBUG out = "<(context); +} + +namespace // unnamed +{ +static void __attribute__((constructor)) register_tasks(void) { ZipTask::register_variants(); } +} // namespace + +} // namespace cunumeric diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu new file mode 100644 index 000000000..1a68e62b1 --- /dev/null +++ b/src/cunumeric/index/zip.cu @@ -0,0 +1,95 @@ +/* Copyright 2022 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/index/zip.h" +#include "cunumeric/index/zip_template.inl" +#include "cunumeric/cuda_help.h" + +namespace cunumeric { + +using namespace Legion; + +template +__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) + zip_kernel(const AccessorWO, DIM> out, + const DeferredBuffer, 1> index_arrays, + const Rect rect, + const Pitches pitches, + int volume, + std::index_sequence) +{ + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= volume) return; + auto p = pitches.unflatten(idx, rect.lo); + out[p] = Legion::Point(index_arrays[Is][p]...); + printf("IRINA DEBUG point = %d %d %d \n", out[p][0], out[p][1], out[p][2]); +} + +template +__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) + zip_kernel_dense(Point* out, + const DeferredBuffer index_arrays, + const Rect rect, + int volume, + std::index_sequence) +{ + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= volume) return; + out[idx] = Legion::Point(index_arrays[Is][idx]...); + printf("IRINA DEBUG dense point = %d %d %d \n", out[idx][0], out[idx][1], out[idx][2]); + printf("IRINA DEBUG dense index_arr = %d %d %d \n", + index_arrays[0][idx], + index_arrays[1][idx], + index_arrays[2][idx]); +} + +template +struct ZipImplBody { + using VAL = int64_t; + + template + void operator()(const AccessorWO, DIM>& out, + const std::vector>& index_arrays, + const Rect& rect, + const Pitches& pitches, + bool dense, + std::index_sequence) const + { + const size_t volume = rect.volume(); + const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + if (dense) { + DeferredBuffer idx_arr(Memory::Kind::Z_COPY_MEM, + Rect<1>(0, index_arrays.size() - 1)); + for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) { + idx_arr[idx] = index_arrays[idx].ptr(rect); + } + zip_kernel_dense<<>>( + out.ptr(rect), idx_arr, rect, volume, std::make_index_sequence()); + } else { + DeferredBuffer, 1> idx_arr(Memory::Kind::Z_COPY_MEM, + Rect<1>(0, index_arrays.size() - 1)); + for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) idx_arr[idx] = index_arrays[idx]; + zip_kernel<<>>( + out, idx_arr, rect, pitches, volume, std::make_index_sequence()); + } + } +}; + +/*static*/ void ZipTask::gpu_variant(TaskContext& context) +{ + zip_template(context); +} +} // namespace cunumeric diff --git a/src/cunumeric/index/zip.h b/src/cunumeric/index/zip.h new file mode 100644 index 000000000..ae7476b05 --- /dev/null +++ b/src/cunumeric/index/zip.h @@ -0,0 +1,42 @@ +/* Copyright 2022 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#pragma once + +#include "cunumeric/cunumeric.h" + +namespace cunumeric { + +struct ZipArgs { + const Array& out; + const std::vector& inputs; +}; + +class ZipTask : public CuNumericTask { + public: + static const int TASK_ID = CUNUMERIC_ZIP; + + public: + static void cpu_variant(legate::TaskContext& context); +#ifdef LEGATE_USE_OPENMP + static void omp_variant(legate::TaskContext& context); +#endif +#ifdef LEGATE_USE_CUDA + static void gpu_variant(legate::TaskContext& context); +#endif +}; + +} // namespace cunumeric diff --git a/src/cunumeric/index/zip_omp.cc b/src/cunumeric/index/zip_omp.cc new file mode 100644 index 000000000..8cfebb32d --- /dev/null +++ b/src/cunumeric/index/zip_omp.cc @@ -0,0 +1,59 @@ +/* Copyright 2022 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/index/zip.h" +#include "cunumeric/index/zip_template.inl" + +namespace cunumeric { + +using namespace Legion; +using namespace legate; + +template +struct ZipImplBody { + using VAL = int64_t; + + template + void operator()(const AccessorWO, DIM>& out, + const std::vector>& index_arrays, + const Rect& rect, + const Pitches& pitches, + bool dense, + std::index_sequence) const + { + const size_t volume = rect.volume(); + if (dense) { + auto outptr = out.ptr(rect); +#pragma omp parallel for schedule(static) + for (size_t idx = 0; idx < volume; ++idx) { + outptr[idx] = Legion::Point(index_arrays[Is].ptr(rect)[idx]...); + } + } else { +#pragma omp parallel for schedule(static) + for (size_t idx = 0; idx < volume; ++idx) { + auto p = pitches.unflatten(idx, rect.lo); + out[p] = Legion::Point(index_arrays[Is][p]...); + } + } // else + } +}; + +/*static*/ void ZipTask::omp_variant(TaskContext& context) +{ + zip_template(context); +} + +} // namespace cunumeric diff --git a/src/cunumeric/index/zip_template.inl b/src/cunumeric/index/zip_template.inl new file mode 100644 index 000000000..bf8d95394 --- /dev/null +++ b/src/cunumeric/index/zip_template.inl @@ -0,0 +1,71 @@ +/* Copyright 2022 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/pitches.h" + +namespace cunumeric { + +using namespace Legion; +using namespace legate; + +template +struct ZipImplBody; + +template +struct ZipImpl { + template + void operator()(ZipArgs& args) const + { + using VAL = int64_t; + auto out_rect = args.out.shape(); + auto out = args.out.write_accessor, DIM>(out_rect); + auto index_rect = args.inputs[0].shape(); + Pitches pitches; + size_t volume = pitches.flatten(index_rect); + if (volume == 0) return; + + std::cout << "IRINA DEBUG N=" << N << " , D = " << DIM << std::endl; + + std::cout << "IRINA DEBUG idex_rect = " << index_rect << "out_rect = " << out_rect << std::endl; +#ifndef LEGION_BOUNDS_CHECKS + bool dense = out.accessor.is_dense_row_major(out_rect); +#endif + std::vector> index_arrays; + for (int i = 0; i < args.inputs.size(); i++) { +#ifdef CUNUMERIC_DEBUG + assert(index_rect == args.inputs[i].shape()); +#endif + index_arrays.push_back(args.inputs[i].read_accessor(index_rect)); + dense = dense && index_arrays[i].accessor.is_dense_row_major(out_rect); + } + +#ifdef LEGION_BOUNDS_CHECKS + bool dense = false; +#endif + + ZipImplBody()( + out, index_arrays, index_rect, pitches, dense, std::make_index_sequence()); + } +}; + +template +static void zip_template(TaskContext& context) +{ + ZipArgs args{context.outputs()[0], context.inputs()}; + double_dispatch(args.inputs[0].dim(), args.inputs.size(), ZipImpl{}, args); +} + +} // namespace cunumeric diff --git a/tests/index_routines.py b/tests/index_routines.py index 3906643b0..d6e598712 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -24,6 +24,45 @@ from legate.core import LEGATE_MAX_DIM +def advanced_indexing(): + + arr = num.array([1, 2, 3, 4, 5, 6, 7]) + indx = num.array([1, 3, 5]) + res = arr[indx] + z = np.array( + [ + [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], + [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]], + ] + ) + # ind0 = np.array([True, False]) + z_num = num.array(z) + # ind0_num = np.array(ind0) + # res = z_num[-1, :] + # print(res) + # print(z[-1, :]) + + # indx0_num = num.array([0, 0]) + # indx1_num = num.array([1, 1]) + # indx2_num = num.array([2, 2]) + + # indx0_num._thunk._zip_indices( + # (indx0_num._thunk, indx1_num._thunk, indx2_num._thunk,)) + + indx0_num = num.array([[0, 0], [0, 0], [0, 0]]) + indx1_num = num.array([[1, 1], [1, 1], [1, 1]]) + indx2_num = num.array([[2, 2], [2, 2], [2, 2]]) + + # indx0_num._thunk._zip_indices((indx0_num._thunk, + # indx1_num._thunk, indx2_num._thunk,)) + + res = z_num[indx0_num, indx1_num, indx2_num] + print(res) + + # res = z_num[ind0_num, :, indx] + return + + def test(): # -------------------------------------------------------------- # choose operator @@ -192,6 +231,10 @@ def test(): fn = np.diag(en, k=k) assert np.array_equal(f, fn) + advanced_indexing() + + return + if __name__ == "__main__": test() From 235716691e004454f88ca235d7d9449c94e4708c Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 10 Mar 2022 20:17:12 -0800 Subject: [PATCH 02/33] adding more tests for advanced indexing --- cunumeric/deferred.py | 46 ++--------- src/cunumeric/index/zip.cu | 6 -- src/cunumeric/index/zip_template.inl | 3 - tests/index_routines.py | 113 +++++++++++++++++++++++---- 4 files changed, 104 insertions(+), 64 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index bc9927747..16a15bcfa 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -340,6 +340,7 @@ def _zip_indices(self, arrays): output_arr = DeferredArray( self.runtime, base=store, dtype=pointN_dtype ) + # call ZIP function to combine index arrays into a singe array task = self.context.create_task(CuNumericOpCode.ZIP) task.add_output(output_arr.base) @@ -368,21 +369,21 @@ def _create_indexing_array(self, key): elif isinstance(k, NumPyThunk): if k.dtype == np.bool: k = k.nonzero() + tuple_of_arrays += k + else: + tuple_of_arrays += (self.runtime.to_deferred_array(k),) else: raise TypeError( "Unsupported entry type passed to advanced", "indexing operation", ) - tuple_of_arrays += (self.runtime.to_deferred_array(k),) else: assert isinstance(key, NumPyThunk) + # irina fixme + if key.ndim != self.ndim: + raise TypeError("Advanced indexing dimension mismatch") # Handle the boolean array case if key.dtype == np.bool: - # irina fixme - if key.ndim != self.ndim: - raise TypeError( - "Boolean advanced indexing dimension mismatch" - ) # IRINA fixme: replace `nonzero` case with the task with # output regions tuple_of_arrays = key.nonzero() @@ -393,37 +394,6 @@ def _create_indexing_array(self, key): raise TypeError("Advanced indexing dimension mismatch") if len(tuple_of_arrays) > 1: - # shape = tuple_of_arrays[0].shape - # for i in range(1, len(tuple_of_arrays)): - # if shape != tuple_of_arrays[i].shape: - # raise ValueError("index arrays should be the same shape") - - # create output array which will store Point field where - # N is number of index arrays - # shape of the output array should be the same as the shape of each - # index array - # NOTE: We need to instantiate a RegionField of non-primitive - # dtype, to store N-dimensional index points, to be used as the - # indirection field in a copy. - # Such dtypes are technically not supported, - # but it should be safe to directly create a DeferredArray - # of that dtype, so long as we don't try to convert it to a - # NumPy array. - # out_dtype = np.dtype((np.int64, (len(tuple_of_arrays),))) - # output_arr = DeferredArray( - # self.runtime, - # base=tuple_of_arrays[0].base, - # dtype=out_dtype, - # ) - - # # call ZIP function to combine index arrays into a singe array - # task = self.context.create_task(CuNumericOpCode.ZIP) - # task.add_output(output_arr.base) - # for index_arr in tuple_of_arrays: - # task.add_input(index_arr.base) - # task.add_alignment(index_arr.base, output_arr.base) - # task.execute() - output_arr = tuple_of_arrays[0]._zip_indices(tuple_of_arrays) return store, output_arr else: @@ -503,8 +473,6 @@ def get_item(self, key): copy.add_source_indirect(index_array.base) copy.add_output(result.base) - # copy.add_alignment(index_array.base, result.base) - copy.execute() else: diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu index 1a68e62b1..1bd8b6aef 100644 --- a/src/cunumeric/index/zip.cu +++ b/src/cunumeric/index/zip.cu @@ -35,7 +35,6 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) if (idx >= volume) return; auto p = pitches.unflatten(idx, rect.lo); out[p] = Legion::Point(index_arrays[Is][p]...); - printf("IRINA DEBUG point = %d %d %d \n", out[p][0], out[p][1], out[p][2]); } template @@ -49,11 +48,6 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= volume) return; out[idx] = Legion::Point(index_arrays[Is][idx]...); - printf("IRINA DEBUG dense point = %d %d %d \n", out[idx][0], out[idx][1], out[idx][2]); - printf("IRINA DEBUG dense index_arr = %d %d %d \n", - index_arrays[0][idx], - index_arrays[1][idx], - index_arrays[2][idx]); } template diff --git a/src/cunumeric/index/zip_template.inl b/src/cunumeric/index/zip_template.inl index bf8d95394..f16b89474 100644 --- a/src/cunumeric/index/zip_template.inl +++ b/src/cunumeric/index/zip_template.inl @@ -37,9 +37,6 @@ struct ZipImpl { size_t volume = pitches.flatten(index_rect); if (volume == 0) return; - std::cout << "IRINA DEBUG N=" << N << " , D = " << DIM << std::endl; - - std::cout << "IRINA DEBUG idex_rect = " << index_rect << "out_rect = " << out_rect << std::endl; #ifndef LEGION_BOUNDS_CHECKS bool dense = out.accessor.is_dense_row_major(out_rect); #endif diff --git a/tests/index_routines.py b/tests/index_routines.py index d6e598712..599d7a045 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -35,31 +35,112 @@ def advanced_indexing(): [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]], ] ) - # ind0 = np.array([True, False]) z_num = num.array(z) - # ind0_num = np.array(ind0) - # res = z_num[-1, :] - # print(res) - # print(z[-1, :]) - - # indx0_num = num.array([0, 0]) - # indx1_num = num.array([1, 1]) - # indx2_num = num.array([2, 2]) - # indx0_num._thunk._zip_indices( - # (indx0_num._thunk, indx1_num._thunk, indx2_num._thunk,)) + # simple advance indexing: + y = np.array([0, -1, -2, -3, -4, -5]) + y_num = num.array(y) + index = np.array([2, 4, 0, 4, 4, 4]) + index_num = num.array(index) + assert np.array_equal(y[index], y_num[index_num]) + + # simple 2D case + # fixme dimension mismatch case + # index_2d = np.array([[ 1, 2, 0], + # [ 5, 5, 5], + # [ 2, 3, 4]]) + # index_2d_num = num.array(index_2d) + # assert np.array_equal(y[index_2d], y_num[index_2d_num]) + + # mismatch dimesion case: + # indx_bool = np.array([True, True]) + # indx_bool_num = num.array(indx_bool) + # res = z[indx_bool] + # res_num = z_num[indx_bool_num] + # print ("bool array as indx np:") + # print(res) + # print ("cunumeric:") + # print (res_num) - indx0_num = num.array([[0, 0], [0, 0], [0, 0]]) - indx1_num = num.array([[1, 1], [1, 1], [1, 1]]) - indx2_num = num.array([[2, 2], [2, 2], [2, 2]]) + # test for bool array of the same dimension + indx_bool = np.array( + [ + [ + [False, True, False, False], + [True, True, False, False], + [True, False, True, False], + ], + [ + [False, True, False, False], + [True, True, False, False], + [True, False, True, False], + ], + ] + ) + indx_bool_num = num.array(indx_bool) + res = z[indx_bool] + res_num = z_num[indx_bool_num] + print("bool array as indx np:") + print(res) + print(z[indx_bool.nonzero()]) + print("cunumeric:") + print(res_num) + # fixme unomment when nonzero is fixed + # assert np.array_equal(res, res_num) + + # test mixed data + res = z[-1, :] + res_num = z_num[-1, :] + assert np.array_equal(res, res_num) + + # case when multiple number of arays is send + indx0 = np.array([[0, 1], [1, 0], [0, 0]]) + indx1 = np.array([[0, 1], [2, 0], [1, 2]]) + indx2 = np.array([[3, 2], [1, 0], [3, 2]]) + + indx0_num = num.array(indx0) + indx1_num = num.array(indx1) + indx2_num = num.array(indx2) # indx0_num._thunk._zip_indices((indx0_num._thunk, # indx1_num._thunk, indx2_num._thunk,)) res = z_num[indx0_num, indx1_num, indx2_num] - print(res) + res_np = z[indx0, indx1, indx2] + assert np.array_equal(res, res_np) + + # FIXME: Combining Basic and Advanced Indexing Schemes: + # ind0 = np.array([True, True]) + # ind0_num=num.array(ind0) + # res = z[ind0, :, -1] + # res_num = z_num[ind0_num, :, -1] + # print (res) + # fixme error + # print(res_num) + # assert np.array_equal(res, res_num) + + # In-Place & Augmented Assignments via Advanced Indexing + x = np.array( + [ + [0.38, -0.16, 0.38, -0.41, -0.04], + [-0.47, -0.01, -0.18, -0.5, -0.49], + [0.02, 0.4, 0.33, 0.33, -0.13], + ] + ) + indx0 = np.array([0, 2]) + indx1 = np.array([2, 4]) + # x_num = num.array(x) + # indx0_num = num.array(indx0) + # indx1_num = num.array(indx1) + print(x[indx0, indx1]) + # FIXME 0: + # print (x_num[indx0_num,indx1_num]) + # assert np.array_equal(x[indx0, indx1], x_num[indx0_num, indx1_num]) + # print (x_num[indx0_num, indx1_num]) + x[indx0, indx1] = 0.0 + print(x) + # x_num[indx0_num, indx1_num] =0.0 - # res = z_num[ind0_num, :, indx] return From abc583ee563ecf8ea66a6755581e77182f4dce37 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Fri, 11 Mar 2022 14:27:09 -0800 Subject: [PATCH 03/33] addressing dimension mismatch case --- cunumeric/deferred.py | 49 ++++++++++++------ tests/index_routines.py | 112 ++++++++++++++++++++++++---------------- 2 files changed, 101 insertions(+), 60 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 16a15bcfa..3bd950e3b 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -379,9 +379,21 @@ def _create_indexing_array(self, key): ) else: assert isinstance(key, NumPyThunk) - # irina fixme - if key.ndim != self.ndim: - raise TypeError("Advanced indexing dimension mismatch") + if key.ndim < store.ndim: + raise TypeError("Unimplimented") + # FIXME advance indexing task + # diff = store.ndim - key.ndim + # print ("IRINA DEBUG store ndim = " , store) + # for i in range(diff): + # store = store.slice((store.ndim - i - 1), slice(None)) + # print ("IRINA DEBUG store ndim = " , store) + elif key.ndim > store.ndim: + if store.ndim != 1: + raise ValueError("Advance indexing dimention mismatch") + diff = store.ndim - key.ndim + for i in range(diff): + store = store.promote(i + 1, store.shape[0]) + # Handle the boolean array case if key.dtype == np.bool: # IRINA fixme: replace `nonzero` case with the task with @@ -460,7 +472,6 @@ def get_item(self, key): if self._is_advanced_indexing(key): # Create the indexing array store, index_array = self._create_indexing_array(key) - # Create a new array to be the result result = self.runtime.create_empty_thunk( index_array.base.shape, @@ -498,23 +509,31 @@ def set_item(self, key, rhs): # Check to see if this is advanced indexing or not if self._is_advanced_indexing(key): # Create the indexing array - index_array = self._create_indexing_array(key) - if index_array.shape != rhs.shape: - raise ValueError( - "Advanced indexing array does not match source shape" - ) - if self.ndim != index_array.ndim: - raise NotImplementedError( - "need support for indirect partitioning" + store, index_array = self._create_indexing_array(key) + # if index_array.shape != rhs.shape: + # raise ValueError( + # "Advanced indexing array does not match source shape" + # ) + # if self.ndim != index_array.ndim: + # raise NotImplementedError( + # "need support for indirect partitioning" + # ) + if rhs.ndim == 0: + shape = store.shape + val = rhs + rhs = self.runtime.create_empty_thunk( + shape, + self.dtype, + inputs=[self], ) - + rhs.fill(val) copy = self.context.create_copy() - copy.add_input(rhs.base) + copy.add_input(store) copy.add_target_indirect(index_array.base) copy.add_output(self.base) - copy.add_alignment(index_array.base, rhs.base) + # copy.add_alignment(index_array.base, rhs.base) copy.execute() diff --git a/tests/index_routines.py b/tests/index_routines.py index 599d7a045..fcef0aefe 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -25,10 +25,24 @@ def advanced_indexing(): + # simple advance indexing: + print("advance indexing test 1") + x = np.array([1, 2, 3, 4, 5, 6, 7]) + indx = np.array([1, 3, 5]) + res = x[indx] + x_num = num.array(x) + indx_num = num.array(indx) + res_num = x_num[indx_num] + assert np.array_equal(res, res_num) + + # advance indexing test when a.ndim ==1 , indx.ndim >1 + print("advance indexing test 2") + y = np.array([0, -1, -2, -3, -4, -5]) + y_num = num.array(y) + index = np.array([[1, 0, 1, 3, 0, 0], [2, 4, 0, 4, 4, 4]]) + index_num = num.array(index) + assert np.array_equal(y[index], y_num[index_num]) - arr = num.array([1, 2, 3, 4, 5, 6, 7]) - indx = num.array([1, 3, 5]) - res = arr[indx] z = np.array( [ [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], @@ -37,32 +51,28 @@ def advanced_indexing(): ) z_num = num.array(z) - # simple advance indexing: - y = np.array([0, -1, -2, -3, -4, -5]) - y_num = num.array(y) - index = np.array([2, 4, 0, 4, 4, 4]) - index_num = num.array(index) - assert np.array_equal(y[index], y_num[index_num]) - # simple 2D case - # fixme dimension mismatch case - # index_2d = np.array([[ 1, 2, 0], - # [ 5, 5, 5], - # [ 2, 3, 4]]) - # index_2d_num = num.array(index_2d) - # assert np.array_equal(y[index_2d], y_num[index_2d_num]) + print("advance indexing test 3") + index_2d = np.array([[1, 2, 0], [5, 5, 5], [2, 3, 4]]) + index_2d_num = num.array(index_2d) + assert np.array_equal(y[index_2d], y_num[index_2d_num]) # mismatch dimesion case: - # indx_bool = np.array([True, True]) + # print ("advance indexing test 4") + # indx_bool = np.array([True, False]) # indx_bool_num = num.array(indx_bool) # res = z[indx_bool] - # res_num = z_num[indx_bool_num] - # print ("bool array as indx np:") + # print("IRINA DEBUG") + # assert np.array_equal(indx_bool.nonzero(), indx_bool_num.nonzero()) + # print("bool array as indx np:") # print(res) - # print ("cunumeric:") - # print (res_num) + # print("cunumeric:") + # res_num = z_num[indx_bool_num] + # print(res_num) + # assert np.array_equal(res, res_num) # test for bool array of the same dimension + print("advance indexing test 5") indx_bool = np.array( [ [ @@ -80,20 +90,20 @@ def advanced_indexing(): indx_bool_num = num.array(indx_bool) res = z[indx_bool] res_num = z_num[indx_bool_num] - print("bool array as indx np:") - print(res) - print(z[indx_bool.nonzero()]) - print("cunumeric:") - print(res_num) - # fixme unomment when nonzero is fixed - # assert np.array_equal(res, res_num) + # print("bool array as indx np:") + # print(res) + # print("cunumeric:") + # print(res_num) + assert np.array_equal(res, res_num) # test mixed data + print("advance indexing test 6") res = z[-1, :] res_num = z_num[-1, :] assert np.array_equal(res, res_num) - # case when multiple number of arays is send + # case when multiple number of arays is passed + print("advance indexing test 7") indx0 = np.array([[0, 1], [1, 0], [0, 0]]) indx1 = np.array([[0, 1], [2, 0], [1, 2]]) indx2 = np.array([[3, 2], [1, 0], [3, 2]]) @@ -110,35 +120,47 @@ def advanced_indexing(): assert np.array_equal(res, res_np) # FIXME: Combining Basic and Advanced Indexing Schemes: - # ind0 = np.array([True, True]) - # ind0_num=num.array(ind0) + # print ("advance indexing test 8") + # ind0 = np.array([1, 1]) + # ind0_num = num.array(ind0) # res = z[ind0, :, -1] # res_num = z_num[ind0_num, :, -1] - # print (res) - # fixme error + # print(res) # print(res_num) # assert np.array_equal(res, res_num) # In-Place & Augmented Assignments via Advanced Indexing - x = np.array( - [ - [0.38, -0.16, 0.38, -0.41, -0.04], - [-0.47, -0.01, -0.18, -0.5, -0.49], - [0.02, 0.4, 0.33, 0.33, -0.13], - ] - ) - indx0 = np.array([0, 2]) - indx1 = np.array([2, 4]) + # simple 1d case + # y = np.array([0, -1, -2, -3, -4, -5]) + # y_num = num.array(y) + # index = np.array([2, 4, 0, 4, 4, 4]) + # index_num = num.array(index) + # print (y[index]) + # print(y_num[index]) + # y[index] = 0 + # y_num[index_num] =0 + # print (y_num) + + # 2D test + # x = np.array( + # [ + # [0.38, -0.16, 0.38, -0.41, -0.04], + # [-0.47, -0.01, -0.18, -0.5, -0.49], + # [0.02, 0.4, 0.33, 0.33, -0.13], + # ] + # ) + # indx0 = np.array([0, 1]) + # indx1 = np.array([1, 2]) # x_num = num.array(x) # indx0_num = num.array(indx0) # indx1_num = num.array(indx1) - print(x[indx0, indx1]) + # print(x[indx0, indx1]) # FIXME 0: # print (x_num[indx0_num,indx1_num]) # assert np.array_equal(x[indx0, indx1], x_num[indx0_num, indx1_num]) # print (x_num[indx0_num, indx1_num]) - x[indx0, indx1] = 0.0 - print(x) + # x[indx0, indx1] = 0.0 + # print(x) # x_num[indx0_num, indx1_num] =0.0 return From 3925e52ddf2ccac51202297d6a5ab358db9bc59b Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Mon, 14 Mar 2022 14:24:58 -0700 Subject: [PATCH 04/33] adding broadcasting for index arrays --- cunumeric/deferred.py | 46 ++++++++++++++++++++++++++--------------- tests/index_routines.py | 30 +++++++++++++++++++-------- 2 files changed, 50 insertions(+), 26 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 3bd950e3b..37f5d5390 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -314,13 +314,16 @@ def _zip_indices(self, arrays): data_type = arrays[0].dtype if not np.issubdtype(data_type, np.integer): raise TypeError("a array should be integer type") + new_arrays = tuple() for a in arrays: - if a.shape != shape: - raise TypeError( - "shape of all index arrrays should be the same" - ) if data_type != a.dtype: raise TypeError("type of all index arrrays should be the same") + if a.shape != shape: + a = a._broadcast(shape) + else: + a = a.base + new_arrays = new_arrays + (a,) + arrays = new_arrays # create output array which will store Point field where # N is number of index arrays # shape of the output array should be the same as the shape of each @@ -345,8 +348,8 @@ def _zip_indices(self, arrays): task = self.context.create_task(CuNumericOpCode.ZIP) task.add_output(output_arr.base) for index_arr in arrays: - task.add_input(index_arr.base) - task.add_alignment(output_arr.base, index_arr.base) + task.add_input(index_arr) + task.add_alignment(output_arr.base, index_arr) task.execute() return output_arr @@ -365,6 +368,7 @@ def _create_indexing_array(self, key): store = store.project(dim + shift, k) shift -= 1 elif isinstance(k, slice): + # FIXME do we need to transform the store here? store = store.slice(dim + shift, k) elif isinstance(k, NumPyThunk): if k.dtype == np.bool: @@ -379,15 +383,7 @@ def _create_indexing_array(self, key): ) else: assert isinstance(key, NumPyThunk) - if key.ndim < store.ndim: - raise TypeError("Unimplimented") - # FIXME advance indexing task - # diff = store.ndim - key.ndim - # print ("IRINA DEBUG store ndim = " , store) - # for i in range(diff): - # store = store.slice((store.ndim - i - 1), slice(None)) - # print ("IRINA DEBUG store ndim = " , store) - elif key.ndim > store.ndim: + if key.ndim > store.ndim: if store.ndim != 1: raise ValueError("Advance indexing dimention mismatch") diff = store.ndim - key.ndim @@ -399,17 +395,33 @@ def _create_indexing_array(self, key): # IRINA fixme: replace `nonzero` case with the task with # output regions tuple_of_arrays = key.nonzero() + elif key.ndim < store.ndim: + # FIXME test and see if it works for 2D + diff = store.ndim - key.ndim + indx = np.expand_dims(key, list(range(diff, self.ndim))) + tuple_of_arrays = (indx,) + for dim in range(diff, self.ndim): + indx = np.expand_dims( + np.arrange( + self.shape[dim], + list(i for i in range(self.ndim) if i != dim), + ) + ) + tuple_of_arrays = tuple_of_arrays + (indx,) else: tuple_of_arrays = (self.runtime.to_deferred_array(key),) if len(tuple_of_arrays) > self.ndim: raise TypeError("Advanced indexing dimension mismatch") - if len(tuple_of_arrays) > 1: + if len(tuple_of_arrays) == self.ndim and self.ndim > 1: + output_arr = tuple_of_arrays[0]._zip_indices(tuple_of_arrays) return store, output_arr - else: + elif len(tuple_of_arrays) == 1 and self.ndim == 1: return store, tuple_of_arrays[0] + else: + raise ValueError("Advance indexing dimention mismatch") @staticmethod def _unpack_ellipsis(key, ndim): diff --git a/tests/index_routines.py b/tests/index_routines.py index fcef0aefe..7303e09c7 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -25,8 +25,8 @@ def advanced_indexing(): - # simple advance indexing: - print("advance indexing test 1") + # simple advanced indexing: + print("advanced indexing test 1") x = np.array([1, 2, 3, 4, 5, 6, 7]) indx = np.array([1, 3, 5]) res = x[indx] @@ -35,8 +35,8 @@ def advanced_indexing(): res_num = x_num[indx_num] assert np.array_equal(res, res_num) - # advance indexing test when a.ndim ==1 , indx.ndim >1 - print("advance indexing test 2") + # advanced indexing test when a.ndim ==1 , indx.ndim >1 + print("advanced indexing test 2") y = np.array([0, -1, -2, -3, -4, -5]) y_num = num.array(y) index = np.array([[1, 0, 1, 3, 0, 0], [2, 4, 0, 4, 4, 4]]) @@ -52,7 +52,7 @@ def advanced_indexing(): z_num = num.array(z) # simple 2D case - print("advance indexing test 3") + print("advanced indexing test 3") index_2d = np.array([[1, 2, 0], [5, 5, 5], [2, 3, 4]]) index_2d_num = num.array(index_2d) assert np.array_equal(y[index_2d], y_num[index_2d_num]) @@ -72,7 +72,7 @@ def advanced_indexing(): # assert np.array_equal(res, res_num) # test for bool array of the same dimension - print("advance indexing test 5") + print("advanced indexing test 5") indx_bool = np.array( [ [ @@ -97,13 +97,13 @@ def advanced_indexing(): assert np.array_equal(res, res_num) # test mixed data - print("advance indexing test 6") + print("advanced indexing test 6") res = z[-1, :] res_num = z_num[-1, :] assert np.array_equal(res, res_num) # case when multiple number of arays is passed - print("advance indexing test 7") + print("advanced indexing test 7") indx0 = np.array([[0, 1], [1, 0], [0, 0]]) indx1 = np.array([[0, 1], [2, 0], [1, 2]]) indx2 = np.array([[3, 2], [1, 0], [3, 2]]) @@ -119,8 +119,20 @@ def advanced_indexing(): res_np = z[indx0, indx1, indx2] assert np.array_equal(res, res_np) + # indices with broadcast: + indx0 = np.array([[0, 1], [1, 0], [0, 0]]) + indx1 = np.array([[0, 1]]) + indx2 = np.array([[3, 2], [1, 0], [3, 2]]) + + indx0_num = num.array(indx0) + indx1_num = num.array(indx1) + indx2_num = num.array(indx2) + res = z_num[indx0_num, indx1_num, indx2_num] + res_np = z[indx0, indx1, indx2] + assert np.array_equal(res, res_np) + # FIXME: Combining Basic and Advanced Indexing Schemes: - # print ("advance indexing test 8") + # print ("advanced indexing test 8") # ind0 = np.array([1, 1]) # ind0_num = num.array(ind0) # res = z[ind0, :, -1] From e7708c7447870c71f1f051c5a07c28a7787c9dab Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Tue, 15 Mar 2022 11:22:13 -0700 Subject: [PATCH 05/33] adding advanced_indexing task --- cunumeric/config.py | 1 + cunumeric/deferred.py | 73 +++++---- src/cunumeric.mk | 3 + src/cunumeric/cunumeric_c.h | 1 + src/cunumeric/index/advanced_indexing.cc | 76 +++++++++ src/cunumeric/index/advanced_indexing.cu | 148 ++++++++++++++++++ src/cunumeric/index/advanced_indexing.h | 43 +++++ src/cunumeric/index/advanced_indexing_omp.cc | 92 +++++++++++ .../index/advanced_indexing_template.inl | 79 ++++++++++ tests/index_routines.py | 29 +++- tests/nonzero.py | 4 + 11 files changed, 515 insertions(+), 34 deletions(-) create mode 100644 src/cunumeric/index/advanced_indexing.cc create mode 100644 src/cunumeric/index/advanced_indexing.cu create mode 100644 src/cunumeric/index/advanced_indexing.h create mode 100644 src/cunumeric/index/advanced_indexing_omp.cc create mode 100644 src/cunumeric/index/advanced_indexing_template.inl diff --git a/cunumeric/config.py b/cunumeric/config.py index e1462fbd3..2ec560d50 100644 --- a/cunumeric/config.py +++ b/cunumeric/config.py @@ -78,6 +78,7 @@ def destroy(self): # Match these to CuNumericOpCode in cunumeric_c.h @unique class CuNumericOpCode(IntEnum): + ADVANCED_INDX = _cunumeric.CUNUMERIC_ADVANCED_INDEXING ARANGE = _cunumeric.CUNUMERIC_ARANGE BINARY_OP = _cunumeric.CUNUMERIC_BINARY_OP BINARY_RED = _cunumeric.CUNUMERIC_BINARY_RED diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 37f5d5390..91cad64b7 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -372,6 +372,8 @@ def _create_indexing_array(self, key): store = store.slice(dim + shift, k) elif isinstance(k, NumPyThunk): if k.dtype == np.bool: + # in case of the mixed indises we all nonzero + # for the bool array k = k.nonzero() tuple_of_arrays += k else: @@ -392,22 +394,36 @@ def _create_indexing_array(self, key): # Handle the boolean array case if key.dtype == np.bool: + if key.shape == self.shape: + out = self.runtime.create_unbound_thunk(self.dtype) + task = self.context.create_task( + CuNumericOpCode.ADVANCED_INDX + ) + task.add_output(out.base) + task.add_input(self.base) + task.add_input(key.base) + task.add_alignment(self.base, key.base) + task.execute() + return False, store, out # IRINA fixme: replace `nonzero` case with the task with - # output regions + # output regions when ND output regions are available tuple_of_arrays = key.nonzero() elif key.ndim < store.ndim: - # FIXME test and see if it works for 2D - diff = store.ndim - key.ndim - indx = np.expand_dims(key, list(range(diff, self.ndim))) - tuple_of_arrays = (indx,) - for dim in range(diff, self.ndim): - indx = np.expand_dims( - np.arrange( - self.shape[dim], - list(i for i in range(self.ndim) if i != dim), - ) - ) - tuple_of_arrays = tuple_of_arrays + (indx,) + raise ValueError("Advance indexing dimention mismatch") + # FIXME add extensions to ZIP taskD + # ndim_out = store.ndim + key.ndim-1 + # indx = key._expand_dims(list(range(key.ndim, ndim_out))) + # np.expand_dims(key, list(range(key.ndim, ndim_out))) + # print("IRINA DEBUG shape key " , indx.shape) + # tuple_of_arrays = (indx,) + # for dim in range(1, store.ndim): + # dims= list(i for i in range(ndim_out) if i + # not in range(dim+key.ndim-1,dim+2*key.ndim-1)) + # print("IRINA DEBUG dims = ", dims) + + # indx = np.arrange( + # self.shape[dim])._expand_dims(dims) + # tuple_of_arrays = tuple_of_arrays + (indx,) else: tuple_of_arrays = (self.runtime.to_deferred_array(key),) @@ -417,9 +433,9 @@ def _create_indexing_array(self, key): if len(tuple_of_arrays) == self.ndim and self.ndim > 1: output_arr = tuple_of_arrays[0]._zip_indices(tuple_of_arrays) - return store, output_arr + return True, store, output_arr elif len(tuple_of_arrays) == 1 and self.ndim == 1: - return store, tuple_of_arrays[0] + return True, store, tuple_of_arrays[0] else: raise ValueError("Advance indexing dimention mismatch") @@ -483,20 +499,23 @@ def get_item(self, key): # Check to see if this is advanced indexing or not if self._is_advanced_indexing(key): # Create the indexing array - store, index_array = self._create_indexing_array(key) - # Create a new array to be the result - result = self.runtime.create_empty_thunk( - index_array.base.shape, - self.dtype, - inputs=[self], - ) - copy = self.context.create_copy() + copy_needed, store, index_array = self._create_indexing_array(key) + if copy_needed: + # Create a new array to be the result + result = self.runtime.create_empty_thunk( + index_array.base.shape, + self.dtype, + inputs=[self], + ) + copy = self.context.create_copy() - copy.add_input(store) - copy.add_source_indirect(index_array.base) - copy.add_output(result.base) + copy.add_input(store) + copy.add_source_indirect(index_array.base) + copy.add_output(result.base) - copy.execute() + copy.execute() + else: + return index_array else: result = self._get_view(key) diff --git a/src/cunumeric.mk b/src/cunumeric.mk index 9778d5dd1..896c6ca4f 100644 --- a/src/cunumeric.mk +++ b/src/cunumeric.mk @@ -25,6 +25,7 @@ GEN_CPU_SRC += cunumeric/ternary/where.cc \ cunumeric/nullary/arange.cc \ cunumeric/nullary/eye.cc \ cunumeric/nullary/fill.cc \ + cunumeric/index/advanced_indexing.cc \ cunumeric/index/choose.cc \ cunumeric/index/repeat.cc \ cunumeric/index/zip.cc \ @@ -65,6 +66,7 @@ GEN_CPU_SRC += cunumeric/ternary/where_omp.cc \ cunumeric/nullary/arange_omp.cc \ cunumeric/nullary/eye_omp.cc \ cunumeric/nullary/fill_omp.cc \ + cunumeric/index/advanced_indexing_omp.cc\ cunumeric/index/choose_omp.cc \ cunumeric/index/repeat_omp.cc \ cunumeric/index/zip_omp.cc \ @@ -104,6 +106,7 @@ GEN_GPU_SRC += cunumeric/ternary/where.cu \ cunumeric/nullary/arange.cu \ cunumeric/nullary/eye.cu \ cunumeric/nullary/fill.cu \ + cunumeric/index/advanced_indexing.cu \ cunumeric/index/choose.cu \ cunumeric/index/repeat.cu \ cunumeric/index/zip.cu \ diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h index 68f4f56fd..0e8106ff1 100644 --- a/src/cunumeric/cunumeric_c.h +++ b/src/cunumeric/cunumeric_c.h @@ -23,6 +23,7 @@ // Also, sort these alphabetically except the first one for easy lookup later enum CuNumericOpCode { _CUNUMERIC_OP_CODE_BASE = 0, + CUNUMERIC_ADVANCED_INDEXING, CUNUMERIC_ARANGE, CUNUMERIC_BINARY_OP, CUNUMERIC_BINARY_RED, diff --git a/src/cunumeric/index/advanced_indexing.cc b/src/cunumeric/index/advanced_indexing.cc new file mode 100644 index 000000000..6b9224338 --- /dev/null +++ b/src/cunumeric/index/advanced_indexing.cc @@ -0,0 +1,76 @@ +/* Copyright 2022 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/index/advanced_indexing.h" +#include "cunumeric/index/advanced_indexing_template.inl" + +namespace cunumeric { + +using namespace Legion; +using namespace legate; + +template +struct AdvancedIndexingImplBody { + using VAL = legate_type_of; + + size_t operator()(Buffer& out, + const AccessorRO& input, + const AccessorRO& index, + const Pitches& pitches_input, + const Rect& rect_input, + const Pitches& pitches_index, + const Rect& rect_index) const + { +#ifdef CUNUMERIC_DEBUG + // in this case shapes for input and index arrays should be the same + assert(rect_input == rect_index); +#endif + const size_t volume = rect_index.volume(); + size_t size = 0; + for (size_t idx = 0; idx < volume; ++idx) { + auto p = pitches_index.unflatten(idx, rect_index.lo); + if (index[p] == true) { size++; } + } + + out = create_buffer(size, Memory::Kind::SYSTEM_MEM); + + int64_t out_idx = 0; + for (size_t idx = 0; idx < volume; ++idx) { + auto p = pitches_index.unflatten(idx, rect_index.lo); + auto p_input = pitches_input.unflatten(idx, rect_input.lo); + if (index[p] == true) { + out[out_idx] = input[p_input]; + out_idx++; + } + } + return size; + } +}; + +/*static*/ void AdvancedIndexingTask::cpu_variant(TaskContext& context) +{ + advanced_indexing_template(context); +} + +namespace // unnamed +{ +static void __attribute__((constructor)) register_tasks(void) +{ + AdvancedIndexingTask::register_variants(); +} +} // namespace + +} // namespace cunumeric diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cunumeric/index/advanced_indexing.cu new file mode 100644 index 000000000..f818579ed --- /dev/null +++ b/src/cunumeric/index/advanced_indexing.cu @@ -0,0 +1,148 @@ +/* Copyright 2022 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/index/advanced_indexing.h" +#include "cunumeric/index/advanced_indexing_template.inl" +#include "cunumeric/cuda_help.h" + +#include +#include + +namespace cunumeric { + +using namespace Legion; + +template +static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) + count_nonzero_kernel(size_t volume, + Output out, + AccessorRO index, + Pitches pitches, + Point origin, + size_t iters, + Buffer offsets) +{ + int64_t value = 0; + for (size_t idx = 0; idx < iters; idx++) { + const size_t offset = (idx * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; + if (offset < volume) { + auto point = pitches.unflatten(offset, origin); + auto val = static_cast(index[point]); + offsets[offset] = val; + SumReduction::fold(value, val); + } + } + // Every thread in the thread block must participate in the exchange to get correct results + reduce_output(out, value); +} + +template +static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) + advanced_indexing_kernel(size_t volume, + AccessorRO in, + AccessorRO index, + Buffer out, + Pitches pitches_input, + Point origin_input, + Pitches pitches_index, + Point origin_index, + Buffer offsets) +{ + // FIXME works only when DIM1==DIM2 + const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= volume) return; + auto point = pitches_index.unflatten(tid, origin_index); + auto point_input = pitches_input.unflatten(tid, origin_input); + if (index[point] == true) { + int64_t offset = offsets[tid]; + out[offset] = in[point_input]; + } +} +template +struct AdvancedIndexingImplBody { + using VAL = legate_type_of; + + int64_t compute_size(const AccessorRO& in, + const Pitches& pitches, + const Rect& rect, + const size_t volume, + cudaStream_t stream, + Buffer& offsets) const + { + DeferredReduction> size; + + const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(int64_t); + + if (blocks >= MAX_REDUCTION_CTAS) { + const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS; + count_nonzero_kernel<<>>( + volume, size, in, pitches, rect.lo, iters, offsets); + } else + count_nonzero_kernel<<>>( + volume, size, in, pitches, rect.lo, 1, offsets); + + cudaStreamSynchronize(stream); + + auto off_ptr = offsets.ptr(0); + thrust::exclusive_scan(thrust::cuda::par.on(stream), off_ptr, off_ptr + volume, off_ptr); + + return size.read(); + } + + size_t operator()(Buffer& out, + const AccessorRO& input, + const AccessorRO& index, + const Pitches& pitches_input, + const Rect& rect_input, + const Pitches& pitches_index, + const Rect& rect_index) const + { +#ifdef CUNUMERIC_DEBUG + // in this case shapes for input and index arrays should be the same + assert(rect_input == rect_index); +#endif + int64_t size = 0; + const bool* index_ptr = index.ptr(rect_index); + const size_t volume = rect_index.volume(); + cudaStream_t stream; + cudaStreamCreate(&stream); + auto offsets = create_buffer(volume, Memory::Kind::GPU_FB_MEM); + size = compute_size(index, pitches_index, rect_index, volume, stream, offsets); + + out = create_buffer(size, Memory::Kind::GPU_FB_MEM); + // populate output + if (size > 0) { + const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + advanced_indexing_kernel<<>>(volume, + input, + index, + out, + pitches_input, + rect_input.lo, + pitches_index, + rect_index.lo, + offsets); + } + return size; + } +}; + +/*static*/ void AdvancedIndexingTask::gpu_variant(TaskContext& context) +{ + advanced_indexing_template(context); +} +} // namespace cunumeric diff --git a/src/cunumeric/index/advanced_indexing.h b/src/cunumeric/index/advanced_indexing.h new file mode 100644 index 000000000..ec0c92681 --- /dev/null +++ b/src/cunumeric/index/advanced_indexing.h @@ -0,0 +1,43 @@ +/* Copyright 2022 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#pragma once + +#include "cunumeric/cunumeric.h" + +namespace cunumeric { + +struct AdvancedIndexingArgs { + Array& output; + const Array& input_array; + const Array& indexing_array; +}; + +class AdvancedIndexingTask : public CuNumericTask { + public: + static const int TASK_ID = CUNUMERIC_ADVANCED_INDEXING; + + public: + static void cpu_variant(legate::TaskContext& context); +#ifdef LEGATE_USE_OPENMP + static void omp_variant(legate::TaskContext& context); +#endif +#ifdef LEGATE_USE_CUDA + static void gpu_variant(legate::TaskContext& context); +#endif +}; + +} // namespace cunumeric diff --git a/src/cunumeric/index/advanced_indexing_omp.cc b/src/cunumeric/index/advanced_indexing_omp.cc new file mode 100644 index 000000000..5128ac75d --- /dev/null +++ b/src/cunumeric/index/advanced_indexing_omp.cc @@ -0,0 +1,92 @@ +/* Copyright 2022 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/index/advanced_indexing.h" +#include "cunumeric/index/advanced_indexing_template.inl" +#include "cunumeric/omp_help.h" +#include + +namespace cunumeric { + +using namespace Legion; +using namespace legate; + +template +struct AdvancedIndexingImplBody { + using VAL = legate_type_of; + + size_t operator()(Buffer& out, + const AccessorRO& input, + const AccessorRO& index, + const Pitches& pitches_input, + const Rect& rect_input, + const Pitches& pitches_index, + const Rect& rect_index) const + { +#ifdef CUNUMERIC_DEBUG + // in this case shapes for input and index arrays should be the same + assert(rect_input == rect_index); +#endif + const size_t volume = rect_index.volume(); + const auto max_threads = omp_get_max_threads(); + int64_t size = 0; + ThreadLocalStorage offsets(max_threads); + + { + ThreadLocalStorage sizes(max_threads); + for (auto idx = 0; idx < max_threads; ++idx) sizes[idx] = 0; +#pragma omp parallel + { + const int tid = omp_get_thread_num(); +#pragma omp for schedule(static) + for (size_t idx = 0; idx < volume; ++idx) { + auto point = pitches_index.unflatten(idx, rect_index.lo); + if (index[point] == true) sizes[tid] += 1; + } + } + + for (auto idx = 0; idx < max_threads; ++idx) size += sizes[idx]; + + offsets[0] = 0; + for (auto idx = 1; idx < max_threads; ++idx) offsets[idx] = offsets[idx - 1] + sizes[idx - 1]; + } + out = create_buffer(size, Memory::Kind::SYSTEM_MEM); + +#pragma omp parallel + { + const int tid = omp_get_thread_num(); + int64_t out_idx = offsets[tid]; +#pragma omp for schedule(static) + for (size_t idx = 0; idx < volume; ++idx) { + auto point = pitches_index.unflatten(idx, rect_index.lo); + auto point_input = pitches_input.unflatten(idx, rect_input.lo); + if (index[point] == true) { + out[out_idx] = input[point_input]; + ++out_idx; + } + } + } + + return size; + } +}; + +/*static*/ void AdvancedIndexingTask::omp_variant(TaskContext& context) +{ + advanced_indexing_template(context); +} + +} // namespace cunumeric diff --git a/src/cunumeric/index/advanced_indexing_template.inl b/src/cunumeric/index/advanced_indexing_template.inl new file mode 100644 index 000000000..ed88ac996 --- /dev/null +++ b/src/cunumeric/index/advanced_indexing_template.inl @@ -0,0 +1,79 @@ +/* Copyright 2022 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/pitches.h" + +namespace cunumeric { + +using namespace Legion; +using namespace legate; + +template +struct AdvancedIndexingImplBody; + +template +struct AdvancedIndexingImpl { + template + void operator()(AdvancedIndexingArgs& args) const + { + using VAL = legate_type_of; + auto input_rect = args.input_array.shape(); + auto input_arr = args.input_array.read_accessor(input_rect); + Pitches input_pitches; + Buffer output_arr; + size_t volume1 = input_pitches.flatten(input_rect); + + auto index_rect = args.indexing_array.shape(); + auto index_arr = args.indexing_array.read_accessor(index_rect); + Pitches index_pitches; + size_t volume2 = index_pitches.flatten(index_rect); + + if (volume1 == 0 || volume2 == 0) { + auto empty = create_buffer(0); + args.output.return_data(empty, 0); + return; + } + + int64_t size = 0; + if (DIM1 == DIM2) { + size = AdvancedIndexingImplBody{}( + output_arr, input_arr, index_arr, input_pitches, input_rect, index_pitches, index_rect); + } else { + // should never go here, not implemented + assert(false); + } + args.output.return_data(output_arr, size); + } +}; + +template +struct AdvancedIndexingHelper { + template + void operator()(AdvancedIndexingArgs& args) const + { + dim_dispatch(args.indexing_array.dim(), AdvancedIndexingImpl{}, args); + } +}; + +template +static void advanced_indexing_template(TaskContext& context) +{ + AdvancedIndexingArgs args{context.outputs()[0], context.inputs()[0], context.inputs()[1]}; + double_dispatch( + args.input_array.dim(), args.input_array.code(), AdvancedIndexingHelper{}, args); +} + +} // namespace cunumeric diff --git a/tests/index_routines.py b/tests/index_routines.py index 7303e09c7..1681adf7f 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -57,6 +57,18 @@ def advanced_indexing(): index_2d_num = num.array(index_2d) assert np.array_equal(y[index_2d], y_num[index_2d_num]) + # mismatch dimesion case integers: + # print ("advance indexing test 4") + # indx = np.array([1, 1]) + # indx_num = num.array(indx) + # res = z[indx] + # print("bool array as indx np:") + # print(res) + # print("cunumeric:") + # res_num = z_num[indx_num] + # print(res_num) + # assert np.array_equal(res, res_num) + # mismatch dimesion case: # print ("advance indexing test 4") # indx_bool = np.array([True, False]) @@ -73,6 +85,12 @@ def advanced_indexing(): # test for bool array of the same dimension print("advanced indexing test 5") + index = np.array([True, False, False, True, True, False]) + index_num = num.array(index) + assert np.array_equal(y[index], y_num[index_num]) + + # test for bool array of the same dimension 2D + print("advanced indexing test 6") indx_bool = np.array( [ [ @@ -90,20 +108,16 @@ def advanced_indexing(): indx_bool_num = num.array(indx_bool) res = z[indx_bool] res_num = z_num[indx_bool_num] - # print("bool array as indx np:") - # print(res) - # print("cunumeric:") - # print(res_num) assert np.array_equal(res, res_num) # test mixed data - print("advanced indexing test 6") + print("advanced indexing test 7") res = z[-1, :] res_num = z_num[-1, :] assert np.array_equal(res, res_num) # case when multiple number of arays is passed - print("advanced indexing test 7") + print("advanced indexing test 8") indx0 = np.array([[0, 1], [1, 0], [0, 0]]) indx1 = np.array([[0, 1], [2, 0], [1, 2]]) indx2 = np.array([[3, 2], [1, 0], [3, 2]]) @@ -120,6 +134,7 @@ def advanced_indexing(): assert np.array_equal(res, res_np) # indices with broadcast: + print("advanced indexing test 9") indx0 = np.array([[0, 1], [1, 0], [0, 0]]) indx1 = np.array([[0, 1]]) indx2 = np.array([[3, 2], [1, 0], [3, 2]]) @@ -132,7 +147,7 @@ def advanced_indexing(): assert np.array_equal(res, res_np) # FIXME: Combining Basic and Advanced Indexing Schemes: - # print ("advanced indexing test 8") + # print ("advanced indexing test 10") # ind0 = np.array([1, 1]) # ind0_num = num.array(ind0) # res = z[ind0, :, -1] diff --git a/tests/nonzero.py b/tests/nonzero.py index 109825f0f..6cd3d1472 100644 --- a/tests/nonzero.py +++ b/tests/nonzero.py @@ -101,6 +101,10 @@ def test(): np_nonzero = np.nonzero(x_np) assert_equal(lg_nonzero, np_nonzero) + x_np = np.array([True, True]) + x = num.array(x_np) + assert np.array_equal(x_np.nonzero(), x.nonzero()) + if __name__ == "__main__": test() From 6b4acf1ae65246d991e54670bbadc859570ef6b4 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 17 Mar 2022 10:51:04 -0700 Subject: [PATCH 06/33] extended ZIP task to support case when index.ndim < self.ndim --- cunumeric/deferred.py | 66 ++++++++++++++++------------ src/cunumeric/index/zip.cc | 28 ++++++++---- src/cunumeric/index/zip.cu | 50 +++++++++++++++------ src/cunumeric/index/zip.h | 2 + src/cunumeric/index/zip_omp.cc | 28 ++++++++---- src/cunumeric/index/zip_template.inl | 15 ++++--- tests/index_routines.py | 25 ++++++----- 7 files changed, 140 insertions(+), 74 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 91cad64b7..7d7b0e53d 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -315,14 +315,28 @@ def _zip_indices(self, arrays): if not np.issubdtype(data_type, np.integer): raise TypeError("a array should be integer type") new_arrays = tuple() - for a in arrays: - if data_type != a.dtype: - raise TypeError("type of all index arrrays should be the same") - if a.shape != shape: - a = a._broadcast(shape) - else: - a = a.base - new_arrays = new_arrays + (a,) + key_dim = len(arrays[0].shape) + + if len(arrays) == 1: + # special case when a single index array is passed and it's dim < + # self.ndims + shape = shape + tuple(self.shape[i] for i in range(1, self.ndim)) + array = arrays[0].base + start = key_dim - 1 + for i in range(1, self.ndim): + array = array.promote(start + i, self.shape[i]) + new_arrays += (array,) + else: + for a in arrays: + if data_type != a.dtype: + raise TypeError( + "type of all index arrrays should be the same" + ) + if a.shape != shape: + a = a._broadcast(shape) + else: + a = a.base + new_arrays = new_arrays + (a,) arrays = new_arrays # create output array which will store Point field where # N is number of index arrays @@ -335,7 +349,7 @@ def _zip_indices(self, arrays): # but it should be safe to directly create a DeferredArray # of that dtype, so long as we don't try to convert it to a # NumPy array. - N = len(arrays) + N = self.ndim pointN_dtype = self.runtime.add_point_type(N) store = self.context.create_store( pointN_dtype, shape=shape, optimize_scalar=True @@ -347,9 +361,18 @@ def _zip_indices(self, arrays): # call ZIP function to combine index arrays into a singe array task = self.context.create_task(CuNumericOpCode.ZIP) task.add_output(output_arr.base) - for index_arr in arrays: - task.add_input(index_arr) - task.add_alignment(output_arr.base, index_arr) + if len(arrays) == 1: + task.add_input(arrays[0]) + task.add_alignment(arrays[0], output_arr.base) + task.add_scalar_arg(self.ndim, ty.int64) + task.add_scalar_arg(key_dim, ty.int64) + task.add_broadcast(arrays[0], axes=range(1, len(shape))) + else: + task.add_scalar_arg(self.ndim, ty.int64) + task.add_scalar_arg(self.ndim, ty.int64) + for index_arr in arrays: + task.add_input(index_arr) + task.add_alignment(output_arr.base, index_arr) task.execute() return output_arr @@ -409,21 +432,8 @@ def _create_indexing_array(self, key): # output regions when ND output regions are available tuple_of_arrays = key.nonzero() elif key.ndim < store.ndim: - raise ValueError("Advance indexing dimention mismatch") - # FIXME add extensions to ZIP taskD - # ndim_out = store.ndim + key.ndim-1 - # indx = key._expand_dims(list(range(key.ndim, ndim_out))) - # np.expand_dims(key, list(range(key.ndim, ndim_out))) - # print("IRINA DEBUG shape key " , indx.shape) - # tuple_of_arrays = (indx,) - # for dim in range(1, store.ndim): - # dims= list(i for i in range(ndim_out) if i - # not in range(dim+key.ndim-1,dim+2*key.ndim-1)) - # print("IRINA DEBUG dims = ", dims) - - # indx = np.arrange( - # self.shape[dim])._expand_dims(dims) - # tuple_of_arrays = tuple_of_arrays + (indx,) + output_arr = self._zip_indices((key,)) + return True, store, output_arr else: tuple_of_arrays = (self.runtime.to_deferred_array(key),) @@ -432,7 +442,7 @@ def _create_indexing_array(self, key): if len(tuple_of_arrays) == self.ndim and self.ndim > 1: - output_arr = tuple_of_arrays[0]._zip_indices(tuple_of_arrays) + output_arr = self._zip_indices(tuple_of_arrays) return True, store, output_arr elif len(tuple_of_arrays) == 1 and self.ndim == 1: return True, store, tuple_of_arrays[0] diff --git a/src/cunumeric/index/zip.cc b/src/cunumeric/index/zip.cc index 364d7e973..e0bb67e48 100644 --- a/src/cunumeric/index/zip.cc +++ b/src/cunumeric/index/zip.cc @@ -32,20 +32,30 @@ struct ZipImplBody { const Rect& rect, const Pitches& pitches, bool dense, + const int64_t key_dim, std::index_sequence) const { - const size_t volume = rect.volume(); - if (dense) { - auto outptr = out.ptr(rect); - for (size_t idx = 0; idx < volume; ++idx) { - outptr[idx] = Legion::Point(index_arrays[Is].ptr(rect)[idx]...); - // std::cout<<"IRINA DEBUG dense out = "< 1) { + const size_t volume = rect.volume(); + if (dense) { + auto outptr = out.ptr(rect); + for (size_t idx = 0; idx < volume; ++idx) { + outptr[idx] = Legion::Point(index_arrays[Is].ptr(rect)[idx]...); + } + } else { + for (size_t idx = 0; idx < volume; ++idx) { + auto p = pitches.unflatten(idx, rect.lo); + out[p] = Legion::Point(index_arrays[Is][p]...); + } } - } else { + } else if (index_arrays.size() == 1) { + const size_t volume = rect.volume(); for (size_t idx = 0; idx < volume; ++idx) { auto p = pitches.unflatten(idx, rect.lo); - out[p] = Legion::Point(index_arrays[Is][p]...); - // std::cout<<"IRINA DEBUG out = "< new_point; + new_point[0] = index_arrays[0][p]; + for (size_t i = 1; i < N; i++) { new_point[i] = p[key_dim + i - 1]; } + out[p] = new_point; } } } diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu index 1bd8b6aef..bbfdbbb07 100644 --- a/src/cunumeric/index/zip.cu +++ b/src/cunumeric/index/zip.cu @@ -50,6 +50,24 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) out[idx] = Legion::Point(index_arrays[Is][idx]...); } +template +__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) + zip_kernel(const AccessorWO, DIM> out, + const AccessorRO index_array, + const Rect rect, + const Pitches pitches, + int volume, + const int64_t key_dim) +{ + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= volume) return; + auto p = pitches.unflatten(idx, rect.lo); + Legion::Point new_point; + new_point[0] = index_array[p]; + for (size_t i = 1; i < N; i++) { new_point[i] = p[key_dim + i - 1]; } + out[p] = new_point; +} + template struct ZipImplBody { using VAL = int64_t; @@ -60,24 +78,30 @@ struct ZipImplBody { const Rect& rect, const Pitches& pitches, bool dense, + const int64_t key_dim, std::index_sequence) const { const size_t volume = rect.volume(); const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - if (dense) { - DeferredBuffer idx_arr(Memory::Kind::Z_COPY_MEM, - Rect<1>(0, index_arrays.size() - 1)); - for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) { - idx_arr[idx] = index_arrays[idx].ptr(rect); + if (index_arrays.size() > 1) { + if (dense) { + DeferredBuffer idx_arr(Memory::Kind::Z_COPY_MEM, + Rect<1>(0, index_arrays.size() - 1)); + for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) { + idx_arr[idx] = index_arrays[idx].ptr(rect); + } + zip_kernel_dense<<>>( + out.ptr(rect), idx_arr, rect, volume, std::make_index_sequence()); + } else { + DeferredBuffer, 1> idx_arr(Memory::Kind::Z_COPY_MEM, + Rect<1>(0, index_arrays.size() - 1)); + for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) idx_arr[idx] = index_arrays[idx]; + zip_kernel<<>>( + out, idx_arr, rect, pitches, volume, std::make_index_sequence()); } - zip_kernel_dense<<>>( - out.ptr(rect), idx_arr, rect, volume, std::make_index_sequence()); - } else { - DeferredBuffer, 1> idx_arr(Memory::Kind::Z_COPY_MEM, - Rect<1>(0, index_arrays.size() - 1)); - for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) idx_arr[idx] = index_arrays[idx]; - zip_kernel<<>>( - out, idx_arr, rect, pitches, volume, std::make_index_sequence()); + } else if (index_arrays.size() == 1) { + zip_kernel + <<>>(out, index_arrays[0], rect, pitches, volume, key_dim); } } }; diff --git a/src/cunumeric/index/zip.h b/src/cunumeric/index/zip.h index ae7476b05..bedad8a7a 100644 --- a/src/cunumeric/index/zip.h +++ b/src/cunumeric/index/zip.h @@ -23,6 +23,8 @@ namespace cunumeric { struct ZipArgs { const Array& out; const std::vector& inputs; + const int64_t N; + const int64_t key_dim; }; class ZipTask : public CuNumericTask { diff --git a/src/cunumeric/index/zip_omp.cc b/src/cunumeric/index/zip_omp.cc index 8cfebb32d..30a51a48c 100644 --- a/src/cunumeric/index/zip_omp.cc +++ b/src/cunumeric/index/zip_omp.cc @@ -32,22 +32,34 @@ struct ZipImplBody { const Rect& rect, const Pitches& pitches, bool dense, + const int64_t key_dim, std::index_sequence) const { const size_t volume = rect.volume(); - if (dense) { - auto outptr = out.ptr(rect); + if (index_arrays.size() > 1) { + if (dense) { + auto outptr = out.ptr(rect); #pragma omp parallel for schedule(static) - for (size_t idx = 0; idx < volume; ++idx) { - outptr[idx] = Legion::Point(index_arrays[Is].ptr(rect)[idx]...); - } - } else { + for (size_t idx = 0; idx < volume; ++idx) { + outptr[idx] = Legion::Point(index_arrays[Is].ptr(rect)[idx]...); + } + } else { +#pragma omp parallel for schedule(static) + for (size_t idx = 0; idx < volume; ++idx) { + auto p = pitches.unflatten(idx, rect.lo); + out[p] = Legion::Point(index_arrays[Is][p]...); + } + } // else + } else if (index_arrays.size() == 1) { #pragma omp parallel for schedule(static) for (size_t idx = 0; idx < volume; ++idx) { auto p = pitches.unflatten(idx, rect.lo); - out[p] = Legion::Point(index_arrays[Is][p]...); + Legion::Point new_point; + new_point[0] = index_arrays[0][p]; + for (size_t i = 1; i < N; i++) { new_point[i] = p[key_dim + i - 1]; } + out[p] = new_point; } - } // else + } } }; diff --git a/src/cunumeric/index/zip_template.inl b/src/cunumeric/index/zip_template.inl index f16b89474..e1c99771e 100644 --- a/src/cunumeric/index/zip_template.inl +++ b/src/cunumeric/index/zip_template.inl @@ -37,8 +37,12 @@ struct ZipImpl { size_t volume = pitches.flatten(index_rect); if (volume == 0) return; +#ifdef CUNUMERIC_DEBUG + assert(out_rect == index_rect) +#endif + #ifndef LEGION_BOUNDS_CHECKS - bool dense = out.accessor.is_dense_row_major(out_rect); + bool dense = out.accessor.is_dense_row_major(out_rect); #endif std::vector> index_arrays; for (int i = 0; i < args.inputs.size(); i++) { @@ -52,17 +56,18 @@ struct ZipImpl { #ifdef LEGION_BOUNDS_CHECKS bool dense = false; #endif - ZipImplBody()( - out, index_arrays, index_rect, pitches, dense, std::make_index_sequence()); + out, index_arrays, index_rect, pitches, dense, args.key_dim, std::make_index_sequence()); } }; template static void zip_template(TaskContext& context) { - ZipArgs args{context.outputs()[0], context.inputs()}; - double_dispatch(args.inputs[0].dim(), args.inputs.size(), ZipImpl{}, args); + int64_t N = context.scalars()[0].value(); + int64_t key_dim = context.scalars()[1].value(); + ZipArgs args{context.outputs()[0], context.inputs(), N, key_dim}; + double_dispatch(args.inputs[0].dim(), N, ZipImpl{}, args); } } // namespace cunumeric diff --git a/tests/index_routines.py b/tests/index_routines.py index 1681adf7f..79db6de6d 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -58,18 +58,21 @@ def advanced_indexing(): assert np.array_equal(y[index_2d], y_num[index_2d_num]) # mismatch dimesion case integers: - # print ("advance indexing test 4") - # indx = np.array([1, 1]) - # indx_num = num.array(indx) - # res = z[indx] - # print("bool array as indx np:") - # print(res) - # print("cunumeric:") - # res_num = z_num[indx_num] - # print(res_num) - # assert np.array_equal(res, res_num) + print("advanced indexing test 4") + indx = np.array([1, 1]) + indx_num = num.array(indx) + res = z[indx] + res_num = z_num[indx_num] + assert np.array_equal(res, res_num) + + # 2d: + indx = np.array([[1, 1], [1, 0]]) + indx_num = num.array(indx) + res = z[indx] + res_num = z_num[indx_num] + assert np.array_equal(res, res_num) - # mismatch dimesion case: + # mismatch dimesion case bool: # print ("advance indexing test 4") # indx_bool = np.array([True, False]) # indx_bool_num = num.array(indx_bool) From 51c5f50c1cd9e6f1fc5dc2d9e0db7c460060cc07 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 17 Mar 2022 14:05:24 -0700 Subject: [PATCH 07/33] adding support for the mixed type of the arguments --- cunumeric/deferred.py | 20 +++++++++++++++++--- tests/index_routines.py | 20 +++++++++++--------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 7d7b0e53d..3c2a0f0a7 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -389,10 +389,22 @@ def _create_indexing_array(self, key): if k < 0: k += store.shape[dim + shift] store = store.project(dim + shift, k) + store_to_copy = DeferredArray( + self.runtime, + base=store, + dtype=self.dtype, + ) + store_copy = self.runtime.create_empty_thunk( + store_to_copy.shape, + self.dtype, + inputs=[store_to_copy], + ) + store_copy.copy(store_to_copy, deep=True) + self = store_copy + store = store_copy.base shift -= 1 elif isinstance(k, slice): - # FIXME do we need to transform the store here? - store = store.slice(dim + shift, k) + store = store elif isinstance(k, NumPyThunk): if k.dtype == np.bool: # in case of the mixed indises we all nonzero @@ -440,7 +452,9 @@ def _create_indexing_array(self, key): if len(tuple_of_arrays) > self.ndim: raise TypeError("Advanced indexing dimension mismatch") - if len(tuple_of_arrays) == self.ndim and self.ndim > 1: + if (len(tuple_of_arrays) == self.ndim and self.ndim > 1) or ( + len(tuple_of_arrays) < self.ndim > 1 + ): output_arr = self._zip_indices(tuple_of_arrays) return True, store, output_arr diff --git a/tests/index_routines.py b/tests/index_routines.py index 79db6de6d..72b281ef1 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -115,8 +115,8 @@ def advanced_indexing(): # test mixed data print("advanced indexing test 7") - res = z[-1, :] - res_num = z_num[-1, :] + res = z[:, -1] + res_num = z_num[:, -1] assert np.array_equal(res, res_num) # case when multiple number of arays is passed @@ -149,15 +149,17 @@ def advanced_indexing(): res_np = z[indx0, indx1, indx2] assert np.array_equal(res, res_np) - # FIXME: Combining Basic and Advanced Indexing Schemes: - # print ("advanced indexing test 10") - # ind0 = np.array([1, 1]) - # ind0_num = num.array(ind0) - # res = z[ind0, :, -1] - # res_num = z_num[ind0_num, :, -1] + # Combining Basic and Advanced Indexing Schemes: + print("advanced indexing test 10") + ind0 = np.array([1, 1]) + ind0_num = num.array(ind0) + res = z[ind0, :, -1] + res_num = z_num[ind0_num, :, -1] + # res = z[ind0,-1] # print(res) + # res_num = z_num[ind0,-1] # print(res_num) - # assert np.array_equal(res, res_num) + assert np.array_equal(res, res_num) # In-Place & Augmented Assignments via Advanced Indexing # simple 1d case From 2d6a67132ae0bd9c32eeffc5b41a601bc82d9956 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 17 Mar 2022 19:56:46 -0700 Subject: [PATCH 08/33] adding support for number of arrays passed as indices< self.ndim --- cunumeric/deferred.py | 68 ++++++++++++++++++++++++++++++-------- src/cunumeric/index/zip.cc | 12 ++++--- tests/index_routines.py | 18 ++++++++++ 3 files changed, 81 insertions(+), 17 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 3c2a0f0a7..776fb4fea 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -305,35 +305,76 @@ def get_scalar_array(self): result = np.frombuffer(buf, dtype=self.dtype, count=1) return result.reshape(()) + def broadcast_shapes(self, shapes): + arrays = [np.empty(x, dtype=[]) for x in shapes] + return np.broadcast(*arrays).shape + def _zip_indices(self, arrays): if not isinstance(arrays, tuple): raise TypeError("zip_indices expect tuple of arrays") arrays = tuple(self.runtime.to_deferred_array(a) for a in arrays) # all arrays should have the same shape and type - shape = arrays[0].shape data_type = arrays[0].dtype if not np.issubdtype(data_type, np.integer): raise TypeError("a array should be integer type") - new_arrays = tuple() - key_dim = len(arrays[0].shape) + + shapes = tuple(a.shape for a in arrays) + if len(arrays) > 1: + b_shape = self.broadcast_shapes(shapes) + else: + b_shape = arrays[0].shape + key_dim = len(b_shape) + print("IRINA DEBUG key_dim", key_dim, b_shape) + out_shape = b_shape if len(arrays) == 1: # special case when a single index array is passed and it's dim < # self.ndims - shape = shape + tuple(self.shape[i] for i in range(1, self.ndim)) + out_shape = b_shape + tuple( + self.shape[i] for i in range(1, self.ndim) + ) array = arrays[0].base start = key_dim - 1 + new_arrays = tuple() for i in range(1, self.ndim): array = array.promote(start + i, self.shape[i]) new_arrays += (array,) + elif len(arrays) < self.ndim: + N = len(arrays) + # broadcast shapes + new_arrays = tuple() + for a in arrays: + if data_type != a.dtype: + raise TypeError( + "type of all index arrrays should be the same" + ) + if a.shape != b_shape: + new_arrays += (a._broadcast(b_shape),) + else: + new_arrays += (a.base,) + arrays = new_arrays + # output shape + out_shape = b_shape + tuple( + self.shape[i] for i in range(N, self.ndim) + ) + print("IRINA DEBUG out_shape = ", out_shape) + new_arrays = tuple() + start = key_dim - 1 + for a in arrays: + for i in range(N, self.ndim): + a = a.promote(key_dim + i - N, self.shape[i]) + new_arrays += (a,) + arrays = new_arrays + else: + new_arrays = tuple() for a in arrays: if data_type != a.dtype: raise TypeError( "type of all index arrrays should be the same" ) - if a.shape != shape: - a = a._broadcast(shape) + if a.shape != b_shape: + a = a._broadcast(b_shape) else: a = a.base new_arrays = new_arrays + (a,) @@ -352,7 +393,7 @@ def _zip_indices(self, arrays): N = self.ndim pointN_dtype = self.runtime.add_point_type(N) store = self.context.create_store( - pointN_dtype, shape=shape, optimize_scalar=True + pointN_dtype, shape=out_shape, optimize_scalar=True ) output_arr = DeferredArray( self.runtime, base=store, dtype=pointN_dtype @@ -361,12 +402,13 @@ def _zip_indices(self, arrays): # call ZIP function to combine index arrays into a singe array task = self.context.create_task(CuNumericOpCode.ZIP) task.add_output(output_arr.base) - if len(arrays) == 1: - task.add_input(arrays[0]) - task.add_alignment(arrays[0], output_arr.base) - task.add_scalar_arg(self.ndim, ty.int64) - task.add_scalar_arg(key_dim, ty.int64) - task.add_broadcast(arrays[0], axes=range(1, len(shape))) + if len(arrays) < self.ndim: + task.add_scalar_arg(self.ndim, ty.int64) # N of points in Point + task.add_scalar_arg(key_dim, ty.int64) # key_dim + for a in arrays: + task.add_input(a) + task.add_alignment(a, output_arr.base) + task.add_broadcast(a, axes=range(1, len(out_shape))) else: task.add_scalar_arg(self.ndim, ty.int64) task.add_scalar_arg(self.ndim, ty.int64) diff --git a/src/cunumeric/index/zip.cc b/src/cunumeric/index/zip.cc index e0bb67e48..8b446da5f 100644 --- a/src/cunumeric/index/zip.cc +++ b/src/cunumeric/index/zip.cc @@ -35,7 +35,7 @@ struct ZipImplBody { const int64_t key_dim, std::index_sequence) const { - if (index_arrays.size() > 1) { + if (index_arrays.size() == N) { const size_t volume = rect.volume(); if (dense) { auto outptr = out.ptr(rect); @@ -48,13 +48,17 @@ struct ZipImplBody { out[p] = Legion::Point(index_arrays[Is][p]...); } } - } else if (index_arrays.size() == 1) { + } else if (index_arrays.size() < N) { const size_t volume = rect.volume(); for (size_t idx = 0; idx < volume; ++idx) { auto p = pitches.unflatten(idx, rect.lo); Legion::Point new_point; - new_point[0] = index_arrays[0][p]; - for (size_t i = 1; i < N; i++) { new_point[i] = p[key_dim + i - 1]; } + for (size_t i = 0; i < index_arrays.size(); i++) new_point[i] = index_arrays[i][p]; + for (size_t i = index_arrays.size(); i < N; i++) { + int64_t j = key_dim + i - 1 - (index_arrays.size() - 1); + new_point[i] = p[j]; + } + std::cout << "IRINA DEBUG" << new_point << std::endl; out[p] = new_point; } } diff --git a/tests/index_routines.py b/tests/index_routines.py index 72b281ef1..bc2a374d2 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -72,6 +72,24 @@ def advanced_indexing(): res_num = z_num[indx_num] assert np.array_equal(res, res_num) + # 2 arrays passed do 3d array + indx0 = np.array([1, 1]) + indx1 = np.array([1, 0]) + indx0_num = num.array(indx0) + indx1_num = num.array(indx1) + res = z[indx0, indx1] + res_num = z_num[indx0_num, indx1_num] + assert np.array_equal(res, res_num) + + # 2 arrays with broadcasting + indx0 = np.array([1, 1]) + indx1 = np.array([[1, 0], [1, 0]]) + indx0_num = num.array(indx0) + indx1_num = num.array(indx1) + res = z[indx0, indx1] + res_num = z_num[indx0_num, indx1_num] + assert np.array_equal(res, res_num) + # mismatch dimesion case bool: # print ("advance indexing test 4") # indx_bool = np.array([True, False]) From 31189691057b7f6797def7d98cc284ff8506b196 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Fri, 18 Mar 2022 12:18:54 -0700 Subject: [PATCH 09/33] adding support for the use case arr[:, indx, :] --- cunumeric/deferred.py | 64 +++++++++++++++++++++------- src/cunumeric/index/zip.cc | 9 ++-- src/cunumeric/index/zip.cu | 23 +++++++--- src/cunumeric/index/zip.h | 1 + src/cunumeric/index/zip_omp.cc | 13 +++++- src/cunumeric/index/zip_template.inl | 21 ++++++--- tests/index_routines.py | 46 +++++++++++--------- 7 files changed, 123 insertions(+), 54 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 776fb4fea..8a37b6803 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -309,9 +309,12 @@ def broadcast_shapes(self, shapes): arrays = [np.empty(x, dtype=[]) for x in shapes] return np.broadcast(*arrays).shape - def _zip_indices(self, arrays): + def _zip_indices(self, start_index, arrays): + if not isinstance(arrays, tuple): raise TypeError("zip_indices expect tuple of arrays") + if start_index == -1: + start_index = 0 arrays = tuple(self.runtime.to_deferred_array(a) for a in arrays) # all arrays should have the same shape and type data_type = arrays[0].dtype @@ -324,20 +327,27 @@ def _zip_indices(self, arrays): else: b_shape = arrays[0].shape key_dim = len(b_shape) - print("IRINA DEBUG key_dim", key_dim, b_shape) out_shape = b_shape if len(arrays) == 1: # special case when a single index array is passed and it's dim < # self.ndims - out_shape = b_shape + tuple( - self.shape[i] for i in range(1, self.ndim) + out_shape = ( + tuple(self.shape[i] for i in range(0, start_index)) + + b_shape + + tuple( + self.shape[i] for i in range(start_index + 1, self.ndim) + ) ) array = arrays[0].base start = key_dim - 1 new_arrays = tuple() - for i in range(1, self.ndim): + for i in range(0, start_index): + array = array.promote(i, self.shape[i]) + for i in range(start_index + 1, self.ndim): array = array.promote(start + i, self.shape[i]) + if array.shape != out_shape: + raise ValueError("Wrong shape calculation") new_arrays += (array,) elif len(arrays) < self.ndim: N = len(arrays) @@ -354,16 +364,21 @@ def _zip_indices(self, arrays): new_arrays += (a.base,) arrays = new_arrays # output shape - out_shape = b_shape + tuple( - self.shape[i] for i in range(N, self.ndim) + out_shape = ( + tuple(self.shape[i] for i in range(0, start_index)) + + b_shape + + tuple( + self.shape[i] for i in range(start_index + N, self.ndim) + ) ) - print("IRINA DEBUG out_shape = ", out_shape) new_arrays = tuple() start = key_dim - 1 for a in arrays: - for i in range(N, self.ndim): + for i in range(0, start_index): + a = a.promote(i, self.shape[i]) + for i in range(start_index + N, self.ndim): a = a.promote(key_dim + i - N, self.shape[i]) - new_arrays += (a,) + new_arrays += (a,) arrays = new_arrays else: @@ -405,13 +420,18 @@ def _zip_indices(self, arrays): if len(arrays) < self.ndim: task.add_scalar_arg(self.ndim, ty.int64) # N of points in Point task.add_scalar_arg(key_dim, ty.int64) # key_dim + task.add_scalar_arg(start_index, ty.int64) # start_index for a in arrays: task.add_input(a) task.add_alignment(a, output_arr.base) - task.add_broadcast(a, axes=range(1, len(out_shape))) + task.add_broadcast(a, axes=tuple(range(1, len(out_shape)))) + task.add_broadcast( + output_arr.base, axes=tuple(range(1, len(out_shape))) + ) else: task.add_scalar_arg(self.ndim, ty.int64) task.add_scalar_arg(self.ndim, ty.int64) + task.add_scalar_arg(start_index, ty.int64) for index_arr in arrays: task.add_input(index_arr) task.add_alignment(output_arr.base, index_arr) @@ -423,6 +443,7 @@ def _create_indexing_array(self, key): # Convert everything into deferred arrays of int64 store = self.base shift = 0 + start_index = -1 if isinstance(key, tuple): tuple_of_arrays = () # for k in key: @@ -448,6 +469,10 @@ def _create_indexing_array(self, key): elif isinstance(k, slice): store = store elif isinstance(k, NumPyThunk): + # the very first time we get cunumeric array, record + # start_index + if start_index == -1: + start_index = dim if k.dtype == np.bool: # in case of the mixed indises we all nonzero # for the bool array @@ -480,13 +505,20 @@ def _create_indexing_array(self, key): task.add_input(self.base) task.add_input(key.base) task.add_alignment(self.base, key.base) + task.add_broadcast( + self.base, axes=tuple(range(1, len(self.shape))) + ) + task.add_broadcast( + key.base, axes=tuple(range(1, len(key.shape))) + ) task.execute() return False, store, out - # IRINA fixme: replace `nonzero` case with the task with - # output regions when ND output regions are available - tuple_of_arrays = key.nonzero() + else: + # IRINA fixme: replace `nonzero` case with the task with + # output regions when ND output regions are available + tuple_of_arrays = key.nonzero() elif key.ndim < store.ndim: - output_arr = self._zip_indices((key,)) + output_arr = self._zip_indices(start_index, (key,)) return True, store, output_arr else: tuple_of_arrays = (self.runtime.to_deferred_array(key),) @@ -498,7 +530,7 @@ def _create_indexing_array(self, key): len(tuple_of_arrays) < self.ndim > 1 ): - output_arr = self._zip_indices(tuple_of_arrays) + output_arr = self._zip_indices(start_index, tuple_of_arrays) return True, store, output_arr elif len(tuple_of_arrays) == 1 and self.ndim == 1: return True, store, tuple_of_arrays[0] diff --git a/src/cunumeric/index/zip.cc b/src/cunumeric/index/zip.cc index 8b446da5f..8696694d8 100644 --- a/src/cunumeric/index/zip.cc +++ b/src/cunumeric/index/zip.cc @@ -33,6 +33,7 @@ struct ZipImplBody { const Pitches& pitches, bool dense, const int64_t key_dim, + const int64_t start_index, std::index_sequence) const { if (index_arrays.size() == N) { @@ -53,12 +54,14 @@ struct ZipImplBody { for (size_t idx = 0; idx < volume; ++idx) { auto p = pitches.unflatten(idx, rect.lo); Legion::Point new_point; - for (size_t i = 0; i < index_arrays.size(); i++) new_point[i] = index_arrays[i][p]; - for (size_t i = index_arrays.size(); i < N; i++) { + for (size_t i = 0; i < start_index; i++) { new_point[i] = p[i]; } + for (size_t i = 0; i < index_arrays.size(); i++) { + new_point[start_index + i] = index_arrays[i][p]; + } + for (size_t i = (start_index + index_arrays.size()); i < N; i++) { int64_t j = key_dim + i - 1 - (index_arrays.size() - 1); new_point[i] = p[j]; } - std::cout << "IRINA DEBUG" << new_point << std::endl; out[p] = new_point; } } diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu index bbfdbbb07..28d97ed2f 100644 --- a/src/cunumeric/index/zip.cu +++ b/src/cunumeric/index/zip.cu @@ -53,18 +53,24 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) template __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) zip_kernel(const AccessorWO, DIM> out, - const AccessorRO index_array, + const DeferredBuffer, 1> index_arrays, const Rect rect, const Pitches pitches, int volume, - const int64_t key_dim) + const int64_t key_dim, + const int64_t start_index, + int num_arrays) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= volume) return; auto p = pitches.unflatten(idx, rect.lo); Legion::Point new_point; - new_point[0] = index_array[p]; - for (size_t i = 1; i < N; i++) { new_point[i] = p[key_dim + i - 1]; } + for (size_t i = 0; i < start_index; i++) { new_point[i] = p[i]; } + for (size_t i = 0; i < num_arrays; i++) { new_point[start_index + i] = index_arrays[i][p]; } + for (size_t i = (start_index + num_arrays); i < N; i++) { + int64_t j = key_dim + i - 1 - (num_arrays); + new_point[i] = p[j]; + } out[p] = new_point; } @@ -79,6 +85,7 @@ struct ZipImplBody { const Pitches& pitches, bool dense, const int64_t key_dim, + const int64_t start_index, std::index_sequence) const { const size_t volume = rect.volume(); @@ -100,8 +107,12 @@ struct ZipImplBody { out, idx_arr, rect, pitches, volume, std::make_index_sequence()); } } else if (index_arrays.size() == 1) { - zip_kernel - <<>>(out, index_arrays[0], rect, pitches, volume, key_dim); + DeferredBuffer, 1> idx_arr(Memory::Kind::Z_COPY_MEM, + Rect<1>(0, index_arrays.size() - 1)); + for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) idx_arr[idx] = index_arrays[idx]; + int num_arrays = index_arrays.size(); + zip_kernel<<>>( + out, idx_arr, rect, pitches, num_arrays, key_dim, start_index, num_arrays); } } }; diff --git a/src/cunumeric/index/zip.h b/src/cunumeric/index/zip.h index bedad8a7a..cd6100cc8 100644 --- a/src/cunumeric/index/zip.h +++ b/src/cunumeric/index/zip.h @@ -25,6 +25,7 @@ struct ZipArgs { const std::vector& inputs; const int64_t N; const int64_t key_dim; + const int64_t start_index; }; class ZipTask : public CuNumericTask { diff --git a/src/cunumeric/index/zip_omp.cc b/src/cunumeric/index/zip_omp.cc index 30a51a48c..0848a0f83 100644 --- a/src/cunumeric/index/zip_omp.cc +++ b/src/cunumeric/index/zip_omp.cc @@ -33,6 +33,7 @@ struct ZipImplBody { const Pitches& pitches, bool dense, const int64_t key_dim, + const int64_t start_index, std::index_sequence) const { const size_t volume = rect.volume(); @@ -55,9 +56,17 @@ struct ZipImplBody { for (size_t idx = 0; idx < volume; ++idx) { auto p = pitches.unflatten(idx, rect.lo); Legion::Point new_point; - new_point[0] = index_arrays[0][p]; - for (size_t i = 1; i < N; i++) { new_point[i] = p[key_dim + i - 1]; } + std::cout << "IRINA DEBUG 2" << std::endl; + for (size_t i = 0; i < start_index; i++) { new_point[i] = p[i]; } + for (size_t i = 0; i < index_arrays.size(); i++) { + new_point[start_index + i] = index_arrays[i][p]; + } + for (size_t i = (start_index + index_arrays.size()); i < N; i++) { + int64_t j = key_dim + i - 1 - (index_arrays.size() - 1); + new_point[i] = p[j]; + } out[p] = new_point; + std::cout << "IRINA DEBUG 3 " << out[p] << std::endl; } } } diff --git a/src/cunumeric/index/zip_template.inl b/src/cunumeric/index/zip_template.inl index e1c99771e..b9729949c 100644 --- a/src/cunumeric/index/zip_template.inl +++ b/src/cunumeric/index/zip_template.inl @@ -34,9 +34,11 @@ struct ZipImpl { auto out = args.out.write_accessor, DIM>(out_rect); auto index_rect = args.inputs[0].shape(); Pitches pitches; - size_t volume = pitches.flatten(index_rect); + size_t volume = pitches.flatten(out_rect); if (volume == 0) return; + std::cout << "IRINA DEBUG out rect = " << out_rect << ", index rect = " << index_rect + << std::endl; #ifdef CUNUMERIC_DEBUG assert(out_rect == index_rect) #endif @@ -56,17 +58,24 @@ struct ZipImpl { #ifdef LEGION_BOUNDS_CHECKS bool dense = false; #endif - ZipImplBody()( - out, index_arrays, index_rect, pitches, dense, args.key_dim, std::make_index_sequence()); + ZipImplBody()(out, + index_arrays, + index_rect, + pitches, + dense, + args.key_dim, + args.start_index, + std::make_index_sequence()); } }; template static void zip_template(TaskContext& context) { - int64_t N = context.scalars()[0].value(); - int64_t key_dim = context.scalars()[1].value(); - ZipArgs args{context.outputs()[0], context.inputs(), N, key_dim}; + int64_t N = context.scalars()[0].value(); + int64_t key_dim = context.scalars()[1].value(); + int64_t start_index = context.scalars()[2].value(); + ZipArgs args{context.outputs()[0], context.inputs(), N, key_dim, start_index}; double_dispatch(args.inputs[0].dim(), N, ZipImpl{}, args); } diff --git a/tests/index_routines.py b/tests/index_routines.py index bc2a374d2..90d4a8f1b 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -57,12 +57,22 @@ def advanced_indexing(): index_2d_num = num.array(index_2d) assert np.array_equal(y[index_2d], y_num[index_2d_num]) - # mismatch dimesion case integers: + # mismatch dimesion case: print("advanced indexing test 4") indx = np.array([1, 1]) indx_num = num.array(indx) res = z[indx] res_num = z_num[indx_num] + print(res) + print(res_num) + assert np.array_equal(res, res_num) + + res = z[:, :, indx] + res_num = z_num[:, :, indx_num] + assert np.array_equal(res, res_num) + + res = z[:, indx, :] + res_num = z_num[:, indx_num, :] assert np.array_equal(res, res_num) # 2d: @@ -72,7 +82,11 @@ def advanced_indexing(): res_num = z_num[indx_num] assert np.array_equal(res, res_num) - # 2 arrays passed do 3d array + res = z[:, indx] + res_num = z_num[:, indx_num] + assert np.array_equal(res, res_num) + + # 2 arrays passed to 3d array indx0 = np.array([1, 1]) indx1 = np.array([1, 0]) indx0_num = num.array(indx0) @@ -81,6 +95,10 @@ def advanced_indexing(): res_num = z_num[indx0_num, indx1_num] assert np.array_equal(res, res_num) + res = z[:, indx0, indx1] + res_num = z_num[:, indx0_num, indx1_num] + assert np.array_equal(res, res_num) + # 2 arrays with broadcasting indx0 = np.array([1, 1]) indx1 = np.array([[1, 0], [1, 0]]) @@ -91,18 +109,11 @@ def advanced_indexing(): assert np.array_equal(res, res_num) # mismatch dimesion case bool: - # print ("advance indexing test 4") - # indx_bool = np.array([True, False]) - # indx_bool_num = num.array(indx_bool) - # res = z[indx_bool] - # print("IRINA DEBUG") - # assert np.array_equal(indx_bool.nonzero(), indx_bool_num.nonzero()) - # print("bool array as indx np:") - # print(res) - # print("cunumeric:") - # res_num = z_num[indx_bool_num] - # print(res_num) - # assert np.array_equal(res, res_num) + indx_bool = np.array([True, False]) + indx_bool_num = num.array(indx_bool) + res = z[indx_bool] + res_num = z_num[indx_bool_num] + assert np.array_equal(res, res_num) # test for bool array of the same dimension print("advanced indexing test 5") @@ -147,9 +158,6 @@ def advanced_indexing(): indx1_num = num.array(indx1) indx2_num = num.array(indx2) - # indx0_num._thunk._zip_indices((indx0_num._thunk, - # indx1_num._thunk, indx2_num._thunk,)) - res = z_num[indx0_num, indx1_num, indx2_num] res_np = z[indx0, indx1, indx2] assert np.array_equal(res, res_np) @@ -173,10 +181,6 @@ def advanced_indexing(): ind0_num = num.array(ind0) res = z[ind0, :, -1] res_num = z_num[ind0_num, :, -1] - # res = z[ind0,-1] - # print(res) - # res_num = z_num[ind0,-1] - # print(res_num) assert np.array_equal(res, res_num) # In-Place & Augmented Assignments via Advanced Indexing From b87c51b8f436bf42ba48f741f7cc2c4889cb2f21 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Fri, 18 Mar 2022 13:36:00 -0700 Subject: [PATCH 10/33] some clean-up --- cunumeric/deferred.py | 45 +++++++++----------- src/cunumeric/index/advanced_indexing_omp.cc | 5 ++- src/cunumeric/index/zip.cu | 18 ++++---- src/cunumeric/index/zip_omp.cc | 6 +-- src/cunumeric/index/zip_template.inl | 2 - tests/index_routines.py | 26 ++++++++++- 6 files changed, 58 insertions(+), 44 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 8a37b6803..5904001af 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -310,46 +310,33 @@ def broadcast_shapes(self, shapes): return np.broadcast(*arrays).shape def _zip_indices(self, start_index, arrays): - if not isinstance(arrays, tuple): - raise TypeError("zip_indices expect tuple of arrays") + raise TypeError("zip_indices expects tuple of arrays") + # start_index is the index from witch indices arrays are passed + # for example of arr[:, indx, :], start_index =1 if start_index == -1: start_index = 0 + arrays = tuple(self.runtime.to_deferred_array(a) for a in arrays) # all arrays should have the same shape and type data_type = arrays[0].dtype if not np.issubdtype(data_type, np.integer): raise TypeError("a array should be integer type") + # find a broadcasted shape for all arrays passed as indices shapes = tuple(a.shape for a in arrays) if len(arrays) > 1: b_shape = self.broadcast_shapes(shapes) else: b_shape = arrays[0].shape + + # key dim - dimension of indices arrays key_dim = len(b_shape) out_shape = b_shape - if len(arrays) == 1: - # special case when a single index array is passed and it's dim < - # self.ndims - out_shape = ( - tuple(self.shape[i] for i in range(0, start_index)) - + b_shape - + tuple( - self.shape[i] for i in range(start_index + 1, self.ndim) - ) - ) - array = arrays[0].base - start = key_dim - 1 - new_arrays = tuple() - for i in range(0, start_index): - array = array.promote(i, self.shape[i]) - for i in range(start_index + 1, self.ndim): - array = array.promote(start + i, self.shape[i]) - if array.shape != out_shape: - raise ValueError("Wrong shape calculation") - new_arrays += (array,) - elif len(arrays) < self.ndim: + if len(arrays) < self.ndim: + # the case when # of arrays passed is smaller than dimension of + # the input array N = len(arrays) # broadcast shapes new_arrays = tuple() @@ -372,7 +359,7 @@ def _zip_indices(self, start_index, arrays): ) ) new_arrays = tuple() - start = key_dim - 1 + # promote all index arrays to have the same shape as output for a in arrays: for i in range(0, start_index): a = a.promote(i, self.shape[i]) @@ -382,6 +369,10 @@ def _zip_indices(self, start_index, arrays): arrays = new_arrays else: + # the use case when # of arrays passed is equal to the dimension + # of the input array + if len(arrays) > self.ndim: + raise ValueError("wrong number of index arrays passed") new_arrays = tuple() for a in arrays: if data_type != a.dtype: @@ -394,6 +385,7 @@ def _zip_indices(self, start_index, arrays): a = a.base new_arrays = new_arrays + (a,) arrays = new_arrays + # create output array which will store Point field where # N is number of index arrays # shape of the output array should be the same as the shape of each @@ -487,6 +479,7 @@ def _create_indexing_array(self, key): ) else: assert isinstance(key, NumPyThunk) + # the use case when index array ndim >1 and input array ndim ==1 if key.ndim > store.ndim: if store.ndim != 1: raise ValueError("Advance indexing dimention mismatch") @@ -514,7 +507,7 @@ def _create_indexing_array(self, key): task.execute() return False, store, out else: - # IRINA fixme: replace `nonzero` case with the task with + # FIXME: replace `nonzero` case with the task with # output regions when ND output regions are available tuple_of_arrays = key.nonzero() elif key.ndim < store.ndim: @@ -527,7 +520,7 @@ def _create_indexing_array(self, key): raise TypeError("Advanced indexing dimension mismatch") if (len(tuple_of_arrays) == self.ndim and self.ndim > 1) or ( - len(tuple_of_arrays) < self.ndim > 1 + len(tuple_of_arrays) < self.ndim and self.ndim > 1 ): output_arr = self._zip_indices(start_index, tuple_of_arrays) diff --git a/src/cunumeric/index/advanced_indexing_omp.cc b/src/cunumeric/index/advanced_indexing_omp.cc index 5128ac75d..192b898b1 100644 --- a/src/cunumeric/index/advanced_indexing_omp.cc +++ b/src/cunumeric/index/advanced_indexing_omp.cc @@ -63,7 +63,10 @@ struct AdvancedIndexingImplBody { offsets[0] = 0; for (auto idx = 1; idx < max_threads; ++idx) offsets[idx] = offsets[idx - 1] + sizes[idx - 1]; } - out = create_buffer(size, Memory::Kind::SYSTEM_MEM); + + Memory::Kind kind = + CuNumeric::has_numamem ? Memory::Kind::SOCKET_MEM : Memory::Kind::SYSTEM_MEM; + out = create_buffer(size, kind); #pragma omp parallel { diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu index 28d97ed2f..88b999776 100644 --- a/src/cunumeric/index/zip.cu +++ b/src/cunumeric/index/zip.cu @@ -56,19 +56,19 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) const DeferredBuffer, 1> index_arrays, const Rect rect, const Pitches pitches, + int narrays, int volume, - const int64_t key_dim, - const int64_t start_index, - int num_arrays) + int key_dim, + int start_index) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= volume) return; auto p = pitches.unflatten(idx, rect.lo); Legion::Point new_point; for (size_t i = 0; i < start_index; i++) { new_point[i] = p[i]; } - for (size_t i = 0; i < num_arrays; i++) { new_point[start_index + i] = index_arrays[i][p]; } - for (size_t i = (start_index + num_arrays); i < N; i++) { - int64_t j = key_dim + i - 1 - (num_arrays); + for (size_t i = 0; i < narrays; i++) { new_point[start_index + i] = index_arrays[i][p]; } + for (size_t i = (start_index + narrays); i < N; i++) { + int64_t j = key_dim + i - 1 - (narrays - 1); new_point[i] = p[j]; } out[p] = new_point; @@ -90,7 +90,7 @@ struct ZipImplBody { { const size_t volume = rect.volume(); const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - if (index_arrays.size() > 1) { + if (index_arrays.size() == N) { if (dense) { DeferredBuffer idx_arr(Memory::Kind::Z_COPY_MEM, Rect<1>(0, index_arrays.size() - 1)); @@ -106,13 +106,13 @@ struct ZipImplBody { zip_kernel<<>>( out, idx_arr, rect, pitches, volume, std::make_index_sequence()); } - } else if (index_arrays.size() == 1) { + } else if (index_arrays.size() < N) { DeferredBuffer, 1> idx_arr(Memory::Kind::Z_COPY_MEM, Rect<1>(0, index_arrays.size() - 1)); for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) idx_arr[idx] = index_arrays[idx]; int num_arrays = index_arrays.size(); zip_kernel<<>>( - out, idx_arr, rect, pitches, num_arrays, key_dim, start_index, num_arrays); + out, idx_arr, rect, pitches, num_arrays, volume, key_dim, start_index); } } }; diff --git a/src/cunumeric/index/zip_omp.cc b/src/cunumeric/index/zip_omp.cc index 0848a0f83..4547f64d1 100644 --- a/src/cunumeric/index/zip_omp.cc +++ b/src/cunumeric/index/zip_omp.cc @@ -37,7 +37,7 @@ struct ZipImplBody { std::index_sequence) const { const size_t volume = rect.volume(); - if (index_arrays.size() > 1) { + if (index_arrays.size() == N) { if (dense) { auto outptr = out.ptr(rect); #pragma omp parallel for schedule(static) @@ -51,12 +51,11 @@ struct ZipImplBody { out[p] = Legion::Point(index_arrays[Is][p]...); } } // else - } else if (index_arrays.size() == 1) { + } else if (index_arrays.size() < N) { #pragma omp parallel for schedule(static) for (size_t idx = 0; idx < volume; ++idx) { auto p = pitches.unflatten(idx, rect.lo); Legion::Point new_point; - std::cout << "IRINA DEBUG 2" << std::endl; for (size_t i = 0; i < start_index; i++) { new_point[i] = p[i]; } for (size_t i = 0; i < index_arrays.size(); i++) { new_point[start_index + i] = index_arrays[i][p]; @@ -66,7 +65,6 @@ struct ZipImplBody { new_point[i] = p[j]; } out[p] = new_point; - std::cout << "IRINA DEBUG 3 " << out[p] << std::endl; } } } diff --git a/src/cunumeric/index/zip_template.inl b/src/cunumeric/index/zip_template.inl index b9729949c..79476e1e2 100644 --- a/src/cunumeric/index/zip_template.inl +++ b/src/cunumeric/index/zip_template.inl @@ -37,8 +37,6 @@ struct ZipImpl { size_t volume = pitches.flatten(out_rect); if (volume == 0) return; - std::cout << "IRINA DEBUG out rect = " << out_rect << ", index rect = " << index_rect - << std::endl; #ifdef CUNUMERIC_DEBUG assert(out_rect == index_rect) #endif diff --git a/tests/index_routines.py b/tests/index_routines.py index 90d4a8f1b..1e2669659 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -63,8 +63,6 @@ def advanced_indexing(): indx_num = num.array(indx) res = z[indx] res_num = z_num[indx_num] - print(res) - print(res_num) assert np.array_equal(res, res_num) res = z[:, :, indx] @@ -217,6 +215,30 @@ def advanced_indexing(): # print(x) # x_num[indx0_num, indx1_num] =0.0 + # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by + # 1 when passig 2d index array + for ndim in range(2, LEGATE_MAX_DIM): + a_shape = tuple(random.randint(2, 9) for i in range(ndim)) + np_array = mk_seq_array(np, a_shape) + num_array = mk_seq_array(num, a_shape) + # check when N of index arrays == N of dims + num_tuple_of_indices = tuple() + np_tuple_of_indices = tuple() + for i in range(ndim): + i_shape = (2, 4) + idx_arr_np = mk_seq_array(np, i_shape) % np_array.shape[i] + idx_arr_num = num.array(idx_arr_np) + np_tuple_of_indices += (idx_arr_np,) + num_tuple_of_indices += (idx_arr_num,) + assert np.array_equal( + np_array[np_tuple_of_indices], num_array[num_tuple_of_indices] + ) + # check when N of index arrays == N of dims + i_shape = (2, 2) + idx_arr_np = mk_seq_array(np, i_shape) % np_array.shape[0] + idx_arr_num = num.array(idx_arr_np) + assert np.array_equal(np_array[idx_arr_np], num_array[idx_arr_num]) + return From d06e03d48e974871763e355db507c905252396e1 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 24 Mar 2022 09:32:07 -0700 Subject: [PATCH 11/33] adding support for some corner cases in the advanced indexing --- cunumeric/deferred.py | 84 +++++++++++++++++++++------- src/cunumeric/index/zip.cc | 4 +- src/cunumeric/index/zip_omp.cc | 4 +- src/cunumeric/index/zip_template.inl | 7 +-- tests/index_routines.py | 59 +++++++++++++++++++ 5 files changed, 133 insertions(+), 25 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 5904001af..364d7a92a 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -415,7 +415,7 @@ def _zip_indices(self, start_index, arrays): task.add_scalar_arg(start_index, ty.int64) # start_index for a in arrays: task.add_input(a) - task.add_alignment(a, output_arr.base) + task.add_alignment(output_arr.base, a) task.add_broadcast(a, axes=tuple(range(1, len(out_shape)))) task.add_broadcast( output_arr.base, axes=tuple(range(1, len(out_shape))) @@ -432,40 +432,59 @@ def _zip_indices(self, start_index, arrays): return output_arr def _create_indexing_array(self, key): - # Convert everything into deferred arrays of int64 store = self.base - shift = 0 + # the index where the first index_array is passed to the [] operator start_index = -1 if isinstance(key, tuple): + key = self._unpack_ellipsis(key, self.ndim) + shift = 0 + last_index = self.ndim + # in case when index arrays are passed in the scaterred way, + # we need to transpose original array so all index arrays + # are close to each other + transpose_needed = False + transpose_indices = tuple() + # since we can't call Copy operation on transformed Store, after + # the transformation, we need to return a copy + copy_needed = False tuple_of_arrays = () - # for k in key: + for dim, k in enumerate(key): if np.isscalar(k): if k < 0: k += store.shape[dim + shift] store = store.project(dim + shift, k) - store_to_copy = DeferredArray( - self.runtime, - base=store, - dtype=self.dtype, - ) - store_copy = self.runtime.create_empty_thunk( - store_to_copy.shape, - self.dtype, - inputs=[store_to_copy], - ) - store_copy.copy(store_to_copy, deep=True) - self = store_copy - store = store_copy.base shift -= 1 + copy_needed = True + last_index = dim + shift + elif k is np.newaxis: + store = store.promote(dim + shift, 1) + copy_needed = True elif isinstance(k, slice): - store = store + store = store.slice(dim + shift, k) + if k != slice(None): + copy_needed = True elif isinstance(k, NumPyThunk): # the very first time we get cunumeric array, record # start_index if start_index == -1: - start_index = dim + start_index = dim + shift + if (start_index - last_index) > 1: + transpose_needed = True + last_index = dim + shift + transpose_indices += (dim + shift,) + else: + transpose_needed = transpose_needed or ( + (dim + shift - last_index) > 1 + ) + transpose_indices += (dim + shift,) + last_index = dim + shift if k.dtype == np.bool: + if k.shape[0] != self.shape[dim]: + raise ValueError( + "boolean index did not match " + "indexed array along dimension " + ) # in case of the mixed indises we all nonzero # for the bool array k = k.nonzero() @@ -477,6 +496,33 @@ def _create_indexing_array(self, key): "Unsupported entry type passed to advanced", "indexing operation", ) + # if len(tuple_of_arrays) == 1: + # transpose_needed = False + if transpose_needed: + copy_needed = True + start_index = 0 + post_indices = tuple( + i for i in range(store.ndim) if i not in transpose_indices + ) + transpose_indices += post_indices + store = store.transpose(transpose_indices) + if copy_needed: + # after store is transformed we need to to return a copy of + # the store since Copy operation can't be done on + # the store with transformation + store_to_copy = DeferredArray( + self.runtime, + base=store, + dtype=self.dtype, + ) + store_copy = self.runtime.create_empty_thunk( + store_to_copy.shape, + self.dtype, + inputs=[store_to_copy], + ) + store_copy.copy(store_to_copy, deep=True) + self = store_copy + store = store_copy.base else: assert isinstance(key, NumPyThunk) # the use case when index array ndim >1 and input array ndim ==1 diff --git a/src/cunumeric/index/zip.cc b/src/cunumeric/index/zip.cc index 8696694d8..b167d2f62 100644 --- a/src/cunumeric/index/zip.cc +++ b/src/cunumeric/index/zip.cc @@ -39,9 +39,11 @@ struct ZipImplBody { if (index_arrays.size() == N) { const size_t volume = rect.volume(); if (dense) { + std::vector indx_ptrs; + for (auto a : index_arrays) indx_ptrs.push_back(a.ptr(rect)); auto outptr = out.ptr(rect); for (size_t idx = 0; idx < volume; ++idx) { - outptr[idx] = Legion::Point(index_arrays[Is].ptr(rect)[idx]...); + outptr[idx] = Legion::Point(indx_ptrs[Is][idx]...); } } else { for (size_t idx = 0; idx < volume; ++idx) { diff --git a/src/cunumeric/index/zip_omp.cc b/src/cunumeric/index/zip_omp.cc index 4547f64d1..9276c3450 100644 --- a/src/cunumeric/index/zip_omp.cc +++ b/src/cunumeric/index/zip_omp.cc @@ -39,10 +39,12 @@ struct ZipImplBody { const size_t volume = rect.volume(); if (index_arrays.size() == N) { if (dense) { + std::vector indx_ptrs; + for (auto a : index_arrays) indx_ptrs.push_back(a.ptr(rect)); auto outptr = out.ptr(rect); #pragma omp parallel for schedule(static) for (size_t idx = 0; idx < volume; ++idx) { - outptr[idx] = Legion::Point(index_arrays[Is].ptr(rect)[idx]...); + outptr[idx] = Legion::Point(indx_ptrs[Is][idx]...); } } else { #pragma omp parallel for schedule(static) diff --git a/src/cunumeric/index/zip_template.inl b/src/cunumeric/index/zip_template.inl index 79476e1e2..d4b34a787 100644 --- a/src/cunumeric/index/zip_template.inl +++ b/src/cunumeric/index/zip_template.inl @@ -43,6 +43,8 @@ struct ZipImpl { #ifndef LEGION_BOUNDS_CHECKS bool dense = out.accessor.is_dense_row_major(out_rect); +#else + bool dense = false; #endif std::vector> index_arrays; for (int i = 0; i < args.inputs.size(); i++) { @@ -53,12 +55,9 @@ struct ZipImpl { dense = dense && index_arrays[i].accessor.is_dense_row_major(out_rect); } -#ifdef LEGION_BOUNDS_CHECKS - bool dense = false; -#endif ZipImplBody()(out, index_arrays, - index_rect, + out_rect, pitches, dense, args.key_dim, diff --git a/tests/index_routines.py b/tests/index_routines.py index 1e2669659..aa1275534 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -97,6 +97,21 @@ def advanced_indexing(): res_num = z_num[:, indx0_num, indx1_num] assert np.array_equal(res, res_num) + # 2 index arrays passed in a sparse way: + x = mk_seq_array(np, (3, 4, 5, 6)) + x_num = mk_seq_array(num, (3, 4, 5, 6)) + res = x[:, [0, 1], :, [0, 1]] + res_num = x_num[:, [0, 1], :, [0, 1]] + assert np.array_equal(res, res_num) + + res = x[[0, 1], :, [0, 1], 1:] + res_num = x_num[[0, 1], :, [0, 1], 1:] + assert np.array_equal(res, res_num) + + res = x[:, [0, 1], :, 1:] + res_num = x_num[:, [0, 1], :, 1:] + assert np.array_equal(res, res_num) + # 2 arrays with broadcasting indx0 = np.array([1, 1]) indx1 = np.array([[1, 0], [1, 0]]) @@ -181,6 +196,32 @@ def advanced_indexing(): res_num = z_num[ind0_num, :, -1] assert np.array_equal(res, res_num) + res = z[ind0, :, [False, True, False, True]] + res_num = z_num[ind0_num, :, [False, True, False, True]] + assert np.array_equal(res, res_num) + + res = z[ind0, :, ind0] + res_num = z_num[ind0_num, :, ind0_num] + assert np.array_equal(res, res_num) + + res = z[ind0, :, 1:3] + res_num = z_num[ind0_num, :, 1:3] + assert np.array_equal(res, res_num) + + res = z[1, :, ind0] + res_num = z_num[1, :, ind0_num] + assert np.array_equal(res, res_num) + + x = mk_seq_array(np, (3, 4, 5, 6)) + x_num = mk_seq_array(num, (3, 4, 5, 6)) + res = x[[0, 1], [0, 1], :, 2] + res_num = x_num[[0, 1], [0, 1], :, 2] + assert np.array_equal(res, res_num) + + res = x[..., [0, 1], 2] + res_num = x_num[..., [0, 1], 2] + assert np.array_equal(res, res_num) + # In-Place & Augmented Assignments via Advanced Indexing # simple 1d case # y = np.array([0, -1, -2, -3, -4, -5]) @@ -238,6 +279,24 @@ def advanced_indexing(): idx_arr_np = mk_seq_array(np, i_shape) % np_array.shape[0] idx_arr_num = num.array(idx_arr_np) assert np.array_equal(np_array[idx_arr_np], num_array[idx_arr_num]) + idx_arr_np = np.array([[1, 0, 1], [1, 1, 0]]) + idx_arr_num = num.array(idx_arr_np) + assert np.array_equal( + np_array[:, idx_arr_np], num_array[:, idx_arr_num] + ) + if ndim > 2: + assert np.array_equal( + np_array[1, :, idx_arr_np], num_array[1, :, idx_arr_num] + ) + assert np.array_equal( + np_array[:, idx_arr_np, idx_arr_np], + num_array[:, idx_arr_num, idx_arr_num], + ) + if ndim > 3: + assert np.array_equal( + np_array[:, idx_arr_np, :, idx_arr_np], + num_array[:, idx_arr_num, :, idx_arr_num], + ) return From 7672be165d40d5b005c45579ca74026073033dea Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Mon, 28 Mar 2022 13:08:11 -0700 Subject: [PATCH 12/33] adding support for advanced indexing in-place assignment with views to original data --- cunumeric/deferred.py | 99 ++++++++++++------- src/cunumeric/index/advanced_indexing.cc | 59 ++++++++--- src/cunumeric/index/advanced_indexing.cu | 33 ++++++- src/cunumeric/index/advanced_indexing.h | 1 + src/cunumeric/index/advanced_indexing_omp.cc | 62 +++++++++--- .../index/advanced_indexing_template.inl | 27 ++++- tests/index_routines.py | 85 ++++++++++------ 7 files changed, 269 insertions(+), 97 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 364d7a92a..526292887 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -431,8 +431,9 @@ def _zip_indices(self, start_index, arrays): return output_arr - def _create_indexing_array(self, key): + def _create_indexing_array(self, key, is_set=False): store = self.base + rhs = self # the index where the first index_array is passed to the [] operator start_index = -1 if isinstance(key, tuple): @@ -496,8 +497,6 @@ def _create_indexing_array(self, key): "Unsupported entry type passed to advanced", "indexing operation", ) - # if len(tuple_of_arrays) == 1: - # transpose_needed = False if transpose_needed: copy_needed = True start_index = 0 @@ -521,7 +520,7 @@ def _create_indexing_array(self, key): inputs=[store_to_copy], ) store_copy.copy(store_to_copy, deep=True) - self = store_copy + rhs = store_copy store = store_copy.base else: assert isinstance(key, NumPyThunk) @@ -536,13 +535,19 @@ def _create_indexing_array(self, key): # Handle the boolean array case if key.dtype == np.bool: if key.shape == self.shape: - out = self.runtime.create_unbound_thunk(self.dtype) + out_dtype = self.dtype + if is_set: + N = self.ndim + out_dtype = self.runtime.add_point_type(N) + + out = self.runtime.create_unbound_thunk(out_dtype) task = self.context.create_task( CuNumericOpCode.ADVANCED_INDX ) task.add_output(out.base) task.add_input(self.base) task.add_input(key.base) + task.add_scalar_arg(is_set, bool) task.add_alignment(self.base, key.base) task.add_broadcast( self.base, axes=tuple(range(1, len(self.shape))) @@ -562,16 +567,13 @@ def _create_indexing_array(self, key): else: tuple_of_arrays = (self.runtime.to_deferred_array(key),) - if len(tuple_of_arrays) > self.ndim: + if len(tuple_of_arrays) > rhs.ndim: raise TypeError("Advanced indexing dimension mismatch") - if (len(tuple_of_arrays) == self.ndim and self.ndim > 1) or ( - len(tuple_of_arrays) < self.ndim and self.ndim > 1 - ): - - output_arr = self._zip_indices(start_index, tuple_of_arrays) + if len(tuple_of_arrays) <= rhs.ndim and rhs.ndim > 1: + output_arr = rhs._zip_indices(start_index, tuple_of_arrays) return True, store, output_arr - elif len(tuple_of_arrays) == 1 and self.ndim == 1: + elif len(tuple_of_arrays) == 1 and rhs.ndim == 1: return True, store, tuple_of_arrays[0] else: raise ValueError("Advance indexing dimention mismatch") @@ -676,35 +678,66 @@ def set_item(self, key, rhs): assert self.dtype == rhs.dtype # Check to see if this is advanced indexing or not if self._is_advanced_indexing(key): + view_copy = False # Create the indexing array - store, index_array = self._create_indexing_array(key) - # if index_array.shape != rhs.shape: - # raise ValueError( - # "Advanced indexing array does not match source shape" - # ) - # if self.ndim != index_array.ndim: - # raise NotImplementedError( - # "need support for indirect partitioning" - # ) + copy_needed, store, index_array = self._create_indexing_array( + key, True + ) + if copy_needed: + if self.base.transform.bottom: + lhs = self + else: + # if store is transformed we need to to return a copy of + # the store since Copy operation can't be done on + # the store with transformation + store_to_copy = DeferredArray( + self.runtime, + base=store, + dtype=self.dtype, + ) + store_copy = self.runtime.create_empty_thunk( + store_to_copy.shape, + self.dtype, + inputs=[store_to_copy], + ) + store_copy.copy(store_to_copy, deep=True) + + lhs = store_copy + view_copy = True + else: + lhs = self + view_copy = False + if rhs.ndim == 0: - shape = store.shape - val = rhs - rhs = self.runtime.create_empty_thunk( - shape, + rhs_tmp = self.runtime.create_empty_thunk( + index_array.base.shape, self.dtype, - inputs=[self], + inputs=[index_array], ) - rhs.fill(val) - copy = self.context.create_copy() + task = self.context.create_task(CuNumericOpCode.FILL) + task.add_output(rhs_tmp.base) + task.add_input(rhs.base) + task.add_scalar_arg(False, bool) + task.execute() + rhs = rhs_tmp.base + else: + if rhs.shape != index_array.shape: + rhs = rhs._broadcast(index_array.base.shape) + else: + rhs = rhs.base - copy.add_input(store) + copy = self.context.create_copy() + copy.add_input(rhs) copy.add_target_indirect(index_array.base) - copy.add_output(self.base) - - # copy.add_alignment(index_array.base, rhs.base) - + copy.add_output(lhs.base) copy.execute() + if view_copy: + print("IRINA DEBUG", self.shape, lhs.shape) + print(self.base.transform.bottom) + print(self) + self.copy(lhs, deep=True) + else: view = self._get_view(key) diff --git a/src/cunumeric/index/advanced_indexing.cc b/src/cunumeric/index/advanced_indexing.cc index 6b9224338..74882e0e3 100644 --- a/src/cunumeric/index/advanced_indexing.cc +++ b/src/cunumeric/index/advanced_indexing.cc @@ -22,11 +22,52 @@ namespace cunumeric { using namespace Legion; using namespace legate; -template -struct AdvancedIndexingImplBody { +template +struct AdvancedIndexingImplBody { using VAL = legate_type_of; - size_t operator()(Buffer& out, + void compute_output(Buffer& out, + const AccessorRO& input, + const AccessorRO& index, + const Pitches& pitches_input, + const Rect& rect_input, + const Pitches& pitches_index, + const Rect& rect_index, + int volume) const + { + int64_t out_idx = 0; + for (size_t idx = 0; idx < volume; ++idx) { + auto p = pitches_index.unflatten(idx, rect_index.lo); + auto p_input = pitches_input.unflatten(idx, rect_input.lo); + if (index[p] == true) { + out[out_idx] = input[p_input]; + out_idx++; + } + } + } + + void compute_output(Buffer>& out, + const AccessorRO&, + const AccessorRO& index, + const Pitches& pitches_input, + const Rect& rect_input, + const Pitches& pitches_index, + const Rect& rect_index, + int volume) const + { + int64_t out_idx = 0; + for (size_t idx = 0; idx < volume; ++idx) { + auto p = pitches_index.unflatten(idx, rect_index.lo); + auto p_input = pitches_input.unflatten(idx, rect_input.lo); + if (index[p] == true) { + out[out_idx] = p_input; + out_idx++; + } + } + } + + template + size_t operator()(Buffer& out, const AccessorRO& input, const AccessorRO& index, const Pitches& pitches_input, @@ -45,17 +86,9 @@ struct AdvancedIndexingImplBody { if (index[p] == true) { size++; } } - out = create_buffer(size, Memory::Kind::SYSTEM_MEM); + out = create_buffer(size, Memory::Kind::SYSTEM_MEM); - int64_t out_idx = 0; - for (size_t idx = 0; idx < volume; ++idx) { - auto p = pitches_index.unflatten(idx, rect_index.lo); - auto p_input = pitches_input.unflatten(idx, rect_input.lo); - if (index[p] == true) { - out[out_idx] = input[p_input]; - out_idx++; - } - } + compute_output(out, input, index, pitches_input, rect_input, pitches_index, rect_index, volume); return size; } }; diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cunumeric/index/advanced_indexing.cu index f818579ed..fdce0f2e1 100644 --- a/src/cunumeric/index/advanced_indexing.cu +++ b/src/cunumeric/index/advanced_indexing.cu @@ -71,8 +71,32 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) out[offset] = in[point_input]; } } -template -struct AdvancedIndexingImplBody { + +template +static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) + advanced_indexing_kernel(size_t volume, + AccessorRO in, + AccessorRO index, + Buffer> out, + Pitches pitches_input, + Point origin_input, + Pitches pitches_index, + Point origin_index, + Buffer offsets) +{ + // FIXME works only when DIM1==DIM2 + const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= volume) return; + auto point = pitches_index.unflatten(tid, origin_index); + auto point_input = pitches_input.unflatten(tid, origin_input); + if (index[point] == true) { + int64_t offset = offsets[tid]; + out[offset] = point_input; + } +} + +template +struct AdvancedIndexingImplBody { using VAL = legate_type_of; int64_t compute_size(const AccessorRO& in, @@ -103,7 +127,8 @@ struct AdvancedIndexingImplBody { return size.read(); } - size_t operator()(Buffer& out, + template + size_t operator()(Buffer& out, const AccessorRO& input, const AccessorRO& index, const Pitches& pitches_input, @@ -123,7 +148,7 @@ struct AdvancedIndexingImplBody { auto offsets = create_buffer(volume, Memory::Kind::GPU_FB_MEM); size = compute_size(index, pitches_index, rect_index, volume, stream, offsets); - out = create_buffer(size, Memory::Kind::GPU_FB_MEM); + out = create_buffer(size, Memory::Kind::GPU_FB_MEM); // populate output if (size > 0) { const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; diff --git a/src/cunumeric/index/advanced_indexing.h b/src/cunumeric/index/advanced_indexing.h index ec0c92681..e375d2a72 100644 --- a/src/cunumeric/index/advanced_indexing.h +++ b/src/cunumeric/index/advanced_indexing.h @@ -24,6 +24,7 @@ struct AdvancedIndexingArgs { Array& output; const Array& input_array; const Array& indexing_array; + const bool is_set; }; class AdvancedIndexingTask : public CuNumericTask { diff --git a/src/cunumeric/index/advanced_indexing_omp.cc b/src/cunumeric/index/advanced_indexing_omp.cc index 192b898b1..0568b3fd1 100644 --- a/src/cunumeric/index/advanced_indexing_omp.cc +++ b/src/cunumeric/index/advanced_indexing_omp.cc @@ -24,11 +24,54 @@ namespace cunumeric { using namespace Legion; using namespace legate; -template -struct AdvancedIndexingImplBody { +template +struct AdvancedIndexingImplBody { using VAL = legate_type_of; - size_t operator()(Buffer& out, + void compute_output(Buffer& out, + const AccessorRO& input, + const AccessorRO& index, + const Pitches& pitches_input, + const Rect& rect_input, + const Pitches& pitches_index, + const Rect& rect_index, + int volume, + int64_t out_idx) const + { +#pragma omp for schedule(static) + for (size_t idx = 0; idx < volume; ++idx) { + auto p = pitches_index.unflatten(idx, rect_index.lo); + auto p_input = pitches_input.unflatten(idx, rect_input.lo); + if (index[p] == true) { + out[out_idx] = input[p_input]; + out_idx++; + } + } + } + + void compute_output(Buffer>& out, + const AccessorRO&, + const AccessorRO& index, + const Pitches& pitches_input, + const Rect& rect_input, + const Pitches& pitches_index, + const Rect& rect_index, + int volume, + int64_t out_idx) const + { +#pragma omp for schedule(static) + for (size_t idx = 0; idx < volume; ++idx) { + auto p = pitches_index.unflatten(idx, rect_index.lo); + auto p_input = pitches_input.unflatten(idx, rect_input.lo); + if (index[p] == true) { + out[out_idx] = p_input; + out_idx++; + } + } + } + + template + size_t operator()(Buffer& out, const AccessorRO& input, const AccessorRO& index, const Pitches& pitches_input, @@ -66,21 +109,14 @@ struct AdvancedIndexingImplBody { Memory::Kind kind = CuNumeric::has_numamem ? Memory::Kind::SOCKET_MEM : Memory::Kind::SYSTEM_MEM; - out = create_buffer(size, kind); + out = create_buffer(size, kind); #pragma omp parallel { const int tid = omp_get_thread_num(); int64_t out_idx = offsets[tid]; -#pragma omp for schedule(static) - for (size_t idx = 0; idx < volume; ++idx) { - auto point = pitches_index.unflatten(idx, rect_index.lo); - auto point_input = pitches_input.unflatten(idx, rect_input.lo); - if (index[point] == true) { - out[out_idx] = input[point_input]; - ++out_idx; - } - } + compute_output( + out, input, index, pitches_input, rect_input, pitches_index, rect_index, volume, out_idx); } return size; diff --git a/src/cunumeric/index/advanced_indexing_template.inl b/src/cunumeric/index/advanced_indexing_template.inl index ed88ac996..d4869e07b 100644 --- a/src/cunumeric/index/advanced_indexing_template.inl +++ b/src/cunumeric/index/advanced_indexing_template.inl @@ -21,7 +21,7 @@ namespace cunumeric { using namespace Legion; using namespace legate; -template +template struct AdvancedIndexingImplBody; template @@ -34,6 +34,7 @@ struct AdvancedIndexingImpl { auto input_arr = args.input_array.read_accessor(input_rect); Pitches input_pitches; Buffer output_arr; + Buffer> output_arr_set; size_t volume1 = input_pitches.flatten(input_rect); auto index_rect = args.indexing_array.shape(); @@ -49,13 +50,27 @@ struct AdvancedIndexingImpl { int64_t size = 0; if (DIM1 == DIM2) { - size = AdvancedIndexingImplBody{}( - output_arr, input_arr, index_arr, input_pitches, input_rect, index_pitches, index_rect); + if (args.is_set) { + size = AdvancedIndexingImplBody{}(output_arr_set, + input_arr, + index_arr, + input_pitches, + input_rect, + index_pitches, + index_rect); + } else { + size = AdvancedIndexingImplBody{}( + output_arr, input_arr, index_arr, input_pitches, input_rect, index_pitches, index_rect); + } } else { // should never go here, not implemented assert(false); } - args.output.return_data(output_arr, size); + if (args.is_set) { + args.output.return_data(output_arr_set, size); + } else { + args.output.return_data(output_arr, size); + } } }; @@ -71,7 +86,9 @@ struct AdvancedIndexingHelper { template static void advanced_indexing_template(TaskContext& context) { - AdvancedIndexingArgs args{context.outputs()[0], context.inputs()[0], context.inputs()[1]}; + // is_set flag is used to fill Point field for in-place assignment operation + bool is_set = context.scalars()[0].value(); + AdvancedIndexingArgs args{context.outputs()[0], context.inputs()[0], context.inputs()[1], is_set}; double_dispatch( args.input_array.dim(), args.input_array.code(), AdvancedIndexingHelper{}, args); } diff --git a/tests/index_routines.py b/tests/index_routines.py index aa1275534..3ba1f2fa1 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -134,6 +134,12 @@ def advanced_indexing(): index_num = num.array(index) assert np.array_equal(y[index], y_num[index_num]) + # test in-place assignment fir the case when idx arr + # is 1d bool array: + y[index] = 3 + y_num[index_num] = 3 + assert np.array_equal(y, y_num) + # test for bool array of the same dimension 2D print("advanced indexing test 6") indx_bool = np.array( @@ -155,6 +161,12 @@ def advanced_indexing(): res_num = z_num[indx_bool_num] assert np.array_equal(res, res_num) + # test in-place assignment fir the case when idx arr + # is 2d bool array: + z[indx_bool] = 1 + z_num[indx_bool] = 1 + assert np.array_equal(z, z_num) + # test mixed data print("advanced indexing test 7") res = z[:, -1] @@ -175,6 +187,12 @@ def advanced_indexing(): res_np = z[indx0, indx1, indx2] assert np.array_equal(res, res_np) + # test in-place assignment fir the case when + # several index arrays passed + z_num[indx0_num, indx1_num, indx2_num] = -2 + z[indx0, indx1, indx2] = -2 + assert np.array_equal(z, z_num) + # indices with broadcast: print("advanced indexing test 9") indx0 = np.array([[0, 1], [1, 0], [0, 0]]) @@ -224,37 +242,38 @@ def advanced_indexing(): # In-Place & Augmented Assignments via Advanced Indexing # simple 1d case - # y = np.array([0, -1, -2, -3, -4, -5]) - # y_num = num.array(y) - # index = np.array([2, 4, 0, 4, 4, 4]) - # index_num = num.array(index) - # print (y[index]) - # print(y_num[index]) - # y[index] = 0 - # y_num[index_num] =0 - # print (y_num) + y = np.array([0, -1, -2, -3, -4, -5]) + y_num = num.array(y) + index = np.array([2, 4, 0, 4, 4, 4]) + index_num = num.array(index) + y[index] = 0 + y_num[index_num] = 0 + assert np.array_equal(y, y_num) + + y[index] = np.array([1, 2, 3, 4, 5, 6]) + y_num[index_num] = num.array([1, 2, 3, 4, 5, 6]) + print(y) + print(y_num) + # Order on which data is updated in case when indexing array points to the + # same daya in the original array is not guaranteed, so we can't call + # assert np.array_equal(y, y_num) here # 2D test - # x = np.array( - # [ - # [0.38, -0.16, 0.38, -0.41, -0.04], - # [-0.47, -0.01, -0.18, -0.5, -0.49], - # [0.02, 0.4, 0.33, 0.33, -0.13], - # ] - # ) - # indx0 = np.array([0, 1]) - # indx1 = np.array([1, 2]) - # x_num = num.array(x) - # indx0_num = num.array(indx0) - # indx1_num = num.array(indx1) - # print(x[indx0, indx1]) - # FIXME 0: - # print (x_num[indx0_num,indx1_num]) - # assert np.array_equal(x[indx0, indx1], x_num[indx0_num, indx1_num]) - # print (x_num[indx0_num, indx1_num]) - # x[indx0, indx1] = 0.0 - # print(x) - # x_num[indx0_num, indx1_num] =0.0 + x = np.array( + [ + [0.38, -0.16, 0.38, -0.41, -0.04], + [-0.47, -0.01, -0.18, -0.5, -0.49], + [0.02, 0.4, 0.33, 0.33, -0.13], + ] + ) + indx0 = np.array([0, 1]) + indx1 = np.array([1, 2]) + x_num = num.array(x) + indx0_num = num.array(indx0) + indx1_num = num.array(indx1) + x[indx0, indx1] = 2.0 + x_num[indx0_num, indx1_num] = 2.0 + assert np.array_equal(x, x_num) # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by # 1 when passig 2d index array @@ -279,11 +298,19 @@ def advanced_indexing(): idx_arr_np = mk_seq_array(np, i_shape) % np_array.shape[0] idx_arr_num = num.array(idx_arr_np) assert np.array_equal(np_array[idx_arr_np], num_array[idx_arr_num]) + # test in-place assignment + np_array[idx_arr_np] = 2 + num_array[idx_arr_num] = 2 + assert np.array_equal(num_array, np_array) idx_arr_np = np.array([[1, 0, 1], [1, 1, 0]]) idx_arr_num = num.array(idx_arr_np) assert np.array_equal( np_array[:, idx_arr_np], num_array[:, idx_arr_num] ) + # test in-place assignment + np_array[:, idx_arr_np] = 3 + num_array[:, idx_arr_num] = 3 + assert np.array_equal(num_array, np_array) if ndim > 2: assert np.array_equal( np_array[1, :, idx_arr_np], num_array[1, :, idx_arr_num] From d5e044687df16c1def17d400e1cddcfeb96015a2 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Tue, 5 Apr 2022 12:57:51 -0600 Subject: [PATCH 13/33] registering all PointN types during the Runtime initialization --- cunumeric/config.py | 16 +++++++++++++++- cunumeric/deferred.py | 6 +++--- cunumeric/runtime.py | 28 +++++++++++++++++++--------- src/cunumeric/cunumeric_c.h | 12 ++++++++++++ 4 files changed, 49 insertions(+), 13 deletions(-) diff --git a/cunumeric/config.py b/cunumeric/config.py index 2ec560d50..f369bfb09 100644 --- a/cunumeric/config.py +++ b/cunumeric/config.py @@ -78,7 +78,7 @@ def destroy(self): # Match these to CuNumericOpCode in cunumeric_c.h @unique class CuNumericOpCode(IntEnum): - ADVANCED_INDX = _cunumeric.CUNUMERIC_ADVANCED_INDEXING + ADVANCED_INDEXING = _cunumeric.CUNUMERIC_ADVANCED_INDEXING ARANGE = _cunumeric.CUNUMERIC_ARANGE BINARY_OP = _cunumeric.CUNUMERIC_BINARY_OP BINARY_RED = _cunumeric.CUNUMERIC_BINARY_RED @@ -244,3 +244,17 @@ class CuNumericTunable(IntEnum): NUM_PROCS = _cunumeric.CUNUMERIC_TUNABLE_NUM_PROCS MAX_EAGER_VOLUME = _cunumeric.CUNUMERIC_TUNABLE_MAX_EAGER_VOLUME HAS_NUMAMEM = _cunumeric.CUNUMERIC_TUNABLE_HAS_NUMAMEM + + +# Match these to CuNumericTypeCOdes in cunumeric_c.h +@unique +class CuNumericTypeCodes(IntEnum): + CUNUMERIC_TYPE_POINT1 = _cunumeric.CUNUMERIC_TYPE_POINT1 + CUNUMERIC_TYPE_POINT2 = _cunumeric.CUNUMERIC_TYPE_POINT2 + CUNUMERIC_TYPE_POINT3 = _cunumeric.CUNUMERIC_TYPE_POINT3 + CUNUMERIC_TYPE_POINT4 = _cunumeric.CUNUMERIC_TYPE_POINT4 + CUNUMERIC_TYPE_POINT5 = _cunumeric.CUNUMERIC_TYPE_POINT5 + CUNUMERIC_TYPE_POINT6 = _cunumeric.CUNUMERIC_TYPE_POINT6 + CUNUMERIC_TYPE_POINT7 = _cunumeric.CUNUMERIC_TYPE_POINT7 + CUNUMERIC_TYPE_POINT8 = _cunumeric.CUNUMERIC_TYPE_POINT8 + CUNUMERIC_TYPE_POINT9 = _cunumeric.CUNUMERIC_TYPE_POINT9 diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 526292887..b57655715 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -398,7 +398,7 @@ def _zip_indices(self, start_index, arrays): # of that dtype, so long as we don't try to convert it to a # NumPy array. N = self.ndim - pointN_dtype = self.runtime.add_point_type(N) + pointN_dtype = self.runtime.get_point_type(N) store = self.context.create_store( pointN_dtype, shape=out_shape, optimize_scalar=True ) @@ -538,11 +538,11 @@ def _create_indexing_array(self, key, is_set=False): out_dtype = self.dtype if is_set: N = self.ndim - out_dtype = self.runtime.add_point_type(N) + out_dtype = self.runtime.get_point_type(N) out = self.runtime.create_unbound_thunk(out_dtype) task = self.context.create_task( - CuNumericOpCode.ADVANCED_INDX + CuNumericOpCode.ADVANCED_INDEXING ) task.add_output(out.base) task.add_input(self.base) diff --git a/cunumeric/runtime.py b/cunumeric/runtime.py index a6d840356..14e5c5d4e 100644 --- a/cunumeric/runtime.py +++ b/cunumeric/runtime.py @@ -27,6 +27,7 @@ CuNumericOpCode, CuNumericRedopCode, CuNumericTunable, + CuNumericTypeCodes, cunumeric_context, cunumeric_lib, ) @@ -96,6 +97,24 @@ def _register_dtypes(self): for numpy_type, core_type in _supported_dtypes.items(): type_system.make_alias(np.dtype(numpy_type), core_type) + for n in range(1, LEGATE_MAX_DIM + 1): + self._register_point_type(n) + + def _register_point_type(self, n): + type_system = self.legate_context.type_system + point_type = "" + str(n) + if point_type not in type_system: + code = CuNumericTypeCodes.CUNUMERIC_TYPE_POINT1 + n - 1 + size_in_bytes = 8 * n + type_system.add_type(point_type, size_in_bytes, code) + + def get_point_type(self, n): + type_system = self.legate_context.type_system + point_type = "" + str(n) + if point_type not in type_system: + raise ValueError(f"there is no point type registered fro {n}") + return point_type + def _parse_command_args(self): try: # Prune it out so the application does not see it @@ -175,15 +194,6 @@ def get_arg_dtype(self, value_dtype): dtype.register_reduction_op(redop, redop_id) return arg_dtype - def add_point_type(self, n): - type_system = self.legate_context.type_system - point_type = "point" + str(n) - if point_type not in type_system: - code = type_system[ty.int64].code - size_in_bytes = 8 * n - type_system.add_type(point_type, size_in_bytes, code) - return point_type - def _report_coverage(self): total = len(self.api_calls) implemented = sum(int(impl) for (_, _, impl) in self.api_calls) diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h index 0e8106ff1..a73a69eb8 100644 --- a/src/cunumeric/cunumeric_c.h +++ b/src/cunumeric/cunumeric_c.h @@ -187,6 +187,18 @@ enum CuNumericBounds { CUNUMERIC_MAX_TASKS = 1048576, }; +enum CuNumericTypeCodes { + CUNUMERIC_TYPE_POINT1 = LEGION_TYPE_TOTAL + 1, + CUNUMERIC_TYPE_POINT2, + CUNUMERIC_TYPE_POINT3, + CUNUMERIC_TYPE_POINT4, + CUNUMERIC_TYPE_POINT5, + CUNUMERIC_TYPE_POINT6, + CUNUMERIC_TYPE_POINT7, + CUNUMERIC_TYPE_POINT8, + CUNUMERIC_TYPE_POINT9, +}; + #ifdef __cplusplus extern "C" { #endif From 6e1b4f17ffc70d25fb6f2e47cf26830457cee585 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Tue, 5 Apr 2022 14:22:51 -0600 Subject: [PATCH 14/33] fixing logic for transpose operation in advanced indexing --- cunumeric/deferred.py | 57 +++++++++++++++++++++++------------------ tests/index_routines.py | 8 ++++++ 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index b57655715..42676dfaa 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -449,15 +449,43 @@ def _create_indexing_array(self, key, is_set=False): # the transformation, we need to return a copy copy_needed = False tuple_of_arrays = () + index_map = [] + # First, we need to check if transpose is needed for dim, k in enumerate(key): + if np.isscalar(k) or isinstance(k, NumPyThunk): + if start_index == -1: + start_index = dim + transpose_indices += (dim,) + transpose_needed = transpose_needed or ( + (dim - last_index) > 1 + ) + last_index = dim + + if transpose_needed: + copy_needed = True + start_index = 0 + post_indices = tuple( + i for i in range(store.ndim) if i not in transpose_indices + ) + transpose_indices += post_indices + store = store.transpose(transpose_indices) + index_map = list(transpose_indices) + count = 0 + for i in transpose_indices: + index_map[i] = count + count += 1 + else: + index_map = tuple(range(len(key))) + + for d, k in enumerate(key): + dim = index_map[d] if np.isscalar(k): if k < 0: k += store.shape[dim + shift] store = store.project(dim + shift, k) shift -= 1 copy_needed = True - last_index = dim + shift elif k is np.newaxis: store = store.promote(dim + shift, 1) copy_needed = True @@ -466,22 +494,8 @@ def _create_indexing_array(self, key, is_set=False): if k != slice(None): copy_needed = True elif isinstance(k, NumPyThunk): - # the very first time we get cunumeric array, record - # start_index - if start_index == -1: - start_index = dim + shift - if (start_index - last_index) > 1: - transpose_needed = True - last_index = dim + shift - transpose_indices += (dim + shift,) - else: - transpose_needed = transpose_needed or ( - (dim + shift - last_index) > 1 - ) - transpose_indices += (dim + shift,) - last_index = dim + shift if k.dtype == np.bool: - if k.shape[0] != self.shape[dim]: + if k.shape[0] != store.shape[dim]: raise ValueError( "boolean index did not match " "indexed array along dimension " @@ -497,14 +511,7 @@ def _create_indexing_array(self, key, is_set=False): "Unsupported entry type passed to advanced", "indexing operation", ) - if transpose_needed: - copy_needed = True - start_index = 0 - post_indices = tuple( - i for i in range(store.ndim) if i not in transpose_indices - ) - transpose_indices += post_indices - store = store.transpose(transpose_indices) + if copy_needed: # after store is transformed we need to to return a copy of # the store since Copy operation can't be done on @@ -562,7 +569,7 @@ def _create_indexing_array(self, key, is_set=False): # output regions when ND output regions are available tuple_of_arrays = key.nonzero() elif key.ndim < store.ndim: - output_arr = self._zip_indices(start_index, (key,)) + output_arr = rhs._zip_indices(start_index, (key,)) return True, store, output_arr else: tuple_of_arrays = (self.runtime.to_deferred_array(key),) diff --git a/tests/index_routines.py b/tests/index_routines.py index 3ba1f2fa1..4e4845fa9 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -240,6 +240,14 @@ def advanced_indexing(): res_num = x_num[..., [0, 1], 2] assert np.array_equal(res, res_num) + res = x[:, [0, 1], :, -1] + res_num = x_num[:, [0, 1], :, -1] + assert np.array_equal(res, res_num) + + res = x[:, [0, 1], :, 1:] + res_num = x_num[:, [0, 1], :, 1:] + assert np.array_equal(res, res_num) + # In-Place & Augmented Assignments via Advanced Indexing # simple 1d case y = np.array([0, -1, -2, -3, -4, -5]) From 76c1ae50e61716e299efa319a6cc0c36a6d78ee7 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 6 Apr 2022 11:20:57 -0600 Subject: [PATCH 15/33] fixing an issue when advanced indexing operation is performed on transformed store --- cunumeric/deferred.py | 53 ++++++++++++++++++++++------------------- tests/index_routines.py | 10 ++++++++ 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 42676dfaa..372443ad6 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -431,6 +431,20 @@ def _zip_indices(self, start_index, arrays): return output_arr + def copy_store(self, store): + store_to_copy = DeferredArray( + self.runtime, + base=store, + dtype=self.dtype, + ) + store_copy = self.runtime.create_empty_thunk( + store_to_copy.shape, + self.dtype, + inputs=[store_to_copy], + ) + store_copy.copy(store_to_copy, deep=True) + return store_copy, store_copy.base + def _create_indexing_array(self, key, is_set=False): store = self.base rhs = self @@ -511,26 +525,15 @@ def _create_indexing_array(self, key, is_set=False): "Unsupported entry type passed to advanced", "indexing operation", ) - - if copy_needed: + if copy_needed or (not store._transform.bottom): # after store is transformed we need to to return a copy of # the store since Copy operation can't be done on # the store with transformation - store_to_copy = DeferredArray( - self.runtime, - base=store, - dtype=self.dtype, - ) - store_copy = self.runtime.create_empty_thunk( - store_to_copy.shape, - self.dtype, - inputs=[store_to_copy], - ) - store_copy.copy(store_to_copy, deep=True) - rhs = store_copy - store = store_copy.base + rhs, store = self.copy_store(store) else: assert isinstance(key, NumPyThunk) + if not store._transform.bottom: + rhs, store = self.copy_store(store) # the use case when index array ndim >1 and input array ndim ==1 if key.ndim > store.ndim: if store.ndim != 1: @@ -541,23 +544,23 @@ def _create_indexing_array(self, key, is_set=False): # Handle the boolean array case if key.dtype == np.bool: - if key.shape == self.shape: - out_dtype = self.dtype + if key.shape == rhs.shape: + out_dtype = rhs.dtype if is_set: - N = self.ndim - out_dtype = self.runtime.get_point_type(N) + N = rhs.ndim + out_dtype = rhs.runtime.get_point_type(N) - out = self.runtime.create_unbound_thunk(out_dtype) - task = self.context.create_task( + out = rhs.runtime.create_unbound_thunk(out_dtype) + task = rhs.context.create_task( CuNumericOpCode.ADVANCED_INDEXING ) task.add_output(out.base) - task.add_input(self.base) + task.add_input(rhs.base) task.add_input(key.base) task.add_scalar_arg(is_set, bool) - task.add_alignment(self.base, key.base) + task.add_alignment(rhs.base, key.base) task.add_broadcast( - self.base, axes=tuple(range(1, len(self.shape))) + rhs.base, axes=tuple(range(1, len(rhs.shape))) ) task.add_broadcast( key.base, axes=tuple(range(1, len(key.shape))) @@ -572,7 +575,7 @@ def _create_indexing_array(self, key, is_set=False): output_arr = rhs._zip_indices(start_index, (key,)) return True, store, output_arr else: - tuple_of_arrays = (self.runtime.to_deferred_array(key),) + tuple_of_arrays = (rhs.runtime.to_deferred_array(key),) if len(tuple_of_arrays) > rhs.ndim: raise TypeError("Advanced indexing dimension mismatch") diff --git a/tests/index_routines.py b/tests/index_routines.py index 4e4845fa9..3f187db84 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -283,6 +283,16 @@ def advanced_indexing(): x_num[indx0_num, indx1_num] = 2.0 assert np.array_equal(x, x_num) + # use case when advanced indexing is called on a transformed array: + print("advanced indexing test 11") + z = z[:, 1:] + z_num = z_num[:, 1:] + indx = np.array([1, 1]) + indx_num = num.array(indx) + res = z[indx] + res_num = z_num[indx_num] + assert np.array_equal(res, res_num) + # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by # 1 when passig 2d index array for ndim in range(2, LEGATE_MAX_DIM): From acdae9da8ecb384feb01ae44eba0c009b700fee3 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 7 Apr 2022 10:53:54 -0600 Subject: [PATCH 16/33] adapting to the output region API change --- src/cunumeric/index/advanced_indexing_template.inl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cunumeric/index/advanced_indexing_template.inl b/src/cunumeric/index/advanced_indexing_template.inl index d4869e07b..d324f5ccc 100644 --- a/src/cunumeric/index/advanced_indexing_template.inl +++ b/src/cunumeric/index/advanced_indexing_template.inl @@ -44,7 +44,7 @@ struct AdvancedIndexingImpl { if (volume1 == 0 || volume2 == 0) { auto empty = create_buffer(0); - args.output.return_data(empty, 0); + args.output.return_data(empty, Point<1>(0)); return; } @@ -67,9 +67,9 @@ struct AdvancedIndexingImpl { assert(false); } if (args.is_set) { - args.output.return_data(output_arr_set, size); + args.output.return_data(output_arr_set, Point<1>(size)); } else { - args.output.return_data(output_arr, size); + args.output.return_data(output_arr, Point<1>(size)); } } }; From 6cab1ca1a70414083a654d1b0625d7644b06ddb2 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Mon, 11 Apr 2022 16:52:19 -0600 Subject: [PATCH 17/33] addressing PR comments --- cunumeric/config.py | 2 +- cunumeric/runtime.py | 6 +++--- src/cunumeric/cunumeric_c.h | 1 + src/cunumeric/index/advanced_indexing.cc | 2 +- src/cunumeric/index/advanced_indexing.cu | 2 +- src/cunumeric/index/advanced_indexing_omp.cc | 2 +- src/cunumeric/index/zip.cc | 10 ++++++---- src/cunumeric/index/zip.cu | 5 ++++- src/cunumeric/index/zip_omp.cc | 10 ++++++---- src/cunumeric/index/zip_template.inl | 8 ++++---- 10 files changed, 28 insertions(+), 20 deletions(-) diff --git a/cunumeric/config.py b/cunumeric/config.py index f369bfb09..9c29bbe64 100644 --- a/cunumeric/config.py +++ b/cunumeric/config.py @@ -246,7 +246,7 @@ class CuNumericTunable(IntEnum): HAS_NUMAMEM = _cunumeric.CUNUMERIC_TUNABLE_HAS_NUMAMEM -# Match these to CuNumericTypeCOdes in cunumeric_c.h +# Match these to CuNumericTypeCodes in cunumeric_c.h @unique class CuNumericTypeCodes(IntEnum): CUNUMERIC_TYPE_POINT1 = _cunumeric.CUNUMERIC_TYPE_POINT1 diff --git a/cunumeric/runtime.py b/cunumeric/runtime.py index 14e5c5d4e..fd3d2070b 100644 --- a/cunumeric/runtime.py +++ b/cunumeric/runtime.py @@ -102,7 +102,7 @@ def _register_dtypes(self): def _register_point_type(self, n): type_system = self.legate_context.type_system - point_type = "" + str(n) + point_type = "Point" + str(n) if point_type not in type_system: code = CuNumericTypeCodes.CUNUMERIC_TYPE_POINT1 + n - 1 size_in_bytes = 8 * n @@ -110,9 +110,9 @@ def _register_point_type(self, n): def get_point_type(self, n): type_system = self.legate_context.type_system - point_type = "" + str(n) + point_type = "Point" + str(n) if point_type not in type_system: - raise ValueError(f"there is no point type registered fro {n}") + raise ValueError(f"there is no point type registered for {n}") return point_type def _parse_command_args(self): diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h index a73a69eb8..f270cc247 100644 --- a/src/cunumeric/cunumeric_c.h +++ b/src/cunumeric/cunumeric_c.h @@ -187,6 +187,7 @@ enum CuNumericBounds { CUNUMERIC_MAX_TASKS = 1048576, }; +// Match these to CuNumericTypeCodes in config.py enum CuNumericTypeCodes { CUNUMERIC_TYPE_POINT1 = LEGION_TYPE_TOTAL + 1, CUNUMERIC_TYPE_POINT2, diff --git a/src/cunumeric/index/advanced_indexing.cc b/src/cunumeric/index/advanced_indexing.cc index 74882e0e3..bc2b1870a 100644 --- a/src/cunumeric/index/advanced_indexing.cc +++ b/src/cunumeric/index/advanced_indexing.cc @@ -75,7 +75,7 @@ struct AdvancedIndexingImplBody { const Pitches& pitches_index, const Rect& rect_index) const { -#ifdef CUNUMERIC_DEBUG +#ifdef DEBUG_CUNUMERIC // in this case shapes for input and index arrays should be the same assert(rect_input == rect_index); #endif diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cunumeric/index/advanced_indexing.cu index fdce0f2e1..c454d0860 100644 --- a/src/cunumeric/index/advanced_indexing.cu +++ b/src/cunumeric/index/advanced_indexing.cu @@ -136,7 +136,7 @@ struct AdvancedIndexingImplBody { const Pitches& pitches_index, const Rect& rect_index) const { -#ifdef CUNUMERIC_DEBUG +#ifdef DEBUG_CUNUMERIC // in this case shapes for input and index arrays should be the same assert(rect_input == rect_index); #endif diff --git a/src/cunumeric/index/advanced_indexing_omp.cc b/src/cunumeric/index/advanced_indexing_omp.cc index 0568b3fd1..aad31b6b4 100644 --- a/src/cunumeric/index/advanced_indexing_omp.cc +++ b/src/cunumeric/index/advanced_indexing_omp.cc @@ -79,7 +79,7 @@ struct AdvancedIndexingImplBody { const Pitches& pitches_index, const Rect& rect_index) const { -#ifdef CUNUMERIC_DEBUG +#ifdef DEBUG_CUNUMERIC // in this case shapes for input and index arrays should be the same assert(rect_input == rect_index); #endif diff --git a/src/cunumeric/index/zip.cc b/src/cunumeric/index/zip.cc index b167d2f62..9d055fef3 100644 --- a/src/cunumeric/index/zip.cc +++ b/src/cunumeric/index/zip.cc @@ -39,9 +39,8 @@ struct ZipImplBody { if (index_arrays.size() == N) { const size_t volume = rect.volume(); if (dense) { - std::vector indx_ptrs; - for (auto a : index_arrays) indx_ptrs.push_back(a.ptr(rect)); - auto outptr = out.ptr(rect); + std::vector indx_ptrs = {index_arrays[Is].ptr(rect)...}; + auto outptr = out.ptr(rect); for (size_t idx = 0; idx < volume; ++idx) { outptr[idx] = Legion::Point(indx_ptrs[Is][idx]...); } @@ -51,7 +50,10 @@ struct ZipImplBody { out[p] = Legion::Point(index_arrays[Is][p]...); } } - } else if (index_arrays.size() < N) { + } else { +#ifdef DEBUG_CUNUMERIC + assert(index_arrays.size() < N); +#endif const size_t volume = rect.volume(); for (size_t idx = 0; idx < volume; ++idx) { auto p = pitches.unflatten(idx, rect.lo); diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu index 88b999776..3a3ae0243 100644 --- a/src/cunumeric/index/zip.cu +++ b/src/cunumeric/index/zip.cu @@ -106,7 +106,10 @@ struct ZipImplBody { zip_kernel<<>>( out, idx_arr, rect, pitches, volume, std::make_index_sequence()); } - } else if (index_arrays.size() < N) { + } else { +#ifdef DEBUG_CUNUMERIC + assert(index_arrays.size() < N); +#endif DeferredBuffer, 1> idx_arr(Memory::Kind::Z_COPY_MEM, Rect<1>(0, index_arrays.size() - 1)); for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) idx_arr[idx] = index_arrays[idx]; diff --git a/src/cunumeric/index/zip_omp.cc b/src/cunumeric/index/zip_omp.cc index 9276c3450..d4f961777 100644 --- a/src/cunumeric/index/zip_omp.cc +++ b/src/cunumeric/index/zip_omp.cc @@ -39,9 +39,8 @@ struct ZipImplBody { const size_t volume = rect.volume(); if (index_arrays.size() == N) { if (dense) { - std::vector indx_ptrs; - for (auto a : index_arrays) indx_ptrs.push_back(a.ptr(rect)); - auto outptr = out.ptr(rect); + std::vector indx_ptrs = {index_arrays[Is].ptr(rect)...}; + auto outptr = out.ptr(rect); #pragma omp parallel for schedule(static) for (size_t idx = 0; idx < volume; ++idx) { outptr[idx] = Legion::Point(indx_ptrs[Is][idx]...); @@ -53,7 +52,10 @@ struct ZipImplBody { out[p] = Legion::Point(index_arrays[Is][p]...); } } // else - } else if (index_arrays.size() < N) { + } else { +#ifdef DEBUG_CUNUMERIC + assert(index_arrays.size() < N); +#endif #pragma omp parallel for schedule(static) for (size_t idx = 0; idx < volume; ++idx) { auto p = pitches.unflatten(idx, rect.lo); diff --git a/src/cunumeric/index/zip_template.inl b/src/cunumeric/index/zip_template.inl index d4b34a787..fd536ff35 100644 --- a/src/cunumeric/index/zip_template.inl +++ b/src/cunumeric/index/zip_template.inl @@ -37,18 +37,18 @@ struct ZipImpl { size_t volume = pitches.flatten(out_rect); if (volume == 0) return; -#ifdef CUNUMERIC_DEBUG - assert(out_rect == index_rect) +#ifdef DEBUG_CUNUMERIC + assert(out_rect == index_rect); #endif #ifndef LEGION_BOUNDS_CHECKS - bool dense = out.accessor.is_dense_row_major(out_rect); + bool dense = out.accessor.is_dense_row_major(out_rect); #else bool dense = false; #endif std::vector> index_arrays; for (int i = 0; i < args.inputs.size(); i++) { -#ifdef CUNUMERIC_DEBUG +#ifdef DEBUG_CUNUMERIC assert(index_rect == args.inputs[i].shape()); #endif index_arrays.push_back(args.inputs[i].read_accessor(index_rect)); From 9480cc0aee279128ea5bd74e25b0fb71e67c98f2 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Tue, 12 Apr 2022 10:14:34 -0600 Subject: [PATCH 18/33] some code clean-up + more tests --- cunumeric/deferred.py | 3 -- src/cunumeric/index/zip.cc | 2 +- src/cunumeric/index/zip.cu | 2 +- src/cunumeric/index/zip_omp.cc | 2 +- src/cunumeric/index/zip_template.inl | 23 ++++++++++ tests/index_routines.py | 69 ++++++++++++++++++++++++++-- 6 files changed, 90 insertions(+), 11 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 372443ad6..5d5dfed5a 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -743,9 +743,6 @@ def set_item(self, key, rhs): copy.execute() if view_copy: - print("IRINA DEBUG", self.shape, lhs.shape) - print(self.base.transform.bottom) - print(self) self.copy(lhs, deep=True) else: diff --git a/src/cunumeric/index/zip.cc b/src/cunumeric/index/zip.cc index 9d055fef3..a1bce3a5f 100644 --- a/src/cunumeric/index/zip.cc +++ b/src/cunumeric/index/zip.cc @@ -63,7 +63,7 @@ struct ZipImplBody { new_point[start_index + i] = index_arrays[i][p]; } for (size_t i = (start_index + index_arrays.size()); i < N; i++) { - int64_t j = key_dim + i - 1 - (index_arrays.size() - 1); + int64_t j = key_dim + i - index_arrays.size(); new_point[i] = p[j]; } out[p] = new_point; diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu index 3a3ae0243..abf4914de 100644 --- a/src/cunumeric/index/zip.cu +++ b/src/cunumeric/index/zip.cu @@ -68,7 +68,7 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) for (size_t i = 0; i < start_index; i++) { new_point[i] = p[i]; } for (size_t i = 0; i < narrays; i++) { new_point[start_index + i] = index_arrays[i][p]; } for (size_t i = (start_index + narrays); i < N; i++) { - int64_t j = key_dim + i - 1 - (narrays - 1); + int64_t j = key_dim + i - narrays; new_point[i] = p[j]; } out[p] = new_point; diff --git a/src/cunumeric/index/zip_omp.cc b/src/cunumeric/index/zip_omp.cc index d4f961777..e4a5d5764 100644 --- a/src/cunumeric/index/zip_omp.cc +++ b/src/cunumeric/index/zip_omp.cc @@ -65,7 +65,7 @@ struct ZipImplBody { new_point[start_index + i] = index_arrays[i][p]; } for (size_t i = (start_index + index_arrays.size()); i < N; i++) { - int64_t j = key_dim + i - 1 - (index_arrays.size() - 1); + int64_t j = key_dim + i - index_arrays.size(); new_point[i] = p[j]; } out[p] = new_point; diff --git a/src/cunumeric/index/zip_template.inl b/src/cunumeric/index/zip_template.inl index fd536ff35..e1e2c9004 100644 --- a/src/cunumeric/index/zip_template.inl +++ b/src/cunumeric/index/zip_template.inl @@ -69,6 +69,29 @@ struct ZipImpl { template static void zip_template(TaskContext& context) { + // Here `N` is the number of dimenstions of the input array and the number + // of dimensions of the Point field + // key_dim - is the number of dimensions of the index arrays before + // they were broadcasted to the shape of the input array (shape of + // all index arrays should be the same)) + // start index - is the index from wich first index array was passed + // DIM - dimension of the output array + // + // for the example: + // x.shape = (2,3,4,5) + // ind1.shape = (6,7,8) + // ind2.shape = (6,7,8) + // y = x[:,ind1,ind2,:] + // y.shape == (2,6,7,8,5) + // out.shape == (2,6,7,8,5) + // index_arrays = [ind1', ind2'] + // ind1' == ind1 promoted to (2,6,7,8,5) + // ind2' == ind2 promoted to (2,6,7,8,5) + // DIM = 5 + // N = 4 + // key_dim = 3 + // start_index = 1 + int64_t N = context.scalars()[0].value(); int64_t key_dim = context.scalars()[1].value(); int64_t start_index = context.scalars()[2].value(); diff --git a/tests/index_routines.py b/tests/index_routines.py index 3f187db84..087126c32 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -35,6 +35,13 @@ def advanced_indexing(): res_num = x_num[indx_num] assert np.array_equal(res, res_num) + # after transformation: + x = x[1:] + x_num = x_num[1:] + res = x[indx] + res_num = x_num[indx_num] + assert np.array_equal(res, res_num) + # advanced indexing test when a.ndim ==1 , indx.ndim >1 print("advanced indexing test 2") y = np.array([0, -1, -2, -3, -4, -5]) @@ -43,6 +50,12 @@ def advanced_indexing(): index_num = num.array(index) assert np.array_equal(y[index], y_num[index_num]) + # simple 2D case + print("advanced indexing test 3") + index_2d = np.array([[1, 2, 0], [5, 5, 5], [2, 3, 4]]) + index_2d_num = num.array(index_2d) + assert np.array_equal(y[index_2d], y_num[index_2d_num]) + z = np.array( [ [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], @@ -51,11 +64,20 @@ def advanced_indexing(): ) z_num = num.array(z) - # simple 2D case - print("advanced indexing test 3") - index_2d = np.array([[1, 2, 0], [5, 5, 5], [2, 3, 4]]) - index_2d_num = num.array(index_2d) - assert np.array_equal(y[index_2d], y_num[index_2d_num]) + zt = z.transpose( + ( + 1, + 0, + 2, + ) + ) + zt_num = z_num.transpose( + ( + 1, + 0, + 2, + ) + ) # mismatch dimesion case: print("advanced indexing test 4") @@ -65,14 +87,26 @@ def advanced_indexing(): res_num = z_num[indx_num] assert np.array_equal(res, res_num) + res = zt[indx] + res_num = zt_num[indx_num] + assert np.array_equal(res, res_num) + res = z[:, :, indx] res_num = z_num[:, :, indx_num] assert np.array_equal(res, res_num) + res = zt[:, :, indx] + res_num = zt_num[:, :, indx_num] + assert np.array_equal(res, res_num) + res = z[:, indx, :] res_num = z_num[:, indx_num, :] assert np.array_equal(res, res_num) + res = zt[:, indx, :] + res_num = zt_num[:, indx_num, :] + assert np.array_equal(res, res_num) + # 2d: indx = np.array([[1, 1], [1, 0]]) indx_num = num.array(indx) @@ -80,10 +114,18 @@ def advanced_indexing(): res_num = z_num[indx_num] assert np.array_equal(res, res_num) + res = zt[indx] + res_num = zt_num[indx_num] + assert np.array_equal(res, res_num) + res = z[:, indx] res_num = z_num[:, indx_num] assert np.array_equal(res, res_num) + res = zt[:, indx] + res_num = zt_num[:, indx_num] + assert np.array_equal(res, res_num) + # 2 arrays passed to 3d array indx0 = np.array([1, 1]) indx1 = np.array([1, 0]) @@ -93,10 +135,18 @@ def advanced_indexing(): res_num = z_num[indx0_num, indx1_num] assert np.array_equal(res, res_num) + res = zt[indx0, indx1] + res_num = zt_num[indx0_num, indx1_num] + assert np.array_equal(res, res_num) + res = z[:, indx0, indx1] res_num = z_num[:, indx0_num, indx1_num] assert np.array_equal(res, res_num) + res = zt[:, indx0, indx1] + res_num = zt_num[:, indx0_num, indx1_num] + assert np.array_equal(res, res_num) + # 2 index arrays passed in a sparse way: x = mk_seq_array(np, (3, 4, 5, 6)) x_num = mk_seq_array(num, (3, 4, 5, 6)) @@ -121,6 +171,10 @@ def advanced_indexing(): res_num = z_num[indx0_num, indx1_num] assert np.array_equal(res, res_num) + res = zt[indx0, indx1] + res_num = zt_num[indx0_num, indx1_num] + assert np.array_equal(res, res_num) + # mismatch dimesion case bool: indx_bool = np.array([True, False]) indx_bool_num = num.array(indx_bool) @@ -293,6 +347,11 @@ def advanced_indexing(): res_num = z_num[indx_num] assert np.array_equal(res, res_num) + # in-place assignment + z[indx] = 10 + z_num[indx_num] = 10 + assert np.array_equal(z, z_num) + # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by # 1 when passig 2d index array for ndim in range(2, LEGATE_MAX_DIM): From 02864d5d22caaf8f71b0b7b85aa0353106037714 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 13 Apr 2022 10:56:51 -0600 Subject: [PATCH 19/33] addressing PR comments for AdvancedIndexing task --- src/cunumeric/index/advanced_indexing.cc | 16 +++++----- src/cunumeric/index/advanced_indexing.cu | 16 +++++----- src/cunumeric/index/advanced_indexing_omp.cc | 30 ++++++++++--------- .../index/advanced_indexing_template.inl | 29 ++++++++++-------- src/cunumeric/omp_help.h | 9 +++++- 5 files changed, 57 insertions(+), 43 deletions(-) diff --git a/src/cunumeric/index/advanced_indexing.cc b/src/cunumeric/index/advanced_indexing.cc index bc2b1870a..fea649bb3 100644 --- a/src/cunumeric/index/advanced_indexing.cc +++ b/src/cunumeric/index/advanced_indexing.cc @@ -22,8 +22,8 @@ namespace cunumeric { using namespace Legion; using namespace legate; -template -struct AdvancedIndexingImplBody { +template +struct AdvancedIndexingImplBody { using VAL = legate_type_of; void compute_output(Buffer& out, @@ -33,13 +33,13 @@ struct AdvancedIndexingImplBody { const Rect& rect_input, const Pitches& pitches_index, const Rect& rect_index, - int volume) const + const size_t volume) const { int64_t out_idx = 0; for (size_t idx = 0; idx < volume; ++idx) { - auto p = pitches_index.unflatten(idx, rect_index.lo); - auto p_input = pitches_input.unflatten(idx, rect_input.lo); + auto p = pitches_index.unflatten(idx, rect_index.lo); if (index[p] == true) { + auto p_input = pitches_input.unflatten(idx, rect_input.lo); out[out_idx] = input[p_input]; out_idx++; } @@ -53,13 +53,13 @@ struct AdvancedIndexingImplBody { const Rect& rect_input, const Pitches& pitches_index, const Rect& rect_index, - int volume) const + const size_t volume) const { int64_t out_idx = 0; for (size_t idx = 0; idx < volume; ++idx) { - auto p = pitches_index.unflatten(idx, rect_index.lo); - auto p_input = pitches_input.unflatten(idx, rect_input.lo); + auto p = pitches_index.unflatten(idx, rect_index.lo); if (index[p] == true) { + auto p_input = pitches_input.unflatten(idx, rect_input.lo); out[out_idx] = p_input; out_idx++; } diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cunumeric/index/advanced_indexing.cu index c454d0860..1eb320aa4 100644 --- a/src/cunumeric/index/advanced_indexing.cu +++ b/src/cunumeric/index/advanced_indexing.cu @@ -35,14 +35,14 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) size_t iters, Buffer offsets) { - int64_t value = 0; + size_t value = 0; for (size_t idx = 0; idx < iters; idx++) { const size_t offset = (idx * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; if (offset < volume) { auto point = pitches.unflatten(offset, origin); - auto val = static_cast(index[point]); + auto val = static_cast(index[point]); offsets[offset] = val; - SumReduction::fold(value, val); + SumReduction::fold(value, val); } } // Every thread in the thread block must participate in the exchange to get correct results @@ -95,8 +95,8 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) } } -template -struct AdvancedIndexingImplBody { +template +struct AdvancedIndexingImplBody { using VAL = legate_type_of; int64_t compute_size(const AccessorRO& in, @@ -106,10 +106,10 @@ struct AdvancedIndexingImplBody { cudaStream_t stream, Buffer& offsets) const { - DeferredReduction> size; + DeferredReduction> size; const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(int64_t); + size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(size_t); if (blocks >= MAX_REDUCTION_CTAS) { const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS; @@ -140,7 +140,7 @@ struct AdvancedIndexingImplBody { // in this case shapes for input and index arrays should be the same assert(rect_input == rect_index); #endif - int64_t size = 0; + size_t size = 0; const bool* index_ptr = index.ptr(rect_index); const size_t volume = rect_index.volume(); cudaStream_t stream; diff --git a/src/cunumeric/index/advanced_indexing_omp.cc b/src/cunumeric/index/advanced_indexing_omp.cc index aad31b6b4..7c34cf8df 100644 --- a/src/cunumeric/index/advanced_indexing_omp.cc +++ b/src/cunumeric/index/advanced_indexing_omp.cc @@ -18,14 +18,17 @@ #include "cunumeric/index/advanced_indexing_template.inl" #include "cunumeric/omp_help.h" #include +#include +#include +#include namespace cunumeric { using namespace Legion; using namespace legate; -template -struct AdvancedIndexingImplBody { +template +struct AdvancedIndexingImplBody { using VAL = legate_type_of; void compute_output(Buffer& out, @@ -35,14 +38,14 @@ struct AdvancedIndexingImplBody { const Rect& rect_input, const Pitches& pitches_index, const Rect& rect_index, - int volume, + const size_t volume, int64_t out_idx) const { #pragma omp for schedule(static) for (size_t idx = 0; idx < volume; ++idx) { - auto p = pitches_index.unflatten(idx, rect_index.lo); - auto p_input = pitches_input.unflatten(idx, rect_input.lo); + auto p = pitches_index.unflatten(idx, rect_index.lo); if (index[p] == true) { + auto p_input = pitches_input.unflatten(idx, rect_input.lo); out[out_idx] = input[p_input]; out_idx++; } @@ -56,14 +59,14 @@ struct AdvancedIndexingImplBody { const Rect& rect_input, const Pitches& pitches_index, const Rect& rect_index, - int volume, + const size_t volume, int64_t out_idx) const { #pragma omp for schedule(static) for (size_t idx = 0; idx < volume; ++idx) { - auto p = pitches_index.unflatten(idx, rect_index.lo); - auto p_input = pitches_input.unflatten(idx, rect_input.lo); + auto p = pitches_index.unflatten(idx, rect_index.lo); if (index[p] == true) { + auto p_input = pitches_input.unflatten(idx, rect_input.lo); out[out_idx] = p_input; out_idx++; } @@ -85,12 +88,12 @@ struct AdvancedIndexingImplBody { #endif const size_t volume = rect_index.volume(); const auto max_threads = omp_get_max_threads(); - int64_t size = 0; + size_t size = 0; ThreadLocalStorage offsets(max_threads); { - ThreadLocalStorage sizes(max_threads); - for (auto idx = 0; idx < max_threads; ++idx) sizes[idx] = 0; + ThreadLocalStorage sizes(max_threads); + thrust::fill(thrust::omp::par, sizes.begin(), sizes.end(), 0); #pragma omp parallel { const int tid = omp_get_thread_num(); @@ -101,10 +104,9 @@ struct AdvancedIndexingImplBody { } } - for (auto idx = 0; idx < max_threads; ++idx) size += sizes[idx]; + size = thrust::reduce(thrust::omp::par, sizes.begin(), sizes.end(), 0); - offsets[0] = 0; - for (auto idx = 1; idx < max_threads; ++idx) offsets[idx] = offsets[idx - 1] + sizes[idx - 1]; + thrust::exclusive_scan(thrust::omp::par, sizes.begin(), sizes.end(), offsets.begin()); } Memory::Kind kind = diff --git a/src/cunumeric/index/advanced_indexing_template.inl b/src/cunumeric/index/advanced_indexing_template.inl index d324f5ccc..bc7dc923b 100644 --- a/src/cunumeric/index/advanced_indexing_template.inl +++ b/src/cunumeric/index/advanced_indexing_template.inl @@ -21,7 +21,7 @@ namespace cunumeric { using namespace Legion; using namespace legate; -template +template struct AdvancedIndexingImplBody; template @@ -43,23 +43,28 @@ struct AdvancedIndexingImpl { size_t volume2 = index_pitches.flatten(index_rect); if (volume1 == 0 || volume2 == 0) { - auto empty = create_buffer(0); - args.output.return_data(empty, Point<1>(0)); + if (args.is_set) { + auto empty = create_buffer>(0); + args.output.return_data(empty, Point<1>(0)); + } else { + auto empty = create_buffer(0); + args.output.return_data(empty, Point<1>(0)); + } return; } - int64_t size = 0; + size_t size = 0; if (DIM1 == DIM2) { if (args.is_set) { - size = AdvancedIndexingImplBody{}(output_arr_set, - input_arr, - index_arr, - input_pitches, - input_rect, - index_pitches, - index_rect); + size = AdvancedIndexingImplBody{}(output_arr_set, + input_arr, + index_arr, + input_pitches, + input_rect, + index_pitches, + index_rect); } else { - size = AdvancedIndexingImplBody{}( + size = AdvancedIndexingImplBody{}( output_arr, input_arr, index_arr, input_pitches, input_rect, index_pitches, index_rect); } } else { diff --git a/src/cunumeric/omp_help.h b/src/cunumeric/omp_help.h index 2cf3cb106..8d7440724 100644 --- a/src/cunumeric/omp_help.h +++ b/src/cunumeric/omp_help.h @@ -27,7 +27,10 @@ struct ThreadLocalStorage { static constexpr size_t CACHE_LINE_SIZE = 64; public: - ThreadLocalStorage(size_t num_threads) : storage_(CACHE_LINE_SIZE * num_threads) {} + ThreadLocalStorage(size_t num_threads) + : storage_(CACHE_LINE_SIZE * num_threads), num_threads_(num_threads) + { + } ~ThreadLocalStorage() {} public: @@ -36,8 +39,12 @@ struct ThreadLocalStorage { return *reinterpret_cast(storage_.data() + CACHE_LINE_SIZE * idx); } + VAL* begin() { return reinterpret_cast(storage_.data()); } + VAL* end() { return reinterpret_cast(storage_.data() + CACHE_LINE_SIZE * num_threads_); } + private: std::vector storage_; + size_t num_threads_; }; } // namespace cunumeric From c69897f8f6bc9c6a6c5a16c6368904fc8e40f9b9 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 13 Apr 2022 21:33:05 -0600 Subject: [PATCH 20/33] addressing some PR comments on cunumeric/deferred --- cunumeric/deferred.py | 50 +++++++++----------- src/cunumeric/index/advanced_indexing.cc | 2 +- src/cunumeric/index/advanced_indexing.cu | 2 +- src/cunumeric/index/advanced_indexing_omp.cc | 2 +- tests/index_routines.py | 17 +++++++ 5 files changed, 42 insertions(+), 31 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index ed00f4ac0..a1cb77de9 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -531,7 +531,7 @@ def _create_indexing_array(self, key, is_set=False): # the store with transformation rhs, store = self.copy_store(store) else: - assert isinstance(key, NumPyThunk) + assert isinstance(key, DeferredArray) if not store._transform.bottom: rhs, store = self.copy_store(store) # the use case when index array ndim >1 and input array ndim ==1 @@ -688,41 +688,34 @@ def set_item(self, key, rhs): assert self.dtype == rhs.dtype # Check to see if this is advanced indexing or not if is_advanced_indexing(key): - view_copy = False # Create the indexing array copy_needed, store, index_array = self._create_indexing_array( key, True ) - if copy_needed: - if self.base.transform.bottom: - lhs = self - else: - # if store is transformed we need to to return a copy of - # the store since Copy operation can't be done on - # the store with transformation - store_to_copy = DeferredArray( - self.runtime, - base=store, - dtype=self.dtype, - ) - store_copy = self.runtime.create_empty_thunk( - store_to_copy.shape, - self.dtype, - inputs=[store_to_copy], - ) - store_copy.copy(store_to_copy, deep=True) - - lhs = store_copy - view_copy = True - else: + if self.base.transform.bottom: lhs = self - view_copy = False + else: + # if store is transformed we need to to return a copy of + # the store since Copy operation can't be done on + # the store with transformation + store_to_copy = DeferredArray( + self.runtime, + base=store, + dtype=self.dtype, + ) + store_copy = self.runtime.create_empty_thunk( + store_to_copy.shape, + self.dtype, + inputs=[store_to_copy], + ) + store_copy.copy(store_to_copy, deep=True) + lhs = store_copy if rhs.ndim == 0: rhs_tmp = self.runtime.create_empty_thunk( index_array.base.shape, self.dtype, - inputs=[index_array], + inputs=[], ) task = self.context.create_task(CuNumericOpCode.FILL) task.add_output(rhs_tmp.base) @@ -732,7 +725,8 @@ def set_item(self, key, rhs): rhs = rhs_tmp.base else: if rhs.shape != index_array.shape: - rhs = rhs._broadcast(index_array.base.shape) + rhs_tmp = rhs._broadcast(index_array.base.shape) + rhs_tmp, rhs = rhs.copy_store(rhs_tmp) else: rhs = rhs.base @@ -742,7 +736,7 @@ def set_item(self, key, rhs): copy.add_output(lhs.base) copy.execute() - if view_copy: + if lhs is not self: self.copy(lhs, deep=True) else: diff --git a/src/cunumeric/index/advanced_indexing.cc b/src/cunumeric/index/advanced_indexing.cc index fea649bb3..d5b1c2c16 100644 --- a/src/cunumeric/index/advanced_indexing.cc +++ b/src/cunumeric/index/advanced_indexing.cc @@ -77,7 +77,7 @@ struct AdvancedIndexingImplBody { { #ifdef DEBUG_CUNUMERIC // in this case shapes for input and index arrays should be the same - assert(rect_input == rect_index); + assert(Domain(rect_input) == Domain(rect_index)); #endif const size_t volume = rect_index.volume(); size_t size = 0; diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cunumeric/index/advanced_indexing.cu index 1eb320aa4..c78fec4cf 100644 --- a/src/cunumeric/index/advanced_indexing.cu +++ b/src/cunumeric/index/advanced_indexing.cu @@ -138,7 +138,7 @@ struct AdvancedIndexingImplBody { { #ifdef DEBUG_CUNUMERIC // in this case shapes for input and index arrays should be the same - assert(rect_input == rect_index); + assert(Domain(rect_input) == Domain(rect_index)); #endif size_t size = 0; const bool* index_ptr = index.ptr(rect_index); diff --git a/src/cunumeric/index/advanced_indexing_omp.cc b/src/cunumeric/index/advanced_indexing_omp.cc index 7c34cf8df..8bffa2199 100644 --- a/src/cunumeric/index/advanced_indexing_omp.cc +++ b/src/cunumeric/index/advanced_indexing_omp.cc @@ -84,7 +84,7 @@ struct AdvancedIndexingImplBody { { #ifdef DEBUG_CUNUMERIC // in this case shapes for input and index arrays should be the same - assert(rect_input == rect_index); + assert(Domain(rect_input) == Domain(rect_index)); #endif const size_t volume = rect_index.volume(); const auto max_threads = omp_get_max_threads(); diff --git a/tests/index_routines.py b/tests/index_routines.py index 087126c32..c0a62d87e 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -320,6 +320,23 @@ def advanced_indexing(): # same daya in the original array is not guaranteed, so we can't call # assert np.array_equal(y, y_num) here + index = np.array([1, 4, 3, 2, 0, 5]) + index_num = num.array(index) + y[index] = np.array([1, 2, 3, 4, 5, 6]) + y_num[index_num] = num.array([1, 2, 3, 4, 5, 6]) + print(y) + print(y_num) + assert np.array_equal(y, y_num) + + # the case when broadcast is needed: + index = np.array([[1, 4, 3], [2, 0, 5]]) + index_num = num.array(index) + y[index] = np.array([[1, 2, 3]]) + y_num[index_num] = num.array([[1, 2, 3]]) + print(y) + print(y_num) + assert np.array_equal(y, y_num) + # 2D test x = np.array( [ From e2bbcb522c5bfb805d1f05c15e29978a83f99580 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 13 Apr 2022 22:42:08 -0600 Subject: [PATCH 21/33] addressing some PR comments on cunumeric/deferred 2 --- cunumeric/deferred.py | 62 +++++++++++------------------------------ tests/index_routines.py | 7 +++++ 2 files changed, 24 insertions(+), 45 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index a1cb77de9..139cdc1af 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -305,10 +305,6 @@ def get_scalar_array(self): result = np.frombuffer(buf, dtype=self.dtype, count=1) return result.reshape(()) - def broadcast_shapes(self, shapes): - arrays = [np.empty(x, dtype=[]) for x in shapes] - return np.broadcast(*arrays).shape - def _zip_indices(self, start_index, arrays): if not isinstance(arrays, tuple): raise TypeError("zip_indices expects tuple of arrays") @@ -326,7 +322,8 @@ def _zip_indices(self, start_index, arrays): # find a broadcasted shape for all arrays passed as indices shapes = tuple(a.shape for a in arrays) if len(arrays) > 1: - b_shape = self.broadcast_shapes(shapes) + # TODO: replace with cunumeric.broadcast_shapes, when available + b_shape = np.broadcast_shapes(*shapes) else: b_shape = arrays[0].shape @@ -431,7 +428,7 @@ def _zip_indices(self, start_index, arrays): return output_arr - def copy_store(self, store): + def _copy_store(self, store): store_to_copy = DeferredArray( self.runtime, base=store, @@ -461,7 +458,6 @@ def _create_indexing_array(self, key, is_set=False): transpose_indices = tuple() # since we can't call Copy operation on transformed Store, after # the transformation, we need to return a copy - copy_needed = False tuple_of_arrays = () index_map = [] @@ -477,7 +473,6 @@ def _create_indexing_array(self, key, is_set=False): last_index = dim if transpose_needed: - copy_needed = True start_index = 0 post_indices = tuple( i for i in range(store.ndim) if i not in transpose_indices @@ -499,14 +494,10 @@ def _create_indexing_array(self, key, is_set=False): k += store.shape[dim + shift] store = store.project(dim + shift, k) shift -= 1 - copy_needed = True elif k is np.newaxis: store = store.promote(dim + shift, 1) - copy_needed = True elif isinstance(k, slice): store = store.slice(dim + shift, k) - if k != slice(None): - copy_needed = True elif isinstance(k, NumPyThunk): if k.dtype == np.bool: if k.shape[0] != store.shape[dim]: @@ -525,23 +516,22 @@ def _create_indexing_array(self, key, is_set=False): "Unsupported entry type passed to advanced", "indexing operation", ) - if copy_needed or (not store._transform.bottom): + if store.transformed: # after store is transformed we need to to return a copy of # the store since Copy operation can't be done on # the store with transformation - rhs, store = self.copy_store(store) + rhs, store = self._copy_store(store) else: assert isinstance(key, DeferredArray) - if not store._transform.bottom: - rhs, store = self.copy_store(store) - # the use case when index array ndim >1 and input array ndim ==1 + # the use case when index array ndim >input array ndim if key.ndim > store.ndim: - if store.ndim != 1: - raise ValueError("Advance indexing dimention mismatch") diff = store.ndim - key.ndim for i in range(diff): store = store.promote(i + 1, store.shape[0]) + if store.transformed: + rhs, store = self._copy_store(store) + # Handle the boolean array case if key.dtype == np.bool: if key.shape == rhs.shape: @@ -566,14 +556,14 @@ def _create_indexing_array(self, key, is_set=False): key.base, axes=tuple(range(1, len(key.shape))) ) task.execute() - return False, store, out + return False, rhs, out else: # FIXME: replace `nonzero` case with the task with # output regions when ND output regions are available tuple_of_arrays = key.nonzero() elif key.ndim < store.ndim: output_arr = rhs._zip_indices(start_index, (key,)) - return True, store, output_arr + return True, rhs, output_arr else: tuple_of_arrays = (rhs.runtime.to_deferred_array(key),) @@ -582,9 +572,9 @@ def _create_indexing_array(self, key, is_set=False): if len(tuple_of_arrays) <= rhs.ndim and rhs.ndim > 1: output_arr = rhs._zip_indices(start_index, tuple_of_arrays) - return True, store, output_arr + return True, rhs, output_arr elif len(tuple_of_arrays) == 1 and rhs.ndim == 1: - return True, store, tuple_of_arrays[0] + return True, rhs, tuple_of_arrays[0] else: raise ValueError("Advance indexing dimention mismatch") @@ -648,7 +638,8 @@ def get_item(self, key): # Check to see if this is advanced indexing or not if is_advanced_indexing(key): # Create the indexing array - copy_needed, store, index_array = self._create_indexing_array(key) + copy_needed, rhs, index_array = self._create_indexing_array(key) + store = rhs.base if copy_needed: # Create a new array to be the result result = self.runtime.create_empty_thunk( @@ -689,28 +680,9 @@ def set_item(self, key, rhs): # Check to see if this is advanced indexing or not if is_advanced_indexing(key): # Create the indexing array - copy_needed, store, index_array = self._create_indexing_array( + copy_needed, lhs, index_array = self._create_indexing_array( key, True ) - if self.base.transform.bottom: - lhs = self - else: - # if store is transformed we need to to return a copy of - # the store since Copy operation can't be done on - # the store with transformation - store_to_copy = DeferredArray( - self.runtime, - base=store, - dtype=self.dtype, - ) - store_copy = self.runtime.create_empty_thunk( - store_to_copy.shape, - self.dtype, - inputs=[store_to_copy], - ) - store_copy.copy(store_to_copy, deep=True) - lhs = store_copy - if rhs.ndim == 0: rhs_tmp = self.runtime.create_empty_thunk( index_array.base.shape, @@ -726,7 +698,7 @@ def set_item(self, key, rhs): else: if rhs.shape != index_array.shape: rhs_tmp = rhs._broadcast(index_array.base.shape) - rhs_tmp, rhs = rhs.copy_store(rhs_tmp) + rhs_tmp, rhs = rhs._copy_store(rhs_tmp) else: rhs = rhs.base diff --git a/tests/index_routines.py b/tests/index_routines.py index c0a62d87e..92f055ba3 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -354,6 +354,13 @@ def advanced_indexing(): x_num[indx0_num, indx1_num] = 2.0 assert np.array_equal(x, x_num) + # shape mismatch: + indx = np.ones((2, 2, 2), dtype=int) + indx_num = num.array(indx) + res = x[indx] + res_num = x_num[indx_num] + assert np.array_equal(res, res_num) + # use case when advanced indexing is called on a transformed array: print("advanced indexing test 11") z = z[:, 1:] From a8bf6adc78aadacfa383aa4160170e7279061ac2 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Fri, 15 Apr 2022 11:35:17 -0600 Subject: [PATCH 22/33] cleaning up deferred.py --- cunumeric/array.py | 17 ++- cunumeric/deferred.py | 287 +++++++++++++++++----------------------- tests/index_routines.py | 29 ++++ 3 files changed, 167 insertions(+), 166 deletions(-) diff --git a/cunumeric/array.py b/cunumeric/array.py index 0af95e6f6..e6de88710 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -767,8 +767,21 @@ def _convert_key(self, key, first=True): elif isinstance(key, tuple) and first: return tuple(self._convert_key(k, first=False) for k in key) else: - # Otherwise convert it to a cuNumeric array and get the thunk - return convert_to_cunumeric_ndarray(key)._thunk + # Otherwise convert it to a cuNumeric array, check types + # and get the thunk + key = convert_to_cunumeric_ndarray(key) + if key.dtype != np.bool and not np.issubdtype( + key.dtype, np.integer + ): + raise TypeError("index arrays should be int or bool type") + if key.dtype != np.bool and key.dtype != np.int64: + runtime.warn( + "converting index array to int64 type", + category=RuntimeWarning, + ) + key = key.astype(np.int64) + + return key._thunk @add_boilerplate() def __getitem__(self, key): diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 139cdc1af..5a4a0ef93 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -313,11 +313,15 @@ def _zip_indices(self, start_index, arrays): if start_index == -1: start_index = 0 - arrays = tuple(self.runtime.to_deferred_array(a) for a in arrays) - # all arrays should have the same shape and type - data_type = arrays[0].dtype - if not np.issubdtype(data_type, np.integer): - raise TypeError("a array should be integer type") + new_arrays = tuple() + # check array's type and converting them to deferred arrays + for a in arrays: + a = self.runtime.to_deferred_array(a) + data_type = a.dtype + if data_type != np.int64: + raise TypeError("index arrays should be int64 type") + new_arrays += (a,) + arrays = new_arrays # find a broadcasted shape for all arrays passed as indices shapes = tuple(a.shape for a in arrays) @@ -331,22 +335,19 @@ def _zip_indices(self, start_index, arrays): key_dim = len(b_shape) out_shape = b_shape + # broadcast shapes + new_arrays = tuple() + for a in arrays: + if a.shape != b_shape: + new_arrays += (a._broadcast(b_shape),) + else: + new_arrays += (a.base,) + arrays = new_arrays + if len(arrays) < self.ndim: # the case when # of arrays passed is smaller than dimension of # the input array N = len(arrays) - # broadcast shapes - new_arrays = tuple() - for a in arrays: - if data_type != a.dtype: - raise TypeError( - "type of all index arrrays should be the same" - ) - if a.shape != b_shape: - new_arrays += (a._broadcast(b_shape),) - else: - new_arrays += (a.base,) - arrays = new_arrays # output shape out_shape = ( tuple(self.shape[i] for i in range(0, start_index)) @@ -364,24 +365,8 @@ def _zip_indices(self, start_index, arrays): a = a.promote(key_dim + i - N, self.shape[i]) new_arrays += (a,) arrays = new_arrays - - else: - # the use case when # of arrays passed is equal to the dimension - # of the input array - if len(arrays) > self.ndim: - raise ValueError("wrong number of index arrays passed") - new_arrays = tuple() - for a in arrays: - if data_type != a.dtype: - raise TypeError( - "type of all index arrrays should be the same" - ) - if a.shape != b_shape: - a = a._broadcast(b_shape) - else: - a = a.base - new_arrays = new_arrays + (a,) - arrays = new_arrays + elif len(arrays) > self.ndim: + raise ValueError("wrong number of index arrays passed") # create output array which will store Point field where # N is number of index arrays @@ -406,24 +391,12 @@ def _zip_indices(self, start_index, arrays): # call ZIP function to combine index arrays into a singe array task = self.context.create_task(CuNumericOpCode.ZIP) task.add_output(output_arr.base) - if len(arrays) < self.ndim: - task.add_scalar_arg(self.ndim, ty.int64) # N of points in Point - task.add_scalar_arg(key_dim, ty.int64) # key_dim - task.add_scalar_arg(start_index, ty.int64) # start_index - for a in arrays: - task.add_input(a) - task.add_alignment(output_arr.base, a) - task.add_broadcast(a, axes=tuple(range(1, len(out_shape)))) - task.add_broadcast( - output_arr.base, axes=tuple(range(1, len(out_shape))) - ) - else: - task.add_scalar_arg(self.ndim, ty.int64) - task.add_scalar_arg(self.ndim, ty.int64) - task.add_scalar_arg(start_index, ty.int64) - for index_arr in arrays: - task.add_input(index_arr) - task.add_alignment(output_arr.base, index_arr) + task.add_scalar_arg(self.ndim, ty.int64) # N of points in Point + task.add_scalar_arg(key_dim, ty.int64) # key_dim + task.add_scalar_arg(start_index, ty.int64) # start_index + for a in arrays: + task.add_input(a) + task.add_alignment(output_arr.base, a) task.execute() return output_arr @@ -447,128 +420,112 @@ def _create_indexing_array(self, key, is_set=False): rhs = self # the index where the first index_array is passed to the [] operator start_index = -1 - if isinstance(key, tuple): - key = self._unpack_ellipsis(key, self.ndim) - shift = 0 - last_index = self.ndim - # in case when index arrays are passed in the scaterred way, - # we need to transpose original array so all index arrays - # are close to each other - transpose_needed = False - transpose_indices = tuple() - # since we can't call Copy operation on transformed Store, after - # the transformation, we need to return a copy - tuple_of_arrays = () - index_map = [] - - # First, we need to check if transpose is needed - for dim, k in enumerate(key): - if np.isscalar(k) or isinstance(k, NumPyThunk): - if start_index == -1: - start_index = dim - transpose_indices += (dim,) - transpose_needed = transpose_needed or ( - (dim - last_index) > 1 - ) - last_index = dim + if ( + isinstance(key, NumPyThunk) + and key.dtype == np.bool + and key.shape == rhs.shape + ): + if not isinstance(key, DeferredArray): + key = self.runtime.to_deferred_array(key) + + out_dtype = rhs.dtype + if is_set: + N = rhs.ndim + out_dtype = rhs.runtime.get_point_type(N) + + out = rhs.runtime.create_unbound_thunk(out_dtype) + task = rhs.context.create_task(CuNumericOpCode.ADVANCED_INDEXING) + task.add_output(out.base) + task.add_input(rhs.base) + task.add_input(key.base) + task.add_scalar_arg(is_set, bool) + task.add_alignment(rhs.base, key.base) + task.execute() + return False, rhs, out - if transpose_needed: - start_index = 0 - post_indices = tuple( - i for i in range(store.ndim) if i not in transpose_indices - ) - transpose_indices += post_indices - store = store.transpose(transpose_indices) - index_map = list(transpose_indices) - count = 0 - for i in transpose_indices: - index_map[i] = count - count += 1 - else: - index_map = tuple(range(len(key))) - - for d, k in enumerate(key): - dim = index_map[d] - if np.isscalar(k): - if k < 0: - k += store.shape[dim + shift] - store = store.project(dim + shift, k) - shift -= 1 - elif k is np.newaxis: - store = store.promote(dim + shift, 1) - elif isinstance(k, slice): - store = store.slice(dim + shift, k) - elif isinstance(k, NumPyThunk): - if k.dtype == np.bool: - if k.shape[0] != store.shape[dim]: - raise ValueError( - "boolean index did not match " - "indexed array along dimension " - ) - # in case of the mixed indises we all nonzero - # for the bool array - k = k.nonzero() - tuple_of_arrays += k - else: - tuple_of_arrays += (self.runtime.to_deferred_array(k),) - else: - raise TypeError( - "Unsupported entry type passed to advanced", - "indexing operation", - ) - if store.transformed: - # after store is transformed we need to to return a copy of - # the store since Copy operation can't be done on - # the store with transformation - rhs, store = self._copy_store(store) - else: - assert isinstance(key, DeferredArray) + if isinstance(key, NumPyThunk): # the use case when index array ndim >input array ndim if key.ndim > store.ndim: diff = store.ndim - key.ndim for i in range(diff): store = store.promote(i + 1, store.shape[0]) - if store.transformed: - rhs, store = self._copy_store(store) + key = (key,) - # Handle the boolean array case - if key.dtype == np.bool: - if key.shape == rhs.shape: - out_dtype = rhs.dtype - if is_set: - N = rhs.ndim - out_dtype = rhs.runtime.get_point_type(N) + assert isinstance(key, tuple) + key = self._unpack_ellipsis(key, self.ndim) + shift = 0 + last_index = self.ndim + # in case when index arrays are passed in the scaterred way, + # we need to transpose original array so all index arrays + # are close to each other + transpose_needed = False + transpose_indices = tuple() + key_transpose_indices = tuple() + # since we can't call Copy operation on transformed Store, after + # the transformation, we need to return a copy + tuple_of_arrays = () + + # First, we need to check if transpose is needed + for dim, k in enumerate(key): + if np.isscalar(k) or isinstance(k, NumPyThunk): + if start_index == -1: + start_index = dim + transpose_indices += (dim,) + transpose_needed = transpose_needed or ((dim - last_index) > 1) + last_index = dim + + if transpose_needed: + start_index = 0 + post_indices = tuple( + i for i in range(store.ndim) if i not in transpose_indices + ) + key_transpose_indices = transpose_indices + transpose_indices += post_indices + post_indices = tuple( + i for i in range(len(key)) if i not in key_transpose_indices + ) + key_transpose_indices += post_indices + store = store.transpose(transpose_indices) - out = rhs.runtime.create_unbound_thunk(out_dtype) - task = rhs.context.create_task( - CuNumericOpCode.ADVANCED_INDEXING - ) - task.add_output(out.base) - task.add_input(rhs.base) - task.add_input(key.base) - task.add_scalar_arg(is_set, bool) - task.add_alignment(rhs.base, key.base) - task.add_broadcast( - rhs.base, axes=tuple(range(1, len(rhs.shape))) - ) - task.add_broadcast( - key.base, axes=tuple(range(1, len(key.shape))) - ) - task.execute() - return False, rhs, out + key = tuple(key[i] for i in key_transpose_indices) + + for d, k in enumerate(key): + dim = d + if np.isscalar(k): + if k < 0: + k += store.shape[dim + shift] + store = store.project(dim + shift, k) + shift -= 1 + elif k is np.newaxis: + store = store.promote(dim + shift, 1) + elif isinstance(k, slice): + store = store.slice(dim + shift, k) + elif isinstance(k, NumPyThunk): + if not isinstance(key, DeferredArray): + k = self.runtime.to_deferred_array(k) + if k.dtype == np.bool: + if k.shape[0] != store.shape[dim + shift]: + raise ValueError( + "shape of boolean index did not match " + "indexed array " + ) + # in case of the mixed indises we all nonzero + # for the bool array + k = k.nonzero() + tuple_of_arrays += k else: - # FIXME: replace `nonzero` case with the task with - # output regions when ND output regions are available - tuple_of_arrays = key.nonzero() - elif key.ndim < store.ndim: - output_arr = rhs._zip_indices(start_index, (key,)) - return True, rhs, output_arr + tuple_of_arrays += (k,) else: - tuple_of_arrays = (rhs.runtime.to_deferred_array(key),) - - if len(tuple_of_arrays) > rhs.ndim: - raise TypeError("Advanced indexing dimension mismatch") + raise TypeError( + "Unsupported entry type passed to advanced ", + "indexing operation", + ) + if store.transformed: + # after store is transformed we need to to return a copy of + # the store since Copy operation can't be done on + # the store with transformation + rhs, store = self._copy_store(store) if len(tuple_of_arrays) <= rhs.ndim and rhs.ndim > 1: output_arr = rhs._zip_indices(start_index, tuple_of_arrays) @@ -683,6 +640,8 @@ def set_item(self, key, rhs): copy_needed, lhs, index_array = self._create_indexing_array( key, True ) + # TODO: remove rhs.ndim ==0 logic when issue with scalars not being + # type of Store is addressed if rhs.ndim == 0: rhs_tmp = self.runtime.create_empty_thunk( index_array.base.shape, diff --git a/tests/index_routines.py b/tests/index_routines.py index 92f055ba3..0ebafaeef 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -376,6 +376,35 @@ def advanced_indexing(): z_num[indx_num] = 10 assert np.array_equal(z, z_num) + x = np.ones((3, 4)) + x_num = num.array(x) + ind = np.full((4,), True) + ind_num = num.array(ind) + res = x[:, ind] + res_num = x_num[:, ind_num] + assert np.array_equal(res, res_num) + + if LEGATE_MAX_DIM > 7: + x = np.ones((2, 3, 4, 5, 3, 4)) + ind1 = np.full((3, 4), True) + ind2 = np.full((3, 4), True) + x_num = num.array(x) + ind1_num = num.array(ind1) + ind2_num = num.array(ind2) + res = x[:, ind1, :ind2] + res_num = x[:, ind1_num, :ind2_num] + res = x[ind1, :ind2] + res_num = x[ind1_num, :ind2_num] + assert np.array_equal(res, res_num) + + x = np.ones((3, 4)) + x_num = num.array(x) + ind = np.full((3,), 1, dtype=np.int32) + ind_num = num.array(ind) + res = x[ind, ind] + res_num = x_num[ind_num, ind_num] + assert np.array_equal(res, res_num) + # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by # 1 when passig 2d index array for ndim in range(2, LEGATE_MAX_DIM): From 04a3cc48008b486e1280b541701922ccbbaecef5 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Fri, 15 Apr 2022 15:47:46 -0600 Subject: [PATCH 23/33] making zip task to compile for the case when Legion is compiled with the support for large dimensions --- src/cunumeric/index/zip.cc | 8 ++++++-- src/cunumeric/index/zip.cu | 8 ++++++-- src/cunumeric/index/zip_omp.cc | 8 ++++++-- tests/index_routines.py | 8 +++----- 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/cunumeric/index/zip.cc b/src/cunumeric/index/zip.cc index a1bce3a5f..e755634ce 100644 --- a/src/cunumeric/index/zip.cc +++ b/src/cunumeric/index/zip.cc @@ -42,12 +42,16 @@ struct ZipImplBody { std::vector indx_ptrs = {index_arrays[Is].ptr(rect)...}; auto outptr = out.ptr(rect); for (size_t idx = 0; idx < volume; ++idx) { - outptr[idx] = Legion::Point(indx_ptrs[Is][idx]...); + Legion::Point new_point; + for (size_t i = 0; i < N; i++) { new_point[i] = indx_ptrs[i][idx]; } + outptr[idx] = new_point; } } else { for (size_t idx = 0; idx < volume; ++idx) { auto p = pitches.unflatten(idx, rect.lo); - out[p] = Legion::Point(index_arrays[Is][p]...); + Legion::Point new_point; + for (size_t i = 0; i < N; i++) { new_point[i] = index_arrays[i][p]; } + out[p] = new_point; } } } else { diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu index abf4914de..c130b9bb4 100644 --- a/src/cunumeric/index/zip.cu +++ b/src/cunumeric/index/zip.cu @@ -34,7 +34,9 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= volume) return; auto p = pitches.unflatten(idx, rect.lo); - out[p] = Legion::Point(index_arrays[Is][p]...); + Legion::Point new_point; + for (size_t i = 0; i < N; i++) { new_point[i] = index_arrays[i][p]; } + out[p] = new_point; } template @@ -47,7 +49,9 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= volume) return; - out[idx] = Legion::Point(index_arrays[Is][idx]...); + Legion::Point new_point; + for (size_t i = 0; i < N; i++) { new_point[i] = index_arrays[i][idx]; } + out[idx] = new_point; } template diff --git a/src/cunumeric/index/zip_omp.cc b/src/cunumeric/index/zip_omp.cc index e4a5d5764..51eb4c04f 100644 --- a/src/cunumeric/index/zip_omp.cc +++ b/src/cunumeric/index/zip_omp.cc @@ -43,13 +43,17 @@ struct ZipImplBody { auto outptr = out.ptr(rect); #pragma omp parallel for schedule(static) for (size_t idx = 0; idx < volume; ++idx) { - outptr[idx] = Legion::Point(indx_ptrs[Is][idx]...); + Legion::Point new_point; + for (size_t i = 0; i < N; i++) { new_point[i] = indx_ptrs[i][idx]; } + outptr[idx] = new_point; } } else { #pragma omp parallel for schedule(static) for (size_t idx = 0; idx < volume; ++idx) { auto p = pitches.unflatten(idx, rect.lo); - out[p] = Legion::Point(index_arrays[Is][p]...); + Legion::Point new_point; + for (size_t i = 0; i < N; i++) { new_point[i] = index_arrays[i][p]; } + out[p] = new_point; } } // else } else { diff --git a/tests/index_routines.py b/tests/index_routines.py index 0ebafaeef..abd3b9536 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -391,10 +391,8 @@ def advanced_indexing(): x_num = num.array(x) ind1_num = num.array(ind1) ind2_num = num.array(ind2) - res = x[:, ind1, :ind2] - res_num = x[:, ind1_num, :ind2_num] - res = x[ind1, :ind2] - res_num = x[ind1_num, :ind2_num] + res = x[:, ind1, :, ind2] + res_num = x[:, ind1_num, :, ind2_num] assert np.array_equal(res, res_num) x = np.ones((3, 4)) @@ -408,7 +406,7 @@ def advanced_indexing(): # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by # 1 when passig 2d index array for ndim in range(2, LEGATE_MAX_DIM): - a_shape = tuple(random.randint(2, 9) for i in range(ndim)) + a_shape = tuple(random.randint(2, 5) for i in range(ndim)) np_array = mk_seq_array(np, a_shape) num_array = mk_seq_array(num, a_shape) # check when N of index arrays == N of dims From 173f47fccabdfd068e7572327acbae0f120fa512 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Mon, 18 Apr 2022 10:17:47 -0600 Subject: [PATCH 24/33] Removing the cehck for the output_rect == input_rect In the case when input arrays (index arrays) are broadcasted and, legate not always will partition them (it will sometimes just broadcast them). This doesn't effect correctness. --- src/cunumeric/index/zip_template.inl | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/cunumeric/index/zip_template.inl b/src/cunumeric/index/zip_template.inl index e1e2c9004..3f7fcfe69 100644 --- a/src/cunumeric/index/zip_template.inl +++ b/src/cunumeric/index/zip_template.inl @@ -37,10 +37,6 @@ struct ZipImpl { size_t volume = pitches.flatten(out_rect); if (volume == 0) return; -#ifdef DEBUG_CUNUMERIC - assert(out_rect == index_rect); -#endif - #ifndef LEGION_BOUNDS_CHECKS bool dense = out.accessor.is_dense_row_major(out_rect); #else @@ -48,10 +44,7 @@ struct ZipImpl { #endif std::vector> index_arrays; for (int i = 0; i < args.inputs.size(); i++) { -#ifdef DEBUG_CUNUMERIC - assert(index_rect == args.inputs[i].shape()); -#endif - index_arrays.push_back(args.inputs[i].read_accessor(index_rect)); + index_arrays.push_back(args.inputs[i].read_accessor(args.inputs[i].shape())); dense = dense && index_arrays[i].accessor.is_dense_row_major(out_rect); } From 3601cdbe6773de69e0099b189e57dfb0feda0b2d Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Mon, 18 Apr 2022 14:53:03 -0600 Subject: [PATCH 25/33] cleaning-up tests + fixing logic for transformed rhs and index arrays --- cunumeric/deferred.py | 14 +- src/cunumeric/cunumeric_c.h | 2 +- tests/advanced_indexing.py | 591 ++++++++++++++++++++++++++++++++++++ tests/index_routines.py | 434 -------------------------- 4 files changed, 598 insertions(+), 443 deletions(-) create mode 100644 tests/advanced_indexing.py diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 5a4a0ef93..948f83186 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -444,12 +444,6 @@ def _create_indexing_array(self, key, is_set=False): return False, rhs, out if isinstance(key, NumPyThunk): - # the use case when index array ndim >input array ndim - if key.ndim > store.ndim: - diff = store.ndim - key.ndim - for i in range(diff): - store = store.promote(i + 1, store.shape[0]) - key = (key,) assert isinstance(key, tuple) @@ -487,7 +481,6 @@ def _create_indexing_array(self, key, is_set=False): ) key_transpose_indices += post_indices store = store.transpose(transpose_indices) - key = tuple(key[i] for i in key_transpose_indices) for d, k in enumerate(key): @@ -531,7 +524,10 @@ def _create_indexing_array(self, key, is_set=False): output_arr = rhs._zip_indices(start_index, tuple_of_arrays) return True, rhs, output_arr elif len(tuple_of_arrays) == 1 and rhs.ndim == 1: - return True, rhs, tuple_of_arrays[0] + key = tuple_of_arrays[0] + if key.base.transformed: + key, key_store = key._copy_store(key.base) + return True, rhs, key else: raise ValueError("Advance indexing dimention mismatch") @@ -659,6 +655,8 @@ def set_item(self, key, rhs): rhs_tmp = rhs._broadcast(index_array.base.shape) rhs_tmp, rhs = rhs._copy_store(rhs_tmp) else: + if rhs.base.transformed: + rhs, rhs_base = rhs._copy_store(rhs.base) rhs = rhs.base copy = self.context.create_copy() diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h index f270cc247..fc95446c4 100644 --- a/src/cunumeric/cunumeric_c.h +++ b/src/cunumeric/cunumeric_c.h @@ -189,7 +189,7 @@ enum CuNumericBounds { // Match these to CuNumericTypeCodes in config.py enum CuNumericTypeCodes { - CUNUMERIC_TYPE_POINT1 = LEGION_TYPE_TOTAL + 1, + CUNUMERIC_TYPE_POINT1 = MAX_TYPE_NUMBER + 1, CUNUMERIC_TYPE_POINT2, CUNUMERIC_TYPE_POINT3, CUNUMERIC_TYPE_POINT4, diff --git a/tests/advanced_indexing.py b/tests/advanced_indexing.py new file mode 100644 index 000000000..f0e478eb9 --- /dev/null +++ b/tests/advanced_indexing.py @@ -0,0 +1,591 @@ +# Copyright 2022 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import random + +import numpy as np +from test_tools.generators import mk_seq_array + +import cunumeric as num +from legate.core import LEGATE_MAX_DIM + + +def test(): + + # tests on 1D input array: + print("advanced indexing test 1") + + # a: simple 1D test + x = np.array([1, 2, 3, 4, 5, 6, 7]) + indx = np.array([1, 3, 5]) + res = x[indx] + x_num = num.array(x) + indx_num = num.array(indx) + res_num = x_num[indx_num] + assert np.array_equal(res, res_num) + + # b: after base array transformation: + xt = x[1:] + xt_num = x_num[1:] + res = xt[indx] + res_num = xt_num[indx_num] + assert np.array_equal(res, res_num) + + # c: after index array transformation: + indxt = indx[1:] + indxt_num = indx_num[1:] + res = x[indxt] + res_num = x_num[indxt_num] + assert np.array_equal(res, res_num) + + # d: test in-place assignment with scalar: + x[indx] = 13 + x_num[indx_num] = 13 + assert np.array_equal(x, x_num) + + # e: test in-place assignment with array: + xt[indx] = np.array([3, 5, 7]) + xt_num[indx_num] = num.array([3, 5, 7]) + assert np.array_equal(xt, xt_num) + assert np.array_equal(x, x_num) + + # f: test in-place assignment with transformed rhs array: + b = np.array([3, 5, 7, 8]) + b_num = num.array([3, 5, 7, 8]) + bt = b[1:] + bt_num = b_num[1:] + x[indx] = bt + x_num[indx_num] = bt_num + assert np.array_equal(x, x_num) + + # g: test in-place assignment with transformed + # rhs and lhs arrays: + b = np.array([3, 5, 7, 8]) + b_num = num.array([3, 5, 7, 8]) + b1 = b[1:] + b1_num = b_num[1:] + xt[indx] = b1 + xt_num[indx_num] = b1_num + assert np.array_equal(xt, xt_num) + assert np.array_equal(x, x_num) + + # h: in-place assignment with transformed index array: + b = np.array([5, 7]) + b_num = num.array([5, 7]) + x[indxt] = b + x_num[indxt_num] = b_num + assert np.array_equal(x, x_num) + + # i: the case when index.ndim > input.ndim: + index = np.array([[1, 0, 1, 3, 0, 0], [2, 4, 0, 4, 4, 4]]) + index_num = num.array(index) + assert np.array_equal(x[index], x_num[index_num]) + + # j: test for bool array of the same dimension + index = np.array([True, False, False, True, True, False, True]) + index_num = num.array(index) + assert np.array_equal(x[index], x_num[index_num]) + + # k: test in-place assignment fir the case when idx arr + # is 1d bool array: + x[index] = 3 + x_num[index_num] = 3 + assert np.array_equal(x, x_num) + + # l: test when type of a base array is different from int: + x_float = x.astype(float) + x_num_float = x_num.astype(float) + index = np.array([[1, 0, 1, 3, 0, 0], [2, 4, 0, 4, 4, 4]]) + index_num = num.array(index) + assert np.array_equal(x_float[index], x_num_float[index_num]) + + # m: test when type of the index array is not int64 + index = np.array([1, 3, 5], dtype=np.int16) + index_num = num.array(index) + assert np.array_equal(x[index], x_num[index_num]) + + # n: the case when rhs is a different type + x[index] = 3.5 + x_num[index_num] = 3.5 + assert np.array_equal(x, x_num) + + # o: the case when rhs is an array of different type + b = np.array([2.1, 3.3, 7.2]) + b_num = num.array(b) + x[index] = b + x_num[index_num] = b_num + assert np.array_equal(x, x_num) + + # p: in-place assignment where some indices point to the + # same location: + index = np.array([2, 4, 0, 4, 4, 4]) + index_num = num.array(index) + x[index] = 0 + x_num[index_num] = 0 + assert np.array_equal(x, x_num) + + # q: in-place assignment in the case when broadcast is needed: + index = np.array([[1, 4, 3], [2, 0, 5]]) + index_num = num.array(index) + x[index] = np.array([[1, 2, 3]]) + x_num[index_num] = num.array([[1, 2, 3]]) + assert np.array_equal(x, x_num) + + # Nd cases + print("advanced indexing test 2") + + x = mk_seq_array(np, (2, 3, 4, 5)) + x_num = mk_seq_array(num, (2, 3, 4, 5)) + xt = x.transpose( + ( + 1, + 0, + 2, + 3, + ) + ) + xt_num = x_num.transpose( + ( + 1, + 0, + 2, + 3, + ) + ) + + # a: 1d index array passed to a different indices: + indx = np.array([1, 1]) + indx_num = num.array(indx) + res = x[indx] + res_num = x_num[indx_num] + assert np.array_equal(res, res_num) + + res = xt[indx] + res_num = xt_num[indx_num] + assert np.array_equal(res, res_num) + + res = x[:, :, indx] + res_num = x_num[:, :, indx_num] + assert np.array_equal(res, res_num) + + res = xt[:, :, indx] + res_num = xt_num[:, :, indx_num] + assert np.array_equal(res, res_num) + + res = x[:, :, :, indx] + res_num = x_num[:, :, :, indx_num] + assert np.array_equal(res, res_num) + + res = xt[:, :, :, indx] + res_num = xt_num[:, :, :, indx_num] + assert np.array_equal(res, res_num) + + res = x[:, indx, :] + res_num = x_num[:, indx_num, :] + assert np.array_equal(res, res_num) + + res = xt[:, indx, :] + res_num = xt_num[:, indx_num, :] + assert np.array_equal(res, res_num) + + # b : 2 1d index arrays passed + indx0 = np.array([1, 1]) + indx1 = np.array([1, 0]) + indx0_num = num.array(indx0) + indx1_num = num.array(indx1) + res = x[indx0, indx1] + res_num = x_num[indx0_num, indx1_num] + assert np.array_equal(res, res_num) + + res = xt[indx0, indx1] + res_num = xt_num[indx0_num, indx1_num] + assert np.array_equal(res, res_num) + + res = x[:, indx0, indx1] + res_num = x_num[:, indx0_num, indx1_num] + assert np.array_equal(res, res_num) + + res = xt[:, indx0, indx1] + res_num = xt_num[:, indx0_num, indx1_num] + assert np.array_equal(res, res_num) + + # c: 2 index arrays passed in a sparse way: + res = x[:, [0, 1], :, [0, 1]] + res_num = x_num[:, [0, 1], :, [0, 1]] + assert np.array_equal(res, res_num) + + res = xt[:, [0, 1], :, [0, 1]] + res_num = xt_num[:, [0, 1], :, [0, 1]] + assert np.array_equal(res, res_num) + + res = x[[0, 1], :, [0, 1], 1:] + res_num = x_num[[0, 1], :, [0, 1], 1:] + assert np.array_equal(res, res_num) + + res = xt[[0, 1], :, [0, 1], 1:] + res_num = xt_num[[0, 1], :, [0, 1], 1:] + assert np.array_equal(res, res_num) + + res = x[:, [0, 1], :, 1:] + res_num = x_num[:, [0, 1], :, 1:] + assert np.array_equal(res, res_num) + + res = xt[:, [0, 1], :, 1:] + res_num = xt_num[:, [0, 1], :, 1:] + assert np.array_equal(res, res_num) + + z = x + z_num = x_num + z[[0, 1], [0, 1]] = 11 + z_num[[0, 1], [0, 1]] = 11 + assert np.array_equal(z, z_num) + + # d: newaxis is passed along with array: + + res = x[..., [1, 0]] + res_num = x_num[..., [1, 0]] + assert np.array_equal(res, res_num) + + res = xt[..., [0, 1], 1:] + res_num = xt_num[..., [0, 1], 1:] + assert np.array_equal(res, res_num) + + res = x[..., [0, 1], [1, 1]] + res_num = x_num[..., [0, 1], [1, 1]] + assert np.array_equal(res, res_num) + + # e: index arrays that have different shape: + indx0 = np.array([1, 1]) + indx1 = np.array([[1, 0], [1, 0]]) + indx0_num = num.array(indx0) + indx1_num = num.array(indx1) + res = x[indx0, indx1] + res_num = x_num[indx0_num, indx1_num] + assert np.array_equal(res, res_num) + + res = xt[indx0, indx1] + res_num = xt_num[indx0_num, indx1_num] + assert np.array_equal(res, res_num) + + res = x[indx0, indx1, indx0, indx1] + res_num = x_num[indx0_num, indx1_num, indx0_num, indx1_num] + assert np.array_equal(res, res_num) + + res = x[indx0, :, indx1] + res_num = x_num[indx0_num, :, indx1_num] + assert np.array_equal(res, res_num) + + res = xt[:, indx0, indx1, 1:] + res_num = xt_num[:, indx0_num, indx1_num, 1:] + assert np.array_equal(res, res_num) + + # f: single boolean array passed: + indx_bool = np.array([True, False]) + indx_bool_num = num.array(indx_bool) + res = x[indx_bool] + res_num = x_num[indx_bool_num] + assert np.array_equal(res, res_num) + + indx_bool = np.array([True, False, True]) + indx_bool_num = num.array(indx_bool) + res = x[:, indx_bool] + res_num = x_num[:, indx_bool_num] + assert np.array_equal(res, res_num) + + # on the transposed base + indx_bool = np.array([True, False, True]) + indx_bool_num = num.array(indx_bool) + res = xt[indx_bool] + res_num = xt_num[indx_bool_num] + assert np.array_equal(res, res_num) + + indx_bool = np.array([True, False, True, False, False]) + indx_bool_num = num.array(indx_bool) + res = x[..., indx_bool] + res_num = x_num[..., indx_bool_num] + assert np.array_equal(res, res_num) + + print("IRINA DEBUG 1") + indx1_bool = np.array([True, False]) + indx1_bool_num = num.array(indx1_bool) + indx2_bool = np.array([True, False, True, True]) + indx2_bool_num = num.array(indx2_bool) + res = x[indx1_bool, :, indx2_bool] + print(res.shape) + print(res) + res_num = x_num[indx1_bool_num, :, indx2_bool_num] + print(res_num.shape) + print(res_num) + assert np.array_equal(res, res_num) + + print("IRINA DEBUG 2") + res = x[indx1_bool, 1, indx2_bool] + # res_num = x_num[indx1_bool_num, 1, indx2_bool_num] + # print(res.shape) + # print(res_num.shape) + # assert np.array_equal(res, res_num) + + # g: boolean array with the same shape is passed to x: + indx = x % 2 + indx = indx.astype(bool) + indx_num = num.array(indx) + res = x[indx] + res_num = x_num[indx_num] + assert np.array_equal(res, res_num) + + # h: inplace assignment with bool arays + z = x + z_num = x_num + z[indx] = 1 + z_num[indx_num] = 1 + assert np.array_equal(z, z_num) + print("IRINA DEBUG 3") + + indx_bool = np.array([True, False, True]) + indx_bool_num = num.array(indx_bool) + z[:, indx_bool] = 5 + z_num[:, indx_bool_num] = 5 + assert np.array_equal(z, z_num) + + print("IRINA DEBUG 4") + # i: two bool array of the same shape are passed: + x = mk_seq_array( + np, + ( + 3, + 4, + 3, + 4, + ), + ) + x_num = mk_seq_array( + num, + ( + 3, + 4, + 3, + 4, + ), + ) + indx = np.array( + [ + [True, False, False, False], + [False, False, False, False], + [False, False, False, True], + ] + ) + indx_num = num.array(indx) + res = x[indx, indx] + print("IRINA DEBUG res = ", res.shape) + # res_num = x_num[indx_num, indx_num] + # assert np.array_equal(res, res_num) + + # j: 2 bool arrays should be broadcasted: + # res = x[idx, [True,False,False]] + # res_num = x_num[idx_num, [True,False,False]] + + # 2d bool array not at the first index: + indx = np.full((4, 3), True) + indx_num = num.array(indx) + res = x[:, indx] + # res_num = x_num[:, indx] + # assert np.array_equal(res, res_num) + + # 3: testing mixed type of the arguments passed: + + # a: bool and index arrays + x = mk_seq_array( + np, + ( + 2, + 3, + 4, + 5, + ), + ) + x_num = mk_seq_array( + num, + ( + 2, + 3, + 4, + 5, + ), + ) + res = x[[1, 1], [False, True, False]] + # res_num = x_num[[1,1], [False, True,False]] + # assert np.array_equal(res, res_num) + + res = x[[1, 1], :, [False, True, False, True]] + res_num = x_num[[1, 1], :, [False, True, False, True]] + assert np.array_equal(res, res_num) + + # b: combining basic and advanced indexing schemes + ind0 = np.array([1, 1]) + ind0_num = num.array(ind0) + res = x[ind0, :, -1] + res_num = x_num[ind0_num, :, -1] + assert np.array_equal(res, res_num) + + res = x[ind0, :, 1:3] + res_num = x_num[ind0_num, :, 1:3] + assert np.array_equal(res, res_num) + + res = x[1, :, ind0] + res_num = x_num[1, :, ind0_num] + assert np.array_equal(res, res_num) + + x = mk_seq_array(np, (3, 4, 5, 6)) + x_num = mk_seq_array(num, (3, 4, 5, 6)) + res = x[[0, 1], [0, 1], :, 2] + res_num = x_num[[0, 1], [0, 1], :, 2] + assert np.array_equal(res, res_num) + + res = x[..., [0, 1], 2] + res_num = x_num[..., [0, 1], 2] + assert np.array_equal(res, res_num) + + res = x[:, [0, 1], :, -1] + res_num = x_num[:, [0, 1], :, -1] + assert np.array_equal(res, res_num) + + res = x[:, [0, 1], :, 1:] + res_num = x_num[:, [0, 1], :, 1:] + assert np.array_equal(res, res_num) + + # c: transformed base: + z = x[:, 1:] + z_num = x_num[:, 1:] + indx = np.array([1, 1]) + indx_num = num.array(indx) + res = z[indx] + res_num = z_num[indx_num] + assert np.array_equal(res, res_num) + + # d: shape mismatch case: + x = np.array( + [ + [0.38, -0.16, 0.38, -0.41, -0.04], + [-0.47, -0.01, -0.18, -0.5, -0.49], + [0.02, 0.4, 0.33, 0.33, -0.13], + ] + ) + x_num = num.array(x) + + indx = np.ones((2, 2, 2), dtype=int) + indx_num = num.array(indx) + res = x[indx] + res_num = x_num[indx_num] + assert np.array_equal(res, res_num) + + x = np.ones( + ( + 3, + 4, + ), + dtype=int, + ) + x_num = num.array(x) + ind = np.full((4,), True) + ind_num = num.array(ind) + res = x[:, ind] + res_num = x_num[:, ind_num] + assert np.array_equal(res, res_num) + + if LEGATE_MAX_DIM > 7: + x = np.ones((2, 3, 4, 5, 3, 4)) + ind1 = np.full((3, 4), True) + ind2 = np.full((3, 4), True) + x_num = num.array(x) + ind1_num = num.array(ind1) + ind2_num = num.array(ind2) + res = x[:, ind1, :, ind2] + res_num = x[:, ind1_num, :, ind2_num] + assert np.array_equal(res, res_num) + + # e: type mismatch case: + x = np.ones((3, 4)) + x_num = num.array(x) + ind = np.full((3,), 1, dtype=np.int32) + ind_num = num.array(ind) + res = x[ind, ind] + res_num = x_num[ind_num, ind_num] + assert np.array_equal(res, res_num) + + x = np.ones((3, 4), dtype=float) + x_num = num.array(x) + ind = np.full((3,), 1) + ind_num = num.array(ind) + res = x[ind, ind] + res_num = x_num[ind_num, ind_num] + assert np.array_equal(res, res_num) + + x[ind, ind] = 5 + x_num[ind_num, ind_num] = 5 + assert np.array_equal(x, x_num) + + # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by + # 1 when passig 2d index array + for ndim in range(2, LEGATE_MAX_DIM): + a_shape = tuple(random.randint(2, 5) for i in range(ndim)) + np_array = mk_seq_array(np, a_shape) + num_array = mk_seq_array(num, a_shape) + # check when N of index arrays == N of dims + num_tuple_of_indices = tuple() + np_tuple_of_indices = tuple() + for i in range(ndim): + i_shape = (2, 4) + idx_arr_np = mk_seq_array(np, i_shape) % np_array.shape[i] + idx_arr_num = num.array(idx_arr_np) + np_tuple_of_indices += (idx_arr_np,) + num_tuple_of_indices += (idx_arr_num,) + assert np.array_equal( + np_array[np_tuple_of_indices], num_array[num_tuple_of_indices] + ) + # check when N of index arrays == N of dims + i_shape = (2, 2) + idx_arr_np = mk_seq_array(np, i_shape) % np_array.shape[0] + idx_arr_num = num.array(idx_arr_np) + assert np.array_equal(np_array[idx_arr_np], num_array[idx_arr_num]) + # test in-place assignment + np_array[idx_arr_np] = 2 + num_array[idx_arr_num] = 2 + assert np.array_equal(num_array, np_array) + idx_arr_np = np.array([[1, 0, 1], [1, 1, 0]]) + idx_arr_num = num.array(idx_arr_np) + assert np.array_equal( + np_array[:, idx_arr_np], num_array[:, idx_arr_num] + ) + # test in-place assignment + np_array[:, idx_arr_np] = 3 + num_array[:, idx_arr_num] = 3 + assert np.array_equal(num_array, np_array) + if ndim > 2: + assert np.array_equal( + np_array[1, :, idx_arr_np], num_array[1, :, idx_arr_num] + ) + assert np.array_equal( + np_array[:, idx_arr_np, idx_arr_np], + num_array[:, idx_arr_num, idx_arr_num], + ) + if ndim > 3: + assert np.array_equal( + np_array[:, idx_arr_np, :, idx_arr_np], + num_array[:, idx_arr_num, :, idx_arr_num], + ) + + +if __name__ == "__main__": + test() diff --git a/tests/index_routines.py b/tests/index_routines.py index abd3b9536..3f5344df5 100644 --- a/tests/index_routines.py +++ b/tests/index_routines.py @@ -24,438 +24,6 @@ from legate.core import LEGATE_MAX_DIM -def advanced_indexing(): - # simple advanced indexing: - print("advanced indexing test 1") - x = np.array([1, 2, 3, 4, 5, 6, 7]) - indx = np.array([1, 3, 5]) - res = x[indx] - x_num = num.array(x) - indx_num = num.array(indx) - res_num = x_num[indx_num] - assert np.array_equal(res, res_num) - - # after transformation: - x = x[1:] - x_num = x_num[1:] - res = x[indx] - res_num = x_num[indx_num] - assert np.array_equal(res, res_num) - - # advanced indexing test when a.ndim ==1 , indx.ndim >1 - print("advanced indexing test 2") - y = np.array([0, -1, -2, -3, -4, -5]) - y_num = num.array(y) - index = np.array([[1, 0, 1, 3, 0, 0], [2, 4, 0, 4, 4, 4]]) - index_num = num.array(index) - assert np.array_equal(y[index], y_num[index_num]) - - # simple 2D case - print("advanced indexing test 3") - index_2d = np.array([[1, 2, 0], [5, 5, 5], [2, 3, 4]]) - index_2d_num = num.array(index_2d) - assert np.array_equal(y[index_2d], y_num[index_2d_num]) - - z = np.array( - [ - [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], - [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]], - ] - ) - z_num = num.array(z) - - zt = z.transpose( - ( - 1, - 0, - 2, - ) - ) - zt_num = z_num.transpose( - ( - 1, - 0, - 2, - ) - ) - - # mismatch dimesion case: - print("advanced indexing test 4") - indx = np.array([1, 1]) - indx_num = num.array(indx) - res = z[indx] - res_num = z_num[indx_num] - assert np.array_equal(res, res_num) - - res = zt[indx] - res_num = zt_num[indx_num] - assert np.array_equal(res, res_num) - - res = z[:, :, indx] - res_num = z_num[:, :, indx_num] - assert np.array_equal(res, res_num) - - res = zt[:, :, indx] - res_num = zt_num[:, :, indx_num] - assert np.array_equal(res, res_num) - - res = z[:, indx, :] - res_num = z_num[:, indx_num, :] - assert np.array_equal(res, res_num) - - res = zt[:, indx, :] - res_num = zt_num[:, indx_num, :] - assert np.array_equal(res, res_num) - - # 2d: - indx = np.array([[1, 1], [1, 0]]) - indx_num = num.array(indx) - res = z[indx] - res_num = z_num[indx_num] - assert np.array_equal(res, res_num) - - res = zt[indx] - res_num = zt_num[indx_num] - assert np.array_equal(res, res_num) - - res = z[:, indx] - res_num = z_num[:, indx_num] - assert np.array_equal(res, res_num) - - res = zt[:, indx] - res_num = zt_num[:, indx_num] - assert np.array_equal(res, res_num) - - # 2 arrays passed to 3d array - indx0 = np.array([1, 1]) - indx1 = np.array([1, 0]) - indx0_num = num.array(indx0) - indx1_num = num.array(indx1) - res = z[indx0, indx1] - res_num = z_num[indx0_num, indx1_num] - assert np.array_equal(res, res_num) - - res = zt[indx0, indx1] - res_num = zt_num[indx0_num, indx1_num] - assert np.array_equal(res, res_num) - - res = z[:, indx0, indx1] - res_num = z_num[:, indx0_num, indx1_num] - assert np.array_equal(res, res_num) - - res = zt[:, indx0, indx1] - res_num = zt_num[:, indx0_num, indx1_num] - assert np.array_equal(res, res_num) - - # 2 index arrays passed in a sparse way: - x = mk_seq_array(np, (3, 4, 5, 6)) - x_num = mk_seq_array(num, (3, 4, 5, 6)) - res = x[:, [0, 1], :, [0, 1]] - res_num = x_num[:, [0, 1], :, [0, 1]] - assert np.array_equal(res, res_num) - - res = x[[0, 1], :, [0, 1], 1:] - res_num = x_num[[0, 1], :, [0, 1], 1:] - assert np.array_equal(res, res_num) - - res = x[:, [0, 1], :, 1:] - res_num = x_num[:, [0, 1], :, 1:] - assert np.array_equal(res, res_num) - - # 2 arrays with broadcasting - indx0 = np.array([1, 1]) - indx1 = np.array([[1, 0], [1, 0]]) - indx0_num = num.array(indx0) - indx1_num = num.array(indx1) - res = z[indx0, indx1] - res_num = z_num[indx0_num, indx1_num] - assert np.array_equal(res, res_num) - - res = zt[indx0, indx1] - res_num = zt_num[indx0_num, indx1_num] - assert np.array_equal(res, res_num) - - # mismatch dimesion case bool: - indx_bool = np.array([True, False]) - indx_bool_num = num.array(indx_bool) - res = z[indx_bool] - res_num = z_num[indx_bool_num] - assert np.array_equal(res, res_num) - - # test for bool array of the same dimension - print("advanced indexing test 5") - index = np.array([True, False, False, True, True, False]) - index_num = num.array(index) - assert np.array_equal(y[index], y_num[index_num]) - - # test in-place assignment fir the case when idx arr - # is 1d bool array: - y[index] = 3 - y_num[index_num] = 3 - assert np.array_equal(y, y_num) - - # test for bool array of the same dimension 2D - print("advanced indexing test 6") - indx_bool = np.array( - [ - [ - [False, True, False, False], - [True, True, False, False], - [True, False, True, False], - ], - [ - [False, True, False, False], - [True, True, False, False], - [True, False, True, False], - ], - ] - ) - indx_bool_num = num.array(indx_bool) - res = z[indx_bool] - res_num = z_num[indx_bool_num] - assert np.array_equal(res, res_num) - - # test in-place assignment fir the case when idx arr - # is 2d bool array: - z[indx_bool] = 1 - z_num[indx_bool] = 1 - assert np.array_equal(z, z_num) - - # test mixed data - print("advanced indexing test 7") - res = z[:, -1] - res_num = z_num[:, -1] - assert np.array_equal(res, res_num) - - # case when multiple number of arays is passed - print("advanced indexing test 8") - indx0 = np.array([[0, 1], [1, 0], [0, 0]]) - indx1 = np.array([[0, 1], [2, 0], [1, 2]]) - indx2 = np.array([[3, 2], [1, 0], [3, 2]]) - - indx0_num = num.array(indx0) - indx1_num = num.array(indx1) - indx2_num = num.array(indx2) - - res = z_num[indx0_num, indx1_num, indx2_num] - res_np = z[indx0, indx1, indx2] - assert np.array_equal(res, res_np) - - # test in-place assignment fir the case when - # several index arrays passed - z_num[indx0_num, indx1_num, indx2_num] = -2 - z[indx0, indx1, indx2] = -2 - assert np.array_equal(z, z_num) - - # indices with broadcast: - print("advanced indexing test 9") - indx0 = np.array([[0, 1], [1, 0], [0, 0]]) - indx1 = np.array([[0, 1]]) - indx2 = np.array([[3, 2], [1, 0], [3, 2]]) - - indx0_num = num.array(indx0) - indx1_num = num.array(indx1) - indx2_num = num.array(indx2) - res = z_num[indx0_num, indx1_num, indx2_num] - res_np = z[indx0, indx1, indx2] - assert np.array_equal(res, res_np) - - # Combining Basic and Advanced Indexing Schemes: - print("advanced indexing test 10") - ind0 = np.array([1, 1]) - ind0_num = num.array(ind0) - res = z[ind0, :, -1] - res_num = z_num[ind0_num, :, -1] - assert np.array_equal(res, res_num) - - res = z[ind0, :, [False, True, False, True]] - res_num = z_num[ind0_num, :, [False, True, False, True]] - assert np.array_equal(res, res_num) - - res = z[ind0, :, ind0] - res_num = z_num[ind0_num, :, ind0_num] - assert np.array_equal(res, res_num) - - res = z[ind0, :, 1:3] - res_num = z_num[ind0_num, :, 1:3] - assert np.array_equal(res, res_num) - - res = z[1, :, ind0] - res_num = z_num[1, :, ind0_num] - assert np.array_equal(res, res_num) - - x = mk_seq_array(np, (3, 4, 5, 6)) - x_num = mk_seq_array(num, (3, 4, 5, 6)) - res = x[[0, 1], [0, 1], :, 2] - res_num = x_num[[0, 1], [0, 1], :, 2] - assert np.array_equal(res, res_num) - - res = x[..., [0, 1], 2] - res_num = x_num[..., [0, 1], 2] - assert np.array_equal(res, res_num) - - res = x[:, [0, 1], :, -1] - res_num = x_num[:, [0, 1], :, -1] - assert np.array_equal(res, res_num) - - res = x[:, [0, 1], :, 1:] - res_num = x_num[:, [0, 1], :, 1:] - assert np.array_equal(res, res_num) - - # In-Place & Augmented Assignments via Advanced Indexing - # simple 1d case - y = np.array([0, -1, -2, -3, -4, -5]) - y_num = num.array(y) - index = np.array([2, 4, 0, 4, 4, 4]) - index_num = num.array(index) - y[index] = 0 - y_num[index_num] = 0 - assert np.array_equal(y, y_num) - - y[index] = np.array([1, 2, 3, 4, 5, 6]) - y_num[index_num] = num.array([1, 2, 3, 4, 5, 6]) - print(y) - print(y_num) - # Order on which data is updated in case when indexing array points to the - # same daya in the original array is not guaranteed, so we can't call - # assert np.array_equal(y, y_num) here - - index = np.array([1, 4, 3, 2, 0, 5]) - index_num = num.array(index) - y[index] = np.array([1, 2, 3, 4, 5, 6]) - y_num[index_num] = num.array([1, 2, 3, 4, 5, 6]) - print(y) - print(y_num) - assert np.array_equal(y, y_num) - - # the case when broadcast is needed: - index = np.array([[1, 4, 3], [2, 0, 5]]) - index_num = num.array(index) - y[index] = np.array([[1, 2, 3]]) - y_num[index_num] = num.array([[1, 2, 3]]) - print(y) - print(y_num) - assert np.array_equal(y, y_num) - - # 2D test - x = np.array( - [ - [0.38, -0.16, 0.38, -0.41, -0.04], - [-0.47, -0.01, -0.18, -0.5, -0.49], - [0.02, 0.4, 0.33, 0.33, -0.13], - ] - ) - indx0 = np.array([0, 1]) - indx1 = np.array([1, 2]) - x_num = num.array(x) - indx0_num = num.array(indx0) - indx1_num = num.array(indx1) - x[indx0, indx1] = 2.0 - x_num[indx0_num, indx1_num] = 2.0 - assert np.array_equal(x, x_num) - - # shape mismatch: - indx = np.ones((2, 2, 2), dtype=int) - indx_num = num.array(indx) - res = x[indx] - res_num = x_num[indx_num] - assert np.array_equal(res, res_num) - - # use case when advanced indexing is called on a transformed array: - print("advanced indexing test 11") - z = z[:, 1:] - z_num = z_num[:, 1:] - indx = np.array([1, 1]) - indx_num = num.array(indx) - res = z[indx] - res_num = z_num[indx_num] - assert np.array_equal(res, res_num) - - # in-place assignment - z[indx] = 10 - z_num[indx_num] = 10 - assert np.array_equal(z, z_num) - - x = np.ones((3, 4)) - x_num = num.array(x) - ind = np.full((4,), True) - ind_num = num.array(ind) - res = x[:, ind] - res_num = x_num[:, ind_num] - assert np.array_equal(res, res_num) - - if LEGATE_MAX_DIM > 7: - x = np.ones((2, 3, 4, 5, 3, 4)) - ind1 = np.full((3, 4), True) - ind2 = np.full((3, 4), True) - x_num = num.array(x) - ind1_num = num.array(ind1) - ind2_num = num.array(ind2) - res = x[:, ind1, :, ind2] - res_num = x[:, ind1_num, :, ind2_num] - assert np.array_equal(res, res_num) - - x = np.ones((3, 4)) - x_num = num.array(x) - ind = np.full((3,), 1, dtype=np.int32) - ind_num = num.array(ind) - res = x[ind, ind] - res_num = x_num[ind_num, ind_num] - assert np.array_equal(res, res_num) - - # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by - # 1 when passig 2d index array - for ndim in range(2, LEGATE_MAX_DIM): - a_shape = tuple(random.randint(2, 5) for i in range(ndim)) - np_array = mk_seq_array(np, a_shape) - num_array = mk_seq_array(num, a_shape) - # check when N of index arrays == N of dims - num_tuple_of_indices = tuple() - np_tuple_of_indices = tuple() - for i in range(ndim): - i_shape = (2, 4) - idx_arr_np = mk_seq_array(np, i_shape) % np_array.shape[i] - idx_arr_num = num.array(idx_arr_np) - np_tuple_of_indices += (idx_arr_np,) - num_tuple_of_indices += (idx_arr_num,) - assert np.array_equal( - np_array[np_tuple_of_indices], num_array[num_tuple_of_indices] - ) - # check when N of index arrays == N of dims - i_shape = (2, 2) - idx_arr_np = mk_seq_array(np, i_shape) % np_array.shape[0] - idx_arr_num = num.array(idx_arr_np) - assert np.array_equal(np_array[idx_arr_np], num_array[idx_arr_num]) - # test in-place assignment - np_array[idx_arr_np] = 2 - num_array[idx_arr_num] = 2 - assert np.array_equal(num_array, np_array) - idx_arr_np = np.array([[1, 0, 1], [1, 1, 0]]) - idx_arr_num = num.array(idx_arr_np) - assert np.array_equal( - np_array[:, idx_arr_np], num_array[:, idx_arr_num] - ) - # test in-place assignment - np_array[:, idx_arr_np] = 3 - num_array[:, idx_arr_num] = 3 - assert np.array_equal(num_array, np_array) - if ndim > 2: - assert np.array_equal( - np_array[1, :, idx_arr_np], num_array[1, :, idx_arr_num] - ) - assert np.array_equal( - np_array[:, idx_arr_np, idx_arr_np], - num_array[:, idx_arr_num, idx_arr_num], - ) - if ndim > 3: - assert np.array_equal( - np_array[:, idx_arr_np, :, idx_arr_np], - num_array[:, idx_arr_num, :, idx_arr_num], - ) - - return - - def test(): # -------------------------------------------------------------- # choose operator @@ -624,8 +192,6 @@ def test(): fn = np.diag(en, k=k) assert np.array_equal(f, fn) - advanced_indexing() - return From 502a4b25a9d8769a2d1e68f068ea590765de8317 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Tue, 19 Apr 2022 09:40:30 -0600 Subject: [PATCH 26/33] removing unnecessary call to the FILL task --- cunumeric/deferred.py | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 948f83186..a8edd089d 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -636,28 +636,14 @@ def set_item(self, key, rhs): copy_needed, lhs, index_array = self._create_indexing_array( key, True ) - # TODO: remove rhs.ndim ==0 logic when issue with scalars not being - # type of Store is addressed - if rhs.ndim == 0: - rhs_tmp = self.runtime.create_empty_thunk( - index_array.base.shape, - self.dtype, - inputs=[], - ) - task = self.context.create_task(CuNumericOpCode.FILL) - task.add_output(rhs_tmp.base) - task.add_input(rhs.base) - task.add_scalar_arg(False, bool) - task.execute() - rhs = rhs_tmp.base + rhs = self.runtime.to_deferred_array(rhs) + if rhs.shape != index_array.shape: + rhs_tmp = rhs._broadcast(index_array.base.shape) + rhs_tmp, rhs = rhs._copy_store(rhs_tmp) else: - if rhs.shape != index_array.shape: - rhs_tmp = rhs._broadcast(index_array.base.shape) - rhs_tmp, rhs = rhs._copy_store(rhs_tmp) - else: - if rhs.base.transformed: - rhs, rhs_base = rhs._copy_store(rhs.base) - rhs = rhs.base + if rhs.base.transformed: + rhs, rhs_base = rhs._copy_store(rhs.base) + rhs = rhs.base copy = self.context.create_copy() copy.add_input(rhs) From 95ae53d0c8d328f0e42423760f667d30a61bf0c8 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Tue, 19 Apr 2022 14:24:08 -0600 Subject: [PATCH 27/33] fixed the logic for transposing base array when bool arrays are passed as indices --- cunumeric/deferred.py | 18 ++++-- tests/advanced_indexing.py | 126 +++++++++++++++++++++++++++++-------- 2 files changed, 113 insertions(+), 31 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index a8edd089d..33a3a3b4d 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -465,8 +465,18 @@ def _create_indexing_array(self, key, is_set=False): if np.isscalar(k) or isinstance(k, NumPyThunk): if start_index == -1: start_index = dim - transpose_indices += (dim,) + key_transpose_indices += (dim,) transpose_needed = transpose_needed or ((dim - last_index) > 1) + if ( + isinstance(k, NumPyThunk) + and k.dtype == np.bool + and k.ndim >= 2 + ): + for i in range(dim, dim + k.ndim): + transpose_indices += (shift + i,) + shift += k.ndim - 1 + else: + transpose_indices += (dim,) last_index = dim if transpose_needed: @@ -474,7 +484,6 @@ def _create_indexing_array(self, key, is_set=False): post_indices = tuple( i for i in range(store.ndim) if i not in transpose_indices ) - key_transpose_indices = transpose_indices transpose_indices += post_indices post_indices = tuple( i for i in range(len(key)) if i not in key_transpose_indices @@ -483,8 +492,8 @@ def _create_indexing_array(self, key, is_set=False): store = store.transpose(transpose_indices) key = tuple(key[i] for i in key_transpose_indices) - for d, k in enumerate(key): - dim = d + shift = 0 + for dim, k in enumerate(key): if np.isscalar(k): if k < 0: k += store.shape[dim + shift] @@ -506,6 +515,7 @@ def _create_indexing_array(self, key, is_set=False): # in case of the mixed indises we all nonzero # for the bool array k = k.nonzero() + shift += len(k) - 1 tuple_of_arrays += k else: tuple_of_arrays += (k,) diff --git a/tests/advanced_indexing.py b/tests/advanced_indexing.py index f0e478eb9..2bd2c2c16 100644 --- a/tests/advanced_indexing.py +++ b/tests/advanced_indexing.py @@ -246,11 +246,9 @@ def test(): res_num = xt_num[:, [0, 1], :, 1:] assert np.array_equal(res, res_num) - z = x - z_num = x_num - z[[0, 1], [0, 1]] = 11 - z_num[[0, 1], [0, 1]] = 11 - assert np.array_equal(z, z_num) + x[[0, 1], [0, 1]] = 11 + x_num[[0, 1], [0, 1]] = 11 + assert np.array_equal(x, x_num) # d: newaxis is passed along with array: @@ -258,6 +256,22 @@ def test(): res_num = x_num[..., [1, 0]] assert np.array_equal(res, res_num) + xt = x.transpose( + ( + 1, + 3, + 0, + 2, + ) + ) + xt_num = x_num.transpose( + ( + 1, + 3, + 0, + 2, + ) + ) res = xt[..., [0, 1], 1:] res_num = xt_num[..., [0, 1], 1:] assert np.array_equal(res, res_num) @@ -317,25 +331,17 @@ def test(): res_num = x_num[..., indx_bool_num] assert np.array_equal(res, res_num) - print("IRINA DEBUG 1") indx1_bool = np.array([True, False]) indx1_bool_num = num.array(indx1_bool) indx2_bool = np.array([True, False, True, True]) indx2_bool_num = num.array(indx2_bool) res = x[indx1_bool, :, indx2_bool] - print(res.shape) - print(res) res_num = x_num[indx1_bool_num, :, indx2_bool_num] - print(res_num.shape) - print(res_num) assert np.array_equal(res, res_num) - print("IRINA DEBUG 2") res = x[indx1_bool, 1, indx2_bool] - # res_num = x_num[indx1_bool_num, 1, indx2_bool_num] - # print(res.shape) - # print(res_num.shape) - # assert np.array_equal(res, res_num) + res_num = x_num[indx1_bool_num, 1, indx2_bool_num] + assert np.array_equal(res, res_num) # g: boolean array with the same shape is passed to x: indx = x % 2 @@ -351,7 +357,6 @@ def test(): z[indx] = 1 z_num[indx_num] = 1 assert np.array_equal(z, z_num) - print("IRINA DEBUG 3") indx_bool = np.array([True, False, True]) indx_bool_num = num.array(indx_bool) @@ -359,7 +364,6 @@ def test(): z_num[:, indx_bool_num] = 5 assert np.array_equal(z, z_num) - print("IRINA DEBUG 4") # i: two bool array of the same shape are passed: x = mk_seq_array( np, @@ -388,20 +392,66 @@ def test(): ) indx_num = num.array(indx) res = x[indx, indx] - print("IRINA DEBUG res = ", res.shape) - # res_num = x_num[indx_num, indx_num] - # assert np.array_equal(res, res_num) + res_num = x_num[indx_num, indx_num] + assert np.array_equal(res, res_num) + if LEGATE_MAX_DIM > 4: + x = mk_seq_array( + np, + ( + 3, + 4, + 5, + 3, + 4, + ), + ) + x_num = mk_seq_array( + num, + ( + 3, + 4, + 5, + 3, + 4, + ), + ) + res = x[indx, 1, indx] + res_num = x_num[indx_num, 1, indx_num] + assert np.array_equal(res, res_num) + + res = x[indx, :, indx] + res_num = x_num[indx_num, :, indx_num] + assert np.array_equal(res, res_num) # j: 2 bool arrays should be broadcasted: - # res = x[idx, [True,False,False]] - # res_num = x_num[idx_num, [True,False,False]] + x = mk_seq_array( + np, + ( + 3, + 4, + 3, + 4, + ), + ) + x_num = mk_seq_array( + num, + ( + 3, + 4, + 3, + 4, + ), + ) + res = x[indx, [True, False, False]] + res_num = x_num[indx_num, [True, False, False]] + assert np.array_equal(res, res_num) # 2d bool array not at the first index: indx = np.full((4, 3), True) indx_num = num.array(indx) res = x[:, indx] - # res_num = x_num[:, indx] - # assert np.array_equal(res, res_num) + res_num = x_num[:, indx] + assert np.array_equal(res, res_num) # 3: testing mixed type of the arguments passed: @@ -425,8 +475,8 @@ def test(): ), ) res = x[[1, 1], [False, True, False]] - # res_num = x_num[[1,1], [False, True,False]] - # assert np.array_equal(res, res_num) + res_num = x_num[[1, 1], [False, True, False]] + assert np.array_equal(res, res_num) res = x[[1, 1], :, [False, True, False, True]] res_num = x_num[[1, 1], :, [False, True, False, True]] @@ -465,7 +515,7 @@ def test(): res_num = x_num[:, [0, 1], :, 1:] assert np.array_equal(res, res_num) - # c: transformed base: + # c: transformed base or index or rhs: z = x[:, 1:] z_num = x_num[:, 1:] indx = np.array([1, 1]) @@ -474,6 +524,22 @@ def test(): res_num = z_num[indx_num] assert np.array_equal(res, res_num) + indx = np.array([1, 1, 0]) + indx_num = num.array(indx) + indx = indx[1:] + indx_num = indx_num[1:] + res = z[1, indx] + res_num = z_num[1, indx_num] + assert np.array_equal(res, res_num) + + b = np.ones((2, 3, 6, 5)) + b_num = num.array(b) + b = b.transpose((0, 1, 3, 2)) + b_num = b_num.transpose((0, 1, 3, 2)) + z[indx] = b + z_num[indx_num] = b_num + assert np.array_equal(z, z_num) + # d: shape mismatch case: x = np.array( [ @@ -536,6 +602,12 @@ def test(): x_num[ind_num, ind_num] = 5 assert np.array_equal(x, x_num) + b = np.array([1, 2, 3], dtype=np.int16) + b_num = num.array(b) + x[ind, ind] = b + x_num[ind_num, ind_num] = b_num + assert np.array_equal(x, x_num) + # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by # 1 when passig 2d index array for ndim in range(2, LEGATE_MAX_DIM): From a3979dc41b8a0c784146a0e97a7a7a90da16be8c Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Tue, 19 Apr 2022 22:41:29 -0600 Subject: [PATCH 28/33] adding logic for the set_item when base array was transposed internally --- cunumeric/deferred.py | 32 +++++++++++++++++++++++++------- tests/advanced_indexing.py | 4 ++++ 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 33a3a3b4d..674143438 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -441,7 +441,7 @@ def _create_indexing_array(self, key, is_set=False): task.add_scalar_arg(is_set, bool) task.add_alignment(rhs.base, key.base) task.execute() - return False, rhs, out + return False, rhs, out, None if isinstance(key, NumPyThunk): key = (key,) @@ -491,6 +491,8 @@ def _create_indexing_array(self, key, is_set=False): key_transpose_indices += post_indices store = store.transpose(transpose_indices) key = tuple(key[i] for i in key_transpose_indices) + else: + transpose_indices = None shift = 0 for dim, k in enumerate(key): @@ -532,12 +534,12 @@ def _create_indexing_array(self, key, is_set=False): if len(tuple_of_arrays) <= rhs.ndim and rhs.ndim > 1: output_arr = rhs._zip_indices(start_index, tuple_of_arrays) - return True, rhs, output_arr + return True, rhs, output_arr, transpose_indices elif len(tuple_of_arrays) == 1 and rhs.ndim == 1: key = tuple_of_arrays[0] if key.base.transformed: key, key_store = key._copy_store(key.base) - return True, rhs, key + return True, rhs, key, transpose_indices else: raise ValueError("Advance indexing dimention mismatch") @@ -601,7 +603,12 @@ def get_item(self, key): # Check to see if this is advanced indexing or not if is_advanced_indexing(key): # Create the indexing array - copy_needed, rhs, index_array = self._create_indexing_array(key) + ( + copy_needed, + rhs, + index_array, + transpose_indices, + ) = self._create_indexing_array(key) store = rhs.base if copy_needed: # Create a new array to be the result @@ -643,9 +650,12 @@ def set_item(self, key, rhs): # Check to see if this is advanced indexing or not if is_advanced_indexing(key): # Create the indexing array - copy_needed, lhs, index_array = self._create_indexing_array( - key, True - ) + ( + copy_needed, + lhs, + index_array, + transpose_indices, + ) = self._create_indexing_array(key, True) rhs = self.runtime.to_deferred_array(rhs) if rhs.shape != index_array.shape: rhs_tmp = rhs._broadcast(index_array.base.shape) @@ -661,7 +671,15 @@ def set_item(self, key, rhs): copy.add_output(lhs.base) copy.execute() + # todo this copy will be removed when affine copies are + # supported in Legion/Realm if lhs is not self: + # if lhs was transposed in _create_indexing_array + # we need to transpose self as well + if transpose_indices is not None: + store = self.base + store = store.transpose(transpose_indices) + self = DeferredArray(self.runtime, store, self.dtype) self.copy(lhs, deep=True) else: diff --git a/tests/advanced_indexing.py b/tests/advanced_indexing.py index 2bd2c2c16..d0662fbde 100644 --- a/tests/advanced_indexing.py +++ b/tests/advanced_indexing.py @@ -250,6 +250,10 @@ def test(): x_num[[0, 1], [0, 1]] = 11 assert np.array_equal(x, x_num) + x[[0, 1], :, [0, 1]] = 11 + x_num[[0, 1], :, [0, 1]] = 11 + assert np.array_equal(x, x_num) + # d: newaxis is passed along with array: res = x[..., [1, 0]] From f7990adc9c124dc4b48130117326f0175ff4c4ca Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 20 Apr 2022 10:30:23 -0600 Subject: [PATCH 29/33] making set_item work for the case when any transformations are done to se base array internally --- cunumeric/deferred.py | 25 +++++++++++-------------- tests/advanced_indexing.py | 16 ++++++++++++++-- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 674143438..b34e6cb1f 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -441,7 +441,7 @@ def _create_indexing_array(self, key, is_set=False): task.add_scalar_arg(is_set, bool) task.add_alignment(rhs.base, key.base) task.execute() - return False, rhs, out, None + return False, rhs, out, self if isinstance(key, NumPyThunk): key = (key,) @@ -491,8 +491,6 @@ def _create_indexing_array(self, key, is_set=False): key_transpose_indices += post_indices store = store.transpose(transpose_indices) key = tuple(key[i] for i in key_transpose_indices) - else: - transpose_indices = None shift = 0 for dim, k in enumerate(key): @@ -527,6 +525,11 @@ def _create_indexing_array(self, key, is_set=False): "indexing operation", ) if store.transformed: + # in case this operation is called for the set_item, we need + # to apply all the transformations to self as well before + # creating a copy + if is_set: + self = DeferredArray(self.runtime, store, self.dtype) # after store is transformed we need to to return a copy of # the store since Copy operation can't be done on # the store with transformation @@ -534,12 +537,12 @@ def _create_indexing_array(self, key, is_set=False): if len(tuple_of_arrays) <= rhs.ndim and rhs.ndim > 1: output_arr = rhs._zip_indices(start_index, tuple_of_arrays) - return True, rhs, output_arr, transpose_indices + return True, rhs, output_arr, self elif len(tuple_of_arrays) == 1 and rhs.ndim == 1: key = tuple_of_arrays[0] if key.base.transformed: key, key_store = key._copy_store(key.base) - return True, rhs, key, transpose_indices + return True, rhs, key, self else: raise ValueError("Advance indexing dimention mismatch") @@ -607,7 +610,7 @@ def get_item(self, key): copy_needed, rhs, index_array, - transpose_indices, + self, ) = self._create_indexing_array(key) store = rhs.base if copy_needed: @@ -654,7 +657,7 @@ def set_item(self, key, rhs): copy_needed, lhs, index_array, - transpose_indices, + self, ) = self._create_indexing_array(key, True) rhs = self.runtime.to_deferred_array(rhs) if rhs.shape != index_array.shape: @@ -673,13 +676,7 @@ def set_item(self, key, rhs): # todo this copy will be removed when affine copies are # supported in Legion/Realm - if lhs is not self: - # if lhs was transposed in _create_indexing_array - # we need to transpose self as well - if transpose_indices is not None: - store = self.base - store = store.transpose(transpose_indices) - self = DeferredArray(self.runtime, store, self.dtype) + if lhs is not self or self.base.transformed: self.copy(lhs, deep=True) else: diff --git a/tests/advanced_indexing.py b/tests/advanced_indexing.py index d0662fbde..b70ca512e 100644 --- a/tests/advanced_indexing.py +++ b/tests/advanced_indexing.py @@ -250,8 +250,16 @@ def test(): x_num[[0, 1], [0, 1]] = 11 assert np.array_equal(x, x_num) - x[[0, 1], :, [0, 1]] = 11 - x_num[[0, 1], :, [0, 1]] = 11 + x[[0, 1], :, [0, 1]] = 12 + x_num[[0, 1], :, [0, 1]] = 12 + assert np.array_equal(x, x_num) + + x[[0, 1], 1:3, [0, 1]] = 3.5 + x_num[[0, 1], 1:3, [0, 1]] = 3.5 + assert np.array_equal(x, x_num) + + x[1:2, :, [0, 1]] = 7 + x_num[1:2, :, [0, 1]] = 7 assert np.array_equal(x, x_num) # d: newaxis is passed along with array: @@ -260,6 +268,10 @@ def test(): res_num = x_num[..., [1, 0]] assert np.array_equal(res, res_num) + x[..., [1, 0]] = 8 + x_num[..., [1, 0]] = 8 + assert np.array_equal(res, res_num) + xt = x.transpose( ( 1, From a444581894a951e6b4717df1d20b4f3ad7d936bf Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 20 Apr 2022 10:58:32 -0600 Subject: [PATCH 30/33] some code clean-up + documentation --- cunumeric/deferred.py | 30 +++++++++++++++++------------- tests/advanced_indexing.py | 2 ++ 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index b34e6cb1f..774a910bf 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -314,7 +314,7 @@ def _zip_indices(self, start_index, arrays): start_index = 0 new_arrays = tuple() - # check array's type and converting them to deferred arrays + # check array's type and convert them to deferred arrays for a in arrays: a = self.runtime.to_deferred_array(a) data_type = a.dtype @@ -429,6 +429,9 @@ def _create_indexing_array(self, key, is_set=False): key = self.runtime.to_deferred_array(key) out_dtype = rhs.dtype + # in cease this operation is called for the set_item, we + # return Point type field that is later used for + # indirect copy operation if is_set: N = rhs.ndim out_dtype = rhs.runtime.get_point_type(N) @@ -456,8 +459,6 @@ def _create_indexing_array(self, key, is_set=False): transpose_needed = False transpose_indices = tuple() key_transpose_indices = tuple() - # since we can't call Copy operation on transformed Store, after - # the transformation, we need to return a copy tuple_of_arrays = () # First, we need to check if transpose is needed @@ -507,11 +508,12 @@ def _create_indexing_array(self, key, is_set=False): if not isinstance(key, DeferredArray): k = self.runtime.to_deferred_array(k) if k.dtype == np.bool: - if k.shape[0] != store.shape[dim + shift]: - raise ValueError( - "shape of boolean index did not match " - "indexed array " - ) + for i in range(k.ndim): + if k.shape[i] != store.shape[dim + i + shift]: + raise ValueError( + "shape of boolean index did not match " + "indexed array " + ) # in case of the mixed indises we all nonzero # for the bool array k = k.nonzero() @@ -525,9 +527,9 @@ def _create_indexing_array(self, key, is_set=False): "indexing operation", ) if store.transformed: - # in case this operation is called for the set_item, we need - # to apply all the transformations to self as well before - # creating a copy + # in the case this operation is called for the set_item, we need + # to apply all the transformations done to `store` to `self` + # as well before creating a copy if is_set: self = DeferredArray(self.runtime, store, self.dtype) # after store is transformed we need to to return a copy of @@ -540,11 +542,13 @@ def _create_indexing_array(self, key, is_set=False): return True, rhs, output_arr, self elif len(tuple_of_arrays) == 1 and rhs.ndim == 1: key = tuple_of_arrays[0] + # when key is transformed, we need to return a copy in purpose + # to use it as an indirection in copy operation if key.base.transformed: key, key_store = key._copy_store(key.base) return True, rhs, key, self else: - raise ValueError("Advance indexing dimention mismatch") + raise ValueError("Advanced indexing dimention mismatch") @staticmethod def _unpack_ellipsis(key, ndim): @@ -674,7 +678,7 @@ def set_item(self, key, rhs): copy.add_output(lhs.base) copy.execute() - # todo this copy will be removed when affine copies are + # TODO this copy will be removed when affine copies are # supported in Legion/Realm if lhs is not self or self.base.transformed: self.copy(lhs, deep=True) diff --git a/tests/advanced_indexing.py b/tests/advanced_indexing.py index b70ca512e..4ef35de70 100644 --- a/tests/advanced_indexing.py +++ b/tests/advanced_indexing.py @@ -431,10 +431,12 @@ def test(): 4, ), ) + # 2 bool arrays separated by scalar res = x[indx, 1, indx] res_num = x_num[indx_num, 1, indx_num] assert np.array_equal(res, res_num) + # 2 bool arrays separated by : res = x[indx, :, indx] res_num = x_num[indx_num, :, indx_num] assert np.array_equal(res, res_num) From 8796301b18a51199daeb9765b7df02d8974d99f6 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 20 Apr 2022 21:37:20 -0600 Subject: [PATCH 31/33] fixing some small issues --- cunumeric/deferred.py | 14 ++++++++++---- src/cunumeric/index/zip.cu | 10 +++++----- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 33fa69ebb..1c6d39266 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -443,6 +443,12 @@ def _create_indexing_array(self, key, is_set=False): task.add_input(key.base) task.add_scalar_arg(is_set, bool) task.add_alignment(rhs.base, key.base) + task.add_broadcast( + key.base, axes=tuple(range(1, len(key.base.shape))) + ) + task.add_broadcast( + rhs.base, axes=tuple(range(1, len(rhs.base.shape))) + ) task.execute() return False, rhs, out, self @@ -477,7 +483,7 @@ def _create_indexing_array(self, key, is_set=False): transpose_indices += (shift + i,) shift += k.ndim - 1 else: - transpose_indices += (dim,) + transpose_indices += ((dim + shift),) last_index = dim if transpose_needed: @@ -548,7 +554,7 @@ def _create_indexing_array(self, key, is_set=False): key, key_store = key._copy_store(key.base) return True, rhs, key, self else: - raise ValueError("Advanced indexing dimention mismatch") + raise ValueError("Advanced indexing dimension mismatch") @staticmethod def _unpack_ellipsis(key, ndim): @@ -663,7 +669,7 @@ def set_item(self, key, rhs): index_array, self, ) = self._create_indexing_array(key, True) - rhs = self.runtime.to_deferred_array(rhs) + if rhs.shape != index_array.shape: rhs_tmp = rhs._broadcast(index_array.base.shape) rhs_tmp, rhs = rhs._copy_store(rhs_tmp) @@ -680,7 +686,7 @@ def set_item(self, key, rhs): # TODO this copy will be removed when affine copies are # supported in Legion/Realm - if lhs is not self or self.base.transformed: + if lhs is not self: self.copy(lhs, deep=True) else: diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu index c130b9bb4..f748b95d8 100644 --- a/src/cunumeric/index/zip.cu +++ b/src/cunumeric/index/zip.cu @@ -28,7 +28,7 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) const DeferredBuffer, 1> index_arrays, const Rect rect, const Pitches pitches, - int volume, + size_t volume, std::index_sequence) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -44,7 +44,7 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) zip_kernel_dense(Point* out, const DeferredBuffer index_arrays, const Rect rect, - int volume, + size_t volume, std::index_sequence) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -61,9 +61,9 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) const Rect rect, const Pitches pitches, int narrays, - int volume, - int key_dim, - int start_index) + size_t volume, + int64_t key_dim, + int64_t start_index) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= volume) return; From 30539a92a57191a2fb0a2d9ac7dbd0c2f2ac8924 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 21 Apr 2022 10:22:07 -0600 Subject: [PATCH 32/33] adding debugging checks for cuda task variants --- src/cunumeric/index/advanced_indexing.cu | 8 ++--- src/cunumeric/index/zip.cu | 38 +++++++++++++----------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cunumeric/index/advanced_indexing.cu index c78fec4cf..ebfca1971 100644 --- a/src/cunumeric/index/advanced_indexing.cu +++ b/src/cunumeric/index/advanced_indexing.cu @@ -143,10 +143,9 @@ struct AdvancedIndexingImplBody { size_t size = 0; const bool* index_ptr = index.ptr(rect_index); const size_t volume = rect_index.volume(); - cudaStream_t stream; - cudaStreamCreate(&stream); - auto offsets = create_buffer(volume, Memory::Kind::GPU_FB_MEM); - size = compute_size(index, pitches_index, rect_index, volume, stream, offsets); + auto stream = get_cached_stream(); + auto offsets = create_buffer(volume, Memory::Kind::GPU_FB_MEM); + size = compute_size(index, pitches_index, rect_index, volume, stream, offsets); out = create_buffer(size, Memory::Kind::GPU_FB_MEM); // populate output @@ -162,6 +161,7 @@ struct AdvancedIndexingImplBody { rect_index.lo, offsets); } + CHECK_CUDA_STREAM(stream); return size; } }; diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu index f748b95d8..5a18f776d 100644 --- a/src/cunumeric/index/zip.cu +++ b/src/cunumeric/index/zip.cu @@ -25,7 +25,7 @@ using namespace Legion; template __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) zip_kernel(const AccessorWO, DIM> out, - const DeferredBuffer, 1> index_arrays, + const Buffer, 1> index_arrays, const Rect rect, const Pitches pitches, size_t volume, @@ -42,7 +42,7 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) template __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) zip_kernel_dense(Point* out, - const DeferredBuffer index_arrays, + const Buffer index_arrays, const Rect rect, size_t volume, std::index_sequence) @@ -57,7 +57,7 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) template __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) zip_kernel(const AccessorWO, DIM> out, - const DeferredBuffer, 1> index_arrays, + const Buffer, 1> index_arrays, const Rect rect, const Pitches pitches, int narrays, @@ -92,35 +92,37 @@ struct ZipImplBody { const int64_t start_index, std::index_sequence) const { + auto stream = get_cached_stream(); const size_t volume = rect.volume(); const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; if (index_arrays.size() == N) { if (dense) { - DeferredBuffer idx_arr(Memory::Kind::Z_COPY_MEM, - Rect<1>(0, index_arrays.size() - 1)); + auto index_buf = create_buffer( + index_arrays.size(), Memory::Kind::Z_COPY_MEM, 128 /*alignment*/); for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) { - idx_arr[idx] = index_arrays[idx].ptr(rect); + index_buf[idx] = index_arrays[idx].ptr(rect); } - zip_kernel_dense<<>>( - out.ptr(rect), idx_arr, rect, volume, std::make_index_sequence()); + zip_kernel_dense<<>>( + out.ptr(rect), index_buf, rect, volume, std::make_index_sequence()); } else { - DeferredBuffer, 1> idx_arr(Memory::Kind::Z_COPY_MEM, - Rect<1>(0, index_arrays.size() - 1)); - for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) idx_arr[idx] = index_arrays[idx]; - zip_kernel<<>>( - out, idx_arr, rect, pitches, volume, std::make_index_sequence()); + auto index_buf = create_buffer, 1>( + index_arrays.size(), Memory::Kind::Z_COPY_MEM, 128 /*alignment*/); + for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) index_buf[idx] = index_arrays[idx]; + zip_kernel<<>>( + out, index_buf, rect, pitches, volume, std::make_index_sequence()); } } else { #ifdef DEBUG_CUNUMERIC assert(index_arrays.size() < N); #endif - DeferredBuffer, 1> idx_arr(Memory::Kind::Z_COPY_MEM, - Rect<1>(0, index_arrays.size() - 1)); - for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) idx_arr[idx] = index_arrays[idx]; + auto index_buf = create_buffer, 1>( + index_arrays.size(), Memory::Kind::Z_COPY_MEM, 128 /*alignment*/); + for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) index_buf[idx] = index_arrays[idx]; int num_arrays = index_arrays.size(); - zip_kernel<<>>( - out, idx_arr, rect, pitches, num_arrays, volume, key_dim, start_index); + zip_kernel<<>>( + out, index_buf, rect, pitches, num_arrays, volume, key_dim, start_index); } + CHECK_CUDA_STREAM(stream); } }; From 8704cb02ff139ee2a3c57568f073ec710da2df3a Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 21 Apr 2022 11:16:28 -0600 Subject: [PATCH 33/33] removing explicit alignment from the buffers --- src/cunumeric/index/zip.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu index 5a18f776d..ae82e7d10 100644 --- a/src/cunumeric/index/zip.cu +++ b/src/cunumeric/index/zip.cu @@ -97,16 +97,16 @@ struct ZipImplBody { const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; if (index_arrays.size() == N) { if (dense) { - auto index_buf = create_buffer( - index_arrays.size(), Memory::Kind::Z_COPY_MEM, 128 /*alignment*/); + auto index_buf = + create_buffer(index_arrays.size(), Memory::Kind::Z_COPY_MEM); for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) { index_buf[idx] = index_arrays[idx].ptr(rect); } zip_kernel_dense<<>>( out.ptr(rect), index_buf, rect, volume, std::make_index_sequence()); } else { - auto index_buf = create_buffer, 1>( - index_arrays.size(), Memory::Kind::Z_COPY_MEM, 128 /*alignment*/); + auto index_buf = + create_buffer, 1>(index_arrays.size(), Memory::Kind::Z_COPY_MEM); for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) index_buf[idx] = index_arrays[idx]; zip_kernel<<>>( out, index_buf, rect, pitches, volume, std::make_index_sequence()); @@ -115,8 +115,8 @@ struct ZipImplBody { #ifdef DEBUG_CUNUMERIC assert(index_arrays.size() < N); #endif - auto index_buf = create_buffer, 1>( - index_arrays.size(), Memory::Kind::Z_COPY_MEM, 128 /*alignment*/); + auto index_buf = + create_buffer, 1>(index_arrays.size(), Memory::Kind::Z_COPY_MEM); for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) index_buf[idx] = index_arrays[idx]; int num_arrays = index_arrays.size(); zip_kernel<<>>(