nv-legate · sbak5 · May 16, 2022 · Apr 23, 2022 · Apr 28, 2022 · May 12, 2022
diff --git a/cunumeric/module.py b/cunumeric/module.py
@@ -16,9 +16,10 @@
 import math
 import re
 from collections import Counter
-from functools import wraps
+from functools import reduce as functools_reduce, wraps
 from inspect import signature
 from itertools import chain
+from operator import iconcat
 from typing import Optional, Set
 
 import numpy as np
@@ -1230,9 +1231,10 @@ def check_list_depth(arr, prefix=(0,)):
             f"List at arrays{convert_to_array_form(prefix)} cannot be empty"
         )
 
-    depths = [
+    depths = list(
         check_list_depth(each, prefix + (idx,)) for idx, each in enumerate(arr)
-    ]
+    )
+
     if len(set(depths)) != 1:  # this should be one
         # If we're here elements don't have the same depth
         first_depth = depths[0]
@@ -1285,26 +1287,71 @@ def check_shape_dtype(
     return converted, ArrayInfo(ndim, shape, dtype)
 
 
-def _block(arr, cur_depth, depth):
+def _block_collect_slices(arr, cur_depth, depth):
+    # collects slices for each array in `arr`
+    # the last outcome will be slices on every dimension of the output array
+    # for each array in `arr`
     if cur_depth < depth:
-        inputs = list(_block(each, cur_depth + 1, depth) for each in arr)
+        sublist_results = list(
+            _block_collect_slices(each, cur_depth + 1, depth) for each in arr
+        )
+        arrays, outshape_list, slices = zip(*sublist_results)
+        max_ndim = _builtin_max(
+            1 + (depth - cur_depth), *(len(each) for each in outshape_list)
+        )
+        outshape_list = list(
+            ((1,) * (max_ndim - len(each)) + tuple(each))
+            for each in outshape_list
+        )
+        leading_dim = _builtin_sum(
+            each[-1 + (cur_depth - depth)] for each in outshape_list
+        )
+
+        # flatten the arrays from sublists
+        arrays = functools_reduce(iconcat, arrays)
+        # prepares the out_shape of the current list
+        out_shape = list(outshape_list[0])
+        out_shape[-1 + cur_depth - depth] = leading_dim
+        offset = 0
+        updated_slices = []
+        # update the dimension in each slice for the current axis
+        for shape, slice_list in zip(outshape_list, slices):
+            cur_dim = shape[-1 + cur_depth - depth]
+            updated_slices.append(
+                list(
+                    (slice(offset, offset + cur_dim),) + each
+                    for each in slice_list
+                )
+            )
+            offset += cur_dim
+        # flatten the slices
+        slices = functools_reduce(iconcat, updated_slices)
     else:
-        inputs = list(convert_to_cunumeric_ndarray(inp) for inp in arr)
+        arrays = list(convert_to_cunumeric_ndarray(inp) for inp in arr)
+        if len(arr) > 1:
+            arrays, common_info = check_shape_dtype(
+                arrays, block.__name__, axis=-1
+            )
+        else:
+            common_info = ArrayInfo(
+                arrays[0].ndim, arrays[0].shape, arrays[0].dtype
+            )
+        # the initial slices for each arr on arr.shape[-1]
+        out_shape, slices = _concatenate(
+            arrays, axis=-1, common_info=common_info, slicing_only=True
+        )
+    return arrays, out_shape, slices
 
-    # this reshape of elements could be replaced
-    # w/ np.atleast_*d when they're implemented
-    # Computes the maximum number of dimensions for the concatenation
-    max_ndim = _builtin_max(
-        1 + (depth - cur_depth), *(inp.ndim for inp in inputs)
-    )
-    # Append leading 1's to make elements to have the same 'ndim'
-    reshaped = list(
-        inp.reshape((1,) * (max_ndim - inp.ndim) + inp.shape)
-        if max_ndim > inp.ndim
-        else inp
-        for inp in inputs
-    )
-    return concatenate(reshaped, axis=-1 + (cur_depth - depth))
+
+def _block_slicing(arr, depth):
+    # collects the final slices of input arrays and assign them at once
+    arrays, out_shape, slices = _block_collect_slices(arr, 1, depth)
+    out_array = ndarray(shape=out_shape, inputs=arrays)
+
+    for dest, inp in zip(slices, arrays):
+        out_array[(Ellipsis,) + tuple(dest)] = inp
+
+    return out_array
 
 
 def _concatenate(
@@ -1314,33 +1361,45 @@ def _concatenate(
     dtype=None,
     casting="same_kind",
     common_info=None,
+    slicing_only=False,
 ):
     if axis < 0:
         axis += len(common_info.shape)
     leading_dim = _builtin_sum(arr.shape[axis] for arr in inputs)
     out_shape = list(common_info.shape)
     out_shape[axis] = leading_dim
 
-    out_array = ndarray(
-        shape=out_shape, dtype=common_info.dtype, inputs=inputs
-    )
+    if slicing_only:
+        slices = []
+        out_array = (out_shape, slices)
+    else:
+        if out is None:
+            out_array = ndarray(
+                shape=out_shape, dtype=common_info.dtype, inputs=inputs
+            )
+        else:
+            out = convert_to_cunumeric_ndarray(out)
+            if not isinstance(out, ndarray):
+                raise TypeError("out should be ndarray")
+            elif list(out.shape) != out_shape:
+                raise ValueError(
+                    f"out.shape({out.shape}) is not matched "
+                    f"to the result shape of concatenation ({out_shape})"
+                )
+            out_array = out
 
-    # Copy the values over from the inputs
     offset = 0
-    idx_arr = []
-    for i in range(0, axis):
-        idx_arr.append(slice(out_shape[i]))
-
-    idx_arr.append(0)
-
-    for i in range(axis + 1, common_info.ndim):
-        idx_arr.append(slice(out_shape[i]))
-
+    post_idx = (slice(None, None, None),) * len(out_shape[axis + 1 :])
+    # Copy the values over from the inputs
     for inp in inputs:
         if inp.size > 0:
-            idx_arr[axis] = slice(offset, offset + inp.shape[axis])
-            out_array[tuple(idx_arr)] = inp
+            idx_arr = (slice(offset, offset + inp.shape[axis]),) + post_idx
+            if slicing_only:
+                slices.append(idx_arr)
+            else:
+                out_array[(Ellipsis,) + idx_arr] = inp
             offset += inp.shape[axis]
+
     return out_array
 
 
@@ -1437,7 +1496,7 @@ def block(arrays):
     # check if the 'arrays' is a balanced tree
     depth = check_list_depth(arrays)
 
-    result = _block(arrays, 1, depth)
+    result = _block_slicing(arrays, depth)
     return result