Support nested SDFGs in distributed lowering

This allows us to support reductions with their intialization states. The idea is that nested SDFG are required to be schedule such that there is no communication within them. The user passes the schedules for each map, and the implied communication constraints are then checked for consistency. Keeping communication out of the Nested SDFGs means that there is no communication between things like reduction buffer initialization, and also means that all global communication is kept top-level, where it is easier to optimize Pull Request: #123
spcl · Aug 20, 2022 · e0bc48b · e0bc48b
1 parent 201c7dc
commit e0bc48b
Show file tree

Hide file tree

Showing 8 changed files with 547 additions and 160 deletions.
diff --git a/.github/workflows/cpu-ci.yml b/.github/workflows/cpu-ci.yml
@@ -58,7 +58,7 @@ jobs:
     - name: Test with pytest
       env:
         ORT_RELEASE: ${{ github.workspace }}/onnxruntime-daceml-patched
-        PYTEST_ARGS: --cov=daceml --cov-report=term --cov-report xml --cov-config=.coveragerc -m "not fpga and not xilinx and not gpu and not onnx" --timeout=500
+        PYTEST_ARGS: --cov=daceml --cov-report=term --cov-report xml --cov-config=.coveragerc -m "not fpga and not xilinx and not gpu and not onnx and not mpi" --timeout=500
       run: make test
 
     - name: Test with doctest
@@ -95,7 +95,7 @@ jobs:
 
     - name: Test with pytest
       env:
-        PYTEST_ARGS: --cov=daceml --cov-report=term --cov-report xml --cov-config=.coveragerc -m "not fpga and not xilinx and not gpu and not onnx" --timeout=500 --skip-ort
+        PYTEST_ARGS: --cov=daceml --cov-report=term --cov-report xml --cov-config=.coveragerc -m "not fpga and not xilinx and not gpu and not onnx and not mpi" --timeout=500 --skip-ort
       run: make test
 
     - name: Upload coverage

diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
@@ -29,7 +29,7 @@ jobs:
 
       - name: Test with pytest
         env:
-          PYTEST_ARGS: --cov=daceml --cov-report=term --cov-report xml --cov-config=.coveragerc --gpu-only -m "not slow and not fpga and not xilinx and not onnx" --timeout=500
+          PYTEST_ARGS: --cov=daceml --cov-report=term --cov-report xml --cov-config=.coveragerc --gpu-only -m "not slow and not fpga and not xilinx and not onnx and not mpi" --timeout=500
         run: make test
 
       - name: Upload coverage

diff --git a/daceml/distributed/communication/subarrays.py b/daceml/distributed/communication/subarrays.py
@@ -137,6 +137,7 @@ def compute_scatter_color(parent_grid_variables: List[symbolic.symbol],
 def try_construct_subarray(
         sdfg: SDFG, state: SDFGState, pgrid_name: str, global_desc: data.Data,
         subset: subsets.Range, grid_variables: List[symbolic.symbol],
+        scatter: bool,
         dry_run: bool) -> Optional[Tuple[str, str, Optional[str]]]:
     """
     Try to convert the given end of the distributed memlet to a subarray,
@@ -151,6 +152,7 @@ def try_construct_subarray(
     :param subset: The end of the distributed memlet to convert.
     :param grid_variables: The process grid corresponding to the computation to
                            which this is either an input or an output.
+    :param scatter: True if this is for a scatter.
     :param dry_run: If True, don't actually create the grids and subarray.
                     Instead, return None.
     :return: The name of the subarray, the name of the scatter grid and the
@@ -182,13 +184,15 @@ def try_construct_subarray(
         bcast_shape = subgrid_shape(bcast_color)
 
         if not dry_run:
-            scatter_grid_name = sdfg.add_pgrid(shape=scatter_shape,
-                                               parent_grid=pgrid_name,
-                                               color=scatter_color)
+            scatter_grid_name = sdfg.add_pgrid(
+                shape=scatter_shape,
+                parent_grid=pgrid_name,
+                color=scatter_color,
+                exact_grid=None if scatter else 0)
+
             bcast_grid_name = sdfg.add_pgrid(shape=bcast_shape,
                                              parent_grid=pgrid_name,
                                              color=bcast_color)
-
             for name, shape in ((scatter_grid_name, scatter_shape),
                                 (bcast_grid_name, bcast_shape)):
                 distr_utils.initialize_fields(state, [
@@ -280,6 +284,7 @@ def can_be_applied(self, state: SDFGState, *_, **__):
                                        garr,
                                        node.src_subset,
                                        src_vars,
+                                       False,
                                        dry_run=True)
 
             if node.dst_pgrid is not None:
@@ -290,6 +295,7 @@ def can_be_applied(self, state: SDFGState, *_, **__):
                                        garr,
                                        node.dst_subset,
                                        dst_vars,
+                                       False,
                                        dry_run=True)
         except CommunicationSolverException:
             return False
@@ -329,7 +335,14 @@ def expansion(node: 'DistributedMemlet', state: SDFGState, sdfg: SDFG):
                 rvars = src_vars
 
             subarray_name, scatter_grid, bcast_grid = try_construct_subarray(
-                sdfg, state, pgrid_name, garr, subset, rvars, dry_run=False)
+                sdfg,
+                state,
+                pgrid_name,
+                garr,
+                subset,
+                rvars,
+                scatter,
+                dry_run=False)
 
             if scatter:
                 expansion = mpi.BlockScatter(node.label,
@@ -339,7 +352,7 @@ def expansion(node: 'DistributedMemlet', state: SDFGState, sdfg: SDFG):
             else:
                 expansion = mpi.BlockGather(node.label,
                                             subarray_type=subarray_name,
-                                            gather_grid=pgrid_name,
+                                            gather_grid=scatter_grid,
                                             reduce_grid=bcast_grid)
 
             # clean up connectors to match the new node