[microNPU] Add various options to the cascader (apache#10509)

* [microNPU] Added options to Cascader * Added option to toggle multi-dimensional striping, it is disabled by default because it has a very high computational cost. Single dimension striping shares most of the benefit with greatly reduced cost. * Added multiple developer/debugging options prefixed with 'dev_' Also added these options to tvmc. * Added cascader logging, if enabled it will dump information about the cascader proposals to a 'cascader_log.json' file. Co-authored-by: Matthew Barrett <[email protected]> Change-Id: I2ec59ae0bd84b73b2cc4bc56d39e3831b0aeec27 * Updated memory_reduction testcases Also added enable_striping to plan_generator.h Change-Id: I496b30ed6af6f0730087329cd81a69c5040a5e4d Co-authored-by: Matthew Barrett <[email protected]>
Deelvin · May 17, 2022 · 4f4abac · 4f4abac
1 parent 1d8177a
commit 4f4abac
Show file tree

Hide file tree

Showing 25 changed files with 640 additions and 80 deletions.
diff --git a/python/tvm/contrib/ethosu/cascader/__init__.py b/python/tvm/contrib/ethosu/cascader/__init__.py
@@ -37,4 +37,5 @@
 from .tensor_config import TensorConfigState, MemoryRegion, TensorConfig
 from .plan import Plan
 from .scheduler import apply_proposal, cascade, extract_memory_info
+from .logging import Logging
 from .cascader_options import CascaderOptions
diff --git a/python/tvm/contrib/ethosu/cascader/block_config.py b/python/tvm/contrib/ethosu/cascader/block_config.py
@@ -55,5 +55,17 @@ def compute_cycles(self) -> int:
     def output_cycles(self) -> int:
         return int(self._output_cycles)
 
+    def __ge__(self, other: "BlockConfig"):
+        if len(self.output_shape) != len(other.output_shape):
+            return False
+
+        return all(a >= b for a, b in zip(self.output_shape, other.output_shape))
+
+    def __lt__(self, other: "BlockConfig"):
+        if len(self.output_shape) != len(other.output_shape):
+            return False
+
+        return other >= self
+
     def __repr__(self) -> str:
         return f"BlockConfig(output_shape={self.output_shape})"
diff --git a/python/tvm/contrib/ethosu/cascader/cascader_options.py b/python/tvm/contrib/ethosu/cascader/cascader_options.py
@@ -38,8 +38,20 @@ class CascaderOptions(Object):
         How many striping factors to try per axis.
     max_plan_size : int
         The maximum number of Parts in a Plan.
+    max_open_plans : int
+        The maximum number of open Plans to keep after culling.
+    max_closed_plans : int
+        The maxmum number of closed Plans to keep after culling.
     always_copy_size : int
         The maximum size of a Tensor that will always be copied into the cascade region.
+    disable_pareto_plans : bool
+        Disable pareto culling for Plans.
+    disable_pareto_proposals : bool
+        Disable pareto culling for Proposals.
+    enable_multi_dimensional_striping : bool
+        Enable striping in multiple dimensions simultaneously.
+    disable_block_culling : bool
+        Disable culling of block configs.
     enable_striping : bool
         A boolean option to enable striping
 
@@ -51,7 +63,13 @@ def __init__(
         max_proposals: int,
         stripe_factors: int,
         max_plan_size: int,
+        max_open_plans: int,
+        max_closed_plans: int,
         always_copy_size: int,
+        disable_pareto_plans: bool = False,
+        disable_pareto_proposals: bool = False,
+        enable_multi_dimensional_striping: bool = False,
+        disable_block_culling: bool = True,
         enable_striping: bool = False,
     ):
         self.__init_handle_by_constructor__(
@@ -60,6 +78,12 @@ def __init__(
             max_proposals,
             stripe_factors,
             max_plan_size,
+            max_open_plans,
+            max_closed_plans,
             always_copy_size,
+            disable_pareto_plans,
+            disable_pareto_proposals,
+            enable_multi_dimensional_striping,
+            disable_block_culling,
             enable_striping,
         )
diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py
@@ -15,12 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name
+# pylint: disable=too-many-nested-blocks
 """Device config class to hold information about the target hardware"""
 from typing import Tuple, List, Dict, Optional
 from functools import reduce
 
 import math
+import numpy as np
 
+import tvm
 from . import BlockConfig
 from . import StripeConfig
 from . import Propagator
@@ -64,13 +67,14 @@ def as_list(self):
 class EthosuDeviceConfig:
     """Arm(R) Ethos(TM)-U NPU config class"""
 
-    def __init__(self, device: str):
+    def __init__(self, device: str, disable_block_bulling: bool = False):
         self._device = device
         self._subkernel_limits = (8, 8)
         self._output_cycles = (1, 2, 3, 4, 6)
         self._split_depth = 16
         self._max_block_shape = _Shape([1, 32, 64, 128])
         self._bank_size_bytes = 1024
+        self._disable_block_culling = disable_block_bulling
         if self._device == "ethos-u55-256":
             self._micro_block = _Shape([1, 2, 2, 8])
             self._input_micro_block = _Shape([1, 2, 2, 8])
@@ -508,6 +512,28 @@ def get_elementwise_block_config(
         if activation == "LUT" and not self._lut_reserved:
             banks_available -= 2
 
+        # Handle user-forced block config
+        options = tvm.transform.PassContext.current().config.get("relay.ext.ethos-u.options", None)
+        if options and options.dev_force_block_config:
+            block_config = [int(v) for v in options.dev_force_block_config.split("x")]
+            assert len(block_config) == 3
+            if output_layout == "NHWC":
+                block_shape = [output_shape[0], block_config[0], block_config[1], block_config[2]]
+            else:
+                block_shape = [
+                    output_shape[0],
+                    block_config[0],
+                    1 + ((block_config[2] - 1) // 16),
+                    block_config[1],
+                    16,
+                ]
+            output_cycles = self._get_output_cycles(
+                op_type, op_str, ifm_dtype, ofm_dtype, activation
+            )
+            output_cycles *= reduce(lambda a, b: a * b, block_shape, 1)
+            output_cycles = int(math.ceil(output_cycles))
+            return [BlockConfig(block_shape, block_shape, 0, output_cycles)]
+
         # Split the block in half until it fits into SHRAM
         max_height, max_width, max_depth = self._max_block_shape.as_list()[1:]
         if output_layout == "NHCWB16":
@@ -666,6 +692,21 @@ def get_valid_block_configs(
         max_depth = min(ofm_channels, self._max_block_shape.depth)
         min_depth = max(self._micro_block.depth, upscaling_factor)
 
+        heights = range(min_height, max_height + min_height, min_height)
+        widths = range(min_width, max_width + min_width, min_width)
+        depths = range(min_depth, max_depth + min_depth, min_depth)
+
+        # Handle user-forced block config
+        options = tvm.transform.PassContext.current().config.get("relay.ext.ethos-u.options", None)
+        forced = False
+        if options and options.dev_force_block_config:
+            block_config = [int(v) for v in options.dev_force_block_config.split("x")]
+            assert len(block_config) == 3
+            heights = [block_config[0]]
+            widths = [block_config[1]]
+            depths = [block_config[2]]
+            forced = True
+
         input_bytewidth = 1 if ifm_dtype == "int8" else 2
         acc_bytewidth = self._get_accumulator_width(op_type, ifm_dtype)
         banks_available = self._total_banks - self._reserved_banks
@@ -681,26 +722,24 @@ def get_valid_block_configs(
             else:
                 input_block_depth = min(ifm_channels, 32)
 
-        for depth in range(min_depth, max_depth + min_depth, min_depth):
-            if (depth < output_shape.depth) and (depth % self._split_depth != 0):
+        for depth in reversed(depths):
+            if (depth < output_shape.depth) and (depth % self._split_depth != 0) and not forced:
                 # Block depth has to be less than full depth or a multiple of the split depth
                 continue
 
             subkernel_propagator = self._get_subkernel_propagator(
                 op_attrs, ifm_propagator, input_layout, output_layout, depth
             )
 
-            for width in range(min_width, max_width + min_width, min_width):
-                for height in range(min_height, max_height + min_height, min_height):
+            for width in reversed(widths):
+                for height in reversed(heights):
                     if output_layout == "NHCWB16":
                         output_block = (
                             1,
                             height,
                             1 + ((depth - 1) // 16),
                             width,
-                            _round_up(
-                                min(16, max(ofm_channels, min_depth)), self._micro_block.depth
-                            ),
+                            min(16, _round_up(ofm_channels, self._micro_block.depth)),
                         )
                         order = [1, 2, 4, 3, 0]
                     else:
@@ -740,7 +779,7 @@ def get_valid_block_configs(
                         output_cycles = self._get_output_cycles(
                             op_type, op_str, ifm_dtype, ofm_dtype, activation
                         )
-                        output_cycles *= reduce(lambda a, b: a * b, output_block, 1)
+                        output_cycles *= np.prod(output_block).tolist()
                         output_cycles = int(math.ceil(output_cycles))
                         compute_cycles = self._estimate_compute_cycles_per_block(
                             op_type,
@@ -755,11 +794,27 @@ def get_valid_block_configs(
                         block_config = BlockConfig(
                             input_block_shape.as_list(), output_block, compute_cycles, output_cycles
                         )
-                        valid_block_configs.append(block_config)
-                    else:
-                        # Block config does not fit into SHRAM
-                        # Any Block config that is strictly larger than this one will also fail
-                        break
+
+                        if self._disable_block_culling:
+                            # Block culling disabled - add all block configs that fit
+                            valid_block_configs.append(block_config)
+                        else:
+                            # Add block config only if it's not dominated by an existing block.
+                            # A block config is dominated by another if its output_shape is greater
+                            # or equal in every dimension and strictly greater in at least one
+                            # dimension.
+                            dominated = False
+                            for valid_block in valid_block_configs:
+                                if block_config < valid_block:
+                                    dominated = True
+                                    break
+
+                            if not dominated:
+                                valid_block_configs.append(block_config)
+
+                            # Every consecutive block in the innermost loop will be dominated by
+                            # this one so break
+                            break
 
         return valid_block_configs
 

diff --git a/python/tvm/contrib/ethosu/cascader/logging.py b/python/tvm/contrib/ethosu/cascader/logging.py
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A class to hold logging information about the cascader"""
+from typing import Tuple
+import datetime
+import json
+import os
+import math
+
+
+class Logging:
+    """Cascader logging class"""
+
+    def __init__(self):
+        self.min_memory_usage = 0
+        self.max_memory_usage = 0
+        self.min_cycles = 0
+        self.max_cycles = 0
+
+        self.selected_proposal_idx = -1
+        self.proposals = {}
+        self.cascader_runtime = 0
+
+    def add_proposal(self, idx: int, memory_usage: int, cycles: int):
+        self.proposals[idx] = {"memory_usage": memory_usage, "cycles": cycles}
+
+    def get_extreme_points(self) -> Tuple[int, int, int, int]:
+        min_cycles, min_mem_usage = math.inf, math.inf
+        max_cycles, max_mem_usage = 0, 0
+        for proposal in self.proposals.values():
+            min_mem_usage = min(proposal["memory_usage"], min_mem_usage)
+            max_mem_usage = max(proposal["memory_usage"], max_mem_usage)
+            min_cycles = min(proposal["cycles"], min_cycles)
+            max_cycles = max(proposal["cycles"], max_cycles)
+
+        return min_mem_usage, max_mem_usage, min_cycles, max_cycles
+
+    def dump_json(self):
+        min_mem_usage, max_mem_usage, min_cycles, max_cycles = self.get_extreme_points()
+        with open(os.getcwd() + "/cascader_log.json", "w") as json_file:
+            print(
+                json.dumps(
+                    {
+                        "date": f"{datetime.datetime.now()}",
+                        "cascader_runtime": self.cascader_runtime,
+                        "min_cycles": min_cycles,
+                        "max_cycles": max_cycles,
+                        "min_memory_usage": min_mem_usage,
+                        "max_memory_usage": max_mem_usage,
+                        "selected_proposal": self.selected_proposal_idx,
+                        "proposals": self.proposals,
+                    },
+                    indent=2,
+                ),
+                file=json_file,
+            )
diff --git a/python/tvm/contrib/ethosu/cascader/pareto.py b/python/tvm/contrib/ethosu/cascader/pareto.py
@@ -35,5 +35,7 @@ def _thin_vector(vec: List[Object], max_size: int) -> List[Object]:
     return list(_ffi_api.ThinVector(vec, max_size))
 
 
-def _pareto_cull_plans(plans: List[Plan], max_plans: int) -> List[Plan]:
-    return list(_ffi_api.ParetoCullPlans(plans, max_plans))
+def _pareto_cull_plans(
+    plans: List[Plan], max_plans: int, disable_pareto_metric: bool
+) -> List[Plan]:
+    return list(_ffi_api.ParetoCullPlans(plans, max_plans, disable_pareto_metric))
diff --git a/python/tvm/contrib/ethosu/cascader/plan_generator.py b/python/tvm/contrib/ethosu/cascader/plan_generator.py
@@ -27,9 +27,13 @@
 
 
 def _generate_output_stripe_configs(
-    part: Part, stripe_factors: int, enable_striping: bool
+    part: Part, stripe_factors: int, enable_striping: bool, multi_dimensional: bool
 ) -> List[StripeConfig]:
-    return list(_ffi_api.GenerateOutputStripeConfigs(part, stripe_factors, enable_striping))
+    return list(
+        _ffi_api.GenerateOutputStripeConfigs(
+            part, stripe_factors, enable_striping, multi_dimensional
+        )
+    )
 
 
 def _generate_single_plans(