diff --git a/python/tvm/contrib/ethosu/cascader/__init__.py b/python/tvm/contrib/ethosu/cascader/__init__.py index 51f5e58a47ce..1d608c04ff6e 100644 --- a/python/tvm/contrib/ethosu/cascader/__init__.py +++ b/python/tvm/contrib/ethosu/cascader/__init__.py @@ -37,4 +37,5 @@ from .tensor_config import TensorConfigState, MemoryRegion, TensorConfig from .plan import Plan from .scheduler import apply_proposal, cascade, extract_memory_info +from .logging import Logging from .cascader_options import CascaderOptions diff --git a/python/tvm/contrib/ethosu/cascader/block_config.py b/python/tvm/contrib/ethosu/cascader/block_config.py index f246918cf490..b90de753f679 100644 --- a/python/tvm/contrib/ethosu/cascader/block_config.py +++ b/python/tvm/contrib/ethosu/cascader/block_config.py @@ -55,5 +55,17 @@ def compute_cycles(self) -> int: def output_cycles(self) -> int: return int(self._output_cycles) + def __ge__(self, other: "BlockConfig"): + if len(self.output_shape) != len(other.output_shape): + return False + + return all(a >= b for a, b in zip(self.output_shape, other.output_shape)) + + def __lt__(self, other: "BlockConfig"): + if len(self.output_shape) != len(other.output_shape): + return False + + return other >= self + def __repr__(self) -> str: return f"BlockConfig(output_shape={self.output_shape})" diff --git a/python/tvm/contrib/ethosu/cascader/cascader_options.py b/python/tvm/contrib/ethosu/cascader/cascader_options.py index ade04bdde9b0..aeca7fcdcb14 100644 --- a/python/tvm/contrib/ethosu/cascader/cascader_options.py +++ b/python/tvm/contrib/ethosu/cascader/cascader_options.py @@ -38,8 +38,20 @@ class CascaderOptions(Object): How many striping factors to try per axis. max_plan_size : int The maximum number of Parts in a Plan. + max_open_plans : int + The maximum number of open Plans to keep after culling. + max_closed_plans : int + The maxmum number of closed Plans to keep after culling. always_copy_size : int The maximum size of a Tensor that will always be copied into the cascade region. + disable_pareto_plans : bool + Disable pareto culling for Plans. + disable_pareto_proposals : bool + Disable pareto culling for Proposals. + enable_multi_dimensional_striping : bool + Enable striping in multiple dimensions simultaneously. + disable_block_culling : bool + Disable culling of block configs. enable_striping : bool A boolean option to enable striping @@ -51,7 +63,13 @@ def __init__( max_proposals: int, stripe_factors: int, max_plan_size: int, + max_open_plans: int, + max_closed_plans: int, always_copy_size: int, + disable_pareto_plans: bool = False, + disable_pareto_proposals: bool = False, + enable_multi_dimensional_striping: bool = False, + disable_block_culling: bool = True, enable_striping: bool = False, ): self.__init_handle_by_constructor__( @@ -60,6 +78,12 @@ def __init__( max_proposals, stripe_factors, max_plan_size, + max_open_plans, + max_closed_plans, always_copy_size, + disable_pareto_plans, + disable_pareto_proposals, + enable_multi_dimensional_striping, + disable_block_culling, enable_striping, ) diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py index bf6ac48cf904..5f5a937628da 100644 --- a/python/tvm/contrib/ethosu/cascader/device_config.py +++ b/python/tvm/contrib/ethosu/cascader/device_config.py @@ -15,12 +15,15 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=invalid-name +# pylint: disable=too-many-nested-blocks """Device config class to hold information about the target hardware""" from typing import Tuple, List, Dict, Optional from functools import reduce import math +import numpy as np +import tvm from . import BlockConfig from . import StripeConfig from . import Propagator @@ -64,13 +67,14 @@ def as_list(self): class EthosuDeviceConfig: """Arm(R) Ethos(TM)-U NPU config class""" - def __init__(self, device: str): + def __init__(self, device: str, disable_block_bulling: bool = False): self._device = device self._subkernel_limits = (8, 8) self._output_cycles = (1, 2, 3, 4, 6) self._split_depth = 16 self._max_block_shape = _Shape([1, 32, 64, 128]) self._bank_size_bytes = 1024 + self._disable_block_culling = disable_block_bulling if self._device == "ethos-u55-256": self._micro_block = _Shape([1, 2, 2, 8]) self._input_micro_block = _Shape([1, 2, 2, 8]) @@ -508,6 +512,28 @@ def get_elementwise_block_config( if activation == "LUT" and not self._lut_reserved: banks_available -= 2 + # Handle user-forced block config + options = tvm.transform.PassContext.current().config.get("relay.ext.ethos-u.options", None) + if options and options.dev_force_block_config: + block_config = [int(v) for v in options.dev_force_block_config.split("x")] + assert len(block_config) == 3 + if output_layout == "NHWC": + block_shape = [output_shape[0], block_config[0], block_config[1], block_config[2]] + else: + block_shape = [ + output_shape[0], + block_config[0], + 1 + ((block_config[2] - 1) // 16), + block_config[1], + 16, + ] + output_cycles = self._get_output_cycles( + op_type, op_str, ifm_dtype, ofm_dtype, activation + ) + output_cycles *= reduce(lambda a, b: a * b, block_shape, 1) + output_cycles = int(math.ceil(output_cycles)) + return [BlockConfig(block_shape, block_shape, 0, output_cycles)] + # Split the block in half until it fits into SHRAM max_height, max_width, max_depth = self._max_block_shape.as_list()[1:] if output_layout == "NHCWB16": @@ -666,6 +692,21 @@ def get_valid_block_configs( max_depth = min(ofm_channels, self._max_block_shape.depth) min_depth = max(self._micro_block.depth, upscaling_factor) + heights = range(min_height, max_height + min_height, min_height) + widths = range(min_width, max_width + min_width, min_width) + depths = range(min_depth, max_depth + min_depth, min_depth) + + # Handle user-forced block config + options = tvm.transform.PassContext.current().config.get("relay.ext.ethos-u.options", None) + forced = False + if options and options.dev_force_block_config: + block_config = [int(v) for v in options.dev_force_block_config.split("x")] + assert len(block_config) == 3 + heights = [block_config[0]] + widths = [block_config[1]] + depths = [block_config[2]] + forced = True + input_bytewidth = 1 if ifm_dtype == "int8" else 2 acc_bytewidth = self._get_accumulator_width(op_type, ifm_dtype) banks_available = self._total_banks - self._reserved_banks @@ -681,8 +722,8 @@ def get_valid_block_configs( else: input_block_depth = min(ifm_channels, 32) - for depth in range(min_depth, max_depth + min_depth, min_depth): - if (depth < output_shape.depth) and (depth % self._split_depth != 0): + for depth in reversed(depths): + if (depth < output_shape.depth) and (depth % self._split_depth != 0) and not forced: # Block depth has to be less than full depth or a multiple of the split depth continue @@ -690,17 +731,15 @@ def get_valid_block_configs( op_attrs, ifm_propagator, input_layout, output_layout, depth ) - for width in range(min_width, max_width + min_width, min_width): - for height in range(min_height, max_height + min_height, min_height): + for width in reversed(widths): + for height in reversed(heights): if output_layout == "NHCWB16": output_block = ( 1, height, 1 + ((depth - 1) // 16), width, - _round_up( - min(16, max(ofm_channels, min_depth)), self._micro_block.depth - ), + min(16, _round_up(ofm_channels, self._micro_block.depth)), ) order = [1, 2, 4, 3, 0] else: @@ -740,7 +779,7 @@ def get_valid_block_configs( output_cycles = self._get_output_cycles( op_type, op_str, ifm_dtype, ofm_dtype, activation ) - output_cycles *= reduce(lambda a, b: a * b, output_block, 1) + output_cycles *= np.prod(output_block).tolist() output_cycles = int(math.ceil(output_cycles)) compute_cycles = self._estimate_compute_cycles_per_block( op_type, @@ -755,11 +794,27 @@ def get_valid_block_configs( block_config = BlockConfig( input_block_shape.as_list(), output_block, compute_cycles, output_cycles ) - valid_block_configs.append(block_config) - else: - # Block config does not fit into SHRAM - # Any Block config that is strictly larger than this one will also fail - break + + if self._disable_block_culling: + # Block culling disabled - add all block configs that fit + valid_block_configs.append(block_config) + else: + # Add block config only if it's not dominated by an existing block. + # A block config is dominated by another if its output_shape is greater + # or equal in every dimension and strictly greater in at least one + # dimension. + dominated = False + for valid_block in valid_block_configs: + if block_config < valid_block: + dominated = True + break + + if not dominated: + valid_block_configs.append(block_config) + + # Every consecutive block in the innermost loop will be dominated by + # this one so break + break return valid_block_configs diff --git a/python/tvm/contrib/ethosu/cascader/logging.py b/python/tvm/contrib/ethosu/cascader/logging.py new file mode 100644 index 000000000000..0b163eb147e7 --- /dev/null +++ b/python/tvm/contrib/ethosu/cascader/logging.py @@ -0,0 +1,70 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""A class to hold logging information about the cascader""" +from typing import Tuple +import datetime +import json +import os +import math + + +class Logging: + """Cascader logging class""" + + def __init__(self): + self.min_memory_usage = 0 + self.max_memory_usage = 0 + self.min_cycles = 0 + self.max_cycles = 0 + + self.selected_proposal_idx = -1 + self.proposals = {} + self.cascader_runtime = 0 + + def add_proposal(self, idx: int, memory_usage: int, cycles: int): + self.proposals[idx] = {"memory_usage": memory_usage, "cycles": cycles} + + def get_extreme_points(self) -> Tuple[int, int, int, int]: + min_cycles, min_mem_usage = math.inf, math.inf + max_cycles, max_mem_usage = 0, 0 + for proposal in self.proposals.values(): + min_mem_usage = min(proposal["memory_usage"], min_mem_usage) + max_mem_usage = max(proposal["memory_usage"], max_mem_usage) + min_cycles = min(proposal["cycles"], min_cycles) + max_cycles = max(proposal["cycles"], max_cycles) + + return min_mem_usage, max_mem_usage, min_cycles, max_cycles + + def dump_json(self): + min_mem_usage, max_mem_usage, min_cycles, max_cycles = self.get_extreme_points() + with open(os.getcwd() + "/cascader_log.json", "w") as json_file: + print( + json.dumps( + { + "date": f"{datetime.datetime.now()}", + "cascader_runtime": self.cascader_runtime, + "min_cycles": min_cycles, + "max_cycles": max_cycles, + "min_memory_usage": min_mem_usage, + "max_memory_usage": max_mem_usage, + "selected_proposal": self.selected_proposal_idx, + "proposals": self.proposals, + }, + indent=2, + ), + file=json_file, + ) diff --git a/python/tvm/contrib/ethosu/cascader/pareto.py b/python/tvm/contrib/ethosu/cascader/pareto.py index 3c4dcbc88a45..545778934c2c 100644 --- a/python/tvm/contrib/ethosu/cascader/pareto.py +++ b/python/tvm/contrib/ethosu/cascader/pareto.py @@ -35,5 +35,7 @@ def _thin_vector(vec: List[Object], max_size: int) -> List[Object]: return list(_ffi_api.ThinVector(vec, max_size)) -def _pareto_cull_plans(plans: List[Plan], max_plans: int) -> List[Plan]: - return list(_ffi_api.ParetoCullPlans(plans, max_plans)) +def _pareto_cull_plans( + plans: List[Plan], max_plans: int, disable_pareto_metric: bool +) -> List[Plan]: + return list(_ffi_api.ParetoCullPlans(plans, max_plans, disable_pareto_metric)) diff --git a/python/tvm/contrib/ethosu/cascader/plan_generator.py b/python/tvm/contrib/ethosu/cascader/plan_generator.py index 9235a285d8b6..155e01431c08 100644 --- a/python/tvm/contrib/ethosu/cascader/plan_generator.py +++ b/python/tvm/contrib/ethosu/cascader/plan_generator.py @@ -27,9 +27,13 @@ def _generate_output_stripe_configs( - part: Part, stripe_factors: int, enable_striping: bool + part: Part, stripe_factors: int, enable_striping: bool, multi_dimensional: bool ) -> List[StripeConfig]: - return list(_ffi_api.GenerateOutputStripeConfigs(part, stripe_factors, enable_striping)) + return list( + _ffi_api.GenerateOutputStripeConfigs( + part, stripe_factors, enable_striping, multi_dimensional + ) + ) def _generate_single_plans( diff --git a/python/tvm/contrib/ethosu/cascader/scheduler.py b/python/tvm/contrib/ethosu/cascader/scheduler.py index 63d48a19afe9..d33abaf2b7c3 100644 --- a/python/tvm/contrib/ethosu/cascader/scheduler.py +++ b/python/tvm/contrib/ethosu/cascader/scheduler.py @@ -18,8 +18,10 @@ """Scheduler for cascader which converts Proposals into Schedules.""" from typing import Tuple, List, Dict, DefaultDict from collections import defaultdict +import time import numpy as np +import tvm from tvm import te from tvm import tir from tvm import PoolInfo @@ -31,6 +33,7 @@ from .proposal_generator import generate_proposals from .graph import create_cascader_graph from .device_config import EthosuDeviceConfig +from .logging import Logging def tile_nd( @@ -188,13 +191,20 @@ def create_home_map( return home_map -def choose_proposal(proposals: List[Proposal], cascade_region: MemoryRegion): +def choose_proposal( + proposals: List[Proposal], cascade_region: MemoryRegion, select_proposal_idx: int +): """Choose the best performing Proposal that doesn't overflow the cascade region.""" - proposal_choice = proposals[0] - for proposal in reversed(proposals): - if proposal.memory_usage < cascade_region.size: - proposal_choice = proposal - break + if select_proposal_idx != -1: + # Manually select proposal based on index, take modulus the total number of proposals to + # ensure that some proposal is always selected. + proposal_choice = proposals[select_proposal_idx % len(proposals)] + else: + proposal_choice = proposals[0] + for proposal in reversed(proposals): + if proposal.memory_usage < cascade_region.size: + proposal_choice = proposal + break return proposal_choice @@ -271,6 +281,17 @@ def cascade( Target device configuration. """ + tvmc_options = tvm.transform.PassContext.current().config.get("relay.ext.ethos-u.options", None) + log = Logging() if tvmc_options and tvmc_options.dev_cascader_logging else None + select_proposal_idx = ( + int(tvmc_options.dev_select_proposal_idx) + if tvmc_options and tvmc_options.dev_select_proposal_idx + else -1 + ) + + if log: + start = time.time() + assert options.cascade_region in working_regions # First convert the Tensor Expression graph into a CascaderGraph casc_graph = create_cascader_graph(te_graph, const_dict, device_config) @@ -279,6 +300,16 @@ def cascade( # Generate Proposals for Pareto-optimal ways to cascade the CascaderGraph proposals = generate_proposals(casc_graph, home_map, options) # Select the best Proposal subject to the memory constraints - proposal_choice = choose_proposal(proposals, options.cascade_region) + proposal_choice = choose_proposal(proposals, options.cascade_region, select_proposal_idx) + + if log: + for idx, proposal in enumerate(proposals): + log.add_proposal(idx, proposal.memory_usage, proposal.cycles) + if proposal == proposal_choice: + log.selected_proposal_idx = idx + + log.cascader_runtime = time.time() - start + log.dump_json() + # Apply the selected Proposal to the Tensor Expression Schedule apply_proposal(proposal_choice, sch) diff --git a/python/tvm/relay/backend/contrib/ethosu/codegen.py b/python/tvm/relay/backend/contrib/ethosu/codegen.py index 2552d891c9dc..423834daa876 100644 --- a/python/tvm/relay/backend/contrib/ethosu/codegen.py +++ b/python/tvm/relay/backend/contrib/ethosu/codegen.py @@ -368,6 +368,8 @@ def _ethos_u55_cascader(sram, enable_striping) -> Callable: stripe_factors=5, max_plan_size=10, always_copy_size=1024, + max_open_plans=8, + max_closed_plans=32, enable_striping=enable_striping, ) return _create_cascader( diff --git a/python/tvm/relay/backend/contrib/ethosu/vela_api.py b/python/tvm/relay/backend/contrib/ethosu/vela_api.py index 6d01e8de57b5..f241652e738f 100644 --- a/python/tvm/relay/backend/contrib/ethosu/vela_api.py +++ b/python/tvm/relay/backend/contrib/ethosu/vela_api.py @@ -67,6 +67,10 @@ def get_optimal_block_config( ethosu.vela.api.NpuShape3D : The optimal block config for the operator """ + options = tvm.transform.PassContext.current().config.get("relay.ext.ethos-u.options", None) + if options and options.dev_force_block_config: + block_config = [int(v) for v in options.dev_force_block_config.split("x")] + return vapi.NpuShape3D(height=block_config[0], width=block_config[1], depth=block_config[2]) all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_config) return _get_optimal_block_config(all_valid_block_configs) diff --git a/src/contrib/ethosu/cascader/cascader_options.cc b/src/contrib/ethosu/cascader/cascader_options.cc index be4bfee6d75c..0daf3fed2481 100644 --- a/src/contrib/ethosu/cascader/cascader_options.cc +++ b/src/contrib/ethosu/cascader/cascader_options.cc @@ -30,28 +30,48 @@ void CascaderOptionsNode::VisitAttrs(AttrVisitor* v) { v->Visit("max_proposals", &max_proposals); v->Visit("stripe_factors", &stripe_factors); v->Visit("max_plan_size", &max_plan_size); + v->Visit("max_open_plans", &max_open_plans); + v->Visit("max_closed_plans", &max_closed_plans); v->Visit("always_copy_size", &always_copy_size); + v->Visit("disable_pareto_plans", &disable_pareto_plans); + v->Visit("disable_pareto_proposals", &disable_pareto_proposals); + v->Visit("enable_multi_dimensional_striping", &enable_multi_dimensional_striping); + v->Visit("disable_block_culling", &disable_block_culling); v->Visit("enable_striping", &enable_striping); } CascaderOptions::CascaderOptions(const MemoryRegion& cascade_region, int max_proposals, - int stripe_factors, int max_plan_size, int always_copy_size, + int stripe_factors, int max_plan_size, int max_open_plans, + int max_closed_plans, int always_copy_size, + bool disable_pareto_plans, bool disable_pareto_proposals, + bool enable_multi_dimensional_striping, bool disable_block_culling, bool enable_striping) { auto n = make_object(); n->cascade_region = std::move(cascade_region); n->max_proposals = max_proposals; n->stripe_factors = stripe_factors; n->max_plan_size = max_plan_size; + n->max_open_plans = max_open_plans; + n->max_closed_plans = max_closed_plans; n->always_copy_size = always_copy_size; + n->disable_pareto_plans = disable_pareto_plans; + n->disable_pareto_proposals = disable_pareto_proposals; + n->enable_multi_dimensional_striping = enable_multi_dimensional_striping; + n->disable_block_culling = disable_block_culling; n->enable_striping = enable_striping; data_ = std::move(n); } TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.CascaderOptions") .set_body_typed([](MemoryRegion cascade_region, int max_proposals, int stripe_factors, - int max_plan_size, int always_copy_size, bool enable_striping) { - return CascaderOptions(cascade_region, max_proposals, stripe_factors, max_plan_size, - always_copy_size, enable_striping); + int max_plan_size, int max_open_plans, int max_closed_plans, + int always_copy_size, bool disable_pareto_plans, + bool disable_pareto_proposals, bool enable_multi_dimensional_striping, + bool disable_block_culling, bool enable_striping) { + return CascaderOptions( + cascade_region, max_proposals, stripe_factors, max_plan_size, max_open_plans, + max_closed_plans, always_copy_size, disable_pareto_plans, disable_pareto_proposals, + enable_multi_dimensional_striping, disable_block_culling, enable_striping); }); TVM_REGISTER_NODE_TYPE(CascaderOptionsNode); diff --git a/src/contrib/ethosu/cascader/cascader_options.h b/src/contrib/ethosu/cascader/cascader_options.h index ba00451766bc..3545e5cc3ae0 100644 --- a/src/contrib/ethosu/cascader/cascader_options.h +++ b/src/contrib/ethosu/cascader/cascader_options.h @@ -47,8 +47,20 @@ class CascaderOptionsNode : public Object { int stripe_factors; /*! \brief The maximum number of Parts in a Plan. */ int max_plan_size; + /*! \brief The maximum number of open Plans saved for a Part Group */ + int max_open_plans; + /*! \brief The maximum number of closed Plans saved for a Part Group */ + int max_closed_plans; /*! \brief The maximum size of Tensor that will always be copied into the cascade region. */ int always_copy_size; + /*! \brief Flag to disable pareto culling for plans to allow non pareto-optimal plans */ + bool disable_pareto_plans; + /*! \brief Flag to disable pareto culling for proposals to allow non pareto-optimal proposals */ + bool disable_pareto_proposals; + /*! \brief Whether to consider multi-dimensional striping */ + bool enable_multi_dimensional_striping; + /*! \brief Flag to disable culling for block configs to allow non-dominant blocks */ + bool disable_block_culling; /*! \brief A boolean option to enable striping. */ bool enable_striping; @@ -60,7 +72,10 @@ class CascaderOptionsNode : public Object { class CascaderOptions : public ObjectRef { public: CascaderOptions(const MemoryRegion& cascade_region, int max_proposals, int stripe_factors, - int max_plan_size, int always_copy_size, bool enable_striping = true); + int max_plan_size, int max_open_plans, int max_closed_plans, int always_copy_size, + bool disable_pareto_plans, bool disable_pareto_proposals, + bool enable_multi_dimensional_striping, bool disable_block_culling, + bool multi_dimensional_striping); TVM_DEFINE_OBJECT_REF_METHODS(CascaderOptions, ObjectRef, CascaderOptionsNode); }; diff --git a/src/contrib/ethosu/cascader/pareto.cc b/src/contrib/ethosu/cascader/pareto.cc index 52ea729bffa2..e40a6602fa2a 100644 --- a/src/contrib/ethosu/cascader/pareto.cc +++ b/src/contrib/ethosu/cascader/pareto.cc @@ -80,10 +80,16 @@ std::vector ThinVector(const std::vector& vec, size_t max_size) { return thin_vec; } -std::vector ParetoCullPlans(std::vector plans, size_t max_plans) { +std::vector ParetoCullPlans(std::vector plans, size_t max_plans, + bool disable_pareto_metric) { if (plans.size() <= max_plans) { return plans; } + if (disable_pareto_metric) { + // Sample from all plans + return ThinVector(plans, max_plans); + } + std::sort(plans.begin(), plans.end(), [](const Plan& a, const Plan& b) -> bool { return a->GetMemoryUsage() < b->GetMemoryUsage(); }); @@ -108,7 +114,13 @@ std::vector ParetoCullPlans(std::vector plans, size_t max_plans) { return ThinVector(optimal_plans, max_plans); } -std::vector ParetoCullProposals(std::vector proposals, size_t max_proposals) { +std::vector ParetoCullProposals(std::vector proposals, size_t max_proposals, + bool disable_pareto_metric) { + if (disable_pareto_metric) { + // Sample from all Proposals + return ThinVector(proposals, max_proposals); + } + std::sort(proposals.begin(), proposals.end(), [](const Proposal& a, const Proposal& b) -> bool { return a->GetMemoryUsage() < b->GetMemoryUsage(); }); @@ -156,9 +168,9 @@ TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.ThinVector") }); TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.ParetoCullPlans") - .set_body_typed([](Array plans, int max_size) { + .set_body_typed([](Array plans, int max_size, bool disable_pareto_metric) { std::vector vplans(plans.begin(), plans.end()); - return Array(ParetoCullPlans(vplans, max_size)); + return Array(ParetoCullPlans(vplans, max_size, disable_pareto_metric)); }); } // namespace cascader diff --git a/src/contrib/ethosu/cascader/pareto.h b/src/contrib/ethosu/cascader/pareto.h index 511da6c2712f..abb6ca516c23 100644 --- a/src/contrib/ethosu/cascader/pareto.h +++ b/src/contrib/ethosu/cascader/pareto.h @@ -61,13 +61,16 @@ std::vector ThinVector(const std::vector& vec, size_t max_size); * \brief Cull plans which are not Pareto optimal then thin them down. * \param plans The plans to apply the Pareto culling to. * \param max_plans The maximum number of plans after the culling. + * \param disable_pareto_metric Whether to only select from Pareto frontier or not. * \return The culled plans. * \note Plan Pareto-optimality is determined based upon a Plan's memory_usage * and cycles. */ -std::vector ParetoCullPlans(std::vector plans, size_t max_plans); +std::vector ParetoCullPlans(std::vector plans, size_t max_plans, + bool disable_pareto_metric); -std::vector ParetoCullProposals(std::vector proposals, size_t max_proposals); +std::vector ParetoCullProposals(std::vector proposals, size_t max_proposals, + bool disable_pareto_metric); } // namespace cascader } // namespace ethosu diff --git a/src/contrib/ethosu/cascader/plan_generator.cc b/src/contrib/ethosu/cascader/plan_generator.cc index 75e711ea0fa0..780f9adc2c13 100644 --- a/src/contrib/ethosu/cascader/plan_generator.cc +++ b/src/contrib/ethosu/cascader/plan_generator.cc @@ -106,7 +106,8 @@ std::vector GetCascadableAxes(const Part& part) { } std::vector GenerateOutputStripeConfigs(const Part& part, int stripe_factors, - bool enable_striping) { + bool enable_striping, + bool multi_dimensional) { // If stripe_factors is <= 0, then we won't produce any StripeConfigs if (stripe_factors <= 0) { return std::vector(); @@ -147,11 +148,29 @@ std::vector GenerateOutputStripeConfigs(const Part& part, int stri } splits.push_back(std::vector(axis_splits.begin(), axis_splits.end())); } - // Now calculate all the possible combinations of splits for each dimension - // to give us all the possible stripe shapes. For example, if we had two axes - // both with possible splits in {128, 64, 32, 1}, the stripe shapes would be: - // (128, 128), (128, 64), (128, 32) ... (1, 64), (1, 32), (1, 1) - auto stripe_shapes = EnumerateCombinations(splits); + + std::vector> stripe_shapes; + if (multi_dimensional) { + // Now calculate all the possible combinations of splits for each dimension + // to give us all the possible stripe shapes. For example, if we had two axes + // both with possible splits in {128, 64, 32, 1}, the stripe shapes would be: + // (128, 128), (128, 64), (128, 32) ... (1, 64), (1, 32), (1, 1) + stripe_shapes = EnumerateCombinations(splits); + } else { + // Only consider splitting a single axis + int axis = 0; + for (const auto& split : splits) { + for (const auto& axis_split : split) { + std::vector stripe_shape = output_shape; + if (stripe_shape[axis] != axis_split) { + stripe_shape[axis] = axis_split; + stripe_shapes.push_back(stripe_shape); + } + } + axis++; + } + stripe_shapes.push_back(output_shape); + } auto offset = std::vector(output_dims); std::vector stripe_configs; // Calculate the possible axis orderings such that each axis has the opportunity @@ -437,7 +456,8 @@ std::unordered_map, std::vector> GenerateGraphPlans( // output of a Plan. The number generated is a function of stripe_factors and the number of // cascadable dimensions in the Part. std::vector stripe_configs = - GenerateOutputStripeConfigs(part, options->stripe_factors, options->enable_striping); + GenerateOutputStripeConfigs(part, options->stripe_factors, options->enable_striping, + options->enable_multi_dimensional_striping); // Check to see if the output Tensor is part of any existing open Plans if (stripe_configs_by_tensor.find(part->GetOutputTensor()) != stripe_configs_by_tensor.end()) { // If there are other open Plans which have this Part's output Tensor as an input, then @@ -491,10 +511,12 @@ std::unordered_map, std::vector> GenerateGraphPlans( // and plans_by_config maps. for (const auto& part_group : new_part_groups) { if (closed_plans.find(part_group) != closed_plans.end()) { - closed_plans[part_group] = ParetoCullPlans(closed_plans.at(part_group), 32); + closed_plans[part_group] = ParetoCullPlans( + closed_plans.at(part_group), options->max_closed_plans, options->disable_pareto_plans); } for (const auto& it : open_plans[part_group]) { - auto pareto_plans = ParetoCullPlans(it.second, 8); + auto pareto_plans = + ParetoCullPlans(it.second, options->max_open_plans, options->disable_pareto_plans); for (const auto& plan : pareto_plans) { for (const auto& open_config : plan->GetOpenConfigs()) { if (open_config != plan->GetOutputConfig()) { @@ -515,12 +537,13 @@ std::unordered_map, std::vector> GenerateGraphPlans( } TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateOutputStripeConfigs") - .set_body_typed([](Part part, int stripe_factors, bool enable_striping) { + .set_body_typed([](Part part, int stripe_factors, bool enable_striping, + bool multi_dimensional) { if (stripe_factors < 0) { return Array(); } return Array( - GenerateOutputStripeConfigs(part, stripe_factors, enable_striping)); + GenerateOutputStripeConfigs(part, stripe_factors, enable_striping, multi_dimensional)); }); TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateSinglePlans") diff --git a/src/contrib/ethosu/cascader/plan_generator.h b/src/contrib/ethosu/cascader/plan_generator.h index 947728addfd1..71bdef82d2cb 100644 --- a/src/contrib/ethosu/cascader/plan_generator.h +++ b/src/contrib/ethosu/cascader/plan_generator.h @@ -51,9 +51,12 @@ using HomeMap = * \brief Generate possible output StripeConfigs that could be applied to a Part's output. * \param part The Part to generate StripeConfigs for. * \param stripe_factors How many striping factors to try per axis. + * \param enable_striping Whether striping is enabled + * \param multi_dimensional Whether to stripe in more than one dimension. * \return The generated StripeConfigs for the Part's output. */ -std::vector GenerateOutputStripeConfigs(const Part& part, int stripe_factors); +std::vector GenerateOutputStripeConfigs(const Part& part, int stripe_factors, + bool enable_striping, bool multi_dimensional); /*! * \brief Generate single-Part Plans for a Part for a given list of output StripeConfigs. diff --git a/src/contrib/ethosu/cascader/proposal_generator.cc b/src/contrib/ethosu/cascader/proposal_generator.cc index ce709cbaa657..f886aad42408 100644 --- a/src/contrib/ethosu/cascader/proposal_generator.cc +++ b/src/contrib/ethosu/cascader/proposal_generator.cc @@ -177,7 +177,8 @@ std::vector GeneratePartialProposals( } } (*proposals_by_group)[partial_proposal_group] = - ParetoCullProposals(proposals_by_group->at(partial_proposal_group), options->max_proposals); + ParetoCullProposals(proposals_by_group->at(partial_proposal_group), options->max_proposals, + options->disable_pareto_proposals); } return proposals_by_group->at(partial_proposal_group); } diff --git a/src/relay/backend/contrib/ethosu/compiler_attrs.cc b/src/relay/backend/contrib/ethosu/compiler_attrs.cc index 5124e273d9bf..42add45b013c 100644 --- a/src/relay/backend/contrib/ethosu/compiler_attrs.cc +++ b/src/relay/backend/contrib/ethosu/compiler_attrs.cc @@ -41,6 +41,14 @@ struct EthosUCompilerConfigNode : public tvm::AttrsNode