From d2a7f93bebcaa08bc06c6e1e20681b31735383f4 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Mon, 9 May 2022 14:45:09 -0700 Subject: [PATCH] [ROOFLINE] Calculate roofline from existing TIR PrimFunc (#11238) Refactor roofline_analysis to use a pass instrument to save TIR code from compilation for feature extraction. This should support different compilation pipelines and avoids recompiling the module twice. --- python/tvm/utils/roofline.py | 192 +++++++++++++++++++++++++--------- src/auto_scheduler/feature.cc | 2 +- 2 files changed, 146 insertions(+), 48 deletions(-) diff --git a/python/tvm/utils/roofline.py b/python/tvm/utils/roofline.py index 2d05503da75a..431becdd00d1 100644 --- a/python/tvm/utils/roofline.py +++ b/python/tvm/utils/roofline.py @@ -22,6 +22,8 @@ from ..target import Target from ..runtime import profiler_vm, profiling, Device, num_threads from ..script import tir as T +from ..ir.instrument import pass_instrument +from ..ir.expr import GlobalVar def _create_args(mod: IRModule, dev: Device, func_name: str = "main"): @@ -36,16 +38,6 @@ def _create_args(mod: IRModule, dev: Device, func_name: str = "main"): return args -def _estimated_features(mod: IRModule, params: Dict[str, nd.NDArray], target: Target): - comp = relay.vm.VMCompiler() - mod, params = comp.optimize(mod, params=params, target=target) - return { - prim.attrs["hash"]: (name, auto_scheduler.feature.named_features_from_primfunc(prim)) - for name, prim in mod.functions.items() - if isinstance(prim, tir.PrimFunc) - } - - def _detect_vec_width_registers( target: Target, vec_width: Optional[int], num_vector_registers: Optional[int] ): @@ -226,60 +218,98 @@ def estimate_peak_bandwidth(target: Target, dev: Device, vec_width: Optional[int return a.numpy().size * 4 / times.min # 4 bytes per float32 -def roofline_analysis( - mod: IRModule, params: Dict[str, nd.NDArray], target: Union[str, Target], dev: Device +@pass_instrument +class SaveLoweredTIR: + """Save TIR functions from right before final lowering. Right now this + means right before tir.MakePackedAPI.""" + + def __init__(self): + self.functions = {} + self.done = False + + def run_after_pass(self, mod, info): + if not self.done: + if info.name == "tir.MakePackedAPI": + self.done = True + else: + for v, func in mod.functions.items(): + self.functions[v] = func + + +def roofline_from_existing( + report: profiling.Report, + tir_functions: Dict[GlobalVar, tir.PrimFunc], + target: Target, + dev: Device, ) -> profiling.Report: - """ - Create a profiling report that contains roofline and other estimated - statistics from running a module on the VM. + """Add roofline and other estimated statistics to an existing profiling report. - These statistics are calculated by analyzing the lowered TIR of each - operator, so they are estimates of the true values. The statistics are: - - Bound: Is the operator memory or compute bound. This is computed by - assuming that the operator could perfectly cache all loads -- each byte - of memory is only loaded once. - - Percent of Theoretical Optimal: What percent of theoretical optimal for - the bound. i.e. percent of peak memory bandwidth if memory bound, - percent of peak FLOP/s if compute bound. - - Loaded Bytes: estimation of the number of bytes loaded from main memory. - - Estimated Flops: estimated number of floating point operations. - - Arithmetic Intensity: ratio of FLOPs per byte of data. - - FLOP/s: floating point operations per second. - - Bandwidth: Number of bytes loaded per second. + :py:func:`roofline_analysis` should always be used instead of this function + unless you need a custom compilation pipeline. - Parameters - ---------- - mod : IRModule - Uncompiled input module> + Calculating roofline statistics requires features extracted the TIR + functions in addition to per-operator runtime information (`report`) of the + same TIR features. The features and TIR functions are not included with the + compiled library used to generate the per-operator runtime. It is essential + that the per-operator information comes from the exact same compilation + pipeline as the TIR functions. - params : Dict[str, nd.NDArray] - target : Union[str, Target] - Target to run on. + Example + ------- + + ..code: : python + + import tvm + import tvm.relay + + mod, params = tvm.relay.testing.mlp.get_workload() + + # it is recommended to use SaveLoweredTIR to get out the tir primfuncs + save_tir = tvm.utils.roofline.SaveLoweredTIR() + with tvm.transform.PassContext(opt_level=3, pass_instrument=[save_tir]): + lib = relay.vm.compile(mod, params=params, target=target) + + vmexec = profiler_vm.VirtualMachineProfiler(lib, dev) + report = vmexec.profile(*inputs) + + roofline_report = roofline_from_existing(report, save_tir.functions, target, dev) + + Parameters + ---------- + report : Report + Existing profiling report from :py:method:`VirtualMachineProfiler.profile`. + tir_functions : Dict[GlobalVar, PrimFunc] + TIR primfuncs from the module run to generate `report`. It is nessesary + that these functions come before the `tir.MakePackedAPI` pass and are + compatible with auto_scheduler featurization. + :py:class:`SaveLoweredTIR` is the recommended way to collect these + functions. + target : Target + TVM target that `report` was generated with. dev : Device - Device to run on. + Device that `report` was generated with. Returns ------- - - report : profiling.Report - Profiling report which includes the estimated statistics. + profiling.Report + New profiling report that includes all information from `report` + along with additional roofline metrics. See + :py:func:`roofline_analysis` for more information on which metrics + are included. """ - if isinstance(target, str): - target = Target(target) peak_bandwidth = estimate_peak_bandwidth(target, dev) peak_flops = estimate_peak_fma_flops(target, dev) ridge_point = peak_flops / peak_bandwidth - all_features = _estimated_features(mod, params, target) - - lib = relay.vm.compile(mod, params=params, target=target) - vmexec = profiler_vm.VirtualMachineProfiler(lib, dev) + all_features = { + prim.attrs["hash"]: (name, auto_scheduler.feature.named_features_from_primfunc(prim)) + for name, prim in tir_functions.items() + if isinstance(prim, tir.PrimFunc) and "hash" in prim.attrs.keys() + } - args = _create_args(mod, dev) - report = vmexec.profile(*args) new_calls = [] for call in report.calls: if "Hash" in call.keys(): @@ -313,3 +343,71 @@ def roofline_analysis( else: new_calls.append(call) return profiling.Report(new_calls, report.device_metrics) + + +def roofline_analysis( + mod: IRModule, params: Dict[str, nd.NDArray], target: Union[str, Target], dev: Device +) -> profiling.Report: + """ + Create a profiling report that contains roofline and other estimated + statistics from running a module on the VM. + + The roofline model measures how close a operator gets to best possible + memory bandwidth or FLOP/s depending on whether it is memory or compute + bound. This computation uses the runtime of the operator along with two + numbers extracted from the TIR code: bytes of memory touched and number of + floating point operations. + + These statistics are calculated by analyzing the lowered TIR of each + operator, so they are estimates of the true values. The statistics are: + - Bound: Is the operator memory or compute bound. This is computed by + assuming that the operator could perfectly cache all loads -- each byte + of memory is only loaded once. + - Percent of Theoretical Optimal: What percent of theoretical optimal for + the bound. i.e. percent of peak memory bandwidth if memory bound, + percent of peak FLOP/s if compute bound. + - Loaded Bytes: estimation of the number of bytes loaded from main memory. + - Estimated Flops: estimated number of floating point operations. + - Arithmetic Intensity: ratio of FLOPs per byte of data. + - FLOP/s: floating point operations per second. + - Bandwidth: Number of bytes loaded per second. + + Parameters + ---------- + mod : IRModule + Uncompiled input module> + + params : Dict[str, nd.NDArray] + + target : Union[str, Target] + Target to run on. + + dev : Device + Device to run on. + + Returns + ------- + + report : profiling.Report + Profiling report which includes the estimated statistics. + """ + if isinstance(target, str): + target = Target(target) + + save_tir = SaveLoweredTIR() + # copy existing context but add our instrument + pass_ctx = transform.PassContext.current() + with transform.PassContext( + opt_level=pass_ctx.opt_level, + required_pass=pass_ctx.required_pass, + disabled_pass=pass_ctx.disabled_pass, + instruments=list(pass_ctx.instruments) + [save_tir], + config=pass_ctx.config, + ): + lib = relay.vm.compile(mod, params=params, target=target) + vmexec = profiler_vm.VirtualMachineProfiler(lib, dev) + + args = _create_args(mod, dev) + report = vmexec.profile(*args) + + return roofline_from_existing(report, save_tir.functions, target, dev) diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc index 5543b873ed33..bf6fce8978c9 100644 --- a/src/auto_scheduler/feature.cc +++ b/src/auto_scheduler/feature.cc @@ -740,7 +740,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor { // TODO(tkonolige): add arithmetic counts from this statement to counts of inner stores. ana_.Bind(node->var, node->value); ICHECK(variable_definition_stack_.size() > 0) - << "Variable definition out size of a for loop is not handled by feature extraction"; + << "Variable definition outside of a for loop is not handled by feature extraction"; variable_definition_stack_.back().push_back(std::make_tuple(node->var, node->value)); StmtExprVisitor::VisitStmt_(node); }