Skip to content

Commit

Permalink
[ROOFLINE] Calculate roofline from existing TIR PrimFunc (apache#11238)
Browse files Browse the repository at this point in the history
Refactor roofline_analysis to use a pass instrument to save TIR code
from compilation for feature extraction. This should support different
compilation pipelines and avoids recompiling the module twice.
  • Loading branch information
Tristan Konolige authored and Sergey Shtin committed May 17, 2022
1 parent 8690699 commit f187136
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 48 deletions.
192 changes: 145 additions & 47 deletions python/tvm/utils/roofline.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from ..target import Target
from ..runtime import profiler_vm, profiling, Device, num_threads
from ..script import tir as T
from ..ir.instrument import pass_instrument
from ..ir.expr import GlobalVar


def _create_args(mod: IRModule, dev: Device, func_name: str = "main"):
Expand All @@ -36,16 +38,6 @@ def _create_args(mod: IRModule, dev: Device, func_name: str = "main"):
return args


def _estimated_features(mod: IRModule, params: Dict[str, nd.NDArray], target: Target):
comp = relay.vm.VMCompiler()
mod, params = comp.optimize(mod, params=params, target=target)
return {
prim.attrs["hash"]: (name, auto_scheduler.feature.named_features_from_primfunc(prim))
for name, prim in mod.functions.items()
if isinstance(prim, tir.PrimFunc)
}


def _detect_vec_width_registers(
target: Target, vec_width: Optional[int], num_vector_registers: Optional[int]
):
Expand Down Expand Up @@ -226,60 +218,98 @@ def estimate_peak_bandwidth(target: Target, dev: Device, vec_width: Optional[int
return a.numpy().size * 4 / times.min # 4 bytes per float32


def roofline_analysis(
mod: IRModule, params: Dict[str, nd.NDArray], target: Union[str, Target], dev: Device
@pass_instrument
class SaveLoweredTIR:
"""Save TIR functions from right before final lowering. Right now this
means right before tir.MakePackedAPI."""

def __init__(self):
self.functions = {}
self.done = False

def run_after_pass(self, mod, info):
if not self.done:
if info.name == "tir.MakePackedAPI":
self.done = True
else:
for v, func in mod.functions.items():
self.functions[v] = func


def roofline_from_existing(
report: profiling.Report,
tir_functions: Dict[GlobalVar, tir.PrimFunc],
target: Target,
dev: Device,
) -> profiling.Report:
"""
Create a profiling report that contains roofline and other estimated
statistics from running a module on the VM.
"""Add roofline and other estimated statistics to an existing profiling report.
These statistics are calculated by analyzing the lowered TIR of each
operator, so they are estimates of the true values. The statistics are:
- Bound: Is the operator memory or compute bound. This is computed by
assuming that the operator could perfectly cache all loads -- each byte
of memory is only loaded once.
- Percent of Theoretical Optimal: What percent of theoretical optimal for
the bound. i.e. percent of peak memory bandwidth if memory bound,
percent of peak FLOP/s if compute bound.
- Loaded Bytes: estimation of the number of bytes loaded from main memory.
- Estimated Flops: estimated number of floating point operations.
- Arithmetic Intensity: ratio of FLOPs per byte of data.
- FLOP/s: floating point operations per second.
- Bandwidth: Number of bytes loaded per second.
:py:func:`roofline_analysis` should always be used instead of this function
unless you need a custom compilation pipeline.
Parameters
----------
mod : IRModule
Uncompiled input module>
Calculating roofline statistics requires features extracted the TIR
functions in addition to per-operator runtime information (`report`) of the
same TIR features. The features and TIR functions are not included with the
compiled library used to generate the per-operator runtime. It is essential
that the per-operator information comes from the exact same compilation
pipeline as the TIR functions.
params : Dict[str, nd.NDArray]
target : Union[str, Target]
Target to run on.
Example
-------
..code: : python
import tvm
import tvm.relay
mod, params = tvm.relay.testing.mlp.get_workload()
# it is recommended to use SaveLoweredTIR to get out the tir primfuncs
save_tir = tvm.utils.roofline.SaveLoweredTIR()
with tvm.transform.PassContext(opt_level=3, pass_instrument=[save_tir]):
lib = relay.vm.compile(mod, params=params, target=target)
vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)
report = vmexec.profile(*inputs)
roofline_report = roofline_from_existing(report, save_tir.functions, target, dev)
Parameters
----------
report : Report
Existing profiling report from :py:method:`VirtualMachineProfiler.profile`.
tir_functions : Dict[GlobalVar, PrimFunc]
TIR primfuncs from the module run to generate `report`. It is nessesary
that these functions come before the `tir.MakePackedAPI` pass and are
compatible with auto_scheduler featurization.
:py:class:`SaveLoweredTIR` is the recommended way to collect these
functions.
target : Target
TVM target that `report` was generated with.
dev : Device
Device to run on.
Device that `report` was generated with.
Returns
-------
report : profiling.Report
Profiling report which includes the estimated statistics.
profiling.Report
New profiling report that includes all information from `report`
along with additional roofline metrics. See
:py:func:`roofline_analysis` for more information on which metrics
are included.
"""
if isinstance(target, str):
target = Target(target)
peak_bandwidth = estimate_peak_bandwidth(target, dev)
peak_flops = estimate_peak_fma_flops(target, dev)

ridge_point = peak_flops / peak_bandwidth

all_features = _estimated_features(mod, params, target)

lib = relay.vm.compile(mod, params=params, target=target)
vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)
all_features = {
prim.attrs["hash"]: (name, auto_scheduler.feature.named_features_from_primfunc(prim))
for name, prim in tir_functions.items()
if isinstance(prim, tir.PrimFunc) and "hash" in prim.attrs.keys()
}

args = _create_args(mod, dev)
report = vmexec.profile(*args)
new_calls = []
for call in report.calls:
if "Hash" in call.keys():
Expand Down Expand Up @@ -313,3 +343,71 @@ def roofline_analysis(
else:
new_calls.append(call)
return profiling.Report(new_calls, report.device_metrics)


def roofline_analysis(
mod: IRModule, params: Dict[str, nd.NDArray], target: Union[str, Target], dev: Device
) -> profiling.Report:
"""
Create a profiling report that contains roofline and other estimated
statistics from running a module on the VM.
The roofline model measures how close a operator gets to best possible
memory bandwidth or FLOP/s depending on whether it is memory or compute
bound. This computation uses the runtime of the operator along with two
numbers extracted from the TIR code: bytes of memory touched and number of
floating point operations.
These statistics are calculated by analyzing the lowered TIR of each
operator, so they are estimates of the true values. The statistics are:
- Bound: Is the operator memory or compute bound. This is computed by
assuming that the operator could perfectly cache all loads -- each byte
of memory is only loaded once.
- Percent of Theoretical Optimal: What percent of theoretical optimal for
the bound. i.e. percent of peak memory bandwidth if memory bound,
percent of peak FLOP/s if compute bound.
- Loaded Bytes: estimation of the number of bytes loaded from main memory.
- Estimated Flops: estimated number of floating point operations.
- Arithmetic Intensity: ratio of FLOPs per byte of data.
- FLOP/s: floating point operations per second.
- Bandwidth: Number of bytes loaded per second.
Parameters
----------
mod : IRModule
Uncompiled input module>
params : Dict[str, nd.NDArray]
target : Union[str, Target]
Target to run on.
dev : Device
Device to run on.
Returns
-------
report : profiling.Report
Profiling report which includes the estimated statistics.
"""
if isinstance(target, str):
target = Target(target)

save_tir = SaveLoweredTIR()
# copy existing context but add our instrument
pass_ctx = transform.PassContext.current()
with transform.PassContext(
opt_level=pass_ctx.opt_level,
required_pass=pass_ctx.required_pass,
disabled_pass=pass_ctx.disabled_pass,
instruments=list(pass_ctx.instruments) + [save_tir],
config=pass_ctx.config,
):
lib = relay.vm.compile(mod, params=params, target=target)
vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)

args = _create_args(mod, dev)
report = vmexec.profile(*args)

return roofline_from_existing(report, save_tir.functions, target, dev)
2 changes: 1 addition & 1 deletion src/auto_scheduler/feature.cc
Original file line number Diff line number Diff line change
Expand Up @@ -740,7 +740,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
// TODO(tkonolige): add arithmetic counts from this statement to counts of inner stores.
ana_.Bind(node->var, node->value);
ICHECK(variable_definition_stack_.size() > 0)
<< "Variable definition out size of a for loop is not handled by feature extraction";
<< "Variable definition outside of a for loop is not handled by feature extraction";
variable_definition_stack_.back().push_back(std::make_tuple(node->var, node->value));
StmtExprVisitor::VisitStmt_(node);
}
Expand Down

0 comments on commit f187136

Please sign in to comment.