Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ROOFLINE] Calculate roofline from existing TIR PrimFunc #11238

Merged
merged 1 commit into from
May 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 145 additions & 47 deletions python/tvm/utils/roofline.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from ..target import Target
from ..runtime import profiler_vm, profiling, Device, num_threads
from ..script import tir as T
from ..ir.instrument import pass_instrument
from ..ir.expr import GlobalVar


def _create_args(mod: IRModule, dev: Device, func_name: str = "main"):
Expand All @@ -36,16 +38,6 @@ def _create_args(mod: IRModule, dev: Device, func_name: str = "main"):
return args


def _estimated_features(mod: IRModule, params: Dict[str, nd.NDArray], target: Target):
comp = relay.vm.VMCompiler()
mod, params = comp.optimize(mod, params=params, target=target)
return {
prim.attrs["hash"]: (name, auto_scheduler.feature.named_features_from_primfunc(prim))
for name, prim in mod.functions.items()
if isinstance(prim, tir.PrimFunc)
}


def _detect_vec_width_registers(
target: Target, vec_width: Optional[int], num_vector_registers: Optional[int]
):
Expand Down Expand Up @@ -226,60 +218,98 @@ def estimate_peak_bandwidth(target: Target, dev: Device, vec_width: Optional[int
return a.numpy().size * 4 / times.min # 4 bytes per float32


def roofline_analysis(
mod: IRModule, params: Dict[str, nd.NDArray], target: Union[str, Target], dev: Device
@pass_instrument
class SaveLoweredTIR:
"""Save TIR functions from right before final lowering. Right now this
means right before tir.MakePackedAPI."""

def __init__(self):
self.functions = {}
self.done = False

def run_after_pass(self, mod, info):
if not self.done:
if info.name == "tir.MakePackedAPI":
self.done = True
else:
for v, func in mod.functions.items():
self.functions[v] = func


def roofline_from_existing(
report: profiling.Report,
tir_functions: Dict[GlobalVar, tir.PrimFunc],
target: Target,
dev: Device,
) -> profiling.Report:
"""
Create a profiling report that contains roofline and other estimated
statistics from running a module on the VM.
"""Add roofline and other estimated statistics to an existing profiling report.

These statistics are calculated by analyzing the lowered TIR of each
operator, so they are estimates of the true values. The statistics are:
- Bound: Is the operator memory or compute bound. This is computed by
assuming that the operator could perfectly cache all loads -- each byte
of memory is only loaded once.
- Percent of Theoretical Optimal: What percent of theoretical optimal for
the bound. i.e. percent of peak memory bandwidth if memory bound,
percent of peak FLOP/s if compute bound.
- Loaded Bytes: estimation of the number of bytes loaded from main memory.
- Estimated Flops: estimated number of floating point operations.
- Arithmetic Intensity: ratio of FLOPs per byte of data.
- FLOP/s: floating point operations per second.
- Bandwidth: Number of bytes loaded per second.
:py:func:`roofline_analysis` should always be used instead of this function
unless you need a custom compilation pipeline.

Parameters
----------
mod : IRModule
Uncompiled input module>
Calculating roofline statistics requires features extracted the TIR
functions in addition to per-operator runtime information (`report`) of the
same TIR features. The features and TIR functions are not included with the
compiled library used to generate the per-operator runtime. It is essential
that the per-operator information comes from the exact same compilation
pipeline as the TIR functions.

params : Dict[str, nd.NDArray]

target : Union[str, Target]
Target to run on.
Example
-------

..code: : python

import tvm
import tvm.relay

mod, params = tvm.relay.testing.mlp.get_workload()

# it is recommended to use SaveLoweredTIR to get out the tir primfuncs
save_tir = tvm.utils.roofline.SaveLoweredTIR()
with tvm.transform.PassContext(opt_level=3, pass_instrument=[save_tir]):
lib = relay.vm.compile(mod, params=params, target=target)

vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)
report = vmexec.profile(*inputs)

roofline_report = roofline_from_existing(report, save_tir.functions, target, dev)


Parameters
----------
report : Report
Existing profiling report from :py:method:`VirtualMachineProfiler.profile`.
tir_functions : Dict[GlobalVar, PrimFunc]
TIR primfuncs from the module run to generate `report`. It is nessesary
that these functions come before the `tir.MakePackedAPI` pass and are
compatible with auto_scheduler featurization.
:py:class:`SaveLoweredTIR` is the recommended way to collect these
functions.
target : Target
TVM target that `report` was generated with.
dev : Device
Device to run on.
Device that `report` was generated with.

Returns
-------

report : profiling.Report
Profiling report which includes the estimated statistics.
profiling.Report
New profiling report that includes all information from `report`
along with additional roofline metrics. See
:py:func:`roofline_analysis` for more information on which metrics
are included.
"""
if isinstance(target, str):
target = Target(target)
peak_bandwidth = estimate_peak_bandwidth(target, dev)
peak_flops = estimate_peak_fma_flops(target, dev)

ridge_point = peak_flops / peak_bandwidth

all_features = _estimated_features(mod, params, target)

lib = relay.vm.compile(mod, params=params, target=target)
vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)
all_features = {
prim.attrs["hash"]: (name, auto_scheduler.feature.named_features_from_primfunc(prim))
for name, prim in tir_functions.items()
if isinstance(prim, tir.PrimFunc) and "hash" in prim.attrs.keys()
}

args = _create_args(mod, dev)
report = vmexec.profile(*args)
new_calls = []
for call in report.calls:
if "Hash" in call.keys():
Expand Down Expand Up @@ -313,3 +343,71 @@ def roofline_analysis(
else:
new_calls.append(call)
return profiling.Report(new_calls, report.device_metrics)


def roofline_analysis(
mod: IRModule, params: Dict[str, nd.NDArray], target: Union[str, Target], dev: Device
) -> profiling.Report:
"""
Create a profiling report that contains roofline and other estimated
statistics from running a module on the VM.

The roofline model measures how close a operator gets to best possible
memory bandwidth or FLOP/s depending on whether it is memory or compute
bound. This computation uses the runtime of the operator along with two
numbers extracted from the TIR code: bytes of memory touched and number of
floating point operations.

These statistics are calculated by analyzing the lowered TIR of each
operator, so they are estimates of the true values. The statistics are:
- Bound: Is the operator memory or compute bound. This is computed by
assuming that the operator could perfectly cache all loads -- each byte
of memory is only loaded once.
- Percent of Theoretical Optimal: What percent of theoretical optimal for
the bound. i.e. percent of peak memory bandwidth if memory bound,
percent of peak FLOP/s if compute bound.
- Loaded Bytes: estimation of the number of bytes loaded from main memory.
- Estimated Flops: estimated number of floating point operations.
- Arithmetic Intensity: ratio of FLOPs per byte of data.
- FLOP/s: floating point operations per second.
- Bandwidth: Number of bytes loaded per second.

Parameters
----------
mod : IRModule
Uncompiled input module>

params : Dict[str, nd.NDArray]

target : Union[str, Target]
Target to run on.

dev : Device
Device to run on.

Returns
-------

report : profiling.Report
Profiling report which includes the estimated statistics.
"""
if isinstance(target, str):
target = Target(target)

save_tir = SaveLoweredTIR()
# copy existing context but add our instrument
pass_ctx = transform.PassContext.current()
with transform.PassContext(
opt_level=pass_ctx.opt_level,
required_pass=pass_ctx.required_pass,
disabled_pass=pass_ctx.disabled_pass,
instruments=list(pass_ctx.instruments) + [save_tir],
config=pass_ctx.config,
):
lib = relay.vm.compile(mod, params=params, target=target)
vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)

args = _create_args(mod, dev)
report = vmexec.profile(*args)

return roofline_from_existing(report, save_tir.functions, target, dev)
2 changes: 1 addition & 1 deletion src/auto_scheduler/feature.cc
Original file line number Diff line number Diff line change
Expand Up @@ -740,7 +740,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
// TODO(tkonolige): add arithmetic counts from this statement to counts of inner stores.
ana_.Bind(node->var, node->value);
ICHECK(variable_definition_stack_.size() > 0)
<< "Variable definition out size of a for loop is not handled by feature extraction";
<< "Variable definition outside of a for loop is not handled by feature extraction";
variable_definition_stack_.back().push_back(std::make_tuple(node->var, node->value));
StmtExprVisitor::VisitStmt_(node);
}
Expand Down