Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Graph Debugger] Expose way to benchmark individual nodes. #11000

Merged
merged 15 commits into from
Apr 28, 2022
55 changes: 50 additions & 5 deletions python/tvm/contrib/debugger/debug_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,18 @@
# under the License.
"""Graph debug runtime executes TVM debug packed functions."""

import logging
import os
import tempfile
import shutil
import logging
import tvm._ffi
import tempfile

import tvm._ffi
from tvm._ffi.base import string_types
from tvm.contrib import graph_executor
from . import debug_result
from tvm.runtime.module import BenchmarkResult

from ...runtime.profiling import Report
from . import debug_result

_DUMP_ROOT_PREFIX = "tvmdbg_"
_DUMP_PATH_PREFIX = "_tvmdbg_"
Expand Down Expand Up @@ -111,6 +113,7 @@ def __init__(self, module, device, graph_json_str, dump_root):
self._dump_root = dump_root
self._dump_path = None
self._run_individual = module["run_individual"]
self._run_individual_node = module["run_individual_node"]
self._debug_get_output = module["debug_get_output"]
self._execute_node = module["execute_node"]
self._get_node_output = module["get_node_output"]
Expand Down Expand Up @@ -223,7 +226,6 @@ def _run_debug(self):
"""Execute the node specified with index will be executed.
Each debug output will be copied to the buffer
Time consumed for each execution will be set as debug output.

"""
# Get timing.
self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)]
Expand Down Expand Up @@ -281,6 +283,49 @@ def run_individual(self, number, repeat=1, min_repeat_ms=0):
ret = self._run_individual(number, repeat, min_repeat_ms)
return ret.strip(",").split(",") if ret else []

def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
"""Benchmark a single node in the serialized graph.
AndrewZhaoLuo marked this conversation as resolved.
Show resolved Hide resolved

This does not do any data transfers and uses arrays already on the device.

Parameters
----------
index : int
The index of the node, see `self.debug_datum.get_graph_nodes`

number: int
The number of times to run this function for taking average.
We call these runs as one `repeat` of measurement.

repeat: int, optional
The number of times to repeat the measurement.
In total, the function will be invoked (1 + number x repeat) times,
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.

min_repeat_ms: int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.

Returns
-------
A module BenchmarkResult
"""
# Results are returned as serialized strings which we deserialize
ret = self._run_individual_node(index, number, repeat, min_repeat_ms)
answer = []
for value in ret.split(","):
if value.strip() == "":
continue
answer.append(float(value))

return BenchmarkResult(answer)

def profile(self, collectors=None, **input_dict):
"""Run forward execution of the graph and collect overall and per-op
performance metrics.
Expand Down
113 changes: 72 additions & 41 deletions src/runtime/graph_executor/debug/graph_executor_debug.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@
#include <tvm/runtime/registry.h>

#include <chrono>
#include <cmath>
#include <sstream>

#include "../../rpc/rpc_session.h"
#include "../graph_executor.h"

namespace tvm {
Expand Down Expand Up @@ -67,44 +69,14 @@ class GraphExecutorDebug : public GraphExecutor {
time_sec_per_op[index] += RunOpRPC(index, number, repeat, min_repeat_ms);
}
} else {
for (int i = 0; i < repeat; ++i) {
std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds>
tbegin, tend;
double duration_ms = 0.0;
do {
std::fill(time_sec_per_op.begin(), time_sec_per_op.end(), 0);
if (duration_ms > 0.0) {
number = static_cast<int>(std::max((min_repeat_ms / (duration_ms / number) + 1),
number * 1.618)); // 1.618 is chosen by random
}
tbegin = std::chrono::high_resolution_clock::now();
std::vector<std::vector<Timer>> op_timers;
for (size_t index = 0; index < op_execs_.size(); index++) {
op_timers.push_back({});
}
for (int k = 0; k < number; k++) {
for (size_t index = 0; index < op_execs_.size(); ++index) {
if (op_execs_[index]) {
op_timers[index].push_back(RunOpHost(index));
}
}
}
for (size_t index = 0; index < op_execs_.size(); ++index) {
for (auto t : op_timers[index]) {
time_sec_per_op[index] += t->SyncAndGetElapsedNanos() / 1e9;
}
}
tend = std::chrono::high_resolution_clock::now();
duration_ms =
std::chrono::duration_cast<std::chrono::duration<double>>(tend - tbegin).count() *
1000;
} while (duration_ms < min_repeat_ms);

LOG(INFO) << "Iteration: " << i;
int op = 0;
for (size_t index = 0; index < time_sec_per_op.size(); index++) {
for (size_t index = 0; index < op_execs_.size(); ++index) {
std::vector<double> results = RunIndividualNode(index, number, repeat, min_repeat_ms);
for (size_t cur_repeat = 0; cur_repeat < results.size(); cur_repeat++) {
time_sec_per_op[index] = results[cur_repeat];

LOG(INFO) << "Iteration: " << cur_repeat;
int op = 0;
if (op_execs_[index]) {
time_sec_per_op[index] /= number;
LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": "
<< time_sec_per_op[index] * 1e6 << " us/iter";
}
Expand All @@ -114,15 +86,50 @@ class GraphExecutorDebug : public GraphExecutor {

std::ostringstream os;
for (size_t index = 0; index < time_sec_per_op.size(); index++) {
os << time_sec_per_op[index] << ",";
double time = time_sec_per_op[index];
// To have good behavior when calculating total time, etc.
if (std::isnan(time)) {
time = 0;
}
os << time << ",";
}
return os.str();
}

std::vector<double> RunIndividualNode(int node_index, int number, int repeat, int min_repeat_ms) {
std::string tkey = module_->type_key();

// results_in_seconds[a][b] is the bth index run of the ath index repeat
std::vector<double> results_in_seconds(repeat, 0);

if (tkey == "rpc") {
LOG(FATAL) << "RPC measurements should not use RunIndividualNode!";
}

if (!op_execs_[node_index]) {
// don't return anything...
return results_in_seconds;
}

// assume host runs things which is first device
Device& d = devices_[0];
PackedFunc time_evaluator = WrapTimeEvaluator(
TypedPackedFunc<void()>([this, node_index]() { this->RunOpHost(node_index); }), d, number,
repeat, min_repeat_ms);
std::string result = time_evaluator();
const double* results_arr = reinterpret_cast<const double*>(result.data());
size_t double_bytes = sizeof(double);
for (size_t i = 0; i < result.size() / double_bytes; i++) {
results_in_seconds[i] = results_arr[i];
}
return results_in_seconds;
}

double RunOpRPC(int index, int number, int repeat, int min_repeat_ms) {
// Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes which
// represent inputs/parameters to the graph. Other types may be supported in the future, but
// consideration would be needed as to how to do that over RPC before we support it here.
// Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes
// which represent inputs/parameters to the graph. Other types may be supported in the
// future, but consideration would be needed as to how to do that over RPC before we support
// it here.
if (nodes_[index].op_type != "tvm_op") {
CHECK_EQ(nodes_[index].op_type, "null")
<< "Don't know how to run op type " << nodes_[index].op_type
Expand Down Expand Up @@ -362,6 +369,30 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
ICHECK_GE(min_repeat_ms, 0);
*rv = this->RunIndividual(number, repeat, min_repeat_ms);
});
} else if (name == "run_individual_node") {
return TypedPackedFunc<std::string(int, int, int, int)>(
[sptr_to_self, this](int node_index, int number, int repeat, int min_repeat_ms) {
ICHECK_GE(node_index, 0);
ICHECK_LT(node_index, nodes_.size());
ICHECK_GT(number, 0);
ICHECK_GT(repeat, 0);
ICHECK_GE(min_repeat_ms, 0);
std::vector<double> results =
this->RunIndividualNode(node_index, number, repeat, min_repeat_ms);

// Have problems returning FloatImm so serialize to string results as hack.
std::stringstream s;

// use maximum precision available and use fixed representation
s << std::fixed;
s.precision(std::numeric_limits<double>::max_digits10);

for (double cur : results) {
s << cur << ", ";
}

return s.str();
});
} else if (name == "profile") {
return TypedPackedFunc<profiling::Report(Array<profiling::MetricCollector>)>(
[sptr_to_self, this](Array<profiling::MetricCollector> collectors) {
Expand Down
114 changes: 92 additions & 22 deletions tests/python/unittest/test_runtime_graph_debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,56 @@
import re
import sys
import time
from distutils.log import debug

import numpy as np
import pytest

import tvm
import tvm.testing
from tvm import te
import numpy as np
from tvm import rpc
from tvm import rpc, te
from tvm._ffi.base import TVMError
from tvm.contrib import utils
from tvm.contrib.debugger import debug_executor


@tvm.testing.requires_llvm
@tvm.testing.requires_rpc
def test_graph_simple():
n = 4
A = te.placeholder((n,), name="A")
B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
s = te.create_schedule(B.op)
# Constants for creating simple graphs, fixtures to avoid free globals
@pytest.fixture
def n():
return 4


@pytest.fixture
def A(n):
return te.placeholder((n,), name="A")


@pytest.fixture
def B(A):
return te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")


@pytest.fixture
def s(B):
return te.create_schedule(B.op)


@pytest.fixture
def mlib(s, A, B):
return tvm.build(s, [A, B], "llvm", name="myadd")


@pytest.fixture
def myadd(mlib):
def _myadd(*args):
to_return = mlib["myadd"](*args)
time.sleep(0.25)
return to_return

return _myadd


@pytest.fixture
def graph():
node0 = {"op": "null", "name": "x", "inputs": []}
node1 = {
"op": "tvm_op",
Expand All @@ -64,21 +94,19 @@ def test_graph_simple():
"attrs": attrs,
}
graph = json.dumps(graph)
return graph

def check_verify():
mlib = tvm.build(s, [A, B], "llvm", name="myadd")

def myadd(*args):
to_return = mlib["myadd"](*args)
time.sleep(0.25)
return to_return

@tvm.testing.requires_llvm
@tvm.testing.requires_rpc
@pytest.mark.skipif(
tvm.support.libinfo()["USE_PROFILER"] != "ON", reason="TVM was not built with profiler support"
)
def test_end_to_end_graph_simple(graph, n, A, B, s, myadd):
def check_verify():
mlib_proxy = tvm.support.FrontendTestModule()
mlib_proxy["myadd"] = myadd
try:
mod = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))
except ValueError:
return
mod = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))

a = np.random.uniform(size=(n,)).astype(A.dtype)
mod.set_input(x=a)
Expand Down Expand Up @@ -185,5 +213,47 @@ def check_remote(server):
check_remote(rpc.Server("127.0.0.1"))


@tvm.testing.requires_llvm
@pytest.mark.skipif(
tvm.support.libinfo()["USE_PROFILER"] != "ON", reason="TVM was not built with profiler support"
)
def test_run_single_node(graph, n, A, myadd):
mlib_proxy = tvm.support.FrontendTestModule()
mlib_proxy["myadd"] = myadd
mod: debug_executor.GraphModuleDebug = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))

a = np.random.uniform(size=(n,)).astype(A.dtype)
mod.set_input(x=a)

assert len(mod.debug_datum.get_graph_nodes()) == 2
assert mod.debug_datum.get_graph_nodes()[0]["op"] == "param"
assert mod.debug_datum.get_graph_nodes()[1]["op"] == "myadd"

# Running a node with no associated function should return instantly and have 0 runtime
assert mod.run_individual_node(0, number=1).mean == 0

# Meanwhile the actual function should take some time, more time if you run it more times
repeat_1_result = mod.run_individual_node(1, repeat=1)
assert repeat_1_result.mean > 0

# Running multiple times (10) should take longer than 1 time
repeat_3_results = mod.run_individual_node(1, repeat=3)
assert sum(repeat_3_results.results) > sum(repeat_1_result.results)

# Increasing the number of repeats should give you the number of results asked for
assert len(mod.run_individual_node(1, repeat=10).results) == 10

# Doing repeat_ms should have the run time greater than the asked amount
start = time.time()
mod.run_individual_node(1, min_repeat_ms=500)
end = time.time()
elapsed_time_in_seconds = end - start
assert elapsed_time_in_seconds >= 0.5

# Going out of bounds of node index throws a tvm error
with pytest.raises(TVMError):
mod.run_individual_node(2)


if __name__ == "__main__":
sys.exit(pytest.main([__file__] + sys.argv[1:]))