Skip to content

Commit

Permalink
[RUNTIME] Add min_repeat_ms to time_evaluator (apache#2200)
Browse files Browse the repository at this point in the history
  • Loading branch information
merrymercy authored and tqchen committed Jan 1, 2019
1 parent 52dec03 commit 6f70b82
Show file tree
Hide file tree
Showing 8 changed files with 194 additions and 116 deletions.
6 changes: 4 additions & 2 deletions python/tvm/autotvm/measure/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,10 @@ def measure_option(builder, runner):
Note
----
To make measurement results accurate, you should pick the correct value for the argument
`number` and `repeat` in Runner(). Using `min_repeat_ms` can dynamically adjusts `number`,
so it is recommended. The typical value for NVIDIA GPU is 100 ms.
`number` and `repeat` in Runner(). Some devices need a certain minimum running time to
"warm up," such as GPUs that need time to reach a performance power state.
Using `min_repeat_ms` can dynamically adjusts `number`, so it is recommended.
The typical value for NVIDIA GPU is 150 ms.
"""
from .measure_methods import LocalBuilder, LocalRunner

Expand Down
109 changes: 47 additions & 62 deletions python/tvm/autotvm/measure/measure_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,20 +140,22 @@ class RPCRunner(Runner):
The host address of RPC Tracker
port: int
The port of RPC Tracker
number : int, optional
Number of times to do measurement for tasking average
number: int
The number of times to run the generated code for taking average.
We call these runs as one `repeat` of measurement.
repeat : int, optional
Number of times to repeat the measurement.
The number of times to repeat the measurement.
In total, the generated code will be run (1 + number x repeat) times,
where the first one is warm up. The returned result contains `repeat` costs,
min_repeat_ms : float, optional
Minimum duration of a timer measurement in milliseconds.
When the run time of a measurement trial falls below this time, the
`number` parameter will be automatically increased.
Set this to improve the accuracy of perf measurement, e.g., when timers
are not precise enough to capture short-running tasks. This parameter is
also critical when devices need a certain minimum running time to "warm
up," such as GPUs that need time to reach a performance power state.
where the first "1" is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
min_repeat_ms: int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
cooldown_interval: float, optional
The cool down interval between two measurements.
check_correctness: bool, optional
Expand All @@ -177,7 +179,6 @@ def __init__(self,
self.number = number
self.repeat = repeat
self.min_repeat_ms = min_repeat_ms
self.cur_number = number

self.ref_input = None
self.ref_output = None
Expand All @@ -188,7 +189,6 @@ def __init__(self,

def set_task(self, task):
self.task = task
self.cur_number = self.number

if check_remote(task.target, self.key, self.host, self.port):
logger.info("Get devices for measurement successfully!")
Expand Down Expand Up @@ -240,8 +240,9 @@ def run(self, measure_inputs, build_results):
ret = self.executor.submit(run_through_rpc,
measure_inp,
build_res,
self.cur_number,
self.number,
self.repeat,
self.min_repeat_ms,
self.cooldown_interval,
remote_args,
self.ref_input,
Expand All @@ -256,32 +257,6 @@ def run(self, measure_inputs, build_results):
else:
results.append(res)

# If some runs were too fast, do remeasure for them
# to meet the requirement of `min_repeat_ms`
remeasure = np.zeros((len(measure_inputs),), dtype=np.bool)
pre_number = next_number = self.cur_number
min_repeat_duration = self.min_repeat_ms / 1000.0
for i, res in enumerate(results):
if res.error_no == MeasureErrorNo.NO_ERROR:
if np.mean(res.costs) * pre_number <= min_repeat_duration:
next_number = max(next_number,
int(np.ceil(min_repeat_duration / np.mean(res.costs))))
remeasure[i] = True

if pre_number != next_number:
self.cur_number = next_number
msg = "increasing number to %d" % self.cur_number
logger.info(msg)

re_measure_inputs = [x for i, x in enumerate(measure_inputs) if remeasure[i]]
re_build_results = [x for i, x in enumerate(build_results) if remeasure[i]]
re_res = self.run(re_measure_inputs, re_build_results)
ct = 0
for i, rerun in enumerate(remeasure):
if rerun:
results[i] = re_res[ct]
ct += 1

return results

class LocalRunner(RPCRunner):
Expand All @@ -291,21 +266,22 @@ class LocalRunner(RPCRunner):
----------
timeout: float
The timeout of a compilation
number : int, optional
Number of times to do measurement for tasking average
number: int
The number of times to run the generated code for taking average.
We call these runs as one `repeat` of measurement.
repeat : int, optional
Number of times to repeat the measurement.
The number of times to repeat the measurement.
In total, the generated code will be run (1 + number x repeat) times,
where the first one is warm up. The returned result contains `repeat` costs,
each of which is the average of `number` test run.
min_repeat_ms : float, optional
Minimum duration of a timer measurement in milliseconds.
When the run time of a measurement trial falls below this time, the
`number` parameter will be automatically increased.
Set this to improve the accuracy of perf measurement, e.g., when timers
are not precise enough to capture short-running tasks. This parameter is
also critical when devices need a certain minimum running time to "warm
up," such as GPUs that need time to reach a performance power state.
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
min_repeat_ms: int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
cooldown_interval: float, optional
The cool down interval between two measurements.
check_correctness: bool, optional
Expand Down Expand Up @@ -416,7 +392,7 @@ def android_ndk_build_func(measure_input, tmp_dir, **kwargs):


def run_through_rpc(measure_input, build_result,
number, repeat, cooldown_interval,
number, repeat, min_repeat_ms, cooldown_interval,
remote_args, ref_input=None, ref_output=None):
"""Run a generated library through rpc
Expand All @@ -426,13 +402,22 @@ def run_through_rpc(measure_input, build_result,
The raw measure input
build_result: BuildResult
The result returned from Builder. This contains the path to the generated library.
number : int, optional
Number of times to do measurement for tasking average
number: int
The number of times to run the generated code for taking average.
We call these runs as one `repeat` of measurement.
repeat : int, optional
Number of times to repeat the measurement.
The number of times to repeat the measurement.
In total, the generated code will be run (1 + number x repeat) times,
where the first one is warm up. The returned result contains `repeat` costs,
each of which is the average of `number` test run.
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
min_repeat_ms: int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
cooldown_interval: float
The cool down interval between two measurements
remote_args: Tuple
Expand All @@ -454,14 +439,14 @@ def run_through_rpc(measure_input, build_result,
func = remote.load_module(os.path.split(build_result.filename)[1])
ctx = remote.context(str(measure_input.target), 0)
time_f = func.time_evaluator(
func.entry_name, ctx, number=number, repeat=repeat)
func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms)

# set input
if ref_input:
args = [nd.array(x, ctx=ctx) for x in ref_input]
else:
# create empty arrays on the remote device and copy them once.
# This can avoid some memory issues that make the measurment results unreliable.
# This can avoid some memory issues that make the measurement results unreliable.
args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info]
args = [nd.array(x, ctx=ctx) for x in args]
ctx.sync()
Expand Down
28 changes: 20 additions & 8 deletions python/tvm/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def export_library(self,
kwargs.update({'options': ["-I" + path for path in find_include_path()]})
fcompile(file_name, files, **kwargs)

def time_evaluator(self, func_name, ctx, number, repeat=1):
def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0):
"""Get an evaluator that measures time cost of running function.
Parameters
Expand All @@ -139,26 +139,38 @@ def time_evaluator(self, func_name, ctx, number, repeat=1):
The context we should run this function on.
number: int
The number of steps used in measuring each time interval
The number of times to run this function for taking average.
We call these runs as one `repeat` of measurement.
repeat: int, optional
Number of times to run the timer measurement
If repeat equals 3, then we will get 3 numbers in the ProfileResult.
The number of times to repeat the measurement.
In total, the function will be invoked (1 + number x repeat) times,
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
min_repeat_ms: int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
Note
----
The function will be invoked repeat * number + 1 times,
The function will be invoked (1 + number x repeat) times,
with the first call discarded in case there is lazy initialization.
Returns
-------
ftimer : Function
The function that takes same argument as func
and return a float representing seconds per function call.
The function that takes same argument as func and returns a ProfileResult.
The ProfileResult reports `repeat` time costs in seconds.
"""
try:
feval = _RPCTimeEvaluator(
self, func_name, ctx.device_type, ctx.device_id, number, repeat)
self, func_name, ctx.device_type, ctx.device_id, number, repeat, min_repeat_ms)

def evaluator(*args):
"""Internal wrapped evaluator."""
Expand Down
9 changes: 5 additions & 4 deletions src/runtime/rpc/rpc_module.cc
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,11 @@ class RPCModuleNode final : public ModuleNode {
PackedFunc GetTimeEvaluator(const std::string& name,
TVMContext ctx,
int number,
int repeat) {
int repeat,
int min_repeat_ms) {
RPCFuncHandle handle = GetFuncHandle(name);
if (handle == nullptr) return PackedFunc();
handle = sess_->GetTimeEvaluator(handle, ctx, number, repeat);
handle = sess_->GetTimeEvaluator(handle, ctx, number, repeat, min_repeat_ms);
return WrapRemote(handle);
}

Expand Down Expand Up @@ -203,10 +204,10 @@ TVM_REGISTER_GLOBAL("module._RPCTimeEvaluator")
ctx.device_id = args[3];
if (tkey == "rpc") {
*rv = static_cast<RPCModuleNode*>(m.operator->())
->GetTimeEvaluator(args[1], ctx, args[4], args[5]);
->GetTimeEvaluator(args[1], ctx, args[4], args[5], args[6]);
} else {
*rv = WrapTimeEvaluator(
m.GetFunction(args[1], false), ctx, args[4], args[5]);
m.GetFunction(args[1], false), ctx, args[4], args[5], args[6]);
}
});

Expand Down
46 changes: 34 additions & 12 deletions src/runtime/rpc/rpc_session.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
#include <chrono>
#include <vector>
#include <utility>
#include <cmath>
#include <algorithm>
#include "rpc_session.h"
#include "../../common/ring_buffer.h"

Expand Down Expand Up @@ -1002,9 +1004,9 @@ void RPCSession::CopyFromRemote(void* from,
}

RPCFuncHandle RPCSession::GetTimeEvaluator(
RPCFuncHandle fhandle, TVMContext ctx, int number, int repeat) {
RPCFuncHandle fhandle, TVMContext ctx, int number, int repeat, int min_repeat_ms) {
return this->CallRemote(
RPCCode::kGetTimeEvaluator, fhandle, ctx, number, repeat);
RPCCode::kGetTimeEvaluator, fhandle, ctx, number, repeat, min_repeat_ms);
}

// Event handler functions
Expand Down Expand Up @@ -1138,7 +1140,7 @@ void RPCNDArrayFree(TVMArgs args, TVMRetValue *rv) {

void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) {
PackedFunc *pf = static_cast<PackedFunc*>(args[0].operator void*());
void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3]));
void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3], args[4]));
delete pf;
*rv = fhandle;
}
Expand Down Expand Up @@ -1190,21 +1192,41 @@ void RPCSession::EventHandler::HandlePackedCall() {
CHECK_EQ(state_, kRecvCode);
}

PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat) {
auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) {
PackedFunc WrapTimeEvaluator(PackedFunc pf,
TVMContext ctx,
int number,
int repeat,
int min_repeat_ms) {
auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable {
TVMRetValue temp;
std::ostringstream os;
// skip first time call, to activate lazy compilation components.
pf.CallPacked(args, &temp);
DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);

for (int i = 0; i < repeat; ++i) {
// start timing
auto tbegin = std::chrono::high_resolution_clock::now();
for (int i = 0; i < number; ++i) {
pf.CallPacked(args, &temp);
}
DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
auto tend = std::chrono::high_resolution_clock::now();
std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds> tbegin, tend;
double duration_ms = 0.0;

do {
if (duration_ms > 0.0) {
number = static_cast<int>(
std::max((min_repeat_ms / (duration_ms / number) + 1),
number * 1.618)); // 1.618 is chosen by random
}

tbegin = std::chrono::high_resolution_clock::now();
// start timing
for (int i = 0; i < number; ++i) {
pf.CallPacked(args, &temp);
}
DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
tend = std::chrono::high_resolution_clock::now();

duration_ms = std::chrono::duration_cast<std::chrono::duration<double> >
(tend - tbegin).count() * 1000;
} while (duration_ms < min_repeat_ms);

double speed = std::chrono::duration_cast<std::chrono::duration<double> >(
tend - tbegin).count() / number;
os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
Expand Down
Loading

0 comments on commit 6f70b82

Please sign in to comment.