From 85fd26cc11cf3f06276869727fb358016b0056f0 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Tue, 1 Jan 2019 10:59:56 -0800 Subject: [PATCH] [RUNTIME] Add min_repeat_ms to time_evaluator (#2200) --- python/tvm/autotvm/measure/measure.py | 6 +- python/tvm/autotvm/measure/measure_methods.py | 109 ++++++++---------- python/tvm/module.py | 28 +++-- src/runtime/rpc/rpc_module.cc | 9 +- src/runtime/rpc/rpc_session.cc | 46 ++++++-- src/runtime/rpc/rpc_session.h | 42 +++++-- tests/python/unittest/test_autotvm_measure.py | 22 +--- tests/python/unittest/test_runtime_measure.py | 48 ++++++++ 8 files changed, 194 insertions(+), 116 deletions(-) create mode 100644 tests/python/unittest/test_runtime_measure.py diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py index 8a8940817237..3afebfa5c9e3 100644 --- a/python/tvm/autotvm/measure/measure.py +++ b/python/tvm/autotvm/measure/measure.py @@ -187,8 +187,10 @@ def measure_option(builder, runner): Note ---- To make measurement results accurate, you should pick the correct value for the argument - `number` and `repeat` in Runner(). Using `min_repeat_ms` can dynamically adjusts `number`, - so it is recommended. The typical value for NVIDIA GPU is 100 ms. + `number` and `repeat` in Runner(). Some devices need a certain minimum running time to + "warm up," such as GPUs that need time to reach a performance power state. + Using `min_repeat_ms` can dynamically adjusts `number`, so it is recommended. + The typical value for NVIDIA GPU is 150 ms. """ from .measure_methods import LocalBuilder, LocalRunner diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index ff93704edb44..f1e884d8df14 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -140,20 +140,22 @@ class RPCRunner(Runner): The host address of RPC Tracker port: int The port of RPC Tracker - number : int, optional - Number of times to do measurement for tasking average + number: int + The number of times to run the generated code for taking average. + We call these runs as one `repeat` of measurement. repeat : int, optional - Number of times to repeat the measurement. + The number of times to repeat the measurement. In total, the generated code will be run (1 + number x repeat) times, - where the first one is warm up. The returned result contains `repeat` costs, - min_repeat_ms : float, optional - Minimum duration of a timer measurement in milliseconds. - When the run time of a measurement trial falls below this time, the - `number` parameter will be automatically increased. - Set this to improve the accuracy of perf measurement, e.g., when timers - are not precise enough to capture short-running tasks. This parameter is - also critical when devices need a certain minimum running time to "warm - up," such as GPUs that need time to reach a performance power state. + where the first "1" is warm up and will be discarded. + The returned result contains `repeat` costs, + each of which is an average of `number` costs. + min_repeat_ms: int, optional + The minimum duration of one `repeat` in milliseconds. + By default, one `repeat` contains `number` runs. If this parameter is set, + the parameters `number` will be dynamically adjusted to meet the + minimum duration requirement of one `repeat`. + i.e., When the run time of one `repeat` falls below this time, the `number` parameter + will be automatically increased. cooldown_interval: float, optional The cool down interval between two measurements. check_correctness: bool, optional @@ -177,7 +179,6 @@ def __init__(self, self.number = number self.repeat = repeat self.min_repeat_ms = min_repeat_ms - self.cur_number = number self.ref_input = None self.ref_output = None @@ -188,7 +189,6 @@ def __init__(self, def set_task(self, task): self.task = task - self.cur_number = self.number if check_remote(task.target, self.key, self.host, self.port): logger.info("Get devices for measurement successfully!") @@ -240,8 +240,9 @@ def run(self, measure_inputs, build_results): ret = self.executor.submit(run_through_rpc, measure_inp, build_res, - self.cur_number, + self.number, self.repeat, + self.min_repeat_ms, self.cooldown_interval, remote_args, self.ref_input, @@ -256,32 +257,6 @@ def run(self, measure_inputs, build_results): else: results.append(res) - # If some runs were too fast, do remeasure for them - # to meet the requirement of `min_repeat_ms` - remeasure = np.zeros((len(measure_inputs),), dtype=np.bool) - pre_number = next_number = self.cur_number - min_repeat_duration = self.min_repeat_ms / 1000.0 - for i, res in enumerate(results): - if res.error_no == MeasureErrorNo.NO_ERROR: - if np.mean(res.costs) * pre_number <= min_repeat_duration: - next_number = max(next_number, - int(np.ceil(min_repeat_duration / np.mean(res.costs)))) - remeasure[i] = True - - if pre_number != next_number: - self.cur_number = next_number - msg = "increasing number to %d" % self.cur_number - logger.info(msg) - - re_measure_inputs = [x for i, x in enumerate(measure_inputs) if remeasure[i]] - re_build_results = [x for i, x in enumerate(build_results) if remeasure[i]] - re_res = self.run(re_measure_inputs, re_build_results) - ct = 0 - for i, rerun in enumerate(remeasure): - if rerun: - results[i] = re_res[ct] - ct += 1 - return results class LocalRunner(RPCRunner): @@ -291,21 +266,22 @@ class LocalRunner(RPCRunner): ---------- timeout: float The timeout of a compilation - number : int, optional - Number of times to do measurement for tasking average + number: int + The number of times to run the generated code for taking average. + We call these runs as one `repeat` of measurement. repeat : int, optional - Number of times to repeat the measurement. + The number of times to repeat the measurement. In total, the generated code will be run (1 + number x repeat) times, - where the first one is warm up. The returned result contains `repeat` costs, - each of which is the average of `number` test run. - min_repeat_ms : float, optional - Minimum duration of a timer measurement in milliseconds. - When the run time of a measurement trial falls below this time, the - `number` parameter will be automatically increased. - Set this to improve the accuracy of perf measurement, e.g., when timers - are not precise enough to capture short-running tasks. This parameter is - also critical when devices need a certain minimum running time to "warm - up," such as GPUs that need time to reach a performance power state. + where the first one is warm up and will be discarded. + The returned result contains `repeat` costs, + each of which is an average of `number` costs. + min_repeat_ms: int, optional + The minimum duration of one `repeat` in milliseconds. + By default, one `repeat` contains `number` runs. If this parameter is set, + the parameters `number` will be dynamically adjusted to meet the + minimum duration requirement of one `repeat`. + i.e., When the run time of one `repeat` falls below this time, the `number` parameter + will be automatically increased. cooldown_interval: float, optional The cool down interval between two measurements. check_correctness: bool, optional @@ -416,7 +392,7 @@ def android_ndk_build_func(measure_input, tmp_dir, **kwargs): def run_through_rpc(measure_input, build_result, - number, repeat, cooldown_interval, + number, repeat, min_repeat_ms, cooldown_interval, remote_args, ref_input=None, ref_output=None): """Run a generated library through rpc @@ -426,13 +402,22 @@ def run_through_rpc(measure_input, build_result, The raw measure input build_result: BuildResult The result returned from Builder. This contains the path to the generated library. - number : int, optional - Number of times to do measurement for tasking average + number: int + The number of times to run the generated code for taking average. + We call these runs as one `repeat` of measurement. repeat : int, optional - Number of times to repeat the measurement. + The number of times to repeat the measurement. In total, the generated code will be run (1 + number x repeat) times, - where the first one is warm up. The returned result contains `repeat` costs, - each of which is the average of `number` test run. + where the first one is warm up and will be discarded. + The returned result contains `repeat` costs, + each of which is an average of `number` costs. + min_repeat_ms: int, optional + The minimum duration of one `repeat` in milliseconds. + By default, one `repeat` contains `number` runs. If this parameter is set, + the parameters `number` will be dynamically adjusted to meet the + minimum duration requirement of one `repeat`. + i.e., When the run time of one `repeat` falls below this time, the `number` parameter + will be automatically increased. cooldown_interval: float The cool down interval between two measurements remote_args: Tuple @@ -454,14 +439,14 @@ def run_through_rpc(measure_input, build_result, func = remote.load_module(os.path.split(build_result.filename)[1]) ctx = remote.context(str(measure_input.target), 0) time_f = func.time_evaluator( - func.entry_name, ctx, number=number, repeat=repeat) + func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms) # set input if ref_input: args = [nd.array(x, ctx=ctx) for x in ref_input] else: # create empty arrays on the remote device and copy them once. - # This can avoid some memory issues that make the measurment results unreliable. + # This can avoid some memory issues that make the measurement results unreliable. args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info] args = [nd.array(x, ctx=ctx) for x in args] ctx.sync() diff --git a/python/tvm/module.py b/python/tvm/module.py index ad2ac3d5f76c..d33a947dd7ed 100644 --- a/python/tvm/module.py +++ b/python/tvm/module.py @@ -145,7 +145,7 @@ def export_library(self, kwargs.update({'options': ["-I" + path for path in find_include_path()]}) fcompile(file_name, files, **kwargs) - def time_evaluator(self, func_name, ctx, number, repeat=1): + def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0): """Get an evaluator that measures time cost of running function. Parameters @@ -157,26 +157,38 @@ def time_evaluator(self, func_name, ctx, number, repeat=1): The context we should run this function on. number: int - The number of steps used in measuring each time interval + The number of times to run this function for taking average. + We call these runs as one `repeat` of measurement. repeat: int, optional - Number of times to run the timer measurement - If repeat equals 3, then we will get 3 numbers in the ProfileResult. + The number of times to repeat the measurement. + In total, the function will be invoked (1 + number x repeat) times, + where the first one is warm up and will be discarded. + The returned result contains `repeat` costs, + each of which is an average of `number` costs. + + min_repeat_ms: int, optional + The minimum duration of one `repeat` in milliseconds. + By default, one `repeat` contains `number` runs. If this parameter is set, + the parameters `number` will be dynamically adjusted to meet the + minimum duration requirement of one `repeat`. + i.e., When the run time of one `repeat` falls below this time, the `number` parameter + will be automatically increased. Note ---- - The function will be invoked repeat * number + 1 times, + The function will be invoked (1 + number x repeat) times, with the first call discarded in case there is lazy initialization. Returns ------- ftimer : Function - The function that takes same argument as func - and return a float representing seconds per function call. + The function that takes same argument as func and returns a ProfileResult. + The ProfileResult reports `repeat` time costs in seconds. """ try: feval = _RPCTimeEvaluator( - self, func_name, ctx.device_type, ctx.device_id, number, repeat) + self, func_name, ctx.device_type, ctx.device_id, number, repeat, min_repeat_ms) def evaluator(*args): """Internal wrapped evaluator.""" diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc index 80a8cc93ce19..d5528617477f 100644 --- a/src/runtime/rpc/rpc_module.cc +++ b/src/runtime/rpc/rpc_module.cc @@ -124,10 +124,11 @@ class RPCModuleNode final : public ModuleNode { PackedFunc GetTimeEvaluator(const std::string& name, TVMContext ctx, int number, - int repeat) { + int repeat, + int min_repeat_ms) { RPCFuncHandle handle = GetFuncHandle(name); if (handle == nullptr) return PackedFunc(); - handle = sess_->GetTimeEvaluator(handle, ctx, number, repeat); + handle = sess_->GetTimeEvaluator(handle, ctx, number, repeat, min_repeat_ms); return WrapRemote(handle); } @@ -203,10 +204,10 @@ TVM_REGISTER_GLOBAL("module._RPCTimeEvaluator") ctx.device_id = args[3]; if (tkey == "rpc") { *rv = static_cast(m.operator->()) - ->GetTimeEvaluator(args[1], ctx, args[4], args[5]); + ->GetTimeEvaluator(args[1], ctx, args[4], args[5], args[6]); } else { *rv = WrapTimeEvaluator( - m.GetFunction(args[1], false), ctx, args[4], args[5]); + m.GetFunction(args[1], false), ctx, args[4], args[5], args[6]); } }); diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc index 208944a69dce..920f86128d11 100644 --- a/src/runtime/rpc/rpc_session.cc +++ b/src/runtime/rpc/rpc_session.cc @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include "rpc_session.h" #include "../../common/ring_buffer.h" @@ -1002,9 +1004,9 @@ void RPCSession::CopyFromRemote(void* from, } RPCFuncHandle RPCSession::GetTimeEvaluator( - RPCFuncHandle fhandle, TVMContext ctx, int number, int repeat) { + RPCFuncHandle fhandle, TVMContext ctx, int number, int repeat, int min_repeat_ms) { return this->CallRemote( - RPCCode::kGetTimeEvaluator, fhandle, ctx, number, repeat); + RPCCode::kGetTimeEvaluator, fhandle, ctx, number, repeat, min_repeat_ms); } // Event handler functions @@ -1138,7 +1140,7 @@ void RPCNDArrayFree(TVMArgs args, TVMRetValue *rv) { void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) { PackedFunc *pf = static_cast(args[0].operator void*()); - void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3])); + void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3], args[4])); delete pf; *rv = fhandle; } @@ -1190,21 +1192,41 @@ void RPCSession::EventHandler::HandlePackedCall() { CHECK_EQ(state_, kRecvCode); } -PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat) { - auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) { +PackedFunc WrapTimeEvaluator(PackedFunc pf, + TVMContext ctx, + int number, + int repeat, + int min_repeat_ms) { + auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable { TVMRetValue temp; std::ostringstream os; // skip first time call, to activate lazy compilation components. pf.CallPacked(args, &temp); DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); + for (int i = 0; i < repeat; ++i) { - // start timing - auto tbegin = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < number; ++i) { - pf.CallPacked(args, &temp); - } - DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); - auto tend = std::chrono::high_resolution_clock::now(); + std::chrono::time_point tbegin, tend; + double duration_ms = 0.0; + + do { + if (duration_ms > 0.0) { + number = static_cast( + std::max((min_repeat_ms / (duration_ms / number) + 1), + number * 1.618)); // 1.618 is chosen by random + } + + tbegin = std::chrono::high_resolution_clock::now(); + // start timing + for (int i = 0; i < number; ++i) { + pf.CallPacked(args, &temp); + } + DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); + tend = std::chrono::high_resolution_clock::now(); + + duration_ms = std::chrono::duration_cast > + (tend - tbegin).count() * 1000; + } while (duration_ms < min_repeat_ms); + double speed = std::chrono::duration_cast >( tend - tbegin).count() / number; os.write(reinterpret_cast(&speed), sizeof(speed)); diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h index 4b736de0e041..e2ee43992f7b 100644 --- a/src/runtime/rpc/rpc_session.h +++ b/src/runtime/rpc/rpc_session.h @@ -151,14 +151,26 @@ class RPCSession { * * \param fhandle The function handle. * \param ctx The ctx to run measurement on. - * \param number How many steps to run in each time evaluation - * \param repeat How many times to repeat the timer + * \param number The number of times to run this function for taking average. + We call these runs as one `repeat` of measurement. + * \param repeat The number of times to repeat the measurement. + In total, the function will be invoked (1 + number x repeat) times, + where the first one is warm up and will be discarded. + The returned result contains `repeat` costs, + each of which is an average of `number` costs. + * \param min_repeat_ms The minimum duration of one `repeat` in milliseconds. + By default, one `repeat` contains `number` runs. If this parameter is set, + the parameters `number` will be dynamically adjusted to meet the + minimum duration requirement of one `repeat`. + i.e., When the run time of one `repeat` falls below this time, + the `number` parameter will be automatically increased. * \return A remote timer function */ RPCFuncHandle GetTimeEvaluator(RPCFuncHandle fhandle, TVMContext ctx, int number, - int repeat); + int repeat, + int min_repeat_ms); /*! * \brief Call a remote defined system function with arguments. * \param fcode The function code. @@ -221,13 +233,29 @@ class RPCSession { }; /*! - * \brief Wrap a timer function for a given packed function. + * \brief Wrap a timer function to measure the time cost of a given packed function. * \param f The function argument. * \param ctx The context. - * \param number Number of steps in the inner iteration - * \param repeat How many steps to repeat the time evaluation. + * \param number The number of times to run this function for taking average. + We call these runs as one `repeat` of measurement. + * \param repeat The number of times to repeat the measurement. + In total, the function will be invoked (1 + number x repeat) times, + where the first one is warm up and will be discarded. + The returned result contains `repeat` costs, + each of which is an average of `number` costs. + * \param min_repeat_ms The minimum duration of one `repeat` in milliseconds. + By default, one `repeat` contains `number` runs. If this parameter is set, + the parameters `number` will be dynamically adjusted to meet the + minimum duration requirement of one `repeat`. + i.e., When the run time of one `repeat` falls below this time, + the `number` parameter will be automatically increased. + * \return f_timer A timer function. */ -PackedFunc WrapTimeEvaluator(PackedFunc f, TVMContext ctx, int number, int repeat); +PackedFunc WrapTimeEvaluator(PackedFunc f, + TVMContext ctx, + int number, + int repeat, + int min_repeat_ms); /*! * \brief Create a Global RPC module that refers to the session. diff --git a/tests/python/unittest/test_autotvm_measure.py b/tests/python/unittest/test_autotvm_measure.py index e29cc2c51658..e540a11f241b 100644 --- a/tests/python/unittest/test_autotvm_measure.py +++ b/tests/python/unittest/test_autotvm_measure.py @@ -69,29 +69,9 @@ def _callback_wrong(tuner, measure_inputs, measure_results): callbacks=[_callback_wrong]) -def test_min_repeat_ms(): - task, target = get_sample_task() - - measure_option = autotvm.measure_option( - builder=autotvm.LocalBuilder(), - runner=autotvm.LocalRunner(number=1, min_repeat_ms=100) - ) - - def _callback(tuner, measure_inputs, measure_results): - for inp, res in zip(measure_inputs, measure_results): - if res.error_no != 0: - continue - - assert 1000 * np.mean(res.costs) * \ - measure_option['runner'].cur_number >= 100 - - tuner = autotvm.tuner.RandomTuner(task) - tuner.tune(n_trial=5, measure_option=measure_option, - callbacks=[_callback]) - if __name__ == '__main__': logging.basicConfig(level=logging.INFO) test_task_tuner_without_measurement() test_check_correctness() - test_min_repeat_ms() + diff --git a/tests/python/unittest/test_runtime_measure.py b/tests/python/unittest/test_runtime_measure.py new file mode 100644 index 000000000000..4513a268ac20 --- /dev/null +++ b/tests/python/unittest/test_runtime_measure.py @@ -0,0 +1,48 @@ +import time +import ctypes + +import tvm +from tvm.contrib.util import tempdir + + +def test_min_repeat_ms(): + tmp = tempdir() + filename = tmp.relpath("log") + + @tvm.register_func + def my_debug(filename): + """one call lasts for 100 ms and writes one character to a file""" + time.sleep(0.1) + filename = ctypes.c_char_p(filename.value).value + with open(filename, "a") as fout: + fout.write("c") + + X = tvm.compute((), lambda : tvm.call_packed("my_debug", filename)) + s = tvm.create_schedule(X.op) + func = tvm.build(s, [X]) + + x = tvm.nd.empty((), dtype="int32") + ftimer = func.time_evaluator(func.entry_name, tvm.cpu(), + number=1, repeat=1) + ftimer(x) + + with open(filename, "r") as fin: + ct = len(fin.readline()) + + assert ct == 2 + + + ftimer = func.time_evaluator(func.entry_name, tvm.cpu(), + number=1, repeat=1, min_repeat_ms=1000) + ftimer(x) + + # make sure we get more than 10 calls + with open(filename, "r") as fin: + ct = len(fin.readline()) + + assert ct > 10 + 2 + + +if __name__ == "__main__": + test_min_repeat_ms() +