diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py index 8a89408172378..3afebfa5c9e30 100644 --- a/python/tvm/autotvm/measure/measure.py +++ b/python/tvm/autotvm/measure/measure.py @@ -187,8 +187,10 @@ def measure_option(builder, runner): Note ---- To make measurement results accurate, you should pick the correct value for the argument - `number` and `repeat` in Runner(). Using `min_repeat_ms` can dynamically adjusts `number`, - so it is recommended. The typical value for NVIDIA GPU is 100 ms. + `number` and `repeat` in Runner(). Some devices need a certain minimum running time to + "warm up," such as GPUs that need time to reach a performance power state. + Using `min_repeat_ms` can dynamically adjusts `number`, so it is recommended. + The typical value for NVIDIA GPU is 150 ms. """ from .measure_methods import LocalBuilder, LocalRunner diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index ff93704edb442..39509f8202d49 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -140,20 +140,22 @@ class RPCRunner(Runner): The host address of RPC Tracker port: int The port of RPC Tracker - number : int, optional - Number of times to do measurement for tasking average + number: int + The number of times to run the generated code for taking average. + We call this as one `repeat` of measurement. repeat : int, optional - Number of times to repeat the measurement. + The number of times to repeat the measurement. In total, the generated code will be run (1 + number x repeat) times, - where the first one is warm up. The returned result contains `repeat` costs, - min_repeat_ms : float, optional - Minimum duration of a timer measurement in milliseconds. - When the run time of a measurement trial falls below this time, the - `number` parameter will be automatically increased. - Set this to improve the accuracy of perf measurement, e.g., when timers - are not precise enough to capture short-running tasks. This parameter is - also critical when devices need a certain minimum running time to "warm - up," such as GPUs that need time to reach a performance power state. + where the first one is warm up and will be discarded. + The returned result contains `repeat` costs, + each of which is an average of `number` costs. + min_repeat_ms: int, optional + The minimum duration of one `repeat` in milliseconds. + By default, one `repeat` contains `number` runs. If this parameter is set, + the parameters `number` will be dynamically adjusted to meet the + minimum duration requirement of one `repeat`. + i.e., When the run time of one `repeat` falls below this time, the `number` parameter + will be automatically increased. cooldown_interval: float, optional The cool down interval between two measurements. check_correctness: bool, optional @@ -177,7 +179,6 @@ def __init__(self, self.number = number self.repeat = repeat self.min_repeat_ms = min_repeat_ms - self.cur_number = number self.ref_input = None self.ref_output = None @@ -188,7 +189,6 @@ def __init__(self, def set_task(self, task): self.task = task - self.cur_number = self.number if check_remote(task.target, self.key, self.host, self.port): logger.info("Get devices for measurement successfully!") @@ -240,8 +240,9 @@ def run(self, measure_inputs, build_results): ret = self.executor.submit(run_through_rpc, measure_inp, build_res, - self.cur_number, + self.number, self.repeat, + self.min_repeat_ms, self.cooldown_interval, remote_args, self.ref_input, @@ -256,32 +257,6 @@ def run(self, measure_inputs, build_results): else: results.append(res) - # If some runs were too fast, do remeasure for them - # to meet the requirement of `min_repeat_ms` - remeasure = np.zeros((len(measure_inputs),), dtype=np.bool) - pre_number = next_number = self.cur_number - min_repeat_duration = self.min_repeat_ms / 1000.0 - for i, res in enumerate(results): - if res.error_no == MeasureErrorNo.NO_ERROR: - if np.mean(res.costs) * pre_number <= min_repeat_duration: - next_number = max(next_number, - int(np.ceil(min_repeat_duration / np.mean(res.costs)))) - remeasure[i] = True - - if pre_number != next_number: - self.cur_number = next_number - msg = "increasing number to %d" % self.cur_number - logger.info(msg) - - re_measure_inputs = [x for i, x in enumerate(measure_inputs) if remeasure[i]] - re_build_results = [x for i, x in enumerate(build_results) if remeasure[i]] - re_res = self.run(re_measure_inputs, re_build_results) - ct = 0 - for i, rerun in enumerate(remeasure): - if rerun: - results[i] = re_res[ct] - ct += 1 - return results class LocalRunner(RPCRunner): @@ -291,21 +266,22 @@ class LocalRunner(RPCRunner): ---------- timeout: float The timeout of a compilation - number : int, optional - Number of times to do measurement for tasking average + number: int + The number of times to run the generated code for taking average. + We call this as one `repeat` of measurement. repeat : int, optional - Number of times to repeat the measurement. + The number of times to repeat the measurement. In total, the generated code will be run (1 + number x repeat) times, - where the first one is warm up. The returned result contains `repeat` costs, - each of which is the average of `number` test run. - min_repeat_ms : float, optional - Minimum duration of a timer measurement in milliseconds. - When the run time of a measurement trial falls below this time, the - `number` parameter will be automatically increased. - Set this to improve the accuracy of perf measurement, e.g., when timers - are not precise enough to capture short-running tasks. This parameter is - also critical when devices need a certain minimum running time to "warm - up," such as GPUs that need time to reach a performance power state. + where the first one is warm up and will be discarded. + The returned result contains `repeat` costs, + each of which is an average of `number` costs. + min_repeat_ms: int, optional + The minimum duration of one `repeat` in milliseconds. + By default, one `repeat` contains `number` runs. If this parameter is set, + the parameters `number` will be dynamically adjusted to meet the + minimum duration requirement of one `repeat`. + i.e., When the run time of one `repeat` falls below this time, the `number` parameter + will be automatically increased. cooldown_interval: float, optional The cool down interval between two measurements. check_correctness: bool, optional @@ -416,7 +392,7 @@ def android_ndk_build_func(measure_input, tmp_dir, **kwargs): def run_through_rpc(measure_input, build_result, - number, repeat, cooldown_interval, + number, repeat, min_repeat_ms, cooldown_interval, remote_args, ref_input=None, ref_output=None): """Run a generated library through rpc @@ -426,13 +402,22 @@ def run_through_rpc(measure_input, build_result, The raw measure input build_result: BuildResult The result returned from Builder. This contains the path to the generated library. - number : int, optional - Number of times to do measurement for tasking average + number: int + The number of times to run the generated code for taking average. + We call this as one `repeat` of measurement. repeat : int, optional - Number of times to repeat the measurement. + The number of times to repeat the measurement. In total, the generated code will be run (1 + number x repeat) times, - where the first one is warm up. The returned result contains `repeat` costs, - each of which is the average of `number` test run. + where the first one is warm up and will be discarded. + The returned result contains `repeat` costs, + each of which is an average of `number` costs. + min_repeat_ms: int, optional + The minimum duration of one `repeat` in milliseconds. + By default, one `repeat` contains `number` runs. If this parameter is set, + the parameters `number` will be dynamically adjusted to meet the + minimum duration requirement of one `repeat`. + i.e., When the run time of one `repeat` falls below this time, the `number` parameter + will be automatically increased. cooldown_interval: float The cool down interval between two measurements remote_args: Tuple @@ -454,14 +439,14 @@ def run_through_rpc(measure_input, build_result, func = remote.load_module(os.path.split(build_result.filename)[1]) ctx = remote.context(str(measure_input.target), 0) time_f = func.time_evaluator( - func.entry_name, ctx, number=number, repeat=repeat) + func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms) # set input if ref_input: args = [nd.array(x, ctx=ctx) for x in ref_input] else: # create empty arrays on the remote device and copy them once. - # This can avoid some memory issues that make the measurment results unreliable. + # This can avoid some memory issues that make the measurement results unreliable. args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info] args = [nd.array(x, ctx=ctx) for x in args] ctx.sync() diff --git a/python/tvm/module.py b/python/tvm/module.py index 79a1fab455708..18a990e4a77b2 100644 --- a/python/tvm/module.py +++ b/python/tvm/module.py @@ -127,7 +127,7 @@ def export_library(self, kwargs.update({'options': ["-I" + path for path in find_include_path()]}) fcompile(file_name, files, **kwargs) - def time_evaluator(self, func_name, ctx, number, repeat=1): + def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0): """Get an evaluator that measures time cost of running function. Parameters @@ -139,26 +139,38 @@ def time_evaluator(self, func_name, ctx, number, repeat=1): The context we should run this function on. number: int - The number of steps used in measuring each time interval + The number of times to run this function for taking average. + We call this as one `repeat` of measurement. repeat: int, optional - Number of times to run the timer measurement - If repeat equals 3, then we will get 3 numbers in the ProfileResult. + The number of times to repeat the measurement. + In total, the function will be invoked (1 + number x repeat) times, + where the first one is warm up and will be discarded. + The returned result contains `repeat` costs, + each of which is an average of `number` costs. + + min_repeat_ms: int, optional + The minimum duration of one `repeat` in milliseconds. + By default, one `repeat` contains `number` runs. If this parameter is set, + the parameters `number` will be dynamically adjusted to meet the + minimum duration requirement of one `repeat`. + i.e., When the run time of one `repeat` falls below this time, the `number` parameter + will be automatically increased. Note ---- - The function will be invoked repeat * number + 1 times, + The function will be invoked (1 + number x repeat) times, with the first call discarded in case there is lazy initialization. Returns ------- ftimer : Function - The function that takes same argument as func - and return a float representing seconds per function call. + The function that takes same argument as func and returns a ProfileResult. + The ProfileResult reports `repeat` time costs in seconds. """ try: feval = _RPCTimeEvaluator( - self, func_name, ctx.device_type, ctx.device_id, number, repeat) + self, func_name, ctx.device_type, ctx.device_id, number, repeat, min_repeat_ms) def evaluator(*args): """Internal wrapped evaluator.""" diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc index 80a8cc93ce194..2bfded9727cbd 100644 --- a/src/runtime/rpc/rpc_module.cc +++ b/src/runtime/rpc/rpc_module.cc @@ -124,7 +124,8 @@ class RPCModuleNode final : public ModuleNode { PackedFunc GetTimeEvaluator(const std::string& name, TVMContext ctx, int number, - int repeat) { + int repeat, + int min_repeat_ms) { RPCFuncHandle handle = GetFuncHandle(name); if (handle == nullptr) return PackedFunc(); handle = sess_->GetTimeEvaluator(handle, ctx, number, repeat); @@ -203,10 +204,10 @@ TVM_REGISTER_GLOBAL("module._RPCTimeEvaluator") ctx.device_id = args[3]; if (tkey == "rpc") { *rv = static_cast(m.operator->()) - ->GetTimeEvaluator(args[1], ctx, args[4], args[5]); + ->GetTimeEvaluator(args[1], ctx, args[4], args[5], args[6]); } else { *rv = WrapTimeEvaluator( - m.GetFunction(args[1], false), ctx, args[4], args[5]); + m.GetFunction(args[1], false), ctx, args[4], args[5], args[6]); } }); diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc index 208944a69dceb..3f6b28e0c5371 100644 --- a/src/runtime/rpc/rpc_session.cc +++ b/src/runtime/rpc/rpc_session.cc @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include "rpc_session.h" #include "../../common/ring_buffer.h" @@ -1138,7 +1140,7 @@ void RPCNDArrayFree(TVMArgs args, TVMRetValue *rv) { void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) { PackedFunc *pf = static_cast(args[0].operator void*()); - void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3])); + void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3], args[4])); delete pf; *rv = fhandle; } @@ -1190,21 +1192,39 @@ void RPCSession::EventHandler::HandlePackedCall() { CHECK_EQ(state_, kRecvCode); } -PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat) { - auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) { +PackedFunc WrapTimeEvaluator(PackedFunc pf, + TVMContext ctx, + int number, + int repeat, + int min_repeat_ms) { + auto ftimer = [pf, ctx, &number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) { TVMRetValue temp; std::ostringstream os; // skip first time call, to activate lazy compilation components. pf.CallPacked(args, &temp); DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); for (int i = 0; i < repeat; ++i) { - // start timing - auto tbegin = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < number; ++i) { - pf.CallPacked(args, &temp); - } - DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); - auto tend = std::chrono::high_resolution_clock::now(); + std::chrono::time_point tbegin, tend; + double duration_ms; + + do { + tbegin = std::chrono::high_resolution_clock::now(); + // start timing + for (int i = 0; i < number; ++i) { + pf.CallPacked(args, &temp); + } + DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); + tend = std::chrono::high_resolution_clock::now(); + + duration_ms = std::chrono::duration_cast > + (tend - tbegin).count() * 100; + + if (duration_ms < min_repeat_ms) { + number = static_cast(std::max((min_repeat_ms / (duration_ms / number) + 1), + number * 1.618)); + } + } while (duration_ms < min_repeat_ms); + double speed = std::chrono::duration_cast >( tend - tbegin).count() / number; os.write(reinterpret_cast(&speed), sizeof(speed)); diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h index 4b736de0e0418..63a9cbd66d746 100644 --- a/src/runtime/rpc/rpc_session.h +++ b/src/runtime/rpc/rpc_session.h @@ -221,13 +221,29 @@ class RPCSession { }; /*! - * \brief Wrap a timer function for a given packed function. + * \brief Wrap a timer function to measure the time cost of a given packed function. * \param f The function argument. * \param ctx The context. - * \param number Number of steps in the inner iteration - * \param repeat How many steps to repeat the time evaluation. + * \param number The number of times to run this function for taking average. + We call this as one `repeat` of measurement. + * \param repeat The number of times to repeat the measurement. + In total, the function will be invoked (1 + number x repeat) times, + where the first one is warm up and will be discarded. + The returned result contains `repeat` costs, + each of which is an average of `number` costs. + * \param min_repeat_ms The minimum duration of one `repeat` in milliseconds. + By default, one `repeat` contains `number` runs. If this parameter is set, + the parameters `number` will be dynamically adjusted to meet the + minimum duration requirement of one `repeat`. + i.e., When the run time of one `repeat` falls below this time, the `number` parameter + will be automatically increased. + * \return f_timer A timer function. */ -PackedFunc WrapTimeEvaluator(PackedFunc f, TVMContext ctx, int number, int repeat); +PackedFunc WrapTimeEvaluator(PackedFunc f, + TVMContext ctx, + int number, + int repeat, + int min_repeat_ms); /*! * \brief Create a Global RPC module that refers to the session.