[RUNTIME] Add min_repeat_ms to time_evaluator (apache#2200)

FrozenGene · Jan 1, 2019 · 6f70b82 · 6f70b82
1 parent 52dec03
commit 6f70b82
Show file tree

Hide file tree

Showing 8 changed files with 194 additions and 116 deletions.
diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py
@@ -187,8 +187,10 @@ def measure_option(builder, runner):
     Note
     ----
     To make measurement results accurate, you should pick the correct value for the argument
-    `number` and `repeat` in Runner(). Using `min_repeat_ms` can dynamically adjusts `number`,
-    so it is recommended. The typical value for NVIDIA GPU is 100 ms.
+    `number` and `repeat` in Runner(). Some devices need a certain minimum running time to
+    "warm up," such as GPUs that need time to reach a performance power state.
+    Using `min_repeat_ms` can dynamically adjusts `number`, so it is recommended.
+    The typical value for NVIDIA GPU is 150 ms.
     """
     from .measure_methods import LocalBuilder, LocalRunner
 

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
@@ -140,20 +140,22 @@ class RPCRunner(Runner):
         The host address of RPC Tracker
     port: int
         The port of RPC Tracker
-    number : int, optional
-        Number of times to do measurement for tasking average
+    number: int
+        The number of times to run the generated code for taking average.
+        We call these runs as one `repeat` of measurement.
     repeat : int, optional
-        Number of times to repeat the measurement.
+        The number of times to repeat the measurement.
         In total, the generated code will be run (1 + number x repeat) times,
-        where the first one is warm up. The returned result contains `repeat` costs,
-    min_repeat_ms : float, optional
-        Minimum duration of a timer measurement in milliseconds.
-        When the run time of a measurement trial falls below this time, the
-        `number` parameter will be automatically increased.
-        Set this to improve the accuracy of perf measurement, e.g., when timers
-        are not precise enough to capture short-running tasks. This parameter is
-        also critical when devices need a certain minimum running time to "warm
-        up," such as GPUs that need time to reach a performance power state.
+        where the first "1" is warm up and will be discarded.
+        The returned result contains `repeat` costs,
+        each of which is an average of `number` costs.
+    min_repeat_ms: int, optional
+        The minimum duration of one `repeat` in milliseconds.
+        By default, one `repeat` contains `number` runs. If this parameter is set,
+        the parameters `number` will be dynamically adjusted to meet the
+        minimum duration requirement of one `repeat`.
+        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+        will be automatically increased.
     cooldown_interval: float, optional
         The cool down interval between two measurements.
     check_correctness: bool, optional
@@ -177,7 +179,6 @@ def __init__(self,
         self.number = number
         self.repeat = repeat
         self.min_repeat_ms = min_repeat_ms
-        self.cur_number = number
 
         self.ref_input = None
         self.ref_output = None
@@ -188,7 +189,6 @@ def __init__(self,
 
     def set_task(self, task):
         self.task = task
-        self.cur_number = self.number
 
         if check_remote(task.target, self.key, self.host, self.port):
             logger.info("Get devices for measurement successfully!")
@@ -240,8 +240,9 @@ def run(self, measure_inputs, build_results):
                 ret = self.executor.submit(run_through_rpc,
                                            measure_inp,
                                            build_res,
-                                           self.cur_number,
+                                           self.number,
                                            self.repeat,
+                                           self.min_repeat_ms,
                                            self.cooldown_interval,
                                            remote_args,
                                            self.ref_input,
@@ -256,32 +257,6 @@ def run(self, measure_inputs, build_results):
                 else:
                     results.append(res)
 
-        # If some runs were too fast, do remeasure for them
-        # to meet the requirement of `min_repeat_ms`
-        remeasure = np.zeros((len(measure_inputs),), dtype=np.bool)
-        pre_number = next_number = self.cur_number
-        min_repeat_duration = self.min_repeat_ms / 1000.0
-        for i, res in enumerate(results):
-            if res.error_no == MeasureErrorNo.NO_ERROR:
-                if np.mean(res.costs) * pre_number <= min_repeat_duration:
-                    next_number = max(next_number,
-                                      int(np.ceil(min_repeat_duration / np.mean(res.costs))))
-                    remeasure[i] = True
-
-        if pre_number != next_number:
-            self.cur_number = next_number
-            msg = "increasing number to %d" % self.cur_number
-            logger.info(msg)
-
-            re_measure_inputs = [x for i, x in enumerate(measure_inputs) if remeasure[i]]
-            re_build_results = [x for i, x in enumerate(build_results) if remeasure[i]]
-            re_res = self.run(re_measure_inputs, re_build_results)
-            ct = 0
-            for i, rerun in enumerate(remeasure):
-                if rerun:
-                    results[i] = re_res[ct]
-                    ct += 1
-
         return results
 
 class LocalRunner(RPCRunner):
@@ -291,21 +266,22 @@ class LocalRunner(RPCRunner):
     ----------
     timeout: float
         The timeout of a compilation
-    number : int, optional
-        Number of times to do measurement for tasking average
+    number: int
+        The number of times to run the generated code for taking average.
+        We call these runs as one `repeat` of measurement.
     repeat : int, optional
-        Number of times to repeat the measurement.
+        The number of times to repeat the measurement.
         In total, the generated code will be run (1 + number x repeat) times,
-        where the first one is warm up. The returned result contains `repeat` costs,
-        each of which is the average of `number` test run.
-    min_repeat_ms : float, optional
-        Minimum duration of a timer measurement in milliseconds.
-        When the run time of a measurement trial falls below this time, the
-        `number` parameter will be automatically increased.
-        Set this to improve the accuracy of perf measurement, e.g., when timers
-        are not precise enough to capture short-running tasks. This parameter is
-        also critical when devices need a certain minimum running time to "warm
-        up," such as GPUs that need time to reach a performance power state.
+        where the first one is warm up and will be discarded.
+        The returned result contains `repeat` costs,
+        each of which is an average of `number` costs.
+    min_repeat_ms: int, optional
+        The minimum duration of one `repeat` in milliseconds.
+        By default, one `repeat` contains `number` runs. If this parameter is set,
+        the parameters `number` will be dynamically adjusted to meet the
+        minimum duration requirement of one `repeat`.
+        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+        will be automatically increased.
     cooldown_interval: float, optional
         The cool down interval between two measurements.
     check_correctness: bool, optional
@@ -416,7 +392,7 @@ def android_ndk_build_func(measure_input, tmp_dir, **kwargs):
 
 
 def run_through_rpc(measure_input, build_result,
-                    number, repeat, cooldown_interval,
+                    number, repeat, min_repeat_ms, cooldown_interval,
                     remote_args, ref_input=None, ref_output=None):
     """Run a generated library through rpc
 
@@ -426,13 +402,22 @@ def run_through_rpc(measure_input, build_result,
         The raw measure input
     build_result: BuildResult
         The result returned from Builder. This contains the path to the generated library.
-    number : int, optional
-        Number of times to do measurement for tasking average
+    number: int
+        The number of times to run the generated code for taking average.
+        We call these runs as one `repeat` of measurement.
     repeat : int, optional
-        Number of times to repeat the measurement.
+        The number of times to repeat the measurement.
         In total, the generated code will be run (1 + number x repeat) times,
-        where the first one is warm up. The returned result contains `repeat` costs,
-        each of which is the average of `number` test run.
+        where the first one is warm up and will be discarded.
+        The returned result contains `repeat` costs,
+        each of which is an average of `number` costs.
+    min_repeat_ms: int, optional
+        The minimum duration of one `repeat` in milliseconds.
+        By default, one `repeat` contains `number` runs. If this parameter is set,
+        the parameters `number` will be dynamically adjusted to meet the
+        minimum duration requirement of one `repeat`.
+        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+        will be automatically increased.
     cooldown_interval: float
         The cool down interval between two measurements
     remote_args: Tuple
@@ -454,14 +439,14 @@ def run_through_rpc(measure_input, build_result,
         func = remote.load_module(os.path.split(build_result.filename)[1])
         ctx = remote.context(str(measure_input.target), 0)
         time_f = func.time_evaluator(
-            func.entry_name, ctx, number=number, repeat=repeat)
+            func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms)
 
         # set input
         if ref_input:
             args = [nd.array(x, ctx=ctx) for x in ref_input]
         else:
             # create empty arrays on the remote device and copy them once.
-            # This can avoid some memory issues that make the measurment results unreliable.
+            # This can avoid some memory issues that make the measurement results unreliable.
             args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info]
             args = [nd.array(x, ctx=ctx) for x in args]
             ctx.sync()

diff --git a/python/tvm/module.py b/python/tvm/module.py
@@ -127,7 +127,7 @@ def export_library(self,
             kwargs.update({'options': ["-I" + path for path in find_include_path()]})
         fcompile(file_name, files, **kwargs)
 
-    def time_evaluator(self, func_name, ctx, number, repeat=1):
+    def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0):
         """Get an evaluator that measures time cost of running function.
 
         Parameters
@@ -139,26 +139,38 @@ def time_evaluator(self, func_name, ctx, number, repeat=1):
             The context we should run this function on.
 
         number: int
-            The number of steps used in measuring each time interval
+            The number of times to run this function for taking average.
+            We call these runs as one `repeat` of measurement.
 
         repeat: int, optional
-            Number of times to run the timer measurement
-            If repeat equals 3, then we will get 3 numbers in the ProfileResult.
+            The number of times to repeat the measurement.
+            In total, the function will be invoked (1 + number x repeat) times,
+            where the first one is warm up and will be discarded.
+            The returned result contains `repeat` costs,
+            each of which is an average of `number` costs.
+
+        min_repeat_ms: int, optional
+            The minimum duration of one `repeat` in milliseconds.
+            By default, one `repeat` contains `number` runs. If this parameter is set,
+            the parameters `number` will be dynamically adjusted to meet the
+            minimum duration requirement of one `repeat`.
+            i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+            will be automatically increased.
 
         Note
         ----
-        The function will be invoked  repeat * number + 1 times,
+        The function will be invoked  (1 + number x repeat) times,
         with the first call discarded in case there is lazy initialization.
 
         Returns
         -------
         ftimer : Function
-            The function that takes same argument as func
-            and return a float representing seconds per function call.
+            The function that takes same argument as func and returns a ProfileResult.
+            The ProfileResult reports `repeat` time costs in seconds.
         """
         try:
             feval = _RPCTimeEvaluator(
-                self, func_name, ctx.device_type, ctx.device_id, number, repeat)
+                self, func_name, ctx.device_type, ctx.device_id, number, repeat, min_repeat_ms)
 
             def evaluator(*args):
                 """Internal wrapped evaluator."""

diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
@@ -124,10 +124,11 @@ class RPCModuleNode final : public ModuleNode {
   PackedFunc GetTimeEvaluator(const std::string& name,
                               TVMContext ctx,
                               int number,
-                              int repeat) {
+                              int repeat,
+                              int min_repeat_ms) {
     RPCFuncHandle handle = GetFuncHandle(name);
     if (handle == nullptr) return PackedFunc();
-    handle = sess_->GetTimeEvaluator(handle, ctx, number, repeat);
+    handle = sess_->GetTimeEvaluator(handle, ctx, number, repeat, min_repeat_ms);
     return WrapRemote(handle);
   }
 
@@ -203,10 +204,10 @@ TVM_REGISTER_GLOBAL("module._RPCTimeEvaluator")
     ctx.device_id = args[3];
     if (tkey == "rpc") {
       *rv = static_cast<RPCModuleNode*>(m.operator->())
-          ->GetTimeEvaluator(args[1], ctx, args[4], args[5]);
+          ->GetTimeEvaluator(args[1], ctx, args[4], args[5], args[6]);
     } else {
       *rv = WrapTimeEvaluator(
-          m.GetFunction(args[1], false), ctx, args[4], args[5]);
+          m.GetFunction(args[1], false), ctx, args[4], args[5], args[6]);
     }
   });
 

diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
@@ -13,6 +13,8 @@
 #include <chrono>
 #include <vector>
 #include <utility>
+#include <cmath>
+#include <algorithm>
 #include "rpc_session.h"
 #include "../../common/ring_buffer.h"
 
@@ -1002,9 +1004,9 @@ void RPCSession::CopyFromRemote(void* from,
 }
 
 RPCFuncHandle RPCSession::GetTimeEvaluator(
-    RPCFuncHandle fhandle, TVMContext ctx, int number, int repeat) {
+    RPCFuncHandle fhandle, TVMContext ctx, int number, int repeat, int min_repeat_ms) {
   return this->CallRemote(
-      RPCCode::kGetTimeEvaluator, fhandle, ctx, number, repeat);
+      RPCCode::kGetTimeEvaluator, fhandle, ctx, number, repeat, min_repeat_ms);
 }
 
 // Event handler functions
@@ -1138,7 +1140,7 @@ void RPCNDArrayFree(TVMArgs args, TVMRetValue *rv) {
 
 void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) {
   PackedFunc *pf = static_cast<PackedFunc*>(args[0].operator void*());
-  void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3]));
+  void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3], args[4]));
   delete pf;
   *rv = fhandle;
 }
@@ -1190,21 +1192,41 @@ void RPCSession::EventHandler::HandlePackedCall() {
   CHECK_EQ(state_, kRecvCode);
 }
 
-PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat) {
-  auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) {
+PackedFunc WrapTimeEvaluator(PackedFunc pf,
+                             TVMContext ctx,
+                             int number,
+                             int repeat,
+                             int min_repeat_ms) {
+  auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable {
     TVMRetValue temp;
     std::ostringstream os;
     // skip first time call, to activate lazy compilation components.
     pf.CallPacked(args, &temp);
     DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
+
     for (int i = 0; i < repeat; ++i) {
-      // start timing
-      auto tbegin = std::chrono::high_resolution_clock::now();
-      for (int i = 0; i < number; ++i) {
-        pf.CallPacked(args, &temp);
-      }
-      DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
-      auto tend = std::chrono::high_resolution_clock::now();
+      std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds> tbegin, tend;
+      double duration_ms = 0.0;
+
+      do {
+        if (duration_ms > 0.0) {
+          number = static_cast<int>(
+              std::max((min_repeat_ms / (duration_ms / number) + 1),
+                       number * 1.618));   // 1.618 is chosen by random
+        }
+
+        tbegin = std::chrono::high_resolution_clock::now();
+        // start timing
+        for (int i = 0; i < number; ++i) {
+          pf.CallPacked(args, &temp);
+        }
+        DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
+        tend = std::chrono::high_resolution_clock::now();
+
+        duration_ms = std::chrono::duration_cast<std::chrono::duration<double> >
+            (tend - tbegin).count() * 1000;
+      } while (duration_ms < min_repeat_ms);
+
       double speed = std::chrono::duration_cast<std::chrono::duration<double> >(
           tend - tbegin).count() / number;
       os.write(reinterpret_cast<char*>(&speed), sizeof(speed));