From 85fd26cc11cf3f06276869727fb358016b0056f0 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Tue, 1 Jan 2019 10:59:56 -0800
Subject: [PATCH] [RUNTIME] Add min_repeat_ms to time_evaluator (#2200)

---
 python/tvm/autotvm/measure/measure.py         |   6 +-
 python/tvm/autotvm/measure/measure_methods.py | 109 ++++++++----------
 python/tvm/module.py                          |  28 +++--
 src/runtime/rpc/rpc_module.cc                 |   9 +-
 src/runtime/rpc/rpc_session.cc                |  46 ++++++--
 src/runtime/rpc/rpc_session.h                 |  42 +++++--
 tests/python/unittest/test_autotvm_measure.py |  22 +---
 tests/python/unittest/test_runtime_measure.py |  48 ++++++++
 8 files changed, 194 insertions(+), 116 deletions(-)
 create mode 100644 tests/python/unittest/test_runtime_measure.py

diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py
index 8a8940817237..3afebfa5c9e3 100644
--- a/python/tvm/autotvm/measure/measure.py
+++ b/python/tvm/autotvm/measure/measure.py
@@ -187,8 +187,10 @@ def measure_option(builder, runner):
     Note
     ----
     To make measurement results accurate, you should pick the correct value for the argument
-    `number` and `repeat` in Runner(). Using `min_repeat_ms` can dynamically adjusts `number`,
-    so it is recommended. The typical value for NVIDIA GPU is 100 ms.
+    `number` and `repeat` in Runner(). Some devices need a certain minimum running time to
+    "warm up," such as GPUs that need time to reach a performance power state.
+    Using `min_repeat_ms` can dynamically adjusts `number`, so it is recommended.
+    The typical value for NVIDIA GPU is 150 ms.
     """
     from .measure_methods import LocalBuilder, LocalRunner
 
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index ff93704edb44..f1e884d8df14 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -140,20 +140,22 @@ class RPCRunner(Runner):
         The host address of RPC Tracker
     port: int
         The port of RPC Tracker
-    number : int, optional
-        Number of times to do measurement for tasking average
+    number: int
+        The number of times to run the generated code for taking average.
+        We call these runs as one `repeat` of measurement.
     repeat : int, optional
-        Number of times to repeat the measurement.
+        The number of times to repeat the measurement.
         In total, the generated code will be run (1 + number x repeat) times,
-        where the first one is warm up. The returned result contains `repeat` costs,
-    min_repeat_ms : float, optional
-        Minimum duration of a timer measurement in milliseconds.
-        When the run time of a measurement trial falls below this time, the
-        `number` parameter will be automatically increased.
-        Set this to improve the accuracy of perf measurement, e.g., when timers
-        are not precise enough to capture short-running tasks. This parameter is
-        also critical when devices need a certain minimum running time to "warm
-        up," such as GPUs that need time to reach a performance power state.
+        where the first "1" is warm up and will be discarded.
+        The returned result contains `repeat` costs,
+        each of which is an average of `number` costs.
+    min_repeat_ms: int, optional
+        The minimum duration of one `repeat` in milliseconds.
+        By default, one `repeat` contains `number` runs. If this parameter is set,
+        the parameters `number` will be dynamically adjusted to meet the
+        minimum duration requirement of one `repeat`.
+        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+        will be automatically increased.
     cooldown_interval: float, optional
         The cool down interval between two measurements.
     check_correctness: bool, optional
@@ -177,7 +179,6 @@ def __init__(self,
         self.number = number
         self.repeat = repeat
         self.min_repeat_ms = min_repeat_ms
-        self.cur_number = number
 
         self.ref_input = None
         self.ref_output = None
@@ -188,7 +189,6 @@ def __init__(self,
 
     def set_task(self, task):
         self.task = task
-        self.cur_number = self.number
 
         if check_remote(task.target, self.key, self.host, self.port):
             logger.info("Get devices for measurement successfully!")
@@ -240,8 +240,9 @@ def run(self, measure_inputs, build_results):
                 ret = self.executor.submit(run_through_rpc,
                                            measure_inp,
                                            build_res,
-                                           self.cur_number,
+                                           self.number,
                                            self.repeat,
+                                           self.min_repeat_ms,
                                            self.cooldown_interval,
                                            remote_args,
                                            self.ref_input,
@@ -256,32 +257,6 @@ def run(self, measure_inputs, build_results):
                 else:
                     results.append(res)
 
-        # If some runs were too fast, do remeasure for them
-        # to meet the requirement of `min_repeat_ms`
-        remeasure = np.zeros((len(measure_inputs),), dtype=np.bool)
-        pre_number = next_number = self.cur_number
-        min_repeat_duration = self.min_repeat_ms / 1000.0
-        for i, res in enumerate(results):
-            if res.error_no == MeasureErrorNo.NO_ERROR:
-                if np.mean(res.costs) * pre_number <= min_repeat_duration:
-                    next_number = max(next_number,
-                                      int(np.ceil(min_repeat_duration / np.mean(res.costs))))
-                    remeasure[i] = True
-
-        if pre_number != next_number:
-            self.cur_number = next_number
-            msg = "increasing number to %d" % self.cur_number
-            logger.info(msg)
-
-            re_measure_inputs = [x for i, x in enumerate(measure_inputs) if remeasure[i]]
-            re_build_results = [x for i, x in enumerate(build_results) if remeasure[i]]
-            re_res = self.run(re_measure_inputs, re_build_results)
-            ct = 0
-            for i, rerun in enumerate(remeasure):
-                if rerun:
-                    results[i] = re_res[ct]
-                    ct += 1
-
         return results
 
 class LocalRunner(RPCRunner):
@@ -291,21 +266,22 @@ class LocalRunner(RPCRunner):
     ----------
     timeout: float
         The timeout of a compilation
-    number : int, optional
-        Number of times to do measurement for tasking average
+    number: int
+        The number of times to run the generated code for taking average.
+        We call these runs as one `repeat` of measurement.
     repeat : int, optional
-        Number of times to repeat the measurement.
+        The number of times to repeat the measurement.
         In total, the generated code will be run (1 + number x repeat) times,
-        where the first one is warm up. The returned result contains `repeat` costs,
-        each of which is the average of `number` test run.
-    min_repeat_ms : float, optional
-        Minimum duration of a timer measurement in milliseconds.
-        When the run time of a measurement trial falls below this time, the
-        `number` parameter will be automatically increased.
-        Set this to improve the accuracy of perf measurement, e.g., when timers
-        are not precise enough to capture short-running tasks. This parameter is
-        also critical when devices need a certain minimum running time to "warm
-        up," such as GPUs that need time to reach a performance power state.
+        where the first one is warm up and will be discarded.
+        The returned result contains `repeat` costs,
+        each of which is an average of `number` costs.
+    min_repeat_ms: int, optional
+        The minimum duration of one `repeat` in milliseconds.
+        By default, one `repeat` contains `number` runs. If this parameter is set,
+        the parameters `number` will be dynamically adjusted to meet the
+        minimum duration requirement of one `repeat`.
+        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+        will be automatically increased.
     cooldown_interval: float, optional
         The cool down interval between two measurements.
     check_correctness: bool, optional
@@ -416,7 +392,7 @@ def android_ndk_build_func(measure_input, tmp_dir, **kwargs):
 
 
 def run_through_rpc(measure_input, build_result,
-                    number, repeat, cooldown_interval,
+                    number, repeat, min_repeat_ms, cooldown_interval,
                     remote_args, ref_input=None, ref_output=None):
     """Run a generated library through rpc
 
@@ -426,13 +402,22 @@ def run_through_rpc(measure_input, build_result,
         The raw measure input
     build_result: BuildResult
         The result returned from Builder. This contains the path to the generated library.
-    number : int, optional
-        Number of times to do measurement for tasking average
+    number: int
+        The number of times to run the generated code for taking average.
+        We call these runs as one `repeat` of measurement.
     repeat : int, optional
-        Number of times to repeat the measurement.
+        The number of times to repeat the measurement.
         In total, the generated code will be run (1 + number x repeat) times,
-        where the first one is warm up. The returned result contains `repeat` costs,
-        each of which is the average of `number` test run.
+        where the first one is warm up and will be discarded.
+        The returned result contains `repeat` costs,
+        each of which is an average of `number` costs.
+    min_repeat_ms: int, optional
+        The minimum duration of one `repeat` in milliseconds.
+        By default, one `repeat` contains `number` runs. If this parameter is set,
+        the parameters `number` will be dynamically adjusted to meet the
+        minimum duration requirement of one `repeat`.
+        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+        will be automatically increased.
     cooldown_interval: float
         The cool down interval between two measurements
     remote_args: Tuple
@@ -454,14 +439,14 @@ def run_through_rpc(measure_input, build_result,
         func = remote.load_module(os.path.split(build_result.filename)[1])
         ctx = remote.context(str(measure_input.target), 0)
         time_f = func.time_evaluator(
-            func.entry_name, ctx, number=number, repeat=repeat)
+            func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms)
 
         # set input
         if ref_input:
             args = [nd.array(x, ctx=ctx) for x in ref_input]
         else:
             # create empty arrays on the remote device and copy them once.
-            # This can avoid some memory issues that make the measurment results unreliable.
+            # This can avoid some memory issues that make the measurement results unreliable.
             args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info]
             args = [nd.array(x, ctx=ctx) for x in args]
             ctx.sync()
diff --git a/python/tvm/module.py b/python/tvm/module.py
index ad2ac3d5f76c..d33a947dd7ed 100644
--- a/python/tvm/module.py
+++ b/python/tvm/module.py
@@ -145,7 +145,7 @@ def export_library(self,
             kwargs.update({'options': ["-I" + path for path in find_include_path()]})
         fcompile(file_name, files, **kwargs)
 
-    def time_evaluator(self, func_name, ctx, number, repeat=1):
+    def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0):
         """Get an evaluator that measures time cost of running function.
 
         Parameters
@@ -157,26 +157,38 @@ def time_evaluator(self, func_name, ctx, number, repeat=1):
             The context we should run this function on.
 
         number: int
-            The number of steps used in measuring each time interval
+            The number of times to run this function for taking average.
+            We call these runs as one `repeat` of measurement.
 
         repeat: int, optional
-            Number of times to run the timer measurement
-            If repeat equals 3, then we will get 3 numbers in the ProfileResult.
+            The number of times to repeat the measurement.
+            In total, the function will be invoked (1 + number x repeat) times,
+            where the first one is warm up and will be discarded.
+            The returned result contains `repeat` costs,
+            each of which is an average of `number` costs.
+
+        min_repeat_ms: int, optional
+            The minimum duration of one `repeat` in milliseconds.
+            By default, one `repeat` contains `number` runs. If this parameter is set,
+            the parameters `number` will be dynamically adjusted to meet the
+            minimum duration requirement of one `repeat`.
+            i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+            will be automatically increased.
 
         Note
         ----
-        The function will be invoked  repeat * number + 1 times,
+        The function will be invoked  (1 + number x repeat) times,
         with the first call discarded in case there is lazy initialization.
 
         Returns
         -------
         ftimer : Function
-            The function that takes same argument as func
-            and return a float representing seconds per function call.
+            The function that takes same argument as func and returns a ProfileResult.
+            The ProfileResult reports `repeat` time costs in seconds.
         """
         try:
             feval = _RPCTimeEvaluator(
-                self, func_name, ctx.device_type, ctx.device_id, number, repeat)
+                self, func_name, ctx.device_type, ctx.device_id, number, repeat, min_repeat_ms)
 
             def evaluator(*args):
                 """Internal wrapped evaluator."""
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 80a8cc93ce19..d5528617477f 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -124,10 +124,11 @@ class RPCModuleNode final : public ModuleNode {
   PackedFunc GetTimeEvaluator(const std::string& name,
                               TVMContext ctx,
                               int number,
-                              int repeat) {
+                              int repeat,
+                              int min_repeat_ms) {
     RPCFuncHandle handle = GetFuncHandle(name);
     if (handle == nullptr) return PackedFunc();
-    handle = sess_->GetTimeEvaluator(handle, ctx, number, repeat);
+    handle = sess_->GetTimeEvaluator(handle, ctx, number, repeat, min_repeat_ms);
     return WrapRemote(handle);
   }
 
@@ -203,10 +204,10 @@ TVM_REGISTER_GLOBAL("module._RPCTimeEvaluator")
     ctx.device_id = args[3];
     if (tkey == "rpc") {
       *rv = static_cast<RPCModuleNode*>(m.operator->())
-          ->GetTimeEvaluator(args[1], ctx, args[4], args[5]);
+          ->GetTimeEvaluator(args[1], ctx, args[4], args[5], args[6]);
     } else {
       *rv = WrapTimeEvaluator(
-          m.GetFunction(args[1], false), ctx, args[4], args[5]);
+          m.GetFunction(args[1], false), ctx, args[4], args[5], args[6]);
     }
   });
 
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index 208944a69dce..920f86128d11 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -13,6 +13,8 @@
 #include <chrono>
 #include <vector>
 #include <utility>
+#include <cmath>
+#include <algorithm>
 #include "rpc_session.h"
 #include "../../common/ring_buffer.h"
 
@@ -1002,9 +1004,9 @@ void RPCSession::CopyFromRemote(void* from,
 }
 
 RPCFuncHandle RPCSession::GetTimeEvaluator(
-    RPCFuncHandle fhandle, TVMContext ctx, int number, int repeat) {
+    RPCFuncHandle fhandle, TVMContext ctx, int number, int repeat, int min_repeat_ms) {
   return this->CallRemote(
-      RPCCode::kGetTimeEvaluator, fhandle, ctx, number, repeat);
+      RPCCode::kGetTimeEvaluator, fhandle, ctx, number, repeat, min_repeat_ms);
 }
 
 // Event handler functions
@@ -1138,7 +1140,7 @@ void RPCNDArrayFree(TVMArgs args, TVMRetValue *rv) {
 
 void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) {
   PackedFunc *pf = static_cast<PackedFunc*>(args[0].operator void*());
-  void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3]));
+  void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3], args[4]));
   delete pf;
   *rv = fhandle;
 }
@@ -1190,21 +1192,41 @@ void RPCSession::EventHandler::HandlePackedCall() {
   CHECK_EQ(state_, kRecvCode);
 }
 
-PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat) {
-  auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) {
+PackedFunc WrapTimeEvaluator(PackedFunc pf,
+                             TVMContext ctx,
+                             int number,
+                             int repeat,
+                             int min_repeat_ms) {
+  auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable {
     TVMRetValue temp;
     std::ostringstream os;
     // skip first time call, to activate lazy compilation components.
     pf.CallPacked(args, &temp);
     DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
+
     for (int i = 0; i < repeat; ++i) {
-      // start timing
-      auto tbegin = std::chrono::high_resolution_clock::now();
-      for (int i = 0; i < number; ++i) {
-        pf.CallPacked(args, &temp);
-      }
-      DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
-      auto tend = std::chrono::high_resolution_clock::now();
+      std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds> tbegin, tend;
+      double duration_ms = 0.0;
+
+      do {
+        if (duration_ms > 0.0) {
+          number = static_cast<int>(
+              std::max((min_repeat_ms / (duration_ms / number) + 1),
+                       number * 1.618));   // 1.618 is chosen by random
+        }
+
+        tbegin = std::chrono::high_resolution_clock::now();
+        // start timing
+        for (int i = 0; i < number; ++i) {
+          pf.CallPacked(args, &temp);
+        }
+        DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
+        tend = std::chrono::high_resolution_clock::now();
+
+        duration_ms = std::chrono::duration_cast<std::chrono::duration<double> >
+            (tend - tbegin).count() * 1000;
+      } while (duration_ms < min_repeat_ms);
+
       double speed = std::chrono::duration_cast<std::chrono::duration<double> >(
           tend - tbegin).count() / number;
       os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index 4b736de0e041..e2ee43992f7b 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -151,14 +151,26 @@ class RPCSession {
    *
    * \param fhandle The function handle.
    * \param ctx The ctx to run measurement on.
-   * \param number How many steps to run in each time evaluation
-   * \param repeat How many times to repeat the timer
+   * \param number The number of times to run this function for taking average.
+          We call these runs as one `repeat` of measurement.
+   * \param repeat The number of times to repeat the measurement.
+          In total, the function will be invoked (1 + number x repeat) times,
+          where the first one is warm up and will be discarded.
+          The returned result contains `repeat` costs,
+          each of which is an average of `number` costs.
+   * \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
+          By default, one `repeat` contains `number` runs. If this parameter is set,
+          the parameters `number` will be dynamically adjusted to meet the
+          minimum duration requirement of one `repeat`.
+          i.e., When the run time of one `repeat` falls below this time,
+          the `number` parameter will be automatically increased.
    * \return A remote timer function
    */
   RPCFuncHandle GetTimeEvaluator(RPCFuncHandle fhandle,
                                  TVMContext ctx,
                                  int number,
-                                 int repeat);
+                                 int repeat,
+                                 int min_repeat_ms);
   /*!
    * \brief Call a remote defined system function with arguments.
    * \param fcode The function code.
@@ -221,13 +233,29 @@ class RPCSession {
 };
 
 /*!
- * \brief Wrap a timer function for a given packed function.
+ * \brief Wrap a timer function to measure the time cost of a given packed function.
  * \param f The function argument.
  * \param ctx The context.
- * \param number Number of steps in the inner iteration
- * \param repeat How many steps to repeat the time evaluation.
+ * \param number The number of times to run this function for taking average.
+          We call these runs as one `repeat` of measurement.
+ * \param repeat The number of times to repeat the measurement.
+          In total, the function will be invoked (1 + number x repeat) times,
+          where the first one is warm up and will be discarded.
+          The returned result contains `repeat` costs,
+          each of which is an average of `number` costs.
+ * \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
+          By default, one `repeat` contains `number` runs. If this parameter is set,
+          the parameters `number` will be dynamically adjusted to meet the
+          minimum duration requirement of one `repeat`.
+          i.e., When the run time of one `repeat` falls below this time,
+          the `number` parameter will be automatically increased.
+ * \return f_timer A timer function.
  */
-PackedFunc WrapTimeEvaluator(PackedFunc f, TVMContext ctx, int number, int repeat);
+PackedFunc WrapTimeEvaluator(PackedFunc f,
+                             TVMContext ctx,
+                             int number,
+                             int repeat,
+                             int min_repeat_ms);
 
 /*!
  * \brief Create a Global RPC module that refers to the session.
diff --git a/tests/python/unittest/test_autotvm_measure.py b/tests/python/unittest/test_autotvm_measure.py
index e29cc2c51658..e540a11f241b 100644
--- a/tests/python/unittest/test_autotvm_measure.py
+++ b/tests/python/unittest/test_autotvm_measure.py
@@ -69,29 +69,9 @@ def _callback_wrong(tuner, measure_inputs, measure_results):
                callbacks=[_callback_wrong])
 
 
-def test_min_repeat_ms():
-    task, target = get_sample_task()
-
-    measure_option = autotvm.measure_option(
-        builder=autotvm.LocalBuilder(),
-        runner=autotvm.LocalRunner(number=1, min_repeat_ms=100)
-    )
-
-    def _callback(tuner, measure_inputs, measure_results):
-        for inp, res in zip(measure_inputs, measure_results):
-            if res.error_no != 0:
-                continue
-
-            assert 1000 * np.mean(res.costs) * \
-                   measure_option['runner'].cur_number >= 100
-
-    tuner = autotvm.tuner.RandomTuner(task)
-    tuner.tune(n_trial=5, measure_option=measure_option,
-               callbacks=[_callback])
-
 if __name__ == '__main__':
     logging.basicConfig(level=logging.INFO)
 
     test_task_tuner_without_measurement()
     test_check_correctness()
-    test_min_repeat_ms()
+
diff --git a/tests/python/unittest/test_runtime_measure.py b/tests/python/unittest/test_runtime_measure.py
new file mode 100644
index 000000000000..4513a268ac20
--- /dev/null
+++ b/tests/python/unittest/test_runtime_measure.py
@@ -0,0 +1,48 @@
+import time
+import ctypes
+
+import tvm
+from tvm.contrib.util import tempdir
+
+
+def test_min_repeat_ms():
+    tmp = tempdir()
+    filename = tmp.relpath("log")
+
+    @tvm.register_func
+    def my_debug(filename):
+        """one call lasts for 100 ms and writes one character to a file"""
+        time.sleep(0.1)
+        filename = ctypes.c_char_p(filename.value).value
+        with open(filename, "a") as fout:
+            fout.write("c")
+
+    X = tvm.compute((), lambda : tvm.call_packed("my_debug", filename))
+    s = tvm.create_schedule(X.op)
+    func = tvm.build(s, [X])
+
+    x = tvm.nd.empty((), dtype="int32")
+    ftimer = func.time_evaluator(func.entry_name, tvm.cpu(),
+                                 number=1, repeat=1)
+    ftimer(x)
+
+    with open(filename, "r") as fin:
+        ct = len(fin.readline())
+    
+    assert ct == 2
+
+
+    ftimer = func.time_evaluator(func.entry_name, tvm.cpu(),
+                                 number=1, repeat=1, min_repeat_ms=1000)
+    ftimer(x)
+
+    # make sure we get more than 10 calls
+    with open(filename, "r") as fin:
+        ct = len(fin.readline())
+
+    assert ct > 10 + 2
+        
+
+if __name__ == "__main__":
+    test_min_repeat_ms()
+