From 4a30921a0f65a1cdcc6474d13a707218d837bae8 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Thu, 23 May 2019 18:54:02 -0700
Subject: [PATCH] autoTVM task extraction for VTA (nnvm for now)

---
 nnvm/python/nnvm/top/nn.py                  |  10 +-
 python/tvm/autotvm/task/nnvm_integration.py |  71 +++---
 python/tvm/autotvm/task/topi_integration.py |  69 ++++--
 vta/scripts/tune_resnet.py                  | 231 ++++++++++++++++++++
 4 files changed, 321 insertions(+), 60 deletions(-)
 create mode 100644 vta/scripts/tune_resnet.py

diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index 13964f4e25f69..128f985bd6d2f 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -114,25 +114,25 @@ def compute_conv2d(attrs, inputs, _):
     if groups == 1 and layout == 'NCHW4c' and inputs[0].dtype == 'int8':
         # pylint: disable=assignment-from-no-return
         out = topi.nn.conv2d(inputs[0], inputs[1], strides, padding,
-                             dilation, layout, out_dtype=out_dtype)
+                             dilation, layout, out_dtype)
         # pylint: enable=assignment-from-no-return
     elif groups == 1:
         out = topi.nn.conv2d(
-            inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype=out_dtype)
+            inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype)
     elif layout == "NCHW" and \
          groups == get_const_int(inputs[0].shape[1]) and \
          groups == channels:
         out = topi.nn.depthwise_conv2d_nchw(
-            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
+            inputs[0], inputs[1], strides, padding, dilation, out_dtype)
     elif layout in ["NCHW", "NCHW4c"]:
         out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups,
-                                        out_dtype=out_dtype)
+                                        out_dtype)
     elif layout == "NHWC" and \
          kernel_layout == "HWOI" and \
          groups == get_const_int(inputs[0].shape[3]) and \
          groups == channels:
         out = topi.nn.depthwise_conv2d_nhwc(
-            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
+            inputs[0], inputs[1], strides, padding, dilation, out_dtype)
     else:
         raise ValueError("not support arbitrary group number for now")
 
diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py
index dbcee0e516e11..e4d2b3fb80235 100644
--- a/python/tvm/autotvm/task/nnvm_integration.py
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -27,15 +27,16 @@
 
 from .task import create
 from .topi_integration import TaskExtractEnv
+from .dispatcher import ApplyHistoryBest
 
 logger = logging.getLogger('autotvm')
 
 
-def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
+def extract_from_graph(graph, shape, dtype, target, symbols, params, target_host=None):
     """ Extract tuning tasks from a nnvm graph.
 
     This function collects tuning tasks by building the graph
-    with a "tracing" target and tracing all the calls to topi.
+    and trace all the calls to topi.
 
     Parameters
     ----------
@@ -49,6 +50,8 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
         The compilation target
     symbols : Array of nnvm.symbol
         Array of nnvm symbols want to be tuned
+    params : dict of str to NDArray
+        The parameter dictionary.
     target_host: tvm.target.Target
         The host compilation target
 
@@ -78,32 +81,35 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
             topi_funcs.extend(SYMBOL2TOPI[sym_name])
         else:
             warnings.warn("Symbol %s is not tunable, ignored" % sym_name)
-
-    # run compiler to collect all TOPI calls during compilation
     env.reset(topi_funcs)
 
-    # disable logger temporarily
-    old_state = logger.disabled
-    logger.disabled = True
+    with env:
+        # disable logger temporarily
+        old_state = logger.disabled
+        logger.disabled = True
 
-    # use a "tracing" target to do a fake compile for collecting topi calls
-    tracing_target = _target.create("llvm -device=tracing")
-    nnvm.compiler.engine.clear_cache()
-    nnvm.compiler.build(graph, target=tracing_target, shape=shape, dtype=dtype)
+        # run compiler to collect all TOPI calls during compilation
+        nnvm.compiler.engine.clear_cache()
+        nnvm.compiler.build(graph, target=target, shape=shape, dtype=dtype,
+                            target_host=target_host, params=params)
 
-    logger.disabled = old_state
+        logger.disabled = old_state
 
     # create tasks for target
     tasks = []
     for task_name, args in env.get_tasks():
-        tasks.append(create(task_name, args,
-                            target=target, target_host=target_host,
-                            template_key='direct'))
+        try:
+            tsk = create(task_name, args,
+                         target=target, target_host=target_host,
+                         template_key='direct')
+            tasks.append(tsk)
+        except topi.InvalidShapeError:
+            print("[Warning] Invalid Shape during AutoTVM Task Creation")
 
     return tasks
 
 
-def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, target_host=None):
+def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, params, target_host=None):
     """ Extract tuning tasks from multiple nnvm graphs.
 
     This function is the multiple graph version of extract_from_graph
@@ -120,6 +126,8 @@ def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, target_
         The compilation target
     symbols : Array of nnvm.symbol
         Array of nnvm symbols want to be tuned
+    params : dict of str to NDArray
+        The parameter dictionary.
     target_host: tvm.target.Target
         The host compilation target
 
@@ -149,28 +157,29 @@ def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, target_
             topi_funcs.extend(SYMBOL2TOPI[sym_name])
         else:
             warnings.warn("Symbol %s is not tunable, ignored" % sym_name)
-
-    # run compiler to collect all TOPI calls during compilation
     env.reset(topi_funcs)
 
-    # disable logger temporarily
-    old_state = logger.disabled
-    logger.disabled = True
+    with env:
+        # disable logger temporarily
+        old_state = logger.disabled
+        logger.disabled = True
 
-    # use a "tracing" target to do a fake compile for collecting topi calls
-    tracing_target = _target.create("llvm -device=tracing")
+        nnvm.compiler.engine.clear_cache()
+        for graph, shape, dtype in zip(graphs, shapes, dtypes):
+            nnvm.compiler.build(graph, target=target, shape=shape, dtype=dtype)
 
-    nnvm.compiler.engine.clear_cache()
-    for graph, shape, dtype in zip(graphs, shapes, dtypes):
-        nnvm.compiler.build(graph, target=tracing_target, shape=shape, dtype=dtype)
-
-    logger.disabled = old_state
+        logger.disabled = old_state
 
     # create tasks for target
     tasks = []
     for task_name, args in env.get_tasks():
-        tasks.append(create(task_name, args,
-                            target=target, target_host=target_host,
-                            template_key='direct'))
+        try:
+            tsk = create(task_name, args,
+                         target=target, target_host=target_host,
+                         template_key='direct')
+            tasks.append(tsk)
+        except topi.InvalidShapeError:
+            print("[Warning] Invalid Shape during AutoTVM Task Creation")
 
     return tasks
+
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index 3c983768ab3ea..58e78cf53c021 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -27,6 +27,9 @@
 See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
 """
 
+import warnings
+import sys
+
 from ... import _api_internal, tensor, placeholder, create_schedule
 
 from .task import args_to_workload, dispatcher, register
@@ -73,6 +76,7 @@ def deserialize_args(args):
 class TaskExtractEnv:
     """Global environment for extracting tuning tasks from nnvm graph"""
     current = None
+    registered = None
 
     def __init__(self):
         import topi
@@ -106,47 +110,64 @@ def __init__(self):
             topi.nn.deformable_conv2d_nchw: [topi.generic.schedule_deformable_conv2d_nchw],
         }
 
-        self._register_tracing()
+        # support reflection for tracing
+        self.func_to_reflection = {
+            topi.nn.conv2d:                 lambda x: setattr(topi.nn, 'conv2d', x),
+            topi.nn.conv2d_NCHWc:           lambda x: setattr(topi.nn, 'conv2d_NCHWc', x),
+            topi.nn.depthwise_conv2d_nchw:  lambda x: setattr(topi.nn, 'depthwise_conv2d_nchw', x),
+            topi.nn.group_conv2d_nchw:      lambda x: setattr(topi.nn, 'group_conv2d_nchw', x),
+            topi.nn.conv2d_transpose_nchw:  lambda x: setattr(topi.nn, 'conv2d_transpose_nchw', x),
+            topi.nn.dense:                  lambda x: setattr(topi.nn, 'dense', x),
+            topi.nn.bitserial_conv2d_nchw:  lambda x: setattr(topi.nn, 'bitserial_conv2d_nchw', x),
+            topi.nn.bitserial_conv2d_nhwc:  lambda x: setattr(topi.nn, 'bitserial_conv2d_nhwc', x),
+            topi.nn.bitserial_dense:        lambda x: setattr(topi.nn, 'bitserial_dense', x),
+            topi.nn.deformable_conv2d_nchw: lambda x: setattr(topi.nn, 'deformable_conv2d_nchw', x),
+        }
+
         self._register_topi_task()
         self.task_collection = []
         self.wanted_topi_funcs = list(self.topi_to_task.keys())
+        self.modified_funcs = []
+
+    def __enter__(self):
+        self.task_collection = []
+        self.modified_funcs = []
 
-    def _register_tracing(self):
-        """Register tracing function to track the topi function call"""
-        # register topi compute for "tracing" target
-        for topi_compute in self.topi_to_task:
+        for topi_compute in self.wanted_topi_funcs:
             def _local_scope(compute_func):
                 """start a scope to hold the local function in for loop"""
 
-                @compute_func.register("tracing", )
-                def _tracing_topi_compute(*args, **kwargs):
-                    assert not kwargs, "Do not support extracting tuning tasks when" \
-                                       "kwargs is used in TOPI function call." \
+                def _tracing_wrapper(*args, **kwargs):
+                    assert not kwargs, "Do not support extracting tuning tasks when " \
+                                       "kwargs is used in TOPI function call. " \
                                        "Please modify it to use only positional args."
+                    key = (self.topi_to_task[compute_func], serialize_args(args))
+                    if key not in self.task_collection:
+                        self.task_collection.append(key)
+
+                    return compute_func(*args, **kwargs)
+
+                self.func_to_reflection[topi_compute](_tracing_wrapper)
+                self.modified_funcs.append(topi_compute)
 
-                    if compute_func in self.wanted_topi_funcs:  # record this call
-                        key = (self.topi_to_task[compute_func], serialize_args(args))
-                        if key not in self.task_collection:
-                            self.task_collection.append(key)
-                    return compute_func.fdefault(*args)
             _local_scope(topi_compute)
 
-        # register topi schedule for "tracing" target
-        for topi_compute in self.topi_to_task:
-            for topi_schedule in self.topi_to_schedule[topi_compute]:
-                def _local_scope_(schedule_func):
-                    """start a scope to hold the local function in for loop"""
+        return self
 
-                    @schedule_func.register("tracing", )
-                    def _tracing_topi_compute(outs):
-                        outs = [outs] if isinstance(outs, tensor.Tensor) else outs
-                        return create_schedule([x.op for x in outs])
-                _local_scope_(topi_schedule)
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # revert modification
+        for func in self.modified_funcs:
+            self.func_to_reflection[func](func)
 
     def _register_topi_task(self):
         """register tuning wrapper for topi function"""
         import topi
 
+        # Avoid double registration for certain targets
+        if TaskExtractEnv.registered:
+            return
+        TaskExtractEnv.registered = True
+
         # Tuning wrapper for topi functions
         @register("topi_nn_conv2d")
         def _topi_nn_conv2d(*args, **kwargs):
diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
new file mode 100644
index 0000000000000..b22a63e09df83
--- /dev/null
+++ b/vta/scripts/tune_resnet.py
@@ -0,0 +1,231 @@
+import argparse
+import os
+import time
+import numpy as np
+
+import tvm
+from tvm import rpc, autotvm
+from tvm.autotvm.measure.measure_methods import request_remote
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.contrib import graph_runtime, util
+from tvm.contrib.download import download
+
+import topi
+import nnvm.compiler
+import vta
+import vta.testing
+
+env = vta.get_env()
+
+def register_vta_tuning_tasks():
+    from tvm.autotvm.task.topi_integration import TaskExtractEnv, deserialize_args
+
+    @tvm.tag_scope(tag=topi.tag.ELEMWISE)
+    def my_clip(x, a_min, a_max):
+        """Unlike topi's current clip, put min and max into two stages."""
+        const_min = tvm.const(a_min, x.dtype)
+        const_max = tvm.const(a_max, x.dtype)
+        x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
+        x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+        return x
+
+    # init autotvm env to register VTA operator
+    TaskExtractEnv()
+
+    @autotvm.task.register("topi_nn_conv2d", override=True)
+    def _topi_nn_conv2d(*args, **kwargs):
+        assert not kwargs, "Do not support kwargs in template function call"
+        args = deserialize_args(args)
+        A, W = args[:2]
+
+        with tvm.target.vta():
+            res = topi.nn.conv2d(*args, **kwargs)
+            res = topi.right_shift(res, 8)
+            res = my_clip(res, 0, 127)
+            res = topi.cast(res, "int8")
+
+        if tvm.target.current_target().device_name == 'vta':
+            s = topi.generic.schedule_conv2d_nchw([res])
+        else:
+            s = tvm.create_schedule([res.op])
+        return s, [A, W, res]
+
+
+
+def generate_graph(sym, params, target, target_host):
+    # Populate the shape and data type dictionary
+    shape_dict = {"data": (1, 3, 224, 224)}
+    dtype_dict = {"data": 'float32'}
+    shape_dict.update({k: v.shape for k, v in params.items()})
+    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+    # Apply NNVM graph optimization passes
+    sym = vta.graph.clean_cast(sym)
+    sym = vta.graph.clean_conv_fuse(sym)
+    assert env.BLOCK_IN == env.BLOCK_OUT
+    sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT)
+
+    # Compile NNVM graph
+    with nnvm.compiler.build_config(opt_level=3):
+        with vta.build_config():
+            graph, lib, params = nnvm.compiler.build(
+                sym, target, shape_dict, dtype_dict,
+                params=params, target_host=target_host)
+
+    return graph, lib, params
+
+
+def extract_tasks(sym, params, target, target_host):
+    # Populate the shape and data type dictionary
+    shape_dict = {"data": (1, 3, 224, 224)}
+    dtype_dict = {"data": 'float32'}
+    shape_dict.update({k: v.shape for k, v in params.items()})
+    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+    # Apply NNVM graph optimization passes
+    sym = vta.graph.clean_cast(sym)
+    sym = vta.graph.clean_conv_fuse(sym)
+    assert env.BLOCK_IN == env.BLOCK_OUT
+    sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT)
+
+    with vta.build_config():
+        tasks = autotvm.task.extract_from_graph(graph=sym, shape=shape_dict, dtype=dtype_dict, target=target,
+                                                params=params, symbols=(nnvm.sym.conv2d,), target_host=target_host)
+    return tasks
+
+
+def download_model():
+    url = "https://github.com/uwsaml/web-data/raw/master/vta/models/"
+    categ_fn = 'synset.txt'
+    graph_fn = 'resnet18_qt8.json'
+    params_fn = 'resnet18_qt8.params'
+    data_dir = '_data'
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+
+    for file in [categ_fn, graph_fn, params_fn]:
+        if not os.path.isfile(file):
+            download(os.path.join(url, file), os.path.join(data_dir, file))
+
+    sym = nnvm.graph.load_json(open(os.path.join(data_dir, graph_fn)).read())
+    params = nnvm.compiler.load_param_dict(open(os.path.join(data_dir, params_fn), 'rb').read())
+
+    return sym, params
+
+
+def tune_tasks(tasks,
+               measure_option,
+               tuner='xgb',
+               n_trial=1000,
+               early_stopping=None,
+               log_filename='tuning.log',
+               use_transfer_learning=True,
+               try_winograd=True):
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(tsk, loss_type='rank')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(tsk, pop_size=50)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        n_trial_ = min(n_trial, len(tsk.config_space))
+        tuner_obj.tune(n_trial_,
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(n_trial_, prefix=prefix),
+                           autotvm.callback.log_to_file(tmp_log_file)])
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+if __name__ == '__main__':
+
+    # Get tracker info from env
+    tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
+    tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None))
+    if not tracket_host or not tracket_port:
+        print("Set your AutoTVM tracker node host and port variables to run the autotuner")
+        exit()
+
+    tuning_opt = {
+        'log_filename': 'resnet-18.log',
+
+        'tuner': 'random',
+        'n_trial': 1e9,
+        'early_stopping': None,
+
+        'measure_option':  autotvm.measure_option(
+                builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func),
+                runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port,
+                    number=4, repeat=3, timeout=60,
+                    check_correctness=True))
+    }
+
+    # download model
+    sym, params = download_model()
+
+    # register VTA tuning tasks
+    register_vta_tuning_tasks()
+
+    # extract tasks
+    print("Extract tasks...")
+    target = tvm.target.vta()
+    target_host = env.target_host
+    tasks = extract_tasks(sym, params, target, target_host)
+
+    print("Tuning...")
+    tune_tasks(tasks, **tuning_opt)
+
+    # compile kernels with history best records
+    with autotvm.tophub.context(target, extra_files=[tuning_opt['log_filename']]):
+        print("Compile...")
+        graph, lib, params = generate_graph(sym, params, target, target_host)
+        input_shape = (1, 3, 224, 224)
+        dtype = 'float32'
+
+        # export library
+        tmp = util.tempdir()
+        filename = "net.tar"
+        lib.export_library(tmp.relpath(filename))
+
+        # upload module to device
+        print("Upload...")
+        remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000)
+        remote.upload(tmp.relpath(filename))
+        rlib = remote.load_module(filename)
+
+        # upload parameters to device
+        ctx = remote.context(str(target), 0)
+        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+        module = graph_runtime.create(graph, rlib, ctx)
+        module.set_input('data', data_tvm)
+        module.set_input(**rparams)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        ftimer = module.module.time_evaluator("run", ctx, number=3, repeat=3)
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+              (np.mean(prof_res), np.std(prof_res)))
+