From 72ec86ca5ceba5f4b595b16a3b1b3b6d8b78063c Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Thu, 2 May 2019 18:53:32 -0700
Subject: [PATCH 001/126] autotvm support for conv2d operator

---
 python/tvm/autotvm/tophub.py                  |   2 +-
 python/tvm/target.py                          |   9 +-
 vta/python/vta/environment.py                 |   7 +
 vta/python/vta/testing/util.py                |   2 +-
 vta/python/vta/top/__init__.py                |   2 -
 vta/python/vta/top/arm_conv2d.py              |  37 --
 vta/python/vta/top/vta_conv2d.py              | 403 +++-------------
 .../integration/test_benchmark_topi_conv2d.py | 429 ++++++++----------
 8 files changed, 277 insertions(+), 614 deletions(-)
 delete mode 100644 vta/python/vta/top/arm_conv2d.py

diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index 850f501cb1fc..37a95d6f774d 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -44,7 +44,7 @@
     'opencl':  "v0.02",
     'mali':    "v0.05",
 
-    'vta':     "v0.04",
+    'vta':     "v0.05",
 }
 
 logger = logging.getLogger('autotvm')
diff --git a/python/tvm/target.py b/python/tvm/target.py
index 828fff8e228c..4548ffac4c88 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -344,7 +344,7 @@ def register(key, func=None, override=False):
             The function to be registered.
 
         override : bool
-            Whether override existing registeration.
+            Whether override existing registration.
 
         Returns
         -------
@@ -489,6 +489,13 @@ def rasp(options=None):
     return arm_cpu('rasp3b', options)
 
 
+def vta(model='unknown', options=None):
+    opts = ["-device=vta", '-keys=cpu', '-model=%s' % model]
+    opts = _merge_opts(opts, options)
+    ret = _api_internal._TargetCreate("ext_dev", *opts)
+    return ret
+
+
 def create(target_str):
     """Get a target given target string.
 
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
index 4c2200d04727..093b0ec5c386 100644
--- a/vta/python/vta/environment.py
+++ b/vta/python/vta/environment.py
@@ -234,6 +234,10 @@ def gemm(self):
         """GEMM intrinsic"""
         return self.dev.gemm
 
+    @property
+    def target(self):
+        return tvm.target.vta(model=self.TARGET)
+
     @property
     def target_host(self):
         """The target host"""
@@ -243,6 +247,9 @@ def target_host(self):
             return "llvm"
         raise ValueError("Unknown target %s" % self.TARGET)
 
+    @property
+    def target_vta_cpu(self):
+        return tvm.target.arm_cpu(model=self.TARGET)
 
 def get_env():
     """Get the current VTA Environment.
diff --git a/vta/python/vta/testing/util.py b/vta/python/vta/testing/util.py
index 06c700cd7119..f99541691082 100644
--- a/vta/python/vta/testing/util.py
+++ b/vta/python/vta/testing/util.py
@@ -42,7 +42,7 @@ def run(run_func):
         # the port it's listening to, e.g. 9090
         local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
         if local_rpc:
-            remote = rpc.connect("localhost", local_rpc)
+            remote = rpc.connect("127.0.0.1", local_rpc)
             run_func(env, remote)
         else:
             # Make sure simulation library exists
diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py
index 614ed2347181..7346c35506a2 100644
--- a/vta/python/vta/top/__init__.py
+++ b/vta/python/vta/top/__init__.py
@@ -1,5 +1,3 @@
 """TVM TOPI connector, eventually most of these should go to TVM repo"""
 
-from .vta_conv2d import packed_conv2d, schedule_packed_conv2d
 from . import vta_conv2d
-from . import arm_conv2d
diff --git a/vta/python/vta/top/arm_conv2d.py b/vta/python/vta/top/arm_conv2d.py
deleted file mode 100644
index 6e34917c0b71..000000000000
--- a/vta/python/vta/top/arm_conv2d.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Reuse conv2d schedule from ARM CPU"""
-
-import tvm
-
-from topi.nn import conv2d, conv2d_alter_layout
-from topi import generic
-
-@conv2d.register(["vtacpu", "vta"])
-def compute(*args, **kwargs):
-    with tvm.target.arm_cpu("vtacpu"):
-        return conv2d(*args, **kwargs)
-
-@generic.schedule_conv2d_nchw.register(["vtacpu", "vta"])
-def schedule(*args, **kwargs):
-    with tvm.target.arm_cpu("vtacpu"):
-        return generic.schedule_conv2d_nchw(*args, **kwargs)
-
-@conv2d_alter_layout.register(["vtacpu", "vta"])
-def alter(*args, **kwargs):
-    with tvm.target.arm_cpu("vtacpu"):
-        return conv2d_alter_layout(*args, **kwargs)
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index ef4f2017381a..681418d6ecb1 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -15,181 +15,49 @@
 # specific language governing permissions and limitations
 # under the License.
 """Namespace for supporting packed_conv2d + ewise variant of nnvm."""
-from __future__ import absolute_import as _abs
 
-from collections import namedtuple
-
-import logging
 import tvm
+from tvm import autotvm
 import topi
 
-from nnvm.top import registry as reg, OpPattern
-from nnvm.top import nn as _nn
-from ..environment import get_env
-
-
-Workload = namedtuple("Conv2DWorkload",
-                      ['batch', 'height', 'width', 'in_filter', 'out_filter',
-                       'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
-
-def find_schedules(layer, vt_only=False, best_only=False):
-    """ Returns a schedule for a given a layer.
-
-    Parameters
-    ----------
-    layer : Workload
-        Convolutional layer description.
-    vt_only : Boolean
-        Produce a schedule plan with virtual threading.
-    best_only : Boolean
-        Return the "best" schedule plan.
-
-    Returns
-    -------
-    fil_sched : list
-        List of valid schedules.
-
-    """
-    # pylint: disable=too-many-nested-blocks
-    env = get_env()
-
-    # Helper function to get factors
-    def _find_factors(n):
-        factors = []
-        for f in range(1, n + 1):
-            if n % f == 0:
-                factors.append(f)
-        return factors
-
-    def _get_data_movement_byte(schedule, layer):
-        """ Estimate data movement in bytes for the schedule plan
-        """
-        env = get_env()
-        b_f = schedule.b_factor
-        h_f = schedule.h_factor
-        w_f = schedule.w_factor
-        ci_f = schedule.ic_factor
-        co_f = schedule.oc_factor
-        # Derive data movement
-        inp_elem_sizeb = env.BATCH * env.BLOCK_IN * env.INP_WIDTH
-        wgt_elem_sizeb = env.BLOCK_IN * env.BLOCK_OUT * env.WGT_WIDTH
-        out_elem_sizeb = env.BATCH * env.BLOCK_OUT * env.OUT_WIDTH
-        input_tile_elems = b_f * \
-                ((h_f - 1) * layer.hstride + layer.hkernel) * \
-                ((w_f - 1) * layer.wstride + layer.wkernel) * ci_f
-        weight_tile_elems = layer.hkernel * layer.wkernel * ci_f
-        output_tile_elems = b_f * h_f * w_f * co_f
-        # Derive tiling factors
-        b_factor = layer.batch // (b_f * env.BATCH)
-        h_factor = (layer.height // layer.hstride) // h_f
-        w_factor = (layer.width // layer.wstride) // w_f
-        ci_factor = layer.in_filter // (ci_f * env.BLOCK_IN)
-        co_factor = layer.out_filter // (co_f * env.BLOCK_OUT)
-        # Compute input transaction count
-        input_xfers = b_factor * h_factor * w_factor * co_factor * ci_factor
-        weight_xfers = b_factor * h_factor * w_factor * co_factor * ci_factor
-        output_xfers = b_factor * h_factor * w_factor * co_factor
-        # Compute total transfer sizes
-        input_xfer_byte = input_tile_elems * input_xfers * inp_elem_sizeb // 8
-        weight_xfer_byte = weight_tile_elems * weight_xfers * wgt_elem_sizeb // 8
-        output_xfer_byte = output_tile_elems * output_xfers * out_elem_sizeb // 8
-        total_xfer_byte = input_xfer_byte + weight_xfer_byte + output_xfer_byte
-        return total_xfer_byte
-
-    # Scheduling exploration
-    batch_factors = _find_factors(layer.batch // env.BATCH)
-    height_factors = _find_factors(layer.height // layer.hstride)
-    width_factors = _find_factors(layer.width // layer.wstride)
-    cin_factors = _find_factors(layer.in_filter // env.BLOCK_IN)
-    cout_factors = _find_factors(layer.out_filter // env.BLOCK_OUT)
-    ht_factors = [1, 2]
-    cot_factors = [1, 2]
-
-    # Explore schedules
-    schedules = []
-    for b_f in batch_factors:
-        for h_f in height_factors:
-            for w_f in width_factors:
-                for ci_f in cin_factors:
-                    for co_f in cout_factors:
-                        # FIXME: 2D load pattern matching imposes restrictions on schedule
-                        valid = (w_f == layer.width // layer.wstride) or \
-                                (w_f != layer.width // layer.wstride and co_f == 1) and \
-                                ci_f == 1
-                        if valid:
-                            schedules.append([b_f, h_f, w_f, ci_f, co_f])
+import numpy as np
 
-    # Filter the schedules that wouldn't work in the available BRAM sizes
-    inp_elem_sizeb = env.BATCH * env.BLOCK_IN * env.INP_WIDTH
-    wgt_elem_sizeb = env.BLOCK_IN * env.BLOCK_OUT * env.WGT_WIDTH
-    out_elem_sizeb = env.BATCH * env.BLOCK_OUT * env.OUT_WIDTH
-    inp_brams_sizeb = env.INP_BUFF_SIZE * 8
-    wgt_brams_sizeb = env.WGT_BUFF_SIZE * 8
-    out_brams_sizeb = env.OUT_BUFF_SIZE * 8
-    fil_sched = []
-    xfer_size = []
-    for sched in schedules:
-        b_f, h_f, w_f, ci_f, co_f = sched
-        for h_t in ht_factors:
-            for co_t in cot_factors:
-                # Make sure to filter cases where we apply threading on two axes
-                # or cases where the threading factors for h and co are not
-                # factors of h and co
-                if (h_t == 2 and co_t == 2) or (h_f % h_t != 0) or (co_f % co_t != 0):
-                    continue
-                # Adjust tile sizes if threading is applied
-                h_f //= h_t
-                co_f //= co_t
-                # Derive tile sizes
-                input_tile_elems = b_f * \
-                        ((h_f - 1) * layer.hstride + layer.hkernel) * \
-                        ((w_f - 1) * layer.wstride + layer.wkernel) * ci_f
-                weight_tile_elems = layer.hkernel * layer.wkernel * ci_f * co_f
-                output_tile_elems = b_f * h_f * w_f * co_f
-
-                # Derive valid schedule filter
-                valid = True
-                # If in vitrual-threaded mode, only allow for threaded plans
-                valid &= (vt_only and (h_t == 2 or co_t == 2)) or not vt_only
-                # Check that we don't exceed input/weight/output capacity
-                valid &= input_tile_elems * inp_elem_sizeb <= inp_brams_sizeb // (co_t * h_t)
-                valid &= weight_tile_elems * wgt_elem_sizeb <= wgt_brams_sizeb
-                valid &= output_tile_elems * out_elem_sizeb <= out_brams_sizeb // (co_t * h_t)
-                # Make sure that we don't write to the same acc location within 2 consecutive cycles
-                valid &= h_f > 2 and w_f > 2
-                # TODO: check that we don't exceed instruction or micro-op count
-
-                if valid:
-                    schedule = Schedule(b_factor=b_f, oc_factor=co_f, ic_factor=ci_f, h_factor=h_f,
-                                        w_factor=w_f, oc_nthread=co_t, h_nthread=h_t)
-                    fil_sched.append(schedule)
-                    xfer_size.append(_get_data_movement_byte(schedule, layer))
+from ..environment import get_env
 
-    if best_only and xfer_size:
-        return [fil_sched[xfer_size.index(min(xfer_size))]]
-    return fil_sched
+def is_packed_layout(layout):
+    """Check if layout is packed layout"""
+    if layout == "NCHW":
+        return False
+    if "n" in layout and "c" in layout:
+        return True
+    return False
 
-def packed_conv2d(data,
+@autotvm.register_topi_compute(topi.nn.conv2d, 'vta', 'direct')
+def packed_conv2d(cfg,
+                  data,
                   kernel,
-                  padding,
                   strides,
-                  out_dtype="int32"):
-    """ Packed conv2d function.
-    """
+                  padding,
+                  dilation,
+                  layout,
+                  out_dtype):
+    """ Packed conv2d function."""
+    if not is_packed_layout(layout):
+        raise topi.InvalidShapeError()
+    assert dilation == (1, 1)
+
     if padding[0]:
         pad_data = topi.nn.pad(data, [0, 0, padding[0], padding[1], 0, 0], name="pad_data")
     else:
         pad_data = data
     assert len(data.shape) == 6
     assert len(kernel.shape) == 6
-    oheight = topi.util.simplify((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1)
-    owidth = topi.util.simplify((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1)
+    oheight = topi.util.get_const_int((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1)
+    owidth = topi.util.get_const_int((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1)
     oshape = (data.shape[0], kernel.shape[0], oheight, owidth, data.shape[4], kernel.shape[4])
 
     ishape = topi.util.get_const_tuple(data.shape)
     kshape = topi.util.get_const_tuple(kernel.shape)
-    assert data.dtype == "int8", data.dtype
-    assert kernel.dtype == "int8", kernel.dtype
     d_i = tvm.reduce_axis((0, kshape[2]), name='d_i')
     d_j = tvm.reduce_axis((0, kshape[3]), name='d_j')
     k_o = tvm.reduce_axis((0, ishape[1]), name='k_o')
@@ -202,154 +70,28 @@ def packed_conv2d(data,
             kernel[c_o, k_o, d_i, d_j, c_i, k_i].astype(out_dtype),
             axis=[k_o, d_i, d_j, k_i]),
         name="res", tag="packed_conv2d")
-    return res
-
-@tvm.register_func("nnvm.compiler.build_target", override=True)
-def _build(funcs, target, target_host):
-    tvm_t = tvm.target.create(target)
-    if tvm_t.device_name == "vta":
-        return tvm.build(funcs, target="ext_dev", target_host=target_host)
-    if tvm_t.device_name == "rasp" or tvm_t.device_name == "vtacpu":
-        return tvm.build(funcs, target=target_host)
-    return tvm.build(funcs, target=target)
-
-
-@tvm.register_func("nnvm.compiler.lower", override=True)
-def _lower(sch, inputs, func_name, graph):
-    import traceback
-    # pylint: disable=broad-except
-    try:
-        f = tvm.lower(sch, inputs, name=func_name)
-        if "quantized_conv2d" in func_name:
-            logging.info(graph.ir(join_entry_attrs=["shape"]))
-    except Exception:
-        msg = traceback.format_exc()
-        msg += "Error during compile graph\n"
-        msg += "--------------------------\n"
-        msg += graph.ir(join_entry_attrs=["shape"])
-        raise RuntimeError(msg)
-    return f if isinstance(
-        f, (tvm.container.Array, tuple, list)) else [f]
-
-
-@reg.register_compute("clip", level=15)
-def compute_clip(attrs, inputs, _):
-    """ Clip operator.
-    """
-    x = inputs[0]
-    a_min = attrs.get_float("a_min")
-    a_max = attrs.get_float("a_max")
-    const_min = tvm.const(a_min, x.dtype)
-    const_max = tvm.const(a_max, x.dtype)
-    with tvm.tag_scope(topi.tag.ELEMWISE):
-        x = tvm.compute(
-            x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
-        x = tvm.compute(
-            x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
-    return x
-
-# override to force partition at copy
-reg.register_pattern("copy", OpPattern.INJECTIVE, level=15)
 
-def is_packed_layout(layout):
-    """Check if layout is packed layout"""
-    if layout == "NCHW":
-        return False
-    if "n" in layout and "c" in layout:
-        return True
-    return False
-
-@reg.register_alter_op_layout("conv2d", level=15)
-def alter_conv2d_layout(attrs, inputs, out):
-    layout = attrs['layout']
-    if is_packed_layout(layout):
-        return None
-    return _nn.alter_conv2d_layout(attrs, inputs, out)
-
-
-@reg.register_compute("conv2d", level=15)
-def compute_conv2d(attrs, inputs, out):
-    """ 2D convolution algorithm.
-    """
-    padding = attrs.get_int_tuple("padding")
-    strides = attrs.get_int_tuple("strides")
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.get_int("groups")
-    layout = attrs["layout"]
-    out_dtype = attrs['out_dtype']
-    assert dilation == (1, 1), "not support dilate now"
-    if is_packed_layout(layout):
-        assert groups == 1
-        return packed_conv2d(inputs[0], inputs[1],
-                             padding, strides, out_dtype=out_dtype)
-    return _nn.compute_conv2d(attrs, inputs, out)
-
-
-@reg.register_schedule("conv2d", level=15)
-def schedule_conv2d(attrs, outs, target):
-    """ 2D convolution schedule.
-    """
-    layout = attrs["layout"]
-
-    if is_packed_layout(layout):
-        target = tvm.target.create(target)
-        if target.device_name == "vta":
-            return schedule_packed_conv2d(outs)
-        if str(target).startswith("llvm"):
-            return tvm.create_schedule([x.op for x in outs])
-        raise RuntimeError("not support target %s" % target)
-    return _nn.schedule_conv2d(attrs, outs, target)
-
-
-def _get_workload(data, pad_data, kernel, output):
-    """ Get the workload structure.
-    """
-    o_shape = topi.util.get_const_tuple(output.shape)
-    d_shape = topi.util.get_const_tuple(data.shape)
-    k_shape = topi.util.get_const_tuple(kernel.shape)
-    o_b, o_c, o_h, o_w, ob_blk, o_blk = o_shape
-    i_b, i_c, i_h, i_w, ib_blk, i_blk = d_shape
-    k_o, k_i, k_h, k_w, ko_blk, ki_blk = k_shape
-    # For now we need to assume that input channel blocking is the same
-    # as the output channel blocking
-    assert o_blk == i_blk
-    assert ob_blk == ib_blk
-    # Make sure that dimensions match
-    assert o_b == i_b
-    assert o_blk == ko_blk
-    assert i_blk == ki_blk
-    assert k_o == o_c
-    assert k_i == i_c
-    # Scale the channel size
-    i_c *= i_blk
-    o_c *= o_blk
-    if pad_data is not None:
-        p_shape = topi.util.get_const_tuple(pad_data.shape)
-        h_pad = (p_shape[2] - d_shape[2]) // 2
-        w_pad = (p_shape[3] - d_shape[3]) // 2
-    else:
-        h_pad, w_pad = 0, 0
-    h_str = (i_h + h_pad*2 - k_h) // (o_h - 1)
-    w_str = (i_w + w_pad*2 - k_w) // (o_w - 1)
-    return Workload(i_b, i_h, i_w, i_c, o_c, k_h, k_w, h_pad, w_pad, h_str, w_str)
-
-_WL2PLAN = {}
+    cfg.add_flop(2 * np.prod(topi.util.get_const_tuple(oshape)) *
+                 kshape[2] * kshape[3] * ishape[1] * ishape[-1])
+    return res
 
-def schedule_packed_conv2d(outs):
-    """ Schedule the packed conv2d.
-    """
+@autotvm.register_topi_schedule(topi.generic.schedule_conv2d_nchw, 'vta', 'direct')
+def schedule_packed_conv2d(cfg, outs):
     assert len(outs) == 1
     output = outs[0]
+    const_ops = []
     ewise_inputs = []
     ewise_ops = []
     conv2d_res = []
-    assert output.dtype == "int8"
-    assert output.op.input_tensors[0].dtype == "int32"
+    assert "int" in output.op.input_tensors[0].dtype
 
     def _traverse(op):
         if topi.tag.is_broadcast(op.tag):
             if not op.same_as(output.op):
-                ewise_ops.append(op)
+                if len(op.axis) == 0:
+                    const_ops.append(op)
+                else:
+                    ewise_ops.append(op)
             for tensor in op.input_tensors:
                 if isinstance(tensor.op, tvm.tensor.PlaceholderOp):
                     ewise_inputs.append((op, tensor))
@@ -362,6 +104,19 @@ def _traverse(op):
     _traverse(output.op)
     assert len(conv2d_res) == 1
     conv2d_stage = conv2d_res[0].output(0)
+    s = tvm.create_schedule(output.op)
+
+    ##### space definition begin #####
+    b, co, h, w, bi, ci = s[conv2d_stage].op.axis
+    ci, kh, kw, bci = s[conv2d_stage].op.reduce_axis
+    cfg.define_split('tile_b', b, num_outputs=2)
+    cfg.define_split('tile_h', h, num_outputs=2)
+    cfg.define_split('tile_w', w, num_outputs=2)
+    cfg.define_split('tile_ci', ci, num_outputs=2)
+    cfg.define_split('tile_co', co, num_outputs=2)
+    cfg.define_knob('oc_nthread', [1, 2])
+    cfg.define_knob('h_nthread', [1, 2])
+    ###### space definition end ######
 
     data, kernel = conv2d_stage.op.input_tensors
     if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
@@ -370,21 +125,8 @@ def _traverse(op):
         data = temp
     else:
         pad_data = None
-    wrkld = _get_workload(data, pad_data, kernel, output)
-    if wrkld in _WL2PLAN:
-        plan = _WL2PLAN[wrkld]
-    else:
-        plan = find_schedules(wrkld, vt_only=True, best_only=True)[0]
-        logging.info("Trying to find plan for %s", wrkld)
-    env = get_env()
-
-    load_inp = load_wgt = load_out = store_out = env.dma_copy
-    alu = env.alu
-    gemm = env.gemm
 
-    # schedule1
-    oshape = topi.util.get_const_tuple(output.shape)
-    s = tvm.create_schedule(output.op)
+    env = get_env()
 
     # setup pad
     if pad_data is not None:
@@ -394,27 +136,26 @@ def _traverse(op):
         cdata = s.cache_read(data, env.inp_scope, [conv2d_stage])
     ckernel = s.cache_read(kernel, env.wgt_scope, [conv2d_stage])
     s[conv2d_stage].set_scope(env.acc_scope)
+
     # cache read input
     cache_read_ewise = []
-
     for consumer, tensor in ewise_inputs:
         cache_read_ewise.append(
             s.cache_read(tensor, env.acc_scope, [consumer]))
+
     # set ewise scope
     for op in ewise_ops:
         s[op].set_scope(env.acc_scope)
-        s[op].pragma(s[op].op.axis[0], alu)
+        s[op].pragma(s[op].op.axis[0], env.alu)
 
-    # tile
-    oc_factor = (plan.oc_factor if plan.oc_factor
-                 else plan.out_filter // env.BLOCK_OUT)
-    h_factor = (plan.h_factor if plan.h_factor else oshape[2])
-    w_factor = (plan.w_factor if plan.w_factor else oshape[3])
+    for op in const_ops:
+        s[op].compute_inline()
 
+    # tile
     x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis
-    x_co0, x_co1 = s[output].split(x_co, factor=oc_factor)
-    x_i0, x_i1 = s[output].split(x_i, factor=h_factor)
-    x_j0, x_j1 = s[output].split(x_j, factor=w_factor)
+    x_co0, x_co1 = cfg['tile_co'].apply(s, output, x_co)
+    x_i0, x_i1 = cfg['tile_h'].apply(s, output, x_i)
+    x_j0, x_j1 = cfg['tile_w'].apply(s, output, x_j)
     s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)
     store_pt = x_j0
 
@@ -425,17 +166,17 @@ def _traverse(op):
 
     for tensor in cache_read_ewise:
         s[tensor].compute_at(s[output], store_pt)
-        s[tensor].pragma(s[tensor].op.axis[0], load_out)
+        s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy)
 
     # virtual threading along output channel axes
-    if plan.oc_nthread > 1:
-        _, v_t = s[output].split(x_co0, factor=plan.oc_nthread)
+    if cfg['oc_nthread'].val > 1:
+        _, v_t = s[output].split(x_co0, factor=cfg['oc_nthread'].val)
         s[output].reorder(v_t, x_bo)
         s[output].bind(v_t, tvm.thread_axis("cthread"))
 
     # virtual threading along spatial rows
-    if plan.h_nthread > 1:
-        _, v_t = s[output].split(x_i0, factor=plan.h_nthread)
+    if cfg['h_nthread'].val > 1:
+        _, v_t = s[output].split(x_i0, factor=cfg['h_nthread'].val)
         s[output].reorder(v_t, x_bo)
         s[output].bind(v_t, tvm.thread_axis("cthread"))
 
@@ -443,17 +184,17 @@ def _traverse(op):
     k_o, d_i, d_j, k_i = s[conv2d_stage].op.reduce_axis
     s[conv2d_stage].reorder(x_bo, k_o, x_j, d_j, d_i, x_co, x_i, x_bi, x_ci, k_i)
 
-    if plan.ic_factor:
-        k_o, _ = s[conv2d_stage].split(k_o, factor=plan.ic_factor)
-        s[cdata].compute_at(s[conv2d_stage], k_o)
-        s[ckernel].compute_at(s[conv2d_stage], k_o)
+    k_o, _ = cfg['tile_ci'].apply(s, conv2d_stage, k_o)
+    s[cdata].compute_at(s[conv2d_stage], k_o)
+    s[ckernel].compute_at(s[conv2d_stage], k_o)
 
     # Use VTA instructions
-    s[cdata].pragma(s[cdata].op.axis[0], load_inp)
-    s[ckernel].pragma(s[ckernel].op.axis[0], load_wgt)
-    s[conv2d_stage].tensorize(x_bi, gemm)
-    s[output].pragma(x_co1, store_out)
+    s[cdata].pragma(s[cdata].op.axis[0], env.dma_copy)
+    s[ckernel].pragma(s[ckernel].op.axis[0], env.dma_copy)
+    s[conv2d_stage].tensorize(x_bi, env.gemm)
+    s[output].pragma(x_co1, env.dma_copy)
     return s
+<<<<<<< HEAD
 
 class Conv2DSchedule(object):
     """ 2D convolution schedule object.
@@ -508,3 +249,5 @@ def __str__(self):
     else:
         logging.warning("No valid schedule was found for the workload on current vta configuration")
         break
+=======
+>>>>>>> autotvm support for conv2d operator
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index 8a03cb020260..56b66bdb0101 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -16,6 +16,12 @@
 # under the License.
 """Testing if we can generate code in topi style"""
 
+import os
+import json
+from collections import namedtuple
+
+import numpy as np
+
 import tvm
 from tvm import autotvm
 from tvm.contrib import util
@@ -24,10 +30,30 @@
 import topi.testing
 import vta
 import vta.testing
-import numpy as np
-
-Workload = vta.top.vta_conv2d.Workload
-
+from vta.testing import simulator
+
+Workload = namedtuple("Conv2DWorkload",
+                      ['batch', 'height', 'width', 'in_filter', 'out_filter',
+                       'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
+
+# ResNet18 workloads
+resnet_wkls = [
+    # Workloads of resnet18 on imagenet
+    # ('resnet-18.C1',  Workload(1, 224, 224, 3,   64,  7, 7, 3, 3, 2, 2)),
+    ('resnet-18.C2',  Workload(1,  56,  56, 64,  64,  3, 3, 1, 1, 1, 1)),
+    ('resnet-18.C3',  Workload(1,  56,  56, 64,  64,  1, 1, 0, 0, 1, 1)),
+    ('resnet-18.C4',  Workload(1,  56,  56, 64,  128, 3, 3, 1, 1, 2, 2)),
+    ('resnet-18.C5',  Workload(1,  56,  56, 64,  128, 1, 1, 0, 0, 2, 2)),
+    ('resnet-18.C6',  Workload(1,  28,  28, 128, 128, 3, 3, 1, 1, 1, 1)),
+    ('resnet-18.C7',  Workload(1,  28,  28, 128, 256, 3, 3, 1, 1, 2, 2)),
+    ('resnet-18.C8',  Workload(1,  28,  28, 128, 256, 1, 1, 0, 0, 2, 2)),
+    ('resnet-18.C9',  Workload(1,  14,  14, 256, 256, 3, 3, 1, 1, 1, 1)),
+    ('resnet-18.C10', Workload(1,  14,  14, 256, 512, 3, 3, 1, 1, 2, 2)),
+    ('resnet-18.C11', Workload(1,  14,  14, 256, 512, 1, 1, 0, 0, 2, 2)),
+    ('resnet-18.C12', Workload(1,   7,   7, 512, 512, 3, 3, 1, 1, 1, 1)),
+]
+
+# FIXME: we need a custom clip operator to circumvent a pattern detection limitation
 @tvm.tag_scope(tag=topi.tag.ELEMWISE)
 def my_clip(x, a_min, a_max):
     """Unlike topi's current clip, put min and max into two stages."""
@@ -37,249 +63,168 @@ def my_clip(x, a_min, a_max):
     x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
     return x
 
-def test_cpu_conv2d():
-    def run_cpu_conv2d(env, remote, key, batch_size, wl, profile=True):
-        data_shape = (batch_size, wl.in_filter, wl.height, wl.width)
-        kernel_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
-
-        fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
-        fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
-        data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-        kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
-        res_conv = topi.nn.conv2d(
-            data, kernel, padding=(wl.hpad, wl.wpad),
-            strides=(wl.hstride, wl.wstride),
-            dilation=(1, 1),
-            out_dtype="int32")
-        res = topi.right_shift(res_conv, 8)
-        res = my_clip(res, 0, 127)
-        res = topi.cast(res, "int8")
-
-        # To compute number of ops, use a x2 factor for FMA
-        num_ops = 2 * batch_size * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter
-
-        a_shape = (batch_size, wl.in_filter, wl.height, wl.width)
-        w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
-        stride = (wl.hstride, wl.wstride)
-        data_dtype = data.dtype
-        kernel_dtype = kernel.dtype
-        acc_dtype = env.acc_dtype
-        assert wl.hpad == wl.wpad
-        padding = wl.hpad
-
-        @memoize("vta.tests.test_benchmark_topi.conv2d.cpu.verify_nhwc")
-        def get_ref_data():
-            a_np = (np.random.uniform(size=a_shape) * 4).astype(data_dtype)
-            w_np = (np.random.uniform(size=w_shape) * 4).astype(kernel_dtype)
-            a_np = np.abs(a_np)
-            w_np = np.abs(w_np)
-            b_np = topi.testing.conv2d_nchw_python(
-                a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype)
-            return a_np, w_np, b_np
-
-
-        def verify(s, check_correctness):
-            mod = tvm.build(s, [data, kernel, res],
-                            target_host=env.target_host,
-                            name="conv2d")
-            temp = util.tempdir()
-            mod.save(temp.relpath("conv2d.o"))
-            remote.upload(temp.relpath("conv2d.o"))
-            f = remote.load_module("conv2d.o")
-            # verify
-            ctx = remote.cpu(0)
-            # Data in original format
-            data_orig, kernel_orig, res_ref = get_ref_data()
-            res_shape = topi.util.get_const_tuple(res.shape)
-            res_np = np.zeros(res_shape).astype(res.dtype)
-            data_arr = tvm.nd.array(data_orig, ctx)
-            kernel_arr = tvm.nd.array(kernel_orig, ctx)
-            res_arr = tvm.nd.array(res_np, ctx)
-            time_f = f.time_evaluator("conv2d", ctx, number=5)
-            cost = time_f(data_arr, kernel_arr, res_arr)
-            res_unpack = res_arr.asnumpy()
-            if check_correctness:
-                assert wl.hpad == wl.wpad
-                stride = (wl.hstride, wl.wstride)
-                padding = wl.hpad
-                res_ref = res_ref >> 8
-                res_ref = np.clip(res_ref, 0, 127).astype("int8")
-                tvm.testing.assert_allclose(res_unpack, res_ref)
-            return cost
-
-        def conv_normal(print_ir):
-            print("----- CONV2D CPU End-to-End Test-------")
-            s = topi.generic.schedule_conv2d_nchw([res])
-            if print_ir:
-                print(tvm.lower(s, [data, kernel, res], simple_mode=True))
-            cost = verify(s, True)
-            gops = (num_ops / cost.mean) / float(10 ** 9)
-            print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
-
-        conv_normal(False)
-
-    def _run(env, remote):
-        # ResNet18 workloads
-        resnet = {
-            # Workloads of resnet18 on imagenet
-            0: Workload(1, 224, 224, 16, 64, 7, 7, 3, 3, 2, 2),
-            1: Workload(1, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
-            2: Workload(1, 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
-            3: Workload(1, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
-            4: Workload(1, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
-            5: Workload(1, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
-            6: Workload(1, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
-            7: Workload(1, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
-            8: Workload(1, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
-            9: Workload(1, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
-            10: Workload(1, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
-            11: Workload(1, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
-        }
-        batch_size = 1
-        for i in range(1, len(resnet)):
-            wl = resnet[i]
-            key = "resnet-cfg[%d]" % i
-            print("key=%s" % key)
-            print(wl)
-            with tvm.target.create("llvm -device=vtacpu"):
-                run_cpu_conv2d(env, remote, key, batch_size, wl)
-
-    # load pre-tuned operator parameters for ARM CPU
-    autotvm.tophub.check_backend('vta')
-    with autotvm.tophub.context('llvm -device=vtacpu'):
-        vta.testing.run(_run)
-
-
-def test_vta_conv2d():
-    def run_vta_conv2d(env, remote, key, batch_size, wl, profile=True):
-        data_shape = (batch_size//env.BATCH, wl.in_filter//env.BLOCK_IN,
-                      wl.height, wl.width, env.BATCH, env.BLOCK_IN)
+def run_conv2d(env, remote, wl, target,
+               check_correctness=True, print_ir=False,
+               samples=4, profileOnly=False):
+
+    # Workload assertions
+    assert wl.hpad == wl.wpad
+
+    # Perform packing only if we are targeting the accelerator
+    if "arm_cpu" in target.keys:
+        data_pack = False
+        layout = "NCHW"
+    elif "vta" in target.keys:
+        data_pack = True
+        layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
+
+    # Derive shapes depending upon packing
+    a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
+    w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
+    b_shape = (wl.batch, wl.out_filter, 1, 1)
+    if data_pack:
+        data_shape = (wl.batch//env.BATCH, wl.in_filter//env.BLOCK_IN,
+                  wl.height, wl.width, env.BATCH, env.BLOCK_IN)
         kernel_shape = (wl.out_filter//env.BLOCK_OUT, wl.in_filter//env.BLOCK_IN,
                         wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN)
-        bias_shape = (1, wl.out_filter//env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT)
-
-        fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
-        fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
-        data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-        kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
-        bias = tvm.placeholder(bias_shape, name="kernel", dtype=env.acc_dtype)
-
-        res_conv = vta.top.packed_conv2d(
-            data, kernel, padding=(wl.hpad, wl.wpad), strides=(wl.hstride, wl.wstride))
-        res = topi.right_shift(res_conv, 8)
+        bias_shape = (wl.batch//env.BATCH, wl.out_filter//env.BLOCK_OUT,
+                      1, 1, env.BATCH, env.BLOCK_OUT)
+    else:
+        data_shape = a_shape
+        kernel_shape = w_shape
+        bias_shape = b_shape
+    data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
+    kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
+    bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
+
+    # Define base computation schedule
+    with target:
+        res = topi.nn.conv2d(
+            data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), (1, 1),
+            layout, env.acc_dtype)
+        res = topi.right_shift(res, 8)
         res = topi.add(res, bias)
-        res = my_clip(res, 0, 127)
-        res = topi.cast(res, "int8")
-
-        # To compute number of ops, use a x2 factor for FMA
-        num_ops = 2 * batch_size * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter
-
-        a_shape = (batch_size, wl.in_filter, wl.height, wl.width)
-        w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
-        stride = (wl.hstride, wl.wstride)
-        data_dtype = data.dtype
-        kernel_dtype = kernel.dtype
-        acc_dtype = env.acc_dtype
-        assert wl.hpad == wl.wpad
-        padding = wl.hpad
-
-        @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
-        def get_ref_data():
-            a_np = (np.random.uniform(size=a_shape) * 4).astype(data_dtype)
-            w_np = (np.random.uniform(size=w_shape) * 4).astype(kernel_dtype)
-            a_np = np.abs(a_np)
-            w_np = np.abs(w_np)
-            b_np = topi.testing.conv2d_nchw_python(
-                a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype)
-            return a_np, w_np, b_np
-
-        def verify(s, check_correctness):
-            mod = vta.build(s, [data, kernel, bias, res], "ext_dev",
-                            env.target_host, name="conv2d")
-            temp = util.tempdir()
-
-            mod.save(temp.relpath("conv2d.o"))
-            remote.upload(temp.relpath("conv2d.o"))
-            f = remote.load_module("conv2d.o")
-            # verify
-            ctx = remote.ext_dev(0)
-            # Data in original format
-            data_orig, kernel_orig, res_ref = get_ref_data()
-            bias_orig = (np.random.uniform(size=(wl.out_filter,)) * 4).astype("int32")
-            bias_orig = np.abs(bias_orig)
-
-            data_packed = data_orig.reshape(
-                batch_size//env.BATCH, env.BATCH,
-                wl.in_filter//env.BLOCK_IN, env.BLOCK_IN,
-                wl.height, wl.width).transpose((0, 2, 4, 5, 1, 3))
-            kernel_packed = kernel_orig.reshape(
-                wl.out_filter//env.BLOCK_OUT, env.BLOCK_OUT,
-                wl.in_filter//env.BLOCK_IN, env.BLOCK_IN,
-                wl.hkernel, wl.wkernel).transpose((0, 2, 4, 5, 1, 3))
-            bias_packed = bias_orig.reshape(
-                1, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT)
-            res_shape = topi.util.get_const_tuple(res.shape)
-
-            res_np = np.zeros(res_shape).astype(res.dtype)
-            data_arr = tvm.nd.array(data_packed, ctx)
-            kernel_arr = tvm.nd.array(kernel_packed, ctx)
-            bias_arr = tvm.nd.array(bias_packed, ctx)
-            res_arr = tvm.nd.array(res_np, ctx)
-            time_f = f.time_evaluator("conv2d", ctx, number=5)
+        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
+        res = topi.cast(res, env.out_dtype)
+        # Derive base schedule
+        s = topi.generic.schedule_conv2d_nchw([res])
+        if print_ir:
+            print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))
+
+    # Derive number of ops
+    fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
+    fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
+    num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter
+
+    # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
+    def get_ref_data():
+        # derive min max for act, wgt, and bias types (max non inclusive)
+        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1))
+        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1))
+        b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2), 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2)
+        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
+        w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype)
+        b_np = np.random.randint(b_min, b_max, size=b_shape).astype(env.acc_dtype)
+        r_np = topi.testing.conv2d_nchw_python(
+            a_np.astype(env.acc_dtype), w_np.astype(env.acc_dtype), (wl.hstride, wl.wstride), wl.hpad).astype(env.acc_dtype)
+        return a_np, w_np, b_np, r_np
+
+    # Data in original format
+    data_np, kernel_np, bias_np, res_ref = get_ref_data()
+    if data_pack:
+        data_np = data_np.reshape(
+            wl.batch//env.BATCH, env.BATCH,
+            wl.in_filter//env.BLOCK_IN, env.BLOCK_IN,
+            wl.height, wl.width).transpose((0, 2, 4, 5, 1, 3))
+        kernel_np = kernel_np.reshape(
+            wl.out_filter//env.BLOCK_OUT, env.BLOCK_OUT,
+            wl.in_filter//env.BLOCK_IN, env.BLOCK_IN,
+            wl.hkernel, wl.wkernel).transpose((0, 2, 4, 5, 1, 3))
+        bias_np = bias_np.reshape(
+            wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT,
+            1, 1, env.BATCH, env.BLOCK_OUT)
+
+    # Build
+    if "vta" in target.keys:
+        mod = vta.build(s, [data, kernel, bias, res],
+                        target=target,
+                        target_host=env.target_host,
+                        name="conv2d")
+    else:
+        mod = tvm.build(s, [data, kernel, bias, res],
+                        target=target,
+                        target_host=env.target_host,
+                        name="conv2d")
+    temp = util.tempdir()
+    mod.save(temp.relpath("conv2d.o"))
+    remote.upload(temp.relpath("conv2d.o"))
+    f = remote.load_module("conv2d.o")
+    ctx = remote.context(str(target))
+
+    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
+    data_arr = tvm.nd.array(data_np, ctx)
+    kernel_arr = tvm.nd.array(kernel_np, ctx)
+    bias_arr = tvm.nd.array(bias_np, ctx)
+    res_arr = tvm.nd.array(res_np, ctx)
+    time_f = f.time_evaluator("conv2d", ctx, number=samples)
+
+    # In vta sim mode, collect simulator runtime statistics
+    stats = {}
+    cost = None
+    if env.TARGET == "sim":
+        # Check if we're in local RPC mode (allows us to rebuild the
+        # runtime on the fly when varying the VTA designs)
+        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
+        if local_rpc:
+            remote.get_function("vta.simulator.profiler_clear")()
+            if profileOnly:
+                remote.get_function("vta.simulator.profiler_debug_mode")(1)
             cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
-            res_unpack = res_arr.asnumpy().transpose(
-                (0, 4, 1, 5, 2, 3)).reshape(batch_size, wl.out_filter, fout_height, fout_width)
-            if check_correctness:
-                assert wl.hpad == wl.wpad
-                stride = (wl.hstride, wl.wstride)
-                padding = wl.hpad
-                res_ref = res_ref >> 8
-                res_ref += bias_orig.reshape(wl.out_filter, 1, 1)
-                res_ref = np.clip(res_ref, 0, 127).astype("int8")
-                tvm.testing.assert_allclose(res_unpack, res_ref)
-            return cost
-
-        def conv_normal(print_ir):
-            print("----- CONV2D End-to-End Test-------")
-            with vta.build_config():
-                s = vta.top.schedule_packed_conv2d([res])
-                if print_ir:
-                    print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))
-            cost = verify(s, True)
-            gops = (num_ops / cost.mean) / float(10 ** 9)
-            print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
-
-        conv_normal(False)
-
+            stats = json.loads(remote.get_function("vta.simulator.profiler_status")())
+        else:
+            simulator.clear_stats()
+            if profileOnly:
+                simulator.debug_mode(1)
+            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
+            stats = simulator.stats()
+    else:
+        cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
+
+    # Check correctness
+    correct = False
+    if check_correctness:
+        res_orig = res_arr.asnumpy()
+        if data_pack:
+            res_orig = res_orig.transpose(
+                (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, fout_height, fout_width)
+        res_ref = res_ref >> 8
+        res_ref += bias_np.reshape(wl.out_filter, 1, 1)
+        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
+        res_ref = res_ref.astype(env.out_dtype)
+        correct = np.allclose(res_orig, res_ref)
+
+    gops = (num_ops / cost.mean) / float(10 ** 9)
+    status = "PASSED" if correct else "FAILED"
+    if "arm_cpu" in target.keys:
+        device = "CPU"
+    elif "vta" in target.keys:
+        device = "VTA"
+    print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops))
+
+    return correct, cost, stats
+
+def test_conv2d(device="vta"):
     def _run(env, remote):
-        # ResNet18 workloads
-        resnet = {
-            # Workloads of resnet18 on imagenet
-            0: Workload(1, 224, 224, 16, 64, 7, 7, 3, 3, 2, 2),
-            1: Workload(1, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
-            2: Workload(1, 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
-            3: Workload(1, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
-            4: Workload(1, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
-            5: Workload(1, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
-            6: Workload(1, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
-            7: Workload(1, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
-            8: Workload(1, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
-            9: Workload(1, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
-            10: Workload(1, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
-            11: Workload(1, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
-        }
-
-        batch_size = 1
-        for i in range(0, len(resnet)):
-            wl = resnet[i]
-            key = "resnet-cfg[%d]" % i
-            print("key=%s" % key)
-            print(wl)
-            run_vta_conv2d(env, remote, key, batch_size, wl)
-
+        if device == "vta":
+            target = env.target
+        elif device == "arm_cpu":
+            target = env.target_vta_cpu
+        with autotvm.tophub.context(target): # load pre-tuned schedule parameters
+            for _, wl in resnet_wkls:
+                print(wl)
+                run_conv2d(env, remote, wl, target)
     vta.testing.run(_run)
 
-
 if __name__ == "__main__":
-    test_cpu_conv2d()
-    test_vta_conv2d()
+    test_conv2d(device="arm_cpu")
+    test_conv2d(device="vta")

From e9c995bb98fae8c31fd680e92f7fb124822e7551 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 6 May 2019 21:43:49 -0700
Subject: [PATCH 002/126] removing progileOnly option

---
 vta/tests/python/integration/test_benchmark_topi_conv2d.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index 56b66bdb0101..9ae39bca63d2 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -65,7 +65,7 @@ def my_clip(x, a_min, a_max):
 
 def run_conv2d(env, remote, wl, target,
                check_correctness=True, print_ir=False,
-               samples=4, profileOnly=False):
+               samples=4):
 
     # Workload assertions
     assert wl.hpad == wl.wpad
@@ -177,14 +177,10 @@ def get_ref_data():
         local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
         if local_rpc:
             remote.get_function("vta.simulator.profiler_clear")()
-            if profileOnly:
-                remote.get_function("vta.simulator.profiler_debug_mode")(1)
             cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
             stats = json.loads(remote.get_function("vta.simulator.profiler_status")())
         else:
             simulator.clear_stats()
-            if profileOnly:
-                simulator.debug_mode(1)
             cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
             stats = simulator.stats()
     else:

From 25713863bc3da4a6acdd7bc9ed8226f93a59b7d8 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 6 May 2019 21:54:13 -0700
Subject: [PATCH 003/126] removing unsupported layer

---
 vta/tests/python/integration/test_benchmark_topi_conv2d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index 9ae39bca63d2..28c8af4283ce 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -41,7 +41,7 @@
     # Workloads of resnet18 on imagenet
     # ('resnet-18.C1',  Workload(1, 224, 224, 3,   64,  7, 7, 3, 3, 2, 2)),
     ('resnet-18.C2',  Workload(1,  56,  56, 64,  64,  3, 3, 1, 1, 1, 1)),
-    ('resnet-18.C3',  Workload(1,  56,  56, 64,  64,  1, 1, 0, 0, 1, 1)),
+    # ('resnet-18.C3',  Workload(1,  56,  56, 64,  64,  1, 1, 0, 0, 1, 1)), # this layer does not appear in ResNet
     ('resnet-18.C4',  Workload(1,  56,  56, 64,  128, 3, 3, 1, 1, 2, 2)),
     ('resnet-18.C5',  Workload(1,  56,  56, 64,  128, 1, 1, 0, 0, 2, 2)),
     ('resnet-18.C6',  Workload(1,  28,  28, 128, 128, 3, 3, 1, 1, 1, 1)),

From 77e9191fac344d70dbb77cf898bf3dc8b86d7c0f Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 6 May 2019 22:00:02 -0700
Subject: [PATCH 004/126] fixing bare metal test build

---
 vta/python/vta/pkg_config.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vta/python/vta/pkg_config.py b/vta/python/vta/pkg_config.py
index 2c30414ace1a..3977d5aa2e43 100644
--- a/vta/python/vta/pkg_config.py
+++ b/vta/python/vta/pkg_config.py
@@ -77,8 +77,6 @@ def __init__(self, cfg, proj_root):
         if self.target == "pynq":
             self.ldflags = [
                 "-L/usr/lib",
-                "-L/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/",
-                "-L/opt/python3.6/lib/python3.6/site-packages/pynq/lib/",
                 "-l:libcma.so"]
         else:
             self.ldflags = []

From f87417ac35aa9a6c0bb7ec52c27cfdf862d538ee Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 8 May 2019 23:26:49 -0700
Subject: [PATCH 005/126] refactoring resnet WIP

---
 vta/tutorials/resnet.py | 256 +++++++++++++++++++---------------------
 1 file changed, 124 insertions(+), 132 deletions(-)

diff --git a/vta/tutorials/resnet.py b/vta/tutorials/resnet.py
index df3bb0607284..7930bfe750c6 100644
--- a/vta/tutorials/resnet.py
+++ b/vta/tutorials/resnet.py
@@ -37,7 +37,7 @@
 
 import numpy as np
 import requests
-from matplotlib import pyplot as plt
+#from matplotlib import pyplot as plt
 from PIL import Image
 
 import tvm
@@ -82,67 +82,6 @@ def classify(m, image):
     tcost = "t={0:.2f}s".format(tcost.mean)
     return tcost + " {}".format(synset[top])
 
-# Helper function to compile the NNVM graph
-# Takes in a path to a graph file, params file, and device target
-# Returns the NNVM graph object, a compiled library object, and the params dict
-def generate_graph(graph_fn, params_fn, device="vta"):
-    # Measure build start time
-    build_start = time.time()
-
-    # Derive the TVM target
-    target = tvm.target.create("llvm -device={}".format(device))
-
-    # Derive the LLVM compiler flags
-    # When targetting the Pynq, cross-compile to ARMv7 ISA
-    if env.TARGET == "sim":
-        target_host = "llvm"
-    elif env.TARGET == "pynq":
-        target_host = "llvm -mtriple=armv7-none-linux-gnueabihf -mcpu=cortex-a9 -mattr=+neon"
-
-    # Load the ResNet-18 graph and parameters
-    sym = nnvm.graph.load_json(open(graph_fn).read())
-    params = nnvm.compiler.load_param_dict(open(params_fn, 'rb').read())
-
-    # Populate the shape and data type dictionary
-    shape_dict = {"data": (1, 3, 224, 224)}
-    dtype_dict = {"data": 'float32'}
-    shape_dict.update({k: v.shape for k, v in params.items()})
-    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
-
-    # Apply NNVM graph optimization passes
-    sym = vta.graph.clean_cast(sym)
-    sym = vta.graph.clean_conv_fuse(sym)
-    if target.device_name == "vta":
-        assert env.BLOCK_IN == env.BLOCK_OUT
-        sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT)
-
-    # Compile NNVM graph
-    with nnvm.compiler.build_config(opt_level=3):
-        if target.device_name != "vta":
-            graph, lib, params = nnvm.compiler.build(
-                sym, target, shape_dict, dtype_dict,
-                params=params, target_host=target_host)
-        else:
-            with vta.build_config():
-                graph, lib, params = nnvm.compiler.build(
-                    sym, target, shape_dict, dtype_dict,
-                    params=params, target_host=target_host)
-
-    # Save the compiled inference graph library
-    assert tvm.module.enabled("rpc")
-    temp = util.tempdir()
-    lib.save(temp.relpath("graphlib.o"))
-
-    # Send the inference library over to the remote RPC server
-    remote.upload(temp.relpath("graphlib.o"))
-    lib = remote.load_module("graphlib.o")
-
-    # Measure build time
-    build_time = time.time() - build_start
-    print("ResNet-18 inference graph built in {0:.2f}s!".format(build_time))
-
-    return graph, lib, params
-
 
 ######################################################################
 # Download ResNet Model
@@ -169,7 +108,7 @@ def generate_graph(graph_fn, params_fn, device="vta"):
 synset = eval(open(os.path.join(data_dir, categ_fn)).read())
 
 # Download pre-tuned op parameters of conv2d for ARM CPU used in VTA
-autotvm.tophub.check_backend('vta')
+# autotvm.tophub.check_backend('vta')
 
 
 ######################################################################
@@ -213,21 +152,74 @@ def generate_graph(graph_fn, params_fn, device="vta"):
 # ------------------------
 # Build the ResNet graph runtime, and configure the parameters.
 
-# Set ``device=vtacpu`` to run inference on the CPU
+# Set ``device=arm_cpu`` to run inference on the CPU
 # or ``device=vta`` to run inference on the FPGA.
 device = "vta"
 
-# Device context
-ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
+# Derive the TVM target
+if device == "vta":
+    target = env.target
+elif device == "arm_cpu":
+    target = env.target_vta_cpu
+ctx = remote.context(str(target))
+
+# TVM module
+m = None
+
+with autotvm.tophub.context(target):
+
+    graph_fn = os.path.join(data_dir, graph_fn)
+    params_fn= os.path.join(data_dir, params_fn)
+
+    # Measure build start time
+    build_start = time.time()
+
+    # Load the ResNet-18 graph and parameters
+    sym = nnvm.graph.load_json(open(graph_fn).read())
+    params = nnvm.compiler.load_param_dict(open(params_fn, 'rb').read())
+
+    # Populate the shape and data type dictionary
+    shape_dict = {"data": (1, 3, 224, 224)}
+    dtype_dict = {"data": 'float32'}
+    shape_dict.update({k: v.shape for k, v in params.items()})
+    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+    # Apply NNVM graph optimization passes
+    sym = vta.graph.clean_cast(sym)
+    sym = vta.graph.clean_conv_fuse(sym)
+    if target.device_name == "vta":
+        assert env.BLOCK_IN == env.BLOCK_OUT
+        sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT)
+
+    # Compile NNVM graph
+    with nnvm.compiler.build_config(opt_level=3):
+        if target.device_name != "vta":
+            graph, lib, params = nnvm.compiler.build(
+                sym, target, shape_dict, dtype_dict,
+                params=params, target_host=env.target_host)
+        else:
+            with vta.build_config():
+                graph, lib, params = nnvm.compiler.build(
+                    sym, target, shape_dict, dtype_dict,
+                    params=params, target_host=env.target_host)
+
+    # Save the compiled inference graph library
+    assert tvm.module.enabled("rpc")
+    temp = util.tempdir()
+    lib.save(temp.relpath("graphlib.o"))
+
+    # Send the inference library over to the remote RPC server
+    remote.upload(temp.relpath("graphlib.o"))
+    lib = remote.load_module("graphlib.o")
+
+    # Measure build time
+    build_time = time.time() - build_start
+    print("ResNet-18 inference graph built in {0:.2f}s!".format(build_time))
 
-# Build the graph runtime
-graph, lib, params = generate_graph(os.path.join(data_dir, graph_fn),
-                                    os.path.join(data_dir, params_fn),
-                                    device)
-m = graph_runtime.create(graph, lib, ctx)
+    m = graph_runtime.create(graph, lib, ctx)
 
-# Set the parameters
-m.set_input(**params)
+    # Set the parameters
+    m.set_input(**params)
 
 ######################################################################
 # Run ResNet-18 inference on a sample image
@@ -241,8 +233,8 @@ def generate_graph(graph_fn, params_fn, device="vta"):
 response = requests.get(image_url)
 image = Image.open(BytesIO(response.content)).resize((224, 224))
 # Show Image
-plt.imshow(image)
-plt.show()
+# plt.imshow(image)
+# plt.show()
 # Set the input
 image = process_image(image)
 m.set_input('data', image)
@@ -271,60 +263,60 @@ def generate_graph(graph_fn, params_fn, device="vta"):
 # Comment the `if False:` out to run the demo
 
 # Early exit - remove for Demo
-if False:
-
-    import cv2
-    import pafy
-    from IPython.display import clear_output
-
-    # Helper to crop an image to a square (224, 224)
-    # Takes in an Image object, returns an Image object
-    def thumbnailify(image, pad=15):
-        w, h = image.size
-        crop = ((w-h)//2+pad, pad, h+(w-h)//2-pad, h-pad)
-        image = image.crop(crop)
-        image = image.resize((224, 224))
-        return image
-
-    # 16:16 inches
-    plt.rcParams['figure.figsize'] = [16, 16]
-
-    # Stream the video in
-    url = "https://www.youtube.com/watch?v=PJlmYh27MHg&t=2s"
-    video = pafy.new(url)
-    best = video.getbest(preftype="mp4")
-    cap = cv2.VideoCapture(best.url)
-
-    # Process one frame out of every 48 for variety
-    count = 0
-    guess = ""
-    while(count<2400):
-
-        # Capture frame-by-frame
-        ret, frame = cap.read()
-
-        # Process one every 48 frames
-        if count % 48 == 1:
-            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            frame = Image.fromarray(frame)
-            # Crop and resize
-            thumb = np.array(thumbnailify(frame))
-            image = process_image(thumb)
-            guess = classify(m, image)
-
-            # Insert guess in frame
-            frame = cv2.rectangle(thumb,(0,0),(200,0),(0,0,0),50)
-            cv2.putText(frame, guess, (5,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (256,256,256), 1, cv2.LINE_AA)
-
-            plt.imshow(thumb)
-            plt.axis('off')
-            plt.show()
-            if cv2.waitKey(1) & 0xFF == ord('q'):
-                break
-            clear_output(wait=True)
-
-        count += 1
-
-    # When everything done, release the capture
-    cap.release()
-    cv2.destroyAllWindows()
+# if False:
+
+#     import cv2
+#     import pafy
+#     from IPython.display import clear_output
+
+#     # Helper to crop an image to a square (224, 224)
+#     # Takes in an Image object, returns an Image object
+#     def thumbnailify(image, pad=15):
+#         w, h = image.size
+#         crop = ((w-h)//2+pad, pad, h+(w-h)//2-pad, h-pad)
+#         image = image.crop(crop)
+#         image = image.resize((224, 224))
+#         return image
+
+#     # 16:16 inches
+#     plt.rcParams['figure.figsize'] = [16, 16]
+
+#     # Stream the video in
+#     url = "https://www.youtube.com/watch?v=PJlmYh27MHg&t=2s"
+#     video = pafy.new(url)
+#     best = video.getbest(preftype="mp4")
+#     cap = cv2.VideoCapture(best.url)
+
+#     # Process one frame out of every 48 for variety
+#     count = 0
+#     guess = ""
+#     while(count<2400):
+
+#         # Capture frame-by-frame
+#         ret, frame = cap.read()
+
+#         # Process one every 48 frames
+#         if count % 48 == 1:
+#             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+#             frame = Image.fromarray(frame)
+#             # Crop and resize
+#             thumb = np.array(thumbnailify(frame))
+#             image = process_image(thumb)
+#             guess = classify(m, image)
+
+#             # Insert guess in frame
+#             frame = cv2.rectangle(thumb,(0,0),(200,0),(0,0,0),50)
+#             cv2.putText(frame, guess, (5,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (256,256,256), 1, cv2.LINE_AA)
+
+#             plt.imshow(thumb)
+#             plt.axis('off')
+#             plt.show()
+#             if cv2.waitKey(1) & 0xFF == ord('q'):
+#                 break
+#             clear_output(wait=True)
+
+#         count += 1
+
+#     # When everything done, release the capture
+#     cap.release()
+#     cv2.destroyAllWindows()

From 72f7c40da852db08aca45396a2a7596fcb2c7d2f Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Thu, 9 May 2019 18:47:11 -0700
Subject: [PATCH 006/126] VTA topi support fix for NNVM

---
 vta/python/vta/top/__init__.py   |   1 +
 vta/python/vta/top/op.py         | 132 +++++++++++++++++++++++++++++++
 vta/python/vta/top/vta_conv2d.py |  12 +--
 3 files changed, 135 insertions(+), 10 deletions(-)
 create mode 100644 vta/python/vta/top/op.py

diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py
index 7346c35506a2..d1f6ec4f3ec1 100644
--- a/vta/python/vta/top/__init__.py
+++ b/vta/python/vta/top/__init__.py
@@ -1,3 +1,4 @@
 """TVM TOPI connector, eventually most of these should go to TVM repo"""
 
+from . import op
 from . import vta_conv2d
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
new file mode 100644
index 000000000000..c288263e993a
--- /dev/null
+++ b/vta/python/vta/top/op.py
@@ -0,0 +1,132 @@
+"""Namespace for supporting packed_conv2d + ewise variant of nnvm."""
+from __future__ import absolute_import as _abs
+
+from collections import namedtuple
+import logging
+
+import tvm
+from tvm import autotvm
+import topi
+
+from nnvm.top import registry as reg, OpPattern
+from nnvm.top import nn as _nn
+
+from ..environment import get_env
+
+def is_packed_layout(layout):
+    """Check if layout is packed layout"""
+    if layout == "NCHW":
+        return False
+    if "n" in layout and "c" in layout:
+        return True
+    return False
+
+@tvm.register_func("nnvm.compiler.build_target", override=True)
+def _build(funcs, target, target_host):
+    tvm_t = tvm.target.create(target)
+    if tvm_t.device_name == "vta":
+        return tvm.build(funcs, target="ext_dev", target_host=target_host)
+    if tvm_t.device_name == "rasp" or tvm_t.device_name == "vtacpu":
+        return tvm.build(funcs, target=target_host)
+    return tvm.build(funcs, target=target)
+
+@tvm.register_func("nnvm.compiler.lower", override=True)
+def _lower(sch, inputs, func_name, graph):
+    import traceback
+    # pylint: disable=broad-except
+    try:
+        f = tvm.lower(sch, inputs, name=func_name)
+        if "quantized_conv2d" in func_name:
+            logging.info(graph.ir(join_entry_attrs=["shape"]))
+    except Exception:
+        msg = traceback.format_exc()
+        msg += "Error during compile graph\n"
+        msg += "--------------------------\n"
+        msg += graph.ir(join_entry_attrs=["shape"])
+        raise RuntimeError(msg)
+    return f if isinstance(
+        f, (tvm.container.Array, tuple, list)) else [f]
+
+# override to force partition at copy
+reg.register_pattern("copy", OpPattern.INJECTIVE, level=15)
+
+@reg.register_compute("clip", level=15)
+def compute_clip(attrs, inputs, _):
+    """ Clip operator. """
+    x = inputs[0]
+    a_min = attrs.get_float("a_min")
+    a_max = attrs.get_float("a_max")
+    const_min = tvm.const(a_min, x.dtype)
+    const_max = tvm.const(a_max, x.dtype)
+    with tvm.tag_scope(topi.tag.ELEMWISE):
+        x = tvm.compute(
+            x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
+        x = tvm.compute(
+            x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+    return x
+
+@reg.register_compute("conv2d", level=15)
+def compute_conv2d(attrs, inputs, out):
+    """ 2D convolution algorithm.
+    """
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    layout = attrs["layout"]
+    out_dtype = attrs['out_dtype']
+
+    assert dilation == (1, 1), "not support dilate now"
+    if is_packed_layout(layout):
+        if groups == 1:
+            assert groups == 1
+            env = get_env()
+            assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now"
+            assert env.LOG_OUT_WIDTH == 3, "only support 8bit inp for now"
+            inputs = list(inputs)
+            w_pack_factor = 1 << (3 - env.LOG_WGT_WIDTH)
+            assert inputs[1].dtype == "int8"
+
+            # Apply bit packing if necessary
+            if w_pack_factor != 1:
+                kshape = list(topi.util.get_const_tuple(inputs[1].shape))
+                kshape[-1] *= w_pack_factor
+                inputs[1] = reinterpret(inputs[1], kshape, dtype=env.wgt_dtype)
+
+            return topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype)
+        else:
+            return topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, out_dtype)
+
+    with tvm.target.arm_cpu(tvm.target.current_target().model):
+        return _nn.compute_conv2d(attrs, inputs, out)
+
+@reg.register_schedule("conv2d", level=15)
+def schedule_conv2d(attrs, outs, target):
+    """ 2D convolution schedule.
+    """
+    layout = attrs["layout"]
+    groups = attrs.get_int('groups')
+
+    if is_packed_layout(layout):
+        target = tvm.target.create(target)
+        if target.device_name == "vta":
+            if groups == 1:
+                return topi.generic.schedule_conv2d_nchw(outs)
+            else:
+                return topi.generic.schedule_group_conv2d_nchw(outs)
+        elif str(target).startswith("llvm"):
+            return tvm.create_schedule([x.op for x in outs])
+        else:
+            raise RuntimeError("not support target %s" % target)
+
+    with tvm.target.arm_cpu(tvm.target.current_target().model):
+        return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target())
+
+@reg.register_alter_op_layout("conv2d", level=15)
+def alter_conv2d_layout(attrs, inputs, out):
+    layout = attrs['layout']
+    if is_packed_layout(layout):
+        return None
+
+    with tvm.target.arm_cpu(tvm.target.current_target().model):
+        return _nn.alter_conv2d_layout(attrs, inputs, out)
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index 681418d6ecb1..78db543d4774 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -16,22 +16,14 @@
 # under the License.
 """Namespace for supporting packed_conv2d + ewise variant of nnvm."""
 
+import numpy as np
 import tvm
 from tvm import autotvm
 import topi
 
-import numpy as np
-
+from .op import is_packed_layout
 from ..environment import get_env
 
-def is_packed_layout(layout):
-    """Check if layout is packed layout"""
-    if layout == "NCHW":
-        return False
-    if "n" in layout and "c" in layout:
-        return True
-    return False
-
 @autotvm.register_topi_compute(topi.nn.conv2d, 'vta', 'direct')
 def packed_conv2d(cfg,
                   data,

From bb8093d631cb72260ff9dbfd7108db9fe4fbbd95 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Fri, 10 May 2019 13:44:08 -0700
Subject: [PATCH 007/126] fixing resnet18 tutorial to work with TOPI

---
 vta/tutorials/resnet.py | 135 +++++++++++++++++++---------------------
 1 file changed, 64 insertions(+), 71 deletions(-)

diff --git a/vta/tutorials/resnet.py b/vta/tutorials/resnet.py
index 7930bfe750c6..13161586480e 100644
--- a/vta/tutorials/resnet.py
+++ b/vta/tutorials/resnet.py
@@ -37,7 +37,7 @@
 
 import numpy as np
 import requests
-#from matplotlib import pyplot as plt
+from matplotlib import pyplot as plt
 from PIL import Image
 
 import tvm
@@ -107,10 +107,6 @@ def classify(m, image):
 # Read in ImageNet Categories
 synset = eval(open(os.path.join(data_dir, categ_fn)).read())
 
-# Download pre-tuned op parameters of conv2d for ARM CPU used in VTA
-# autotvm.tophub.check_backend('vta')
-
-
 ######################################################################
 # Setup the Pynq Board's RPC Server
 # ---------------------------------
@@ -152,16 +148,13 @@ def classify(m, image):
 # ------------------------
 # Build the ResNet graph runtime, and configure the parameters.
 
-# Set ``device=arm_cpu`` to run inference on the CPU
+# Set ``device=vtacpu`` to run inference on the CPU
 # or ``device=vta`` to run inference on the FPGA.
 device = "vta"
 
-# Derive the TVM target
-if device == "vta":
-    target = env.target
-elif device == "arm_cpu":
-    target = env.target_vta_cpu
-ctx = remote.context(str(target))
+# TVM target and context
+target = tvm.target.create("llvm -device={}".format(device))
+ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
 
 # TVM module
 m = None
@@ -233,8 +226,8 @@ def classify(m, image):
 response = requests.get(image_url)
 image = Image.open(BytesIO(response.content)).resize((224, 224))
 # Show Image
-# plt.imshow(image)
-# plt.show()
+plt.imshow(image)
+plt.show()
 # Set the input
 image = process_image(image)
 m.set_input('data', image)
@@ -263,60 +256,60 @@ def classify(m, image):
 # Comment the `if False:` out to run the demo
 
 # Early exit - remove for Demo
-# if False:
-
-#     import cv2
-#     import pafy
-#     from IPython.display import clear_output
-
-#     # Helper to crop an image to a square (224, 224)
-#     # Takes in an Image object, returns an Image object
-#     def thumbnailify(image, pad=15):
-#         w, h = image.size
-#         crop = ((w-h)//2+pad, pad, h+(w-h)//2-pad, h-pad)
-#         image = image.crop(crop)
-#         image = image.resize((224, 224))
-#         return image
-
-#     # 16:16 inches
-#     plt.rcParams['figure.figsize'] = [16, 16]
-
-#     # Stream the video in
-#     url = "https://www.youtube.com/watch?v=PJlmYh27MHg&t=2s"
-#     video = pafy.new(url)
-#     best = video.getbest(preftype="mp4")
-#     cap = cv2.VideoCapture(best.url)
-
-#     # Process one frame out of every 48 for variety
-#     count = 0
-#     guess = ""
-#     while(count<2400):
-
-#         # Capture frame-by-frame
-#         ret, frame = cap.read()
-
-#         # Process one every 48 frames
-#         if count % 48 == 1:
-#             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-#             frame = Image.fromarray(frame)
-#             # Crop and resize
-#             thumb = np.array(thumbnailify(frame))
-#             image = process_image(thumb)
-#             guess = classify(m, image)
-
-#             # Insert guess in frame
-#             frame = cv2.rectangle(thumb,(0,0),(200,0),(0,0,0),50)
-#             cv2.putText(frame, guess, (5,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (256,256,256), 1, cv2.LINE_AA)
-
-#             plt.imshow(thumb)
-#             plt.axis('off')
-#             plt.show()
-#             if cv2.waitKey(1) & 0xFF == ord('q'):
-#                 break
-#             clear_output(wait=True)
-
-#         count += 1
-
-#     # When everything done, release the capture
-#     cap.release()
-#     cv2.destroyAllWindows()
+if False:
+
+    import cv2
+    import pafy
+    from IPython.display import clear_output
+
+    # Helper to crop an image to a square (224, 224)
+    # Takes in an Image object, returns an Image object
+    def thumbnailify(image, pad=15):
+        w, h = image.size
+        crop = ((w-h)//2+pad, pad, h+(w-h)//2-pad, h-pad)
+        image = image.crop(crop)
+        image = image.resize((224, 224))
+        return image
+
+    # 16:16 inches
+    plt.rcParams['figure.figsize'] = [16, 16]
+
+    # Stream the video in
+    url = "https://www.youtube.com/watch?v=PJlmYh27MHg&t=2s"
+    video = pafy.new(url)
+    best = video.getbest(preftype="mp4")
+    cap = cv2.VideoCapture(best.url)
+
+    # Process one frame out of every 48 for variety
+    count = 0
+    guess = ""
+    while(count<2400):
+
+        # Capture frame-by-frame
+        ret, frame = cap.read()
+
+        # Process one every 48 frames
+        if count % 48 == 1:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame = Image.fromarray(frame)
+            # Crop and resize
+            thumb = np.array(thumbnailify(frame))
+            image = process_image(thumb)
+            guess = classify(m, image)
+
+            # Insert guess in frame
+            frame = cv2.rectangle(thumb,(0,0),(200,0),(0,0,0),50)
+            cv2.putText(frame, guess, (5,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (256,256,256), 1, cv2.LINE_AA)
+
+            plt.imshow(thumb)
+            plt.axis('off')
+            plt.show()
+            if cv2.waitKey(1) & 0xFF == ord('q'):
+                break
+            clear_output(wait=True)
+
+        count += 1
+
+    # When everything done, release the capture
+    cap.release()
+    cv2.destroyAllWindows()

From 5f783556e0f60dcf54e9aff7b67e09db6be06d0e Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 14 May 2019 10:51:34 -0700
Subject: [PATCH 008/126] adding bitpacking support by Marissa

---
 vta/python/vta/top/__init__.py |  1 +
 vta/python/vta/top/bitpack.py  | 70 ++++++++++++++++++++++++++++++++++
 2 files changed, 71 insertions(+)
 create mode 100644 vta/python/vta/top/bitpack.py

diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py
index d1f6ec4f3ec1..5d212cc313ba 100644
--- a/vta/python/vta/top/__init__.py
+++ b/vta/python/vta/top/__init__.py
@@ -1,4 +1,5 @@
 """TVM TOPI connector, eventually most of these should go to TVM repo"""
 
+from . import bitpack
 from . import op
 from . import vta_conv2d
diff --git a/vta/python/vta/top/bitpack.py b/vta/python/vta/top/bitpack.py
new file mode 100644
index 000000000000..7b09ffbf43c0
--- /dev/null
+++ b/vta/python/vta/top/bitpack.py
@@ -0,0 +1,70 @@
+"""Bit packing operators"""
+from __future__ import absolute_import as _abs
+
+import tvm
+from topi import util
+
+from nnvm.top import registry as reg, OpPattern
+from nnvm.top import nn as _nn
+from nnvm.top.tensor import _fschedule_broadcast
+
+def bitpack(data, bits, pack_type="int8", name="bitpack"):
+    """Packs lowest dimension into format needed by VTA
+    Parameters
+    ----------
+    pack_axis : int
+        index of the axis to pack in data
+    bit_axis : int
+        index of axis to place bit axis in resulting packed data
+    Returns
+    -------
+    packed : Tensor
+        The packed tensor.
+    """
+    shape_vec = list(data.shape)
+    if pack_type == 'int8':
+        data_width = 8
+    elif pack_type == 'int16':
+        data_width = 16
+    elif pack_type == 'int32':
+        data_width = 32
+    else:
+        raise RuntimeError("Unknown pack type %s" % pack_type)
+    assert data_width % bits == 0
+    lanes = data_width // bits
+
+    # Data must be in multiples of the data_width
+    assert util.get_const_int(shape_vec[-1]) % lanes == 0, "Not a multiple of word size"
+    shape_vec[-1] = shape_vec[-1] // lanes
+    oshape = tuple(shape_vec)
+
+    def _bitpack(*indices):
+        ret = None
+        mask = tvm.const((1 << bits) - 1, pack_type)
+        for k in range(lanes):
+            idx = list(indices)
+            idx[-1] = idx[-1] * lanes + k
+            elem = data(*idx).astype(pack_type)
+            if k == 0:
+                ret = elem & mask
+            else:
+                val = (elem & mask) << tvm.const(k * bits, pack_type)
+                ret = ret | val
+        return ret
+
+    return tvm.compute(
+        oshape, _bitpack, name=name, tag='bitpack')
+
+
+@reg.register_compute("bitpack", level=15)
+def compute_bitpack(attrs, inputs, out):
+    lanes = attrs.get_int("lanes")
+    dtype = inputs[0].dtype
+    assert dtype == "int8"
+    width = 8
+    assert width % lanes == 0
+    bits = 8 // lanes
+    return bitpack(inputs[0], bits, dtype)
+
+reg.register_schedule("bitpack", _fschedule_broadcast)
+reg.register_pattern("bitpack", OpPattern.INJECTIVE)
\ No newline at end of file

From 25c88978bdd1cf36db2e672f8653de6d436826c3 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 14 May 2019 11:00:45 -0700
Subject: [PATCH 009/126] no support for bitpacking below 8bits for now

---
 vta/python/vta/top/op.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index c288263e993a..3fe9a5ed8e70 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -84,15 +84,7 @@ def compute_conv2d(attrs, inputs, out):
             assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now"
             assert env.LOG_OUT_WIDTH == 3, "only support 8bit inp for now"
             inputs = list(inputs)
-            w_pack_factor = 1 << (3 - env.LOG_WGT_WIDTH)
             assert inputs[1].dtype == "int8"
-
-            # Apply bit packing if necessary
-            if w_pack_factor != 1:
-                kshape = list(topi.util.get_const_tuple(inputs[1].shape))
-                kshape[-1] *= w_pack_factor
-                inputs[1] = reinterpret(inputs[1], kshape, dtype=env.wgt_dtype)
-
             return topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype)
         else:
             return topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, out_dtype)

From d15c97febe34f12ad35ccd1843b1c3f2ae1b16c4 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 14 May 2019 11:16:48 -0700
Subject: [PATCH 010/126] bitpacking annotations

---
 src/relay/op/annotation/annotation.cc | 34 +++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index 428df2fb1115..e0c2b20dfa1f 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -83,5 +83,39 @@ TVM_ADD_FILELINE)
                          return {topi::identity(inputs[0])};
                        });
 
+RELAY_REGISTER_OP("bitpack_start")
+.describe(R"code(
+Mark the start of bitpacking.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_support_level(10)
+.add_type_rel("Identity", IdentityRel)
+.set_attr<TOpPattern>("TOpPattern", kOpaque)
+.set_attr<TOpIsStateful>("TOpIsStateful", false)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                               ElemwiseArbitraryLayout)
+.set_attr<FTVMCompute>("FTVMCompute",
+                       [](const Attrs& attrs, const Array<Tensor>& inputs,
+                          const Type& out_dtype, const Target& target) -> Array<Tensor> {
+                         return {topi::identity(inputs[0])};
+                       });
+
+RELAY_REGISTER_OP("bitpack_end")
+.describe(R"code(
+Mark the end of bitpacking.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_support_level(10)
+.add_type_rel("Identity", IdentityRel)
+.set_attr<TOpPattern>("TOpPattern", kOpaque)
+.set_attr<TOpIsStateful>("TOpIsStateful", false)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                               ElemwiseArbitraryLayout)
+.set_attr<FTVMCompute>("FTVMCompute",
+                       [](const Attrs& attrs, const Array<Tensor>& inputs,
+                          const Type& out_dtype, const Target& target) -> Array<Tensor> {
+                         return {topi::identity(inputs[0])};
+                       });
+
 }  // namespace relay
 }  // namespace tvm

From 51463ffb72360ce7c6c93020ce321d66c50ee99b Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 14 May 2019 11:20:10 -0700
Subject: [PATCH 011/126] fix

---
 src/relay/op/annotation/annotation.cc | 4 ++--
 src/relay/pass/fuse_ops.cc            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index e0c2b20dfa1f..f09a3a22e3ab 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -58,7 +58,7 @@ RELAY_REGISTER_OP("on_device")
                                ElemwiseArbitraryLayout);
 
 Expr StopFusion(Expr data) {
-  static const Op& op = Op::Get("annotation.stop_fusion");
+  static const Op& op = Op::Get("stop_fusion");
   return CallNode::make(op, {data}, Attrs{}, {});
 }
 
@@ -67,7 +67,7 @@ TVM_REGISTER_API("relay.op.annotation._make.stop_fusion")
     return StopFusion(data);
 });
 
-RELAY_REGISTER_OP("annotation.stop_fusion")
+RELAY_REGISTER_OP("stop_fusion")
 .describe(R"code(Annotate an expression to prevent it being fused with previous expressions.)code"
 TVM_ADD_FILELINE)
 .set_num_inputs(1)
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 9f940e54953b..9cd73171bfea 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -821,7 +821,7 @@ class FuseMutator : private ExprMutator {
 
   // Transform calls.
   Expr VisitExpr_(const CallNode* call) {
-    static const Op& stop_fusion = Op::Get("annotation.stop_fusion");
+    static const Op& stop_fusion = Op::Get("stop_fusion");
     if (call->op.as<OpNode>()) {
       // If it is a primitive op call
       // then we must have a group assignment for it already.

From 8bea36836f1a8efc60324ec443e76a72e925b326 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 14 May 2019 16:02:30 -0700
Subject: [PATCH 012/126] relay topi integtation for vta

---
 vta/python/vta/top/__init__.py     |   2 +
 vta/python/vta/top/bitpack.py      |  18 +++--
 vta/python/vta/top/nnvm_bitpack.py |  70 ++++++++++++++++++
 vta/python/vta/top/nnvm_op.py      | 113 +++++++++++++++++++++++++++++
 vta/python/vta/top/op.py           |  95 +++++++-----------------
 vta/python/vta/top/vta_conv2d.py   |   9 ++-
 6 files changed, 228 insertions(+), 79 deletions(-)
 create mode 100644 vta/python/vta/top/nnvm_bitpack.py
 create mode 100644 vta/python/vta/top/nnvm_op.py

diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py
index 5d212cc313ba..f7b48c0bde2d 100644
--- a/vta/python/vta/top/__init__.py
+++ b/vta/python/vta/top/__init__.py
@@ -2,4 +2,6 @@
 
 from . import bitpack
 from . import op
+from . import nnvm_bitpack
+from . import nnvm_op
 from . import vta_conv2d
diff --git a/vta/python/vta/top/bitpack.py b/vta/python/vta/top/bitpack.py
index 7b09ffbf43c0..2265af4518b4 100644
--- a/vta/python/vta/top/bitpack.py
+++ b/vta/python/vta/top/bitpack.py
@@ -4,18 +4,20 @@
 import tvm
 from topi import util
 
-from nnvm.top import registry as reg, OpPattern
-from nnvm.top import nn as _nn
-from nnvm.top.tensor import _fschedule_broadcast
+from tvm.relay.op.op import register_compute, register_schedule
+from tvm.relay.op.op import register_pattern, OpPattern
+from tvm.relay.op.op import schedule_injective
 
 def bitpack(data, bits, pack_type="int8", name="bitpack"):
     """Packs lowest dimension into format needed by VTA
+
     Parameters
     ----------
     pack_axis : int
         index of the axis to pack in data
     bit_axis : int
         index of axis to place bit axis in resulting packed data
+
     Returns
     -------
     packed : Tensor
@@ -56,9 +58,9 @@ def _bitpack(*indices):
         oshape, _bitpack, name=name, tag='bitpack')
 
 
-@reg.register_compute("bitpack", level=15)
-def compute_bitpack(attrs, inputs, out):
-    lanes = attrs.get_int("lanes")
+@register_compute("bitpack", level=15)
+def compute_bitpack(attrs, inputs, output_type, target):
+    lanes = attrs.lanes
     dtype = inputs[0].dtype
     assert dtype == "int8"
     width = 8
@@ -66,5 +68,5 @@ def compute_bitpack(attrs, inputs, out):
     bits = 8 // lanes
     return bitpack(inputs[0], bits, dtype)
 
-reg.register_schedule("bitpack", _fschedule_broadcast)
-reg.register_pattern("bitpack", OpPattern.INJECTIVE)
\ No newline at end of file
+register_schedule("bitpack", schedule_injective)
+register_pattern("bitpack", OpPattern.INJECTIVE)
diff --git a/vta/python/vta/top/nnvm_bitpack.py b/vta/python/vta/top/nnvm_bitpack.py
new file mode 100644
index 000000000000..7b09ffbf43c0
--- /dev/null
+++ b/vta/python/vta/top/nnvm_bitpack.py
@@ -0,0 +1,70 @@
+"""Bit packing operators"""
+from __future__ import absolute_import as _abs
+
+import tvm
+from topi import util
+
+from nnvm.top import registry as reg, OpPattern
+from nnvm.top import nn as _nn
+from nnvm.top.tensor import _fschedule_broadcast
+
+def bitpack(data, bits, pack_type="int8", name="bitpack"):
+    """Packs lowest dimension into format needed by VTA
+    Parameters
+    ----------
+    pack_axis : int
+        index of the axis to pack in data
+    bit_axis : int
+        index of axis to place bit axis in resulting packed data
+    Returns
+    -------
+    packed : Tensor
+        The packed tensor.
+    """
+    shape_vec = list(data.shape)
+    if pack_type == 'int8':
+        data_width = 8
+    elif pack_type == 'int16':
+        data_width = 16
+    elif pack_type == 'int32':
+        data_width = 32
+    else:
+        raise RuntimeError("Unknown pack type %s" % pack_type)
+    assert data_width % bits == 0
+    lanes = data_width // bits
+
+    # Data must be in multiples of the data_width
+    assert util.get_const_int(shape_vec[-1]) % lanes == 0, "Not a multiple of word size"
+    shape_vec[-1] = shape_vec[-1] // lanes
+    oshape = tuple(shape_vec)
+
+    def _bitpack(*indices):
+        ret = None
+        mask = tvm.const((1 << bits) - 1, pack_type)
+        for k in range(lanes):
+            idx = list(indices)
+            idx[-1] = idx[-1] * lanes + k
+            elem = data(*idx).astype(pack_type)
+            if k == 0:
+                ret = elem & mask
+            else:
+                val = (elem & mask) << tvm.const(k * bits, pack_type)
+                ret = ret | val
+        return ret
+
+    return tvm.compute(
+        oshape, _bitpack, name=name, tag='bitpack')
+
+
+@reg.register_compute("bitpack", level=15)
+def compute_bitpack(attrs, inputs, out):
+    lanes = attrs.get_int("lanes")
+    dtype = inputs[0].dtype
+    assert dtype == "int8"
+    width = 8
+    assert width % lanes == 0
+    bits = 8 // lanes
+    return bitpack(inputs[0], bits, dtype)
+
+reg.register_schedule("bitpack", _fschedule_broadcast)
+reg.register_pattern("bitpack", OpPattern.INJECTIVE)
\ No newline at end of file
diff --git a/vta/python/vta/top/nnvm_op.py b/vta/python/vta/top/nnvm_op.py
new file mode 100644
index 000000000000..ce69b2b438d1
--- /dev/null
+++ b/vta/python/vta/top/nnvm_op.py
@@ -0,0 +1,113 @@
+"""Namespace for supporting packed_conv2d + ewise variant of nnvm."""
+from __future__ import absolute_import as _abs
+
+import logging
+
+import tvm
+import topi
+
+from nnvm.top import registry as reg, OpPattern
+from nnvm.top import nn as _nn
+
+from .vta_conv2d import is_packed_layout
+from ..environment import get_env
+
+@tvm.register_func("nnvm.compiler.build_target", override=True)
+def _build(funcs, target, target_host):
+    tvm_t = tvm.target.create(target)
+    if tvm_t.device_name == "vta":
+        return tvm.build(funcs, target="ext_dev", target_host=target_host)
+    if tvm_t.device_name == "rasp" or tvm_t.device_name == "vtacpu":
+        return tvm.build(funcs, target=target_host)
+    return tvm.build(funcs, target=target)
+
+@tvm.register_func("nnvm.compiler.lower", override=True)
+def _lower(sch, inputs, func_name, graph):
+    import traceback
+    # pylint: disable=broad-except
+    try:
+        f = tvm.lower(sch, inputs, name=func_name)
+        if "quantized_conv2d" in func_name:
+            logging.info(graph.ir(join_entry_attrs=["shape"]))
+    except Exception:
+        msg = traceback.format_exc()
+        msg += "Error during compile graph\n"
+        msg += "--------------------------\n"
+        msg += graph.ir(join_entry_attrs=["shape"])
+        raise RuntimeError(msg)
+    return f if isinstance(
+        f, (tvm.container.Array, tuple, list)) else [f]
+
+# override to force partition at copy
+reg.register_pattern("copy", OpPattern.INJECTIVE, level=15)
+
+@reg.register_compute("clip", level=15)
+def compute_clip(attrs, inputs, _):
+    """ Clip operator. """
+    x = inputs[0]
+    a_min = attrs.get_float("a_min")
+    a_max = attrs.get_float("a_max")
+    const_min = tvm.const(a_min, x.dtype)
+    const_max = tvm.const(a_max, x.dtype)
+    with tvm.tag_scope(topi.tag.ELEMWISE):
+        x = tvm.compute(
+            x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
+        x = tvm.compute(
+            x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+    return x
+
+@reg.register_compute("conv2d", level=15)
+def compute_conv2d(attrs, inputs, out):
+    """ Compute definition of conv2d """
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    layout = attrs["layout"]
+    out_dtype = attrs['out_dtype']
+
+    assert dilation == (1, 1), "not support dilate now"
+    if is_packed_layout(layout):
+        if groups == 1:
+            assert groups == 1
+            env = get_env()
+            assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now"
+            assert env.LOG_OUT_WIDTH == 3, "only support 8bit inp for now"
+            inputs = list(inputs)
+            assert inputs[1].dtype == "int8"
+            return topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype)
+        else:
+            return topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, out_dtype)
+
+    with tvm.target.arm_cpu(tvm.target.current_target().model):
+        return _nn.compute_conv2d(attrs, inputs, out)
+
+@reg.register_schedule("conv2d", level=15)
+def schedule_conv2d(attrs, outs, target):
+    """ Schedule definition of conv2d """
+    layout = attrs["layout"]
+    groups = attrs.get_int('groups')
+
+    if is_packed_layout(layout):
+        target = tvm.target.create(target)
+        if target.device_name == "vta":
+            if groups == 1:
+                return topi.generic.schedule_conv2d_nchw(outs)
+            else:
+                return topi.generic.schedule_group_conv2d_nchw(outs)
+        elif str(target).startswith("llvm"):
+            return tvm.create_schedule([x.op for x in outs])
+        else:
+            raise RuntimeError("not support target %s" % target)
+
+    with tvm.target.arm_cpu(tvm.target.current_target().model):
+        return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target())
+
+@reg.register_alter_op_layout("conv2d", level=15)
+def alter_conv2d_layout(attrs, inputs, out):
+    layout = attrs['layout']
+    if is_packed_layout(layout):
+        return None
+
+    with tvm.target.arm_cpu(tvm.target.current_target().model):
+        return _nn.alter_conv2d_layout(attrs, inputs, out)
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index 3fe9a5ed8e70..7f3c58a46116 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -1,61 +1,27 @@
 """Namespace for supporting packed_conv2d + ewise variant of nnvm."""
 from __future__ import absolute_import as _abs
 
-from collections import namedtuple
 import logging
 
 import tvm
-from tvm import autotvm
 import topi
 
-from nnvm.top import registry as reg, OpPattern
-from nnvm.top import nn as _nn
+from tvm.relay.op import op as reg
+from tvm.relay.op.op import OpPattern
+from tvm.relay.op.nn import _nn
 
+from .vta_conv2d import is_packed_layout
 from ..environment import get_env
 
-def is_packed_layout(layout):
-    """Check if layout is packed layout"""
-    if layout == "NCHW":
-        return False
-    if "n" in layout and "c" in layout:
-        return True
-    return False
-
-@tvm.register_func("nnvm.compiler.build_target", override=True)
-def _build(funcs, target, target_host):
-    tvm_t = tvm.target.create(target)
-    if tvm_t.device_name == "vta":
-        return tvm.build(funcs, target="ext_dev", target_host=target_host)
-    if tvm_t.device_name == "rasp" or tvm_t.device_name == "vtacpu":
-        return tvm.build(funcs, target=target_host)
-    return tvm.build(funcs, target=target)
-
-@tvm.register_func("nnvm.compiler.lower", override=True)
-def _lower(sch, inputs, func_name, graph):
-    import traceback
-    # pylint: disable=broad-except
-    try:
-        f = tvm.lower(sch, inputs, name=func_name)
-        if "quantized_conv2d" in func_name:
-            logging.info(graph.ir(join_entry_attrs=["shape"]))
-    except Exception:
-        msg = traceback.format_exc()
-        msg += "Error during compile graph\n"
-        msg += "--------------------------\n"
-        msg += graph.ir(join_entry_attrs=["shape"])
-        raise RuntimeError(msg)
-    return f if isinstance(
-        f, (tvm.container.Array, tuple, list)) else [f]
-
 # override to force partition at copy
 reg.register_pattern("copy", OpPattern.INJECTIVE, level=15)
 
 @reg.register_compute("clip", level=15)
-def compute_clip(attrs, inputs, _):
+def compute_clip(attrs, inputs, output_type, target):
     """ Clip operator. """
     x = inputs[0]
-    a_min = attrs.get_float("a_min")
-    a_max = attrs.get_float("a_max")
+    a_min = attrs.a_min
+    a_max = attrs.a_max
     const_min = tvm.const(a_min, x.dtype)
     const_max = tvm.const(a_max, x.dtype)
     with tvm.tag_scope(topi.tag.ELEMWISE):
@@ -63,18 +29,17 @@ def compute_clip(attrs, inputs, _):
             x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
         x = tvm.compute(
             x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
-    return x
-
-@reg.register_compute("conv2d", level=15)
-def compute_conv2d(attrs, inputs, out):
-    """ 2D convolution algorithm.
-    """
-    padding = attrs.get_int_tuple("padding")
-    strides = attrs.get_int_tuple("strides")
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.get_int("groups")
-    layout = attrs["layout"]
-    out_dtype = attrs['out_dtype']
+    return [x]
+
+@reg.register_compute("nn.conv2d", level=15)
+def compute_conv2d(attrs, inputs, output_type, target):
+    """ Compute definition of conv2d """
+    padding = topi.util.get_const_tuple(attrs.padding)
+    strides = topi.util.get_const_tuple(attrs.strides)
+    dilation = tuple([int(d) for d in attrs.dilation])
+    groups = attrs.groups
+    layout = attrs.data_layout
+    out_dtype = attrs.out_dtype
 
     assert dilation == (1, 1), "not support dilate now"
     if is_packed_layout(layout):
@@ -85,19 +50,18 @@ def compute_conv2d(attrs, inputs, out):
             assert env.LOG_OUT_WIDTH == 3, "only support 8bit inp for now"
             inputs = list(inputs)
             assert inputs[1].dtype == "int8"
-            return topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype)
+            return [topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype)]
         else:
-            return topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, out_dtype)
+            return [topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, out_dtype)]
 
     with tvm.target.arm_cpu(tvm.target.current_target().model):
-        return _nn.compute_conv2d(attrs, inputs, out)
+        return _nn.compute_conv2d(attrs, inputs, output_type, target)
 
-@reg.register_schedule("conv2d", level=15)
+@reg.register_schedule("nn.conv2d", level=15)
 def schedule_conv2d(attrs, outs, target):
-    """ 2D convolution schedule.
-    """
-    layout = attrs["layout"]
-    groups = attrs.get_int('groups')
+    """ Schedule definition of conv2d """
+    groups = attrs.groups
+    layout = attrs.data_layout
 
     if is_packed_layout(layout):
         target = tvm.target.create(target)
@@ -113,12 +77,3 @@ def schedule_conv2d(attrs, outs, target):
 
     with tvm.target.arm_cpu(tvm.target.current_target().model):
         return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target())
-
-@reg.register_alter_op_layout("conv2d", level=15)
-def alter_conv2d_layout(attrs, inputs, out):
-    layout = attrs['layout']
-    if is_packed_layout(layout):
-        return None
-
-    with tvm.target.arm_cpu(tvm.target.current_target().model):
-        return _nn.alter_conv2d_layout(attrs, inputs, out)
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index 78db543d4774..1672af47ca0c 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -21,9 +21,16 @@
 from tvm import autotvm
 import topi
 
-from .op import is_packed_layout
 from ..environment import get_env
 
+def is_packed_layout(layout):
+    """Check if layout is packed layout"""
+    if layout == "NCHW":
+        return False
+    if "n" in layout and "c" in layout:
+        return True
+    return False
+
 @autotvm.register_topi_compute(topi.nn.conv2d, 'vta', 'direct')
 def packed_conv2d(cfg,
                   data,

From 3f31c6c394bc5daf91ce60d54a8fe2b69a87a380 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 14 May 2019 16:03:02 -0700
Subject: [PATCH 013/126] operator tagging for broadcast

---
 src/relay/backend/compile_engine.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index f11dd2875b80..7ae1befcfe89 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -30,6 +30,7 @@
 #include <tvm/relay/pass.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/op_attr_types.h>
+#include <topi/tags.h>
 #include <utility>
 #include <limits>
 #include <mutex>
@@ -170,7 +171,7 @@ class ScheduleGetter :
           LOG(FATAL) << "not handled";
           return tvm::Expr();
         }
-      });
+      }, "compile_engine_const", topi::kBroadcast);
     scalars_.push_back(value->op);
     return {value};
   }

From e7f104991004e3201666e00afea8eb2f3e19d2d8 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 14 May 2019 16:03:23 -0700
Subject: [PATCH 014/126] invalid shape error

---
 topi/python/topi/util.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/topi/python/topi/util.py b/topi/python/topi/util.py
index d4e23be47e58..edf8ee11e884 100644
--- a/topi/python/topi/util.py
+++ b/topi/python/topi/util.py
@@ -23,6 +23,10 @@
 from tvm.api import layout, bijective_layout
 from . import tag
 
+class InvalidShapeError(ValueError):
+    """Invalid shape for a topi function. i.e. call winograd template for non-3x3 kernel)"""
+    pass
+
 def traverse_inline(s, final_op, callback):
     """Traverse computation graph and do auto inline
 

From 52c19f415d4cc82039f7b2d290a77eacb3b7d376 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 14 May 2019 16:26:37 -0700
Subject: [PATCH 015/126] relay graph pack pass

---
 vta/python/vta/top/__init__.py  |   4 +-
 vta/python/vta/top/graphpack.py | 277 ++++++++++++++++++++++++++++++++
 2 files changed, 280 insertions(+), 1 deletion(-)
 create mode 100644 vta/python/vta/top/graphpack.py

diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py
index f7b48c0bde2d..5111035decd3 100644
--- a/vta/python/vta/top/__init__.py
+++ b/vta/python/vta/top/__init__.py
@@ -1,7 +1,9 @@
 """TVM TOPI connector, eventually most of these should go to TVM repo"""
 
 from . import bitpack
-from . import op
+from .graphpack import graph_pack
 from . import nnvm_bitpack
+from .nnvm_graphpack import nnvm_graph_pack
 from . import nnvm_op
+from . import op
 from . import vta_conv2d
diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
new file mode 100644
index 000000000000..3ce50d06dbda
--- /dev/null
+++ b/vta/python/vta/top/graphpack.py
@@ -0,0 +1,277 @@
+"""A Relay implementation of graph packing."""
+
+from tvm import relay
+from tvm.relay import op
+from tvm.relay import ExprMutator
+
+def _to_shape(shape):
+    return tuple(int(sh) for sh in shape)
+
+def _pack_batch_channel(data, dshape, bfactor, cfactor):
+    """Pack the data channel dimension.
+    """
+    assert int(dshape[0]) % bfactor == 0
+    assert int(dshape[1]) % cfactor == 0
+    data = op.reshape(data,
+                      newshape=(int(dshape[0]) // bfactor, bfactor,
+                                int(dshape[1]) // cfactor, cfactor,
+                                int(dshape[2]), int(dshape[3])))
+    data = op.transpose(
+        data, axes=(0, 2, 4, 5, 1, 3))
+    return data
+
+
+def _unpack_batch_channel(data, old_shape):
+    """Unpack the data channel dimension.
+    """
+    data = op.transpose(data, axes=(0, 4, 1, 5, 2, 3))
+    data = op.reshape(data, newshape=old_shape)
+    return data
+
+
+def _pack_weight(data, dshape, cfactor):
+    """Pack the weight into packed format.
+    """
+    assert len(dshape) == 4
+    assert int(dshape[0]) % cfactor == 0
+    assert int(dshape[1]) % cfactor == 0
+    data = op.reshape(data,
+                      newshape=(int(dshape[0]) // cfactor, cfactor,
+                             int(dshape[1]) // cfactor, cfactor,
+                             int(dshape[2]), int(dshape[3])))
+    data = op.transpose(
+        data, axes=(0, 2, 4, 5, 1, 3))
+    return data
+
+
+def _pack_weight_conv2d_transpose(data, dshape, cfactor):
+    """Pack the weight into packed format.
+    """
+    dshape = _to_shape(dshape)
+    assert len(dshape) == 4
+    assert dshape[0] % cfactor == 0
+    assert dshape[1] % cfactor == 0
+    data = op.reshape(data,
+                      newshape=(dshape[0] // cfactor, cfactor,
+                             dshape[1] // cfactor, cfactor,
+                             dshape[2], dshape[3]))
+    data = op.transpose(
+        data, axes=(2, 0, 4, 5, 3, 1))
+    return data
+
+
+def _pack_bias(data, dshape, dtype, bfactor, cfactor):
+    """Pack the bias parameter.
+    """
+    dshape = _to_shape(dshape)
+    assert len(dshape) == 3
+    assert dshape[0] % cfactor == 0
+    data = op.reshape(data,
+                      newshape=(dshape[0] // cfactor,
+                                cfactor, dshape[1],
+                                dshape[2], 1))
+    data = op.transpose(
+        data, axes=(0, 2, 3, 4, 1))
+
+    # broadcast batch dimension to bfactor
+    data = op.broadcast_to(
+            data,
+            shape=(dshape[0] // cfactor, dshape[1], dshape[2], bfactor, cfactor))
+    return data
+
+
+def _get_shape(node):
+    """Get the shape of a node.
+    """
+    return _to_shape(node.checked_type.shape)
+
+class ExprPack(ExprMutator):
+    def __init__(self, bfactor, cfactor, weight_bits):
+        self.bfactor = bfactor
+        self.cfactor = cfactor
+        self.weight_bits = weight_bits
+        self.start_pack = False
+        # Cache Operator the algorithm matches against.
+        self.bitpack_start = op.op.get('bitpack_start')
+        self.bitpack_end = op.op.get('bitpack_end')
+        self.conv2d = op.op.get("nn.conv2d")
+        self.conv2d_transpose = op.op.get("nn.conv2d_transpose")
+        self.add = op.op.get("add")
+        self.bias_add = op.op.get("nn.bias_add")
+        self.number_of_conv2d = 0
+        super().__init__()
+
+    def visit_call(self, call):
+        # First visit the children.
+        oshape = _get_shape(call)
+        odtype = call.checked_type.dtype
+        input_types = [arg.checked_type for arg in call.args]
+        args = [self.visit(arg) for arg in call.args]
+
+        # Start and stop cases.
+        if call.op == self.bitpack_start:
+            assert not self.start_pack
+            self.start_pack = True
+            return _pack_batch_channel(args[0], oshape, self.bfactor, self.cfactor)
+        elif call.op == self.bitpack_end:
+            if self.start_pack:
+                self.start_pack = False
+                data = args[0]
+                data_shape = _get_shape(call.args[0])
+                return _unpack_batch_channel(data, data_shape)
+            else:
+                pass
+        if self.start_pack:
+            # Operator cases
+            if call.op == self.conv2d and odtype == 'int32':
+                self.number_of_conv2d += 1
+                assert 8 % self.weight_bits == 0
+                w_lanes = 8 // self.weight_bits
+                data_layout = "NCHW%dn%dc" % (self.bfactor, self.cfactor)
+                kernel_layout = "OIHW%do%di" % (self.cfactor, self.cfactor)
+                data, weight = args
+                data_shape = _to_shape(input_types[0].shape)
+                kernel_shape = _to_shape(input_types[1].shape)
+                kernel = _pack_weight(weight, kernel_shape, self.cfactor)
+                # insert bit packing when necessary
+                if w_lanes != 1:
+                    assert 8 % w_lanes == 0
+                    kernel = op.bitpack(kernel, lanes=w_lanes)
+                conv2d = op.nn.conv2d(
+                    data,
+                    kernel,
+                    strides=call.attrs.strides,
+                    padding=call.attrs.padding,
+                    dilation=call.attrs.dilation,
+                    groups=call.attrs.groups,
+                    channels=call.attrs.channels,
+                    kernel_size=call.attrs.kernel_size,
+                    data_layout=data_layout,
+                    kernel_layout=kernel_layout,
+                    out_dtype=call.attrs.out_dtype)
+                return conv2d
+            elif call.op == self.conv2d_transpose and odtype == 'int32':
+                self.number_of_conv2d += 1
+                assert 8 % self.weight_bits == 0
+                w_lanes = 8 // self.weight_bits
+                if self.start_pack:
+                    data_layout = "NCHW%dn%dc" % (self.bfactor, self.cfactor)
+                    kernel_layout = "IOHW%di%do" % (self.cfactor, self.cfactor)
+                    data, weight = args
+                    data_shape = _to_shape(input_types[0].shape)
+                    kernel_shape = _to_shape(input_types[1].shape)
+                    kernel = _pack_weight_conv2d_transpose(weight, kernel_shape, self.cfactor)
+                    conv2d = op.nn.conv2d_transpose(
+                        data,
+                        kernel,
+                        strides=call.attrs.strides,
+                        padding=call.attrs.padding,
+                        dilation=call.attrs.dilation,
+                        groups=call.attrs.groups,
+                        channels=call.attrs.channels,
+                        kernel_size=call.attrs.kernel_size,
+                        data_layout=data_layout,
+                        kernel_layout=kernel_layout,
+                        output_padding=call.attrs.output_padding,
+                        out_dtype=call.attrs.out_dtype)
+                return conv2d
+            elif call.op == self.add and tuple(input_types[0].shape) == tuple(input_types[1].shape):
+                pass
+            elif call.op == self.add and len(input_types[1].shape) == 3:
+                data, bias = args
+                bias = _pack_bias(bias, _to_shape(input_types[1].shape), input_types[1].dtype, self.bfactor, self.cfactor)
+                return relay.Call(self.add, [data, bias])
+            elif self.start_pack and call.op == self.bias_add:
+                data, bias = args
+                bias = _pack_bias(bias, _to_shape(input_types[1].shape), input_types[1].dtype, self.bfactor, self.cfactor)
+                return relay.Call(self.add, [data, bias])
+            elif self.start_pack and call.op == op.op.get('cast') and input_types[0].dtype == 'int32':
+                cast = relay.Call(op.op.get('cast'), [args[0]], call.attrs)
+                return relay.Call(op.op.get('copy'), [cast])
+
+        return relay.Call(
+            self.visit(call.op),
+            args,
+            call.attrs)
+
+class BT(Exception):
+    pass
+def get_subgraph(expr, start_name, stop_name):
+    "we assume stop_name only appear once for simplicity."
+    "this constraint will be lifted in the future."
+    "bitpack_start and bitpack_end is both inclusive"
+    bitpack_start = op.op.get('bitpack_start')
+    bitpack_end = op.op.get('bitpack_end')
+    anf = relay.ir_pass.to_a_normal_form(expr)
+    def recursion(anf, start_found, stop_found):
+        if isinstance(anf, relay.expr.Function):
+            return relay.expr.Function(anf.params, recursion(anf.body, start_found, stop_found), anf.ret_type, anf.type_params, anf.attrs)
+        elif isinstance(anf, relay.expr.Let):
+            value = anf.value
+            if isinstance(value, relay.expr.Call):
+                if isinstance(value.op, relay.op.Op):
+                    if value.op.name == start_name and not start_found:
+                        value = relay.expr.Call(bitpack_start, [value])
+                        start_found = True
+                    elif value.op.name == stop_name:
+                        raise BT()
+            try:
+                return relay.expr.Let(anf.var, value, recursion(anf.body, start_found, stop_found))
+            except BT:
+                assert start_found
+                assert not stop_found
+                stop_found = True
+                value = relay.expr.Call(bitpack_end, [value])
+                return relay.expr.Let(anf.var, value, anf.body) # todo: check anf.body has no more stop_name beside that one
+        else:
+            assert start_found
+            assert stop_found
+            return anf
+    annotated = recursion(anf, False, False)
+    return relay.ir_pass.infer_type(relay.ir_pass.to_graph_normal_form(annotated))
+
+def graph_pack(expr,
+               bfactor,
+               cfactor,
+               weight_bits,
+               start_name="nn.max_pool2d",
+               stop_name="nn.global_avg_pool2d"):
+    """Pack the graph into batch&channel packed format.
+
+    Parameters
+    ----------
+    expr : relay.Expr
+       The input program.
+
+    bfactor : int
+       The packing factor in batch
+
+    cfactor : int
+       The packing factor in channel
+
+    weight_bits: int
+        The bit-width of the weights.
+
+    start_name: str, optional
+       Start packing from certain known node.
+
+    stop_name: str, optional
+       Stop packing from certain known node.
+
+    Returns
+    -------
+    expr : Expr
+        The transformed expression.
+    """
+    assert isinstance(expr, relay.Function)
+    expr = get_subgraph(expr, start_name, stop_name)
+    print("Before", expr.astext(show_meta_data=False))
+    expr = relay.ir_pass.infer_type(expr)
+    packer = ExprPack(
+        bfactor, cfactor,
+        weight_bits)
+    expr = packer.visit(expr)
+    print("After", expr.astext(show_meta_data=False))
+    assert not packer.start_pack
+    return relay.ir_pass.infer_type(expr)
+

From ab01f07d197f413dde3342b863e5cafa9064a113 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 14 May 2019 16:49:17 -0700
Subject: [PATCH 016/126] test script for relay to vta compilation

---
 vta/scripts/relay_to_vta.py | 284 ++++++++++++++++++++++++++++++++++++
 1 file changed, 284 insertions(+)
 create mode 100644 vta/scripts/relay_to_vta.py

diff --git a/vta/scripts/relay_to_vta.py b/vta/scripts/relay_to_vta.py
new file mode 100644
index 000000000000..11cddef8b976
--- /dev/null
+++ b/vta/scripts/relay_to_vta.py
@@ -0,0 +1,284 @@
+"""Perform inference on VTA using Relay."""
+
+import argparse, json, requests, time
+from io import BytesIO
+from mxnet.gluon.model_zoo import vision
+import numpy as np
+from os.path import join, isfile
+from PIL import Image
+
+import tvm
+from tvm import rpc, autotvm, relay
+from tvm.contrib import graph_runtime, util, download
+from tvm.contrib.debugger import debug_runtime
+import vta
+from vta.testing import simulator
+from vta.top import graph_pack
+
+parser = argparse.ArgumentParser(description='Train a model for image classification.')
+parser.add_argument('--model', type=str, required=True,
+                    help='Input model name.')
+parser.add_argument('--start-name', type=str, default='nn.max_pool2d',
+                    help='The name of the node where packing starts')
+parser.add_argument('--stop-name', type=str, default='nn.global_avg_pool2d',
+                    help='The name of the node where packing stops')
+parser.add_argument('--debug-profile', action='store_true',
+                    help='Show layer-wise time cost profiling results')
+parser.add_argument('--device', default="vta",
+                    help='Select device target, either "vta" or "vtacpu"')
+parser.add_argument('--measurements', type=int, default=1,
+                    help='Number of measurements')
+
+opt = parser.parse_args()
+
+if 'mobilenet' in opt.model:
+    opt.start_name = 'nn.relu'
+elif 'gan' in opt.model:
+    opt.start_name = 'reshape0'
+    opt.stop_name = 'copy2'
+elif 'rnn' in opt.model:
+    opt.start_name = 'reshape0'
+    opt.stop_name = 'reshape1'
+
+# Helper function to read in image
+# Takes in Image object, returns an ND array
+def process_image(image):
+    # Convert to neural network input format
+    image = np.array(image) - np.array([123., 117., 104.])
+    image /= np.array([58.395, 57.12, 57.375])
+    image = image.transpose((2, 0, 1))
+    image = image[np.newaxis, :]
+
+    return tvm.nd.array(image.astype("float32"))
+
+def mark_nop(graph,
+             conv_layer=-1,
+             skip_conv_layer=(),
+             reverse=False,
+             conv2d_only=False):
+    """Helper function to mark certain op as nop
+
+    Useful to debug performance issues.
+    """
+    jgraph = json.loads(graph.json())
+    counter = 0
+    for _, node in enumerate(jgraph["nodes"]):
+        op_name = node["op"]
+        if op_name != "tvm_op":
+            continue
+        attrs = node["attrs"]
+        func_name = attrs["func_name"]
+
+        if func_name.find("conv2d") != -1:
+            if conv_layer >= 0:
+                if counter != conv_layer:
+                    attrs["func_name"] = "__nop"
+            if counter in skip_conv_layer:
+                attrs["func_name"] = "__nop"
+            counter += 1
+        else:
+            if conv_layer >= 0:
+                attrs["func_name"] = "__nop"
+            attrs["func_name"] = "__nop"
+
+        if reverse:
+            if attrs["func_name"] != "__nop":
+                attrs["func_name"] = "__nop"
+            else:
+                attrs["func_name"] = func_name
+
+        if conv2d_only:
+            if attrs["func_name"].find("conv2d") == -1:
+                attrs["func_name"] = "__nop"
+
+    graph = nnvm.graph.load_json(json.dumps(jgraph))
+    return graph
+
+
+def demo_cat_classification(env, m, ctx, remote, shape_dict, dtype_dict):
+    # Read in ImageNet Categories
+    url = "https://github.com/uwsaml/web-data/raw/master/vta/models/"
+    categ_fn = "synset.txt"
+    for fn in ["synset.txt"]:
+        if not isfile(fn):
+            download.download(join(url, fn), fn)
+    synset = eval(open(categ_fn).read())
+    # Read in test image
+    image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg'
+    # Read in test image
+    response = requests.get(image_url)
+    image = Image.open(BytesIO(response.content)).resize((224, 224))
+    # Set the input
+    image = process_image(image)
+    if "gan" in opt.model or "rnn" in opt.model:
+        # non-classification networks require custom input shapes and out shapes
+        m.set_input('data', tvm.nd.array(
+            10 * np.random.uniform(size=shape_dict['data']).astype(dtype_dict['data'])))
+        timer = m.module.time_evaluator("run", ctx, number=1, repeat=opt.measurements)
+        tcost = timer()
+        std = np.std(tcost.results) * 1000 / env.BATCH
+        mean = tcost.mean * 1000 / env.BATCH
+        print("Performed inference in %.2fms/samlple (std = %.2f)" % (mean, std))
+    else:
+        image = np.repeat(image.asnumpy(), env.BATCH, axis=0)
+        m.set_input('data', image)
+        # Perform inference
+        timer = m.module.time_evaluator("run", ctx, number=1, repeat=opt.measurements)
+        tcost = timer()
+
+        if opt.debug_profile:
+            m.run()
+
+        # Get classification results
+        tvm_output = m.get_output(0,
+                                  tvm.nd.empty((env.BATCH, 1000), "float32", remote.cpu(0)))
+        top_categories = np.argsort(tvm_output.asnumpy()[0])
+
+        # Report top-5 classification results
+        std = np.std(tcost.results) * 1000 / env.BATCH
+        mean = tcost.mean * 1000 / env.BATCH
+        print("%s Prediction" % opt.model)
+        print("                     #1:", synset[top_categories[-1]])
+        print("                     #2:", synset[top_categories[-2]])
+        print("                     #3:", synset[top_categories[-3]])
+        print("                     #4:", synset[top_categories[-4]])
+        print("                     #5:", synset[top_categories[-5]])
+        print("Performed inference in %.2fms/sample (std = %.2f)" % (mean, std))
+
+######################################################################
+# Setup the Pynq Board's RPC Server
+# ---------------------------------
+# Build the RPC server's VTA runtime and program the Pynq FPGA.
+
+def run(device = "vta"):
+    env = vta.get_env()
+    # Measure build start time
+    reconfig_start = time.time()
+
+    # We configure both the bitstream and the runtime system on the Pynq
+    # to match the VTA configuration specified by the vta_config.json file.
+    if env.TARGET != "sim":
+
+        # Make sure that TVM was compiled with RPC=1
+        assert tvm.module.enabled("rpc")
+
+        # Get remote from fleet node
+        remote = autotvm.measure.request_remote(env.TARGET, '10.77.1.109', 9190, timeout=10000)
+
+        # Reconfigure the JIT runtime
+        vta.reconfig_runtime(remote)
+
+        # Program the FPGA with a pre-compiled VTA bitstream.
+        # You can program the FPGA with your own custom bitstream
+        # by passing the path to the bitstream file instead of None.
+        vta.program_fpga(remote, bitstream=None)
+
+        # Report on reconfiguration time
+        reconfig_time = time.time() - reconfig_start
+        print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
+
+    # In simulation mode, host the RPC server locally.
+    elif env.TARGET == "sim":
+        remote = rpc.LocalSession()
+
+    # TVM target and context
+    target = tvm.target.create("llvm -device={}".format(device))
+    ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
+
+    with autotvm.tophub.context(target):
+
+        # Measure build start time
+        build_start = time.time()
+
+        # Derive the LLVM compiler flags
+        # When targetting the Pynq/Ultra-96, cross-compile to ARM ISA
+        target_host = env.target_host
+
+        # Populate the shape and data type dictionary
+        dtype_dict = {"data": 'float32'}
+        if "gan" in opt.model:
+            shape_dict = {"data": (env.BATCH, 100)}
+        elif 'rnn' in opt.model:
+            batch_size, seq_len, hidden_dim = 4, 1, 640
+            begin_state_shape = (batch_size, hidden_dim, 1, 1)
+            shape_dict = {"data": (seq_len, batch_size),
+                        "cell_l0_begin_state_0": begin_state_shape,
+                        "cell_l1_begin_state_0": begin_state_shape}
+            dtype_dict = {"data": "int32",
+                        "cell_l0_begin_state_0": 'float32',
+                        "cell_l1_begin_state_0": 'float32'}
+        else:
+            shape_dict = {"data": (env.BATCH, 3, 224, 224)}
+
+        gluon_model = vision.get_model(opt.model, pretrained=True)
+        relay_graph, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
+
+        shape_dict.update({k: v.shape for k, v in params.items()})
+        dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+        with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1):
+            relay_graph = relay.quantize.quantize(relay_graph, params=params)
+
+        if target.device_name == "vta":
+            assert env.BLOCK_IN == env.BLOCK_OUT
+            relay_graph = graph_pack(
+                relay_graph,
+                env.BATCH,
+                env.BLOCK_OUT,
+                env.WGT_WIDTH,
+                start_name=opt.start_name,
+                stop_name=opt.stop_name)
+
+            relay_graph = relay.ir_pass.fold_constant(relay_graph)
+
+        # Compile Relay program.
+        with relay.build_module.build_config(opt_level=3, disable_pass={"AlterOpLayout"}):
+            if target.device_name != "vta":
+                # import pdb; pdb.set_trace() 
+                graph, lib, params = relay.build(
+                    relay_graph, target=target,
+                    params=params, target_host=target_host)
+            else:
+                # import pdb; pdb.set_trace()
+                with vta.build_config():
+                    graph, lib, params = relay.build(
+                        relay_graph, target=target,
+                        params=params, target_host=target_host)
+
+
+        # Save the compiled inference graph library
+        assert tvm.module.enabled("rpc")
+        temp = util.tempdir()
+        lib.save(temp.relpath("graphlib.o"))
+
+        # Send the inference library over to the remote RPC server
+        remote.upload(temp.relpath("graphlib.o"))
+        lib = remote.load_module("graphlib.o")
+
+        # Measure build time
+        build_time = time.time() - build_start
+        print(opt.model + " inference graph built in {0:.2f}s!".format(build_time))
+        
+        cpu_skip_layer = (0,) if "gan" in opt.model else (3,)
+        # profile script, set this to False to run end to end
+        if opt.debug_fpga_only:
+            graph = mark_nop(graph, skip_conv_layer=cpu_skip_layer)
+        elif opt.debug_cpu_only:
+            graph = mark_nop(graph, skip_conv_layer=cpu_skip_layer, reverse=True)
+        elif opt.run_conv_layer:
+            conv_set = tuple(int(x) for x in opt.run_conv_layer.split(","))
+            graph = mark_nop(graph,
+                             skip_conv_layer=conv_set,
+                             reverse=True,
+                             conv2d_only=True)
+
+        if opt.debug_profile:
+            m = debug_runtime.create(graph, lib, ctx)
+        else:
+            m = graph_runtime.create(graph, lib, ctx)
+
+        # Set the parameters
+        m.set_input(**params)
+        demo_cat_classification(env, m, ctx, remote, shape_dict, dtype_dict)
+
+run(opt.device)

From 32773ad730d07011828b255fe0e36aa558e02b22 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 15 May 2019 14:37:13 -0700
Subject: [PATCH 017/126] adding nnvm graphpack:

---
 vta/python/vta/top/nnvm_graphpack.py | 206 +++++++++++++++++++++++++++
 1 file changed, 206 insertions(+)
 create mode 100644 vta/python/vta/top/nnvm_graphpack.py

diff --git a/vta/python/vta/top/nnvm_graphpack.py b/vta/python/vta/top/nnvm_graphpack.py
new file mode 100644
index 000000000000..1f713acd3e27
--- /dev/null
+++ b/vta/python/vta/top/nnvm_graphpack.py
@@ -0,0 +1,206 @@
+"""An NNVM implementation of graph packing."""
+
+import nnvm
+from nnvm.compiler import graph_attr, graph_util
+
+def _pack_batch_channel(data, dshape, bfactor, cfactor):
+    """Pack the data channel dimension.
+    """
+    assert dshape[0] % bfactor == 0
+    assert dshape[1] % cfactor == 0
+    data = nnvm.sym.reshape(data,
+                            shape=(dshape[0] // bfactor, bfactor,
+                                   dshape[1] // cfactor, cfactor,
+                                   dshape[2], dshape[3]))
+    data = nnvm.sym.transpose(
+        data, axes=(0, 2, 4, 5, 1, 3))
+    return data
+
+
+def _unpack_batch_channel(data, old_shape):
+    """Unpack the data channel dimension.
+    """
+    data = nnvm.sym.transpose(data, axes=(0, 4, 1, 5, 2, 3))
+    data = nnvm.sym.reshape(data, shape=old_shape)
+    return data
+
+
+def _pack_weight(data, dshape, cfactor):
+    """Pack the weight into packed format.
+    """
+    assert len(dshape) == 4
+    assert dshape[0] % cfactor == 0
+    assert dshape[1] % cfactor == 0
+    data = nnvm.sym.reshape(data,
+                            shape=(dshape[0] // cfactor, cfactor,
+                                   dshape[1] // cfactor, cfactor,
+                                   dshape[2], dshape[3]))
+    data = nnvm.sym.transpose(
+        data, axes=(0, 2, 4, 5, 1, 3))
+    return data
+
+
+def _pack_weight_conv2d_transpose(data, dshape, cfactor):
+    """Pack the weight into packed format.
+    """
+    assert len(dshape) == 4
+    assert dshape[0] % cfactor == 0
+    assert dshape[1] % cfactor == 0
+    data = nnvm.sym.reshape(data,
+                            shape=(dshape[0] // cfactor, cfactor,
+                                   dshape[1] // cfactor, cfactor,
+                                   dshape[2], dshape[3]))
+    data = nnvm.sym.transpose(
+        data, axes=(2, 0, 4, 5, 3, 1))
+    return data
+
+
+def _pack_bias(data, dshape, bfactor, cfactor):
+    """Pack the bias parameter.
+    """
+    assert len(dshape) == 3
+    assert dshape[0] % cfactor == 0
+    data = nnvm.sym.reshape(data,
+                            shape=(dshape[0] // cfactor,
+                                   cfactor, dshape[1],
+                                   dshape[2], 1))
+    data = nnvm.sym.transpose(
+        data, axes=(0, 2, 3, 4, 1))
+    # broadcast batch dimension to bfactor
+    data = nnvm.sym.broadcast_to(
+        data,
+        shape=(dshape[0] // cfactor, dshape[1], dshape[2], bfactor, cfactor))
+    return data
+
+
+def _get_shape(sym, shape_dict):
+    """Get the shape of a node.
+    """
+    return graph_util.infer_shape(
+        nnvm.graph.create(sym), **shape_dict)[1][0]
+
+
+def nnvm_graph_pack(graph,
+                    shape_dict,
+                    bfactor,
+                    cfactor,
+                    weight_bits,
+                    start_name="max_pool2d0",
+                    stop_name="global_avg_pool2d0"):
+    """Pack the graph into batch&channel packed format.
+
+    Parameters
+    ----------
+    graph : Graph
+       The input graph.
+
+    shape_dict : dict of str to shape
+       The input shape.
+
+    bfactor : int
+       The packing factor in batch
+
+    cfactor : int
+       The packing factor in channel
+
+    start_name: str, optional
+       Start packing from certain known node.
+
+    start_name: str, optional
+       Stop packing from certain known node.
+
+    Returns
+    -------
+    graph : Graph
+        The transformed graph.
+    """
+    graph = graph_attr.set_shape_inputs(graph, shape_dict)
+    graph = graph.apply("InferShape")
+    shape = graph.json_attr("shape")
+    gidx = graph.index
+    node_map = {}
+    dset = set()
+    start_pack = False
+
+    for nid, node in enumerate(gidx.nodes):
+        children = [node_map[e[0]] for e in node["inputs"]]
+        ishape = [shape[gidx.entry_id(e)] for e in node["inputs"]]
+        oshape = shape[gidx.entry_id(nid, 0)]
+        attrs = node.get("attrs", {})
+        node_name = node["name"]
+        op_name = node["op"]
+        get_clone = lambda c, o_n, n_n, a: getattr(nnvm.symbol, o_n)(
+            *c, name=n_n, **a)
+        if op_name == "null":
+            new_node = nnvm.symbol.Variable(node_name)
+            if start_name and node_name == start_name:
+                start_pack = True
+                new_node = _pack_batch_channel(new_node, oshape, bfactor, cfactor)
+            if start_pack and "_begin_state_" in node_name: # RNN -> CNN, pack
+                new_node = _pack_batch_channel(new_node, oshape, bfactor, cfactor)
+        elif node_name == start_name:
+            assert not start_pack
+            start_pack = True
+            new_node = get_clone(children, op_name, node_name, attrs)
+            new_node = _pack_batch_channel(new_node, oshape, bfactor, cfactor)
+        elif node_name == stop_name:
+            if start_pack:
+                start_pack = False
+                children[0] = _unpack_batch_channel(children[0], ishape[0])
+                new_node = getattr(nnvm.symbol, op_name)(
+                    *children, name=node_name, **attrs)
+            else:
+                new_node = get_clone(children, op_name, node_name, attrs)
+        elif op_name == "conv2d" and attrs.get("out_dtype", None) == "int32":
+            assert 8 % weight_bits == 0
+            w_lanes = 8 // weight_bits
+            if start_pack:
+                attrs["layout"] = "NCHW%dn%dc" % (bfactor, cfactor)
+                attrs["kernel_layout"] = "OIHW%do%di%dp" % (cfactor, cfactor, w_lanes)
+                data, weight = children
+                weight = _pack_weight(weight, ishape[1], cfactor)
+                # insert bit packing when necessary
+                if w_lanes != 1:
+                    assert 8 % w_lanes == 0
+                    weight = nnvm.sym.bitpack(weight, lanes=w_lanes)
+                new_node = nnvm.sym.conv2d(
+                    data, weight, name=node_name, **attrs)
+            else:
+                new_node = get_clone(children, op_name, node_name, attrs)
+        elif op_name == "conv2d_transpose" and attrs.get("out_dtype", None) == "int32":
+            assert 8 % weight_bits == 0
+            w_lanes = 8 // weight_bits
+            if start_pack:
+                attrs["layout"] = "NCHW%dn%dc" % (bfactor, cfactor)
+                attrs["kernel_layout"] = "IOHW%di%do%dp" % (cfactor, cfactor, w_lanes)
+                data, weight = children
+                weight = _pack_weight_conv2d_transpose(weight, ishape[1], cfactor)
+                new_node = nnvm.sym.conv2d_transpose(
+                    data, weight, name=node_name, **attrs)
+            else:
+                new_node = get_clone(children, op_name, node_name, attrs)
+        elif op_name.startswith("broadcast_") and tuple(ishape[0]) == tuple(ishape[1]):
+            new_node = get_clone(children, op_name, node_name, attrs)
+        elif op_name.startswith("broadcast") and len(ishape[1]) == 3:
+            if start_pack:
+                children[1] = _pack_bias(children[1], ishape[1], bfactor, cfactor)
+                new_node = getattr(nnvm.symbol, op_name)(
+                    *children, name=node_name, **attrs)
+            else:
+                new_node = get_clone(children, op_name, node_name, attrs)
+        elif op_name.startswith("elementwise_add"):
+            new_node = get_clone(children, op_name, node_name, attrs)
+        else:
+            new_node = get_clone(children, op_name, node_name, attrs)
+            dset.add(op_name)
+        node_map[nid] = new_node
+
+    assert len(graph.index.output_entries) == 1
+    ret = node_map[graph.index.output_entries[0][0]]
+    if start_pack:
+        oshape = shape[graph.index.output_entries[0][0]]
+        ret = _unpack_batch_channel(ret, oshape)
+    graph = nnvm.graph.create(ret)
+    graph = graph_attr.set_shape_inputs(graph, shape_dict)
+    graph = graph.apply("InferShape")
+    return graph

From 73036e3ce25a0ec504ee3291f809cec1abfd9557 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 15 May 2019 17:45:12 -0700
Subject: [PATCH 018/126] clean up of script

---
 vta/scripts/relay_to_vta.py | 59 -------------------------------------
 1 file changed, 59 deletions(-)

diff --git a/vta/scripts/relay_to_vta.py b/vta/scripts/relay_to_vta.py
index 11cddef8b976..66af34f659e4 100644
--- a/vta/scripts/relay_to_vta.py
+++ b/vta/scripts/relay_to_vta.py
@@ -51,50 +51,6 @@ def process_image(image):
 
     return tvm.nd.array(image.astype("float32"))
 
-def mark_nop(graph,
-             conv_layer=-1,
-             skip_conv_layer=(),
-             reverse=False,
-             conv2d_only=False):
-    """Helper function to mark certain op as nop
-
-    Useful to debug performance issues.
-    """
-    jgraph = json.loads(graph.json())
-    counter = 0
-    for _, node in enumerate(jgraph["nodes"]):
-        op_name = node["op"]
-        if op_name != "tvm_op":
-            continue
-        attrs = node["attrs"]
-        func_name = attrs["func_name"]
-
-        if func_name.find("conv2d") != -1:
-            if conv_layer >= 0:
-                if counter != conv_layer:
-                    attrs["func_name"] = "__nop"
-            if counter in skip_conv_layer:
-                attrs["func_name"] = "__nop"
-            counter += 1
-        else:
-            if conv_layer >= 0:
-                attrs["func_name"] = "__nop"
-            attrs["func_name"] = "__nop"
-
-        if reverse:
-            if attrs["func_name"] != "__nop":
-                attrs["func_name"] = "__nop"
-            else:
-                attrs["func_name"] = func_name
-
-        if conv2d_only:
-            if attrs["func_name"].find("conv2d") == -1:
-                attrs["func_name"] = "__nop"
-
-    graph = nnvm.graph.load_json(json.dumps(jgraph))
-    return graph
-
-
 def demo_cat_classification(env, m, ctx, remote, shape_dict, dtype_dict):
     # Read in ImageNet Categories
     url = "https://github.com/uwsaml/web-data/raw/master/vta/models/"
@@ -234,12 +190,10 @@ def run(device = "vta"):
         # Compile Relay program.
         with relay.build_module.build_config(opt_level=3, disable_pass={"AlterOpLayout"}):
             if target.device_name != "vta":
-                # import pdb; pdb.set_trace() 
                 graph, lib, params = relay.build(
                     relay_graph, target=target,
                     params=params, target_host=target_host)
             else:
-                # import pdb; pdb.set_trace()
                 with vta.build_config():
                     graph, lib, params = relay.build(
                         relay_graph, target=target,
@@ -258,19 +212,6 @@ def run(device = "vta"):
         # Measure build time
         build_time = time.time() - build_start
         print(opt.model + " inference graph built in {0:.2f}s!".format(build_time))
-        
-        cpu_skip_layer = (0,) if "gan" in opt.model else (3,)
-        # profile script, set this to False to run end to end
-        if opt.debug_fpga_only:
-            graph = mark_nop(graph, skip_conv_layer=cpu_skip_layer)
-        elif opt.debug_cpu_only:
-            graph = mark_nop(graph, skip_conv_layer=cpu_skip_layer, reverse=True)
-        elif opt.run_conv_layer:
-            conv_set = tuple(int(x) for x in opt.run_conv_layer.split(","))
-            graph = mark_nop(graph,
-                             skip_conv_layer=conv_set,
-                             reverse=True,
-                             conv2d_only=True)
 
         if opt.debug_profile:
             m = debug_runtime.create(graph, lib, ctx)

From 44b3e5023d149354d9d08e60255376ff3e0163d2 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Fri, 17 May 2019 02:09:52 +0000
Subject: [PATCH 019/126] adding rpc server with fleet server registration

---
 apps/pynq_rpc/start_rpc_server_to_tracker.sh | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100755 apps/pynq_rpc/start_rpc_server_to_tracker.sh

diff --git a/apps/pynq_rpc/start_rpc_server_to_tracker.sh b/apps/pynq_rpc/start_rpc_server_to_tracker.sh
new file mode 100755
index 000000000000..0299ce55c89e
--- /dev/null
+++ b/apps/pynq_rpc/start_rpc_server_to_tracker.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+PROJROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd )"
+
+export PYTHONPATH=${PYTHONPATH}:${PROJROOT}/python:${PROJROOT}/vta/python
+python3.6 -m vta.exec.rpc_server --tracker fleet:9190 --key pynq

From 049118ceb2a2b13703dddbf3db8b5663b598a7bb Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Fri, 17 May 2019 02:14:16 +0000
Subject: [PATCH 020/126] adding license

---
 apps/pynq_rpc/start_rpc_server_to_tracker.sh | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/apps/pynq_rpc/start_rpc_server_to_tracker.sh b/apps/pynq_rpc/start_rpc_server_to_tracker.sh
index 0299ce55c89e..f1b906327add 100755
--- a/apps/pynq_rpc/start_rpc_server_to_tracker.sh
+++ b/apps/pynq_rpc/start_rpc_server_to_tracker.sh
@@ -1,5 +1,23 @@
 #!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
 PROJROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd )"
 
+
 export PYTHONPATH=${PYTHONPATH}:${PROJROOT}/python:${PROJROOT}/vta/python
-python3.6 -m vta.exec.rpc_server --tracker fleet:9190 --key pynq
+export PYTHONPATH=${PYTHONPATH}:/home/xilinx/pynq
+python3 -m vta.exec.rpc_server --tracker fleet:9190 --key pynq

From d304f641465a55a58af1e6ce7057378b56cff7f8 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 20 May 2019 16:42:44 -0700
Subject: [PATCH 021/126] increasing allocatable buffer size

---
 vta/include/vta/driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vta/include/vta/driver.h b/vta/include/vta/driver.h
index eca9e4da9799..2d8e9c2c3d84 100644
--- a/vta/include/vta/driver.h
+++ b/vta/include/vta/driver.h
@@ -42,7 +42,7 @@ extern "C" {
 
 /*! \brief Physically contiguous buffer size limit */
 #ifndef VTA_MAX_XFER
-#define VTA_MAX_XFER (1<<22)
+#define VTA_MAX_XFER (1<<25)
 #endif
 
 /*! PAGE SIZE */

From b22b96ce9c7855429275b87b963a7ef70bf935c1 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 20 May 2019 17:55:05 -0700
Subject: [PATCH 022/126] adding bitstream programming in conv2d test; support
 for getting remote from tracker

---
 vta/python/vta/testing/util.py                | 23 ++++++++++++-------
 .../integration/test_benchmark_topi_conv2d.py |  5 ++++
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/vta/python/vta/testing/util.py b/vta/python/vta/testing/util.py
index f99541691082..b009b7f27fd3 100644
--- a/vta/python/vta/testing/util.py
+++ b/vta/python/vta/testing/util.py
@@ -18,7 +18,7 @@
 from __future__ import absolute_import as _abs
 
 import os
-from tvm import rpc
+from tvm import rpc, autotvm
 from ..environment import get_env
 from . import simulator
 
@@ -54,12 +54,19 @@ def run(run_func):
 
     elif env.TARGET == "pynq":
 
-        # Run on PYNQ if env variable exists
-        host = os.environ.get("VTA_PYNQ_RPC_HOST", None)
-        port = int(os.environ.get("VTA_PYNQ_RPC_PORT", None))
-        if host and port:
-            remote = rpc.connect(host, port)
+        tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
+        tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None))
+        pynq_host = os.environ.get("VTA_PYNQ_RPC_HOST", None)
+        pynq_port = int(os.environ.get("VTA_PYNQ_RPC_PORT", None))
+        # Run device from fleet node if env variables are defined
+        if tracket_host and tracket_port:
+            remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000)
             run_func(env, remote)
         else:
-            raise RuntimeError(
-                "Please set the VTA_PYNQ_RPC_HOST and VTA_PYNQ_RPC_PORT environment variables")
+            # Next, run on PYNQ if env variables are defined
+            if pynq_host and pynq_port:
+                remote = rpc.connect(pynq_host, pynq_port)
+                run_func(env, remote)
+            else:
+                raise RuntimeError(
+                    "Please set the VTA_PYNQ_RPC_HOST and VTA_PYNQ_RPC_PORT environment variables")
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index 28c8af4283ce..dc7b5d710c29 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -29,6 +29,7 @@
 import topi
 import topi.testing
 import vta
+from vta import program_fpga, reconfig_runtime
 import vta.testing
 from vta.testing import simulator
 
@@ -213,6 +214,10 @@ def test_conv2d(device="vta"):
     def _run(env, remote):
         if device == "vta":
             target = env.target
+            if env.TARGET != "sim":
+                assert tvm.module.enabled("rpc")
+                program_fpga(remote, bitstream=None)
+                reconfig_runtime(remote)
         elif device == "arm_cpu":
             target = env.target_vta_cpu
         with autotvm.tophub.context(target): # load pre-tuned schedule parameters

From 2aed8e62ee3a9c1f5d23b97b9e8a7bec8de63ff2 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 21 May 2019 10:35:52 -0700
Subject: [PATCH 023/126] removing printfs

---
 vta/src/runtime.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vta/src/runtime.cc b/vta/src/runtime.cc
index 06b34743955f..f44e3cab8a82 100644
--- a/vta/src/runtime.cc
+++ b/vta/src/runtime.cc
@@ -908,12 +908,10 @@ class CommandQueue {
     insn_queue_.InitSpace();
     device_ = VTADeviceAlloc();
     CHECK(device_ != nullptr);
-    printf("Initialize VTACommandHandle...\n");
   }
 
   ~CommandQueue() {
     VTADeviceFree(device_);
-    printf("Close VTACommandhandle...\n");
   }
 
   uint32_t GetElemBytes(uint32_t memory_id) {

From 897c08ec0eb4f1d9aeeef1212520669eac530cc1 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 21 May 2019 13:55:42 -0700
Subject: [PATCH 024/126] adding option to skip execution in simulator

---
 vta/python/vta/testing/simulator.py | 13 +++++++++
 vta/src/sim/sim_driver.cc           | 42 +++++++++++++++++++++++------
 2 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/vta/python/vta/testing/simulator.py b/vta/python/vta/testing/simulator.py
index dbeba84f6d4a..2d6cfe305756 100644
--- a/vta/python/vta/testing/simulator.py
+++ b/vta/python/vta/testing/simulator.py
@@ -84,4 +84,17 @@ def tsim_cycles():
     """
     return tvm.get_global_func("tvm.vta.tsim.cycles")()
 
+# debug flag to skip execution.
+DEBUG_SKIP_EXEC = 1
+
+def debug_mode(flag):
+    """Set debug mode
+    Paramaters
+    ----------
+    flag : int
+        The debug flag, 0 means clear all flags.
+    """
+    tvm.get_global_func("vta.simulator.profiler_debug_mode")(flag)
+
+
 LIBS = _load_lib()
diff --git a/vta/src/sim/sim_driver.cc b/vta/src/sim/sim_driver.cc
index 5f9f6b637599..0691195f140e 100644
--- a/vta/src/sim/sim_driver.cc
+++ b/vta/src/sim/sim_driver.cc
@@ -35,6 +35,11 @@
 namespace vta {
 namespace sim {
 
+/*! \brief debug flag for skipping computation */
+enum DebugFlagMask {
+  kSkipExec = 1
+};
+
 /*!
  * \brief Helper class to pack and unpack bits
  *  Applies truncation when pack to low level bits.
@@ -253,8 +258,12 @@ class SRAM {
     return &(data_[index]);
   }
   // Execute the load instruction on this SRAM
-  void Load(const VTAMemInsn* op, DRAM* dram, uint64_t* load_counter) {
+  void Load(const VTAMemInsn* op,
+            DRAM* dram,
+            uint64_t* load_counter,
+            bool skip_exec) {
     load_counter[0] += (op->x_size * op->y_size) * kElemBytes;
+    if (skip_exec) return;
     DType* sram_ptr = data_ + op->sram_base;
     uint8_t* dram_ptr = static_cast<uint8_t*>(dram->GetAddr(
         op->dram_base * kElemBytes));
@@ -325,6 +334,8 @@ class Profiler {
   uint64_t gemm_counter{0};
   /*! \brief instr counter for ALU ops */
   uint64_t alu_counter{0};
+  /*! \brief set debug mode */
+  int64_t debug_flag{0};
   /*! \brief clear the profiler */
   void Clear() {
     inp_load_nbytes = 0;
@@ -335,6 +346,10 @@ class Profiler {
     gemm_counter = 0;
     alu_counter = 0;
   }
+  /*! \return Whether we should skip execution. */
+  bool SkipExec() const {
+    return (debug_flag & DebugFlagMask::kSkipExec) != 0;
+  }
 
   std::string AsJSON() {
     std::ostringstream os;
@@ -398,13 +413,15 @@ class Device {
   void RunLoad(const VTAMemInsn* op) {
     if (op->x_size == 0) return;
     if (op->memory_type == VTA_MEM_ID_INP) {
-      inp_.Load(op, dram_, &(prof_->inp_load_nbytes));
+      inp_.Load(op, dram_, &(prof_->inp_load_nbytes), prof_->SkipExec());
     } else if (op->memory_type == VTA_MEM_ID_WGT) {
-      wgt_.Load(op, dram_, &(prof_->wgt_load_nbytes));
+      wgt_.Load(op, dram_, &(prof_->wgt_load_nbytes), prof_->SkipExec());
     } else if (op->memory_type == VTA_MEM_ID_ACC) {
-      acc_.Load(op, dram_, &(prof_->acc_load_nbytes));
+      acc_.Load(op, dram_, &(prof_->acc_load_nbytes), prof_->SkipExec());
     } else if (op->memory_type == VTA_MEM_ID_UOP) {
-      uop_.Load(op, dram_, &(prof_->uop_load_nbytes));
+      // always load in uop, since uop is stateful
+      // subsequent non-debug mode exec can depend on it.
+      uop_.Load(op, dram_, &(prof_->uop_load_nbytes), false);
     } else {
       LOG(FATAL) << "Unknown memory_type=" << op->memory_type;
     }
@@ -416,7 +433,9 @@ class Device {
         op->memory_type == VTA_MEM_ID_UOP) {
       prof_->out_store_nbytes += (
           op->x_size * op->y_size * VTA_BATCH * VTA_BLOCK_OUT * VTA_OUT_WIDTH / 8);
-      acc_.TruncStore<VTA_OUT_WIDTH>(op, dram_);
+      if (!prof_->SkipExec()) {
+        acc_.TruncStore<VTA_OUT_WIDTH>(op, dram_);
+      }
     } else {
       LOG(FATAL) << "Store do not support memory_type="
                  << op->memory_type;
@@ -425,7 +444,8 @@ class Device {
 
   void RunGEMM(const VTAGemInsn* op) {
     if (!op->reset_reg) {
-      prof_->gemm_counter += op->iter_out * op->iter_in;
+      prof_->gemm_counter += op->iter_out * op->iter_in * (op->uop_end - op->uop_bgn);
+      if (prof_->SkipExec()) return;
       for (uint32_t y = 0; y < op->iter_out; ++y) {
         for (uint32_t x = 0; x < op->iter_in; ++x) {
           for (uint32_t uindex = op->uop_bgn; uindex < op->uop_end; ++uindex) {
@@ -459,6 +479,7 @@ class Device {
         }
       }
     } else {
+      if (prof_->SkipExec()) return;
       // reset
       for (uint32_t y = 0; y < op->iter_out; ++y) {
         for (uint32_t x = 0; x < op->iter_in; ++x) {
@@ -477,7 +498,6 @@ class Device {
   }
 
   void RunALU(const VTAAluInsn* op) {
-    prof_->alu_counter += op->iter_out * op->iter_in;
     if (op->use_imm) {
       RunALU_<true>(op);
     } else {
@@ -520,6 +540,8 @@ class Device {
 
   template<bool use_imm, typename F>
   void RunALULoop(const VTAAluInsn* op, F func) {
+    prof_->alu_counter += op->iter_out * op->iter_in * (op->uop_end - op->uop_bgn);
+    if (prof_->SkipExec()) return;
     for (int y = 0; y < op->iter_out; ++y) {
       for (int x = 0; x < op->iter_in; ++x) {
         for (int k = op->uop_bgn; k < op->uop_end; ++k) {
@@ -566,6 +588,10 @@ TVM_REGISTER_GLOBAL("vta.simulator.profiler_status")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     *rv = Profiler::ThreadLocal()->AsJSON();
   });
+TVM_REGISTER_GLOBAL("vta.simulator.profiler_debug_mode")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    Profiler::ThreadLocal()->debug_flag = args[0];
+  });
 }  // namespace sim
 }  // namespace vta
 

From f956f1580053872a848412e98f27cc3f863fa672 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 21 May 2019 13:59:17 -0700
Subject: [PATCH 025/126] InvalidShapeError reporting

---
 topi/python/topi/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py
index a9984148d5d3..ac855d144aad 100644
--- a/topi/python/topi/__init__.py
+++ b/topi/python/topi/__init__.py
@@ -35,6 +35,8 @@
 from . import image
 from . import sparse
 from . import hls
+# error reporting
+from .util import InvalidShapeError
 # not import testing by default
 # because testing can have extra deps that are not necessary
 # we can import them from test cases explicitly

From 48ad24bf7ad0bc916f043ec6a40cb3f8e2db4d82 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 21 May 2019 14:01:27 -0700
Subject: [PATCH 026/126] reset the xlnk driver before every FPGA program

---
 vta/python/vta/exec/rpc_server.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vta/python/vta/exec/rpc_server.py b/vta/python/vta/exec/rpc_server.py
index 8caa48a56104..0ac97a2ab07e 100644
--- a/vta/python/vta/exec/rpc_server.py
+++ b/vta/python/vta/exec/rpc_server.py
@@ -66,6 +66,9 @@ def ext_dev_callback():
 
     @tvm.register_func("tvm.contrib.vta.init", override=True)
     def program_fpga(file_name):
+        from pynq import xlnk
+        # Reset xilinx driver
+        xlnk.Xlnk().xlnk_reset()
         path = tvm.get_global_func("tvm.rpc.server.workpath")(file_name)
         env = get_env()
         program_bitstream.bitstream_program(env.TARGET, path)

From c6935227c35b1001a586d8e029a8724f5c83e4a9 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 21 May 2019 14:17:51 -0700
Subject: [PATCH 027/126] key flag used when building VTA target

---
 src/codegen/build_module.cc | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 0a488f38457b..04a2fd6d4db9 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -58,6 +58,7 @@ Target CreateTarget(const std::string& target_name,
 
   std::string libs_flag = "-libs=";
   std::string device_flag = "-device=";
+  std::string keys_flag = "-keys=";
   for (auto& item : options) {
     t->options_array.push_back(ir::StringImm::make(item));
 
@@ -69,12 +70,19 @@ Target CreateTarget(const std::string& target_name,
       }
     } else if (item.find(device_flag) == 0) {
       t->device_name = item.substr(device_flag.length());
+      t->keys_array.push_back(ir::StringImm::make(t->device_name));
+    } else if (item.find(keys_flag) == 0) {
+      std::stringstream ss(item.substr(keys_flag.length()));
+      std::string key_item;
+      while (std::getline(ss, key_item, ',')) {
+        t->keys_array.push_back(ir::StringImm::make(key_item));
+      }
     }
   }
 
-  if (t->device_name.length() > 0) {
-    t->keys_array.push_back(ir::StringImm::make(t->device_name));
-  }
+  // if (t->device_name.length() > 0) {
+  //   t->keys_array.push_back(ir::StringImm::make(t->device_name));
+  // }
   t->device_type = kDLCPU;
   t->thread_warp_size = 1;
   if (target_name == "c" || target_name == "llvm") {

From 04d1788c5958768af40276c98a251e25734f8607 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 21 May 2019 14:22:07 -0700
Subject: [PATCH 028/126] initial conv2d autotuning support

---
 python/tvm/autotvm/measure/measure_methods.py | 22 +++++-
 vta/python/vta/__init__.py                    |  2 +-
 vta/python/vta/build_module.py                | 59 +++++++++++++++
 vta/scripts/tune_conv2d.py                    | 74 +++++++++++++++++++
 4 files changed, 154 insertions(+), 3 deletions(-)
 create mode 100644 vta/scripts/tune_conv2d.py

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 31d688483294..dcdd46728e3e 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -87,7 +87,9 @@ def __init__(self, timeout=10, n_parallel=None, build_func='default'):
             else:
                 raise ValueError("Invalid build_func" + build_func)
 
-        self.build_func = _wrap_build_func(build_func)
+        # FIXME: right now we're circumventing the wrap_build_func
+        # self.build_func = _wrap_build_func(build_func)
+        self.build_func = build_func
         self.executor = LocalExecutor(timeout=timeout)
         self.tmp_dir = tempfile.mkdtemp()
 
@@ -223,7 +225,18 @@ def set_task(self, task):
                               for x in arg_bufs]
             func = build(s, arg_bufs, "llvm")
             tvm_buf = [nd.array(x) for x in self.ref_input]
-            func(*tvm_buf)
+
+            def _run_func():
+                """Run tvm function in a thread.
+                Because there is some issues with python multiprocessing and the thread pool in tvm
+                """
+                func(*tvm_buf)
+
+            thread = threading.Thread(target=_run_func)
+            thread.start()
+            thread.join()
+            del thread
+
             self.ref_output = [x.asnumpy() for x in tvm_buf]
 
     def get_build_kwargs(self):
@@ -452,6 +465,11 @@ def run_through_rpc(measure_input, build_result,
     try:
         # upload built module
         remote = request_remote(*remote_args)
+        # Program the FPGA every single time when targeting VTA
+        if measure_input.target.device_name == 'vta':
+            from vta import program_fpga, reconfig_runtime
+            program_fpga(remote, None)
+            reconfig_runtime(remote)
         remote.upload(build_result.filename)
         func = remote.load_module(os.path.split(build_result.filename)[1])
         ctx = remote.context(str(measure_input.target), 0)
diff --git a/vta/python/vta/__init__.py b/vta/python/vta/__init__.py
index 926d73649b31..75ecdbad4bc7 100644
--- a/vta/python/vta/__init__.py
+++ b/vta/python/vta/__init__.py
@@ -18,5 +18,5 @@
 # to maintain minimum dependency on the board
 if sys.argv[0] not in ("-c", "-m"):
     from . import top
-    from .build_module import build_config, lower, build
+    from .build_module import build_config, lower, build, vta_autotvm_build_func
     from . import graph
diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py
index 471dc90746de..f723c99dbe45 100644
--- a/vta/python/vta/build_module.py
+++ b/vta/python/vta/build_module.py
@@ -18,8 +18,10 @@
 from __future__ import absolute_import as _abs
 
 import tvm
+from tvm import rpc
 from . import ir_pass
 from .environment import get_env
+from .testing import simulator
 
 
 def lift_coproc_scope(x):
@@ -115,3 +117,60 @@ def build(*args, **kwargs):
         with build_config():
             return tvm.build(*args, **kwargs)
     return tvm.build(*args, **kwargs)
+
+
+def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs):
+    """Custom build func for VTA. Used for autotvm"""
+
+    import time
+    import os
+    from random import getrandbits
+    from tvm.autotvm.util import get_const_tuple
+    from tvm.autotvm.measure.measure_methods import BuildResult, InstantiationError
+
+    tic = time.time()
+    try:
+        filename = os.path.join(tmp_dir, "tmp_func_%0x.tar" % getrandbits(64))
+        target, task, config = measure_input
+
+        with target:
+            s, args = task.instantiate(config)
+            if not config.valid():
+                raise InstantiationError(config.errors)
+
+            func = build(s, args, target_host=task.target_host)
+            sim = build(s, args)
+
+        arg_info =  tuple((get_const_tuple(x.shape), x.dtype) for x in args)
+        func.export_library(filename)
+
+        # When targeting VTA test the schedule on simulator first
+        # in order to catch runtime errors
+        if measure_input.target.device_name == 'vta':
+            from vta import reconfig_runtime
+            # Note: if you're not running the RPC locally, you cannot benefit
+            # from rumtime recompilation...
+            local_rpc_port = int(os.environ.get("VTA_LOCAL_SIM_RPC_PORT", "0"))
+            if local_rpc_port:
+                remote = rpc.connect("localhost", local_rpc_port)
+                reconfig_runtime(remote)
+            else:
+                remote = rpc.LocalSession()
+            sim_path = os.path.join(tmp_dir, "tmp_func_%0x.tar" % getrandbits(64))
+            sim.export_library(sim_path)
+            remote.upload(sim_path)
+            f = remote.load_module(os.path.split(sim_path)[1])
+            ctx = remote.context(str(measure_input.target), 0)
+            args = [tvm.nd.empty(x[0], dtype=x[1], ctx=ctx) for x in arg_info]
+            simulator.clear_stats()
+            simulator.debug_mode(simulator.DEBUG_SKIP_EXEC)
+            f(*args)
+
+        # check by local simulator
+        ctx = tvm.context(str(target))
+        args = [tvm.nd.empty(x[0], dtype=x[1], ctx=ctx) for x in arg_info]
+        sim(*args)
+
+    except Exception as e:  # pylint: disable=broad-except
+        return BuildResult(None, None, e, time.time() - tic)
+    return BuildResult(filename, arg_info, None, time.time() - tic)
diff --git a/vta/scripts/tune_conv2d.py b/vta/scripts/tune_conv2d.py
new file mode 100644
index 000000000000..432008661c58
--- /dev/null
+++ b/vta/scripts/tune_conv2d.py
@@ -0,0 +1,74 @@
+"""Tuning a single conv2d operator"""
+import logging
+import os
+
+import tvm
+from tvm import autotvm
+from tvm.contrib.util import get_lower_ir
+import topi
+import vta
+import vta.testing
+
+env = vta.get_env()
+
+@tvm.tag_scope(tag=topi.tag.ELEMWISE)
+def my_clip(x, a_min, a_max):
+    """Unlike topi's current clip, put min and max into two stages."""
+    const_min = tvm.const(a_min, x.dtype)
+    const_max = tvm.const(a_max, x.dtype)
+    x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
+    x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+    return x
+
+def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dtype):
+    data_shape = (N//env.BATCH, CI//env.BLOCK_IN, H, W, env.BATCH, env.BLOCK_IN)
+    kernel_shape = (CO//env.BLOCK_OUT, CI//env.BLOCK_IN, KH, KW, env.BLOCK_OUT, env.BLOCK_IN)
+    bias_shape = (N//env.BATCH, CO//env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT)
+
+    data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
+    bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
+    kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
+
+    with tvm.target.vta():
+        res = topi.nn.conv2d(data, kernel, padding=padding, strides=strides, dilation=dilation,
+                             layout='NCHW%dn%dc' % (env.BATCH, env.BLOCK_IN), out_dtype='int32')
+        res = topi.add(res, bias)
+        res = topi.right_shift(res, 8)
+        res = my_clip(res, 0, 127)
+        res = topi.cast(res, "int8")
+
+    if tvm.target.current_target().device_name == 'vta':
+        s = topi.generic.schedule_conv2d_nchw([res])
+    else:
+        s = tvm.create_schedule([res.op])
+
+    return s, [data, kernel, bias, res]
+
+if __name__ == '__main__':
+    N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dtype = \
+        1, 64, 56, 56, 64, 3, 3, (1, 1), (1, 1), (1, 1), 'int8', 'int32'
+
+    task = autotvm.task.create(conv2d, args=(N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dtype),
+            target=tvm.target.vta(), target_host=env.target_host, template_key='direct')
+    print(task.config_space)
+
+    # Logging config (for printing tuning log to the screen)
+    logging.basicConfig()
+    logging.getLogger('autotvm').setLevel(logging.DEBUG)
+
+    # Get tracker info from env
+    tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
+    tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None))
+
+    measure_option = autotvm.measure_option(
+            builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func),
+            runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port, number=4, repeat=3, timeout=10000,
+                                     check_correctness=True))
+
+    tuner = autotvm.tuner.RandomTuner(task)
+    n_trial = len(task.config_space)
+    tuner.tune(n_trial=n_trial,
+               measure_option=measure_option,
+               callbacks=[autotvm.callback.log_to_file('conv2d.log')])
+
+    print(tuner.best_config)

From 29ebd8080cca85fbd1710bdfb4cdaa2cbed2d509 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 21 May 2019 14:26:33 -0700
Subject: [PATCH 029/126] edits to tune_conv2d.py

---
 vta/scripts/tune_conv2d.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vta/scripts/tune_conv2d.py b/vta/scripts/tune_conv2d.py
index 432008661c58..2cd8c6ea5a2f 100644
--- a/vta/scripts/tune_conv2d.py
+++ b/vta/scripts/tune_conv2d.py
@@ -59,6 +59,9 @@ def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dt
     # Get tracker info from env
     tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
     tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None))
+    if not tracket_host or not tracket_port:
+        print("Set your AutoTVM tracker node host and port variables to run the autotuner")
+        exit()
 
     measure_option = autotvm.measure_option(
             builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func),
@@ -67,8 +70,9 @@ def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dt
 
     tuner = autotvm.tuner.RandomTuner(task)
     n_trial = len(task.config_space)
-    tuner.tune(n_trial=n_trial,
+    tuner.tune(n_trial=30,
                measure_option=measure_option,
                callbacks=[autotvm.callback.log_to_file('conv2d.log')])
 
+    print("\nBest tuner config:")
     print(tuner.best_config)

From 7e633cd320b91ba1757a3c4dd1bae285b91607a1 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 21 May 2019 16:19:10 -0700
Subject: [PATCH 030/126] exhaustive search

---
 vta/scripts/tune_conv2d.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vta/scripts/tune_conv2d.py b/vta/scripts/tune_conv2d.py
index 2cd8c6ea5a2f..e896e917b921 100644
--- a/vta/scripts/tune_conv2d.py
+++ b/vta/scripts/tune_conv2d.py
@@ -69,8 +69,7 @@ def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dt
                                      check_correctness=True))
 
     tuner = autotvm.tuner.RandomTuner(task)
-    n_trial = len(task.config_space)
-    tuner.tune(n_trial=30,
+    tuner.tune(n_trial=len(task.config_space),
                measure_option=measure_option,
                callbacks=[autotvm.callback.log_to_file('conv2d.log')])
 

From 67b49c787fa5480fa11df7d7e204f03458abcc7c Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 21 May 2019 17:04:14 -0700
Subject: [PATCH 031/126] logging simulator stats in autoTVM

---
 python/tvm/autotvm/measure/measure.py         |  4 +++-
 python/tvm/autotvm/measure/measure_methods.py | 20 ++++++++++---------
 python/tvm/autotvm/record.py                  |  4 +++-
 vta/python/vta/build_module.py                |  7 +++++--
 4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py
index 0836fb741bd2..c4dec35d593f 100644
--- a/python/tvm/autotvm/measure/measure.py
+++ b/python/tvm/autotvm/measure/measure.py
@@ -34,7 +34,7 @@ class MeasureInput(namedtuple("MeasureInput", ["target", "task", "config"])):
     """
 
 
-class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost", "timestamp"])):
+class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost", "timestamp", "sim_stats"])):
     """
     Stores all the results of a measurement
 
@@ -49,6 +49,8 @@ class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost"
         All cost of this measure, including rpc, compilation, test runs
     timestamp: float
         The absolute time stamp when we finish measurement.
+    sim_stats: Dictionary
+        Dictionary of VTA simulator statistics (only used when target is VTA)
     """
 
 
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index dcdd46728e3e..1b32eaced711 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -46,7 +46,7 @@
 
 logger = logging.getLogger('autotvm')
 
-class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 'time_cost'))):
+class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 'time_cost', 'sim_stats'))):
     """
     Stores all the necessary inputs for a measurement.
 
@@ -60,6 +60,8 @@ class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 't
         The error happens during compilation.
     time_cost : float
         The time cost of building
+    sim_stats : Dictionary
+        Dictionary of VTA simulator statistics (only used when target is VTA)
     """
 
 class LocalBuilder(Builder):
@@ -114,13 +116,13 @@ def build(self, measure_inputs):
                 if isinstance(res, Exception):
                     # timeout or fleet error, return MeasureResult directly
                     results.append(MeasureResult((res,), MeasureErrorNo.BUILD_TIMEOUT,
-                                                 self.timeout, time.time()))
+                                                 self.timeout, time.time(), {}))
                 elif res.error is not None:
                     # instantiation error
                     if isinstance(res.error, InstantiationError):
                         results.append(MeasureResult((res.error,),
                                                      MeasureErrorNo.INSTANTIATION_ERROR,
-                                                     res.time_cost, time.time()))
+                                                     res.time_cost, time.time(), {}))
                     else:
                         if "InstantiationError" in str(res.error):
                             msg = str(res.error)
@@ -130,11 +132,11 @@ def build(self, measure_inputs):
                                 pass
                             results.append(MeasureResult((InstantiationError(msg),),
                                                          MeasureErrorNo.INSTANTIATION_ERROR,
-                                                         res.time_cost, time.time()))
+                                                         res.time_cost, time.time(), {}))
                         else:  # tvm error
                             results.append(MeasureResult((res.error,),
                                                          MeasureErrorNo.COMPILE_HOST,
-                                                         res.time_cost, time.time()))
+                                                         res.time_cost, time.time(), {}))
                 else:
                     # return BuildResult
                     results.append(res)
@@ -282,7 +284,7 @@ def run(self, measure_inputs, build_results):
                 res = future.get()
                 if isinstance(res, Exception):   # executor error or timeout
                     results.append(MeasureResult((str(res),), MeasureErrorNo.RUN_TIMEOUT,
-                                                 self.timeout, time.time()))
+                                                 self.timeout, time.time(), {}))
                 else:
                     results.append(res)
 
@@ -416,8 +418,8 @@ def _wrapped(measure_input, tmp_dir, **kwargs):
             func, arg_info = _build_func_common(measure_input, **kwargs)
             func.export_library(filename, build_func)
         except Exception as e:  # pylint: disable=broad-except
-            return BuildResult(None, None, e, time.time() - tic)
-        return BuildResult(filename, arg_info, None, time.time() - tic)
+            return BuildResult(None, None, e, time.time() - tic, {})
+        return BuildResult(filename, arg_info, None, time.time() - tic, {})
     return _wrapped
 
 
@@ -514,7 +516,7 @@ def run_through_rpc(measure_input, build_result,
         errno = MeasureErrorNo.RUNTIME_DEVICE
     tstamp = time.time()
     time.sleep(cooldown_interval)
-    return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)
+    return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp, build_result.sim_stats)
 
 
 def request_remote(device_key, host=None, port=None, priority=1, timeout=60):
diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py
index 14efb7bd9239..9d3747352f79 100644
--- a/python/tvm/autotvm/record.py
+++ b/python/tvm/autotvm/record.py
@@ -98,7 +98,9 @@ def encode(inp, result, protocol='json'):
                   result.all_cost,
                   result.timestamp),
 
-            "v": AUTOTVM_LOG_VERSION
+            "v": AUTOTVM_LOG_VERSION,
+
+            "s": result.sim_stats
         }
         return json.dumps(json_dict)
     if protocol == 'pickle':
diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py
index f723c99dbe45..7ab0834f0368 100644
--- a/vta/python/vta/build_module.py
+++ b/vta/python/vta/build_module.py
@@ -129,6 +129,8 @@ def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs):
     from tvm.autotvm.measure.measure_methods import BuildResult, InstantiationError
 
     tic = time.time()
+    # simulator stats
+    stats = {}
     try:
         filename = os.path.join(tmp_dir, "tmp_func_%0x.tar" % getrandbits(64))
         target, task, config = measure_input
@@ -165,6 +167,7 @@ def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs):
             simulator.clear_stats()
             simulator.debug_mode(simulator.DEBUG_SKIP_EXEC)
             f(*args)
+            stats = simulator.stats()
 
         # check by local simulator
         ctx = tvm.context(str(target))
@@ -172,5 +175,5 @@ def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs):
         sim(*args)
 
     except Exception as e:  # pylint: disable=broad-except
-        return BuildResult(None, None, e, time.time() - tic)
-    return BuildResult(filename, arg_info, None, time.time() - tic)
+        return BuildResult(None, None, e, time.time() - tic, stats)
+    return BuildResult(filename, arg_info, None, time.time() - tic, stats)

From 378b3a51404515565522510a422d3e98e6331481 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 21 May 2019 23:09:08 -0700
Subject: [PATCH 032/126] tuning over all resnet layers

---
 vta/scripts/tune_conv2d.py | 67 +++++++++++++++++++++++++++++---------
 1 file changed, 51 insertions(+), 16 deletions(-)

diff --git a/vta/scripts/tune_conv2d.py b/vta/scripts/tune_conv2d.py
index e896e917b921..0113060a77da 100644
--- a/vta/scripts/tune_conv2d.py
+++ b/vta/scripts/tune_conv2d.py
@@ -1,4 +1,5 @@
 """Tuning a single conv2d operator"""
+from collections import namedtuple
 import logging
 import os
 
@@ -11,6 +12,26 @@
 
 env = vta.get_env()
 
+Workload = namedtuple("Conv2DWorkload",
+                      ['batch', 'height', 'width', 'in_filter', 'out_filter',
+                       'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
+
+resnet_wkls = [
+    # Workloads of resnet18 on imagenet
+    # ('resnet-18.C1',  Workload(1, 224, 224, 3,   64,  7, 7, 3, 3, 2, 2)),
+    ('resnet-18.C2',  Workload(1,  56,  56, 64,  64,  3, 3, 1, 1, 1, 1)),
+    # ('resnet-18.C3',  Workload(1,  56,  56, 64,  64,  1, 1, 0, 0, 1, 1)), # this layer does not appear in ResNet
+    ('resnet-18.C4',  Workload(1,  56,  56, 64,  128, 3, 3, 1, 1, 2, 2)),
+    ('resnet-18.C5',  Workload(1,  56,  56, 64,  128, 1, 1, 0, 0, 2, 2)),
+    ('resnet-18.C6',  Workload(1,  28,  28, 128, 128, 3, 3, 1, 1, 1, 1)),
+    ('resnet-18.C7',  Workload(1,  28,  28, 128, 256, 3, 3, 1, 1, 2, 2)),
+    ('resnet-18.C8',  Workload(1,  28,  28, 128, 256, 1, 1, 0, 0, 2, 2)),
+    ('resnet-18.C9',  Workload(1,  14,  14, 256, 256, 3, 3, 1, 1, 1, 1)),
+    ('resnet-18.C10', Workload(1,  14,  14, 256, 512, 3, 3, 1, 1, 2, 2)),
+    ('resnet-18.C11', Workload(1,  14,  14, 256, 512, 1, 1, 0, 0, 2, 2)),
+    ('resnet-18.C12', Workload(1,   7,   7, 512, 512, 3, 3, 1, 1, 1, 1)),
+]
+
 @tvm.tag_scope(tag=topi.tag.ELEMWISE)
 def my_clip(x, a_min, a_max):
     """Unlike topi's current clip, put min and max into two stages."""
@@ -45,12 +66,6 @@ def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dt
     return s, [data, kernel, bias, res]
 
 if __name__ == '__main__':
-    N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dtype = \
-        1, 64, 56, 56, 64, 3, 3, (1, 1), (1, 1), (1, 1), 'int8', 'int32'
-
-    task = autotvm.task.create(conv2d, args=(N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dtype),
-            target=tvm.target.vta(), target_host=env.target_host, template_key='direct')
-    print(task.config_space)
 
     # Logging config (for printing tuning log to the screen)
     logging.basicConfig()
@@ -63,15 +78,35 @@ def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dt
         print("Set your AutoTVM tracker node host and port variables to run the autotuner")
         exit()
 
-    measure_option = autotvm.measure_option(
-            builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func),
-            runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port, number=4, repeat=3, timeout=10000,
-                                     check_correctness=True))
+    for wl_name, wl in resnet_wkls:
+
+        # Workload parameters
+        N = wl.batch
+        CI = wl.in_filter
+        H = wl.height
+        W = wl.width
+        CO = wl.out_filter
+        KH = wl.hkernel
+        KW = wl.wkernel
+        strides = (wl.hstride, wl.wstride)
+        padding = (wl.hpad, wl.wpad)
+        dilation = (1, 1)
+        in_dtype = 'int8'
+        out_dtype = 'int32'
+
+        task = autotvm.task.create(conv2d, args=(N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dtype),
+                target=tvm.target.vta(), target_host=env.target_host, template_key='direct')
+        print(task.config_space)
+
+        measure_option = autotvm.measure_option(
+                builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func),
+                runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port, number=4, repeat=3, timeout=10000,
+                                        check_correctness=True))
 
-    tuner = autotvm.tuner.RandomTuner(task)
-    tuner.tune(n_trial=len(task.config_space),
-               measure_option=measure_option,
-               callbacks=[autotvm.callback.log_to_file('conv2d.log')])
+        tuner = autotvm.tuner.RandomTuner(task)
+        tuner.tune(n_trial=len(task.config_space),
+                measure_option=measure_option,
+                callbacks=[autotvm.callback.log_to_file('conv2d.log')])
 
-    print("\nBest tuner config:")
-    print(tuner.best_config)
+        print("\nBest tuner config:")
+        print(tuner.best_config)

From e3656ca7d4f3057d92a47d2f474a29c2d7db6727 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Thu, 23 May 2019 18:35:02 -0700
Subject: [PATCH 033/126] removing sim stats from log for now due to tophub
 issues

---
 python/tvm/autotvm/measure/measure.py         |  4 +---
 python/tvm/autotvm/measure/measure_methods.py | 20 +++++++++----------
 python/tvm/autotvm/record.py                  |  4 +---
 vta/python/vta/build_module.py                |  4 ++--
 4 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py
index c4dec35d593f..0836fb741bd2 100644
--- a/python/tvm/autotvm/measure/measure.py
+++ b/python/tvm/autotvm/measure/measure.py
@@ -34,7 +34,7 @@ class MeasureInput(namedtuple("MeasureInput", ["target", "task", "config"])):
     """
 
 
-class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost", "timestamp", "sim_stats"])):
+class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost", "timestamp"])):
     """
     Stores all the results of a measurement
 
@@ -49,8 +49,6 @@ class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost"
         All cost of this measure, including rpc, compilation, test runs
     timestamp: float
         The absolute time stamp when we finish measurement.
-    sim_stats: Dictionary
-        Dictionary of VTA simulator statistics (only used when target is VTA)
     """
 
 
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 1b32eaced711..dcdd46728e3e 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -46,7 +46,7 @@
 
 logger = logging.getLogger('autotvm')
 
-class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 'time_cost', 'sim_stats'))):
+class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 'time_cost'))):
     """
     Stores all the necessary inputs for a measurement.
 
@@ -60,8 +60,6 @@ class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 't
         The error happens during compilation.
     time_cost : float
         The time cost of building
-    sim_stats : Dictionary
-        Dictionary of VTA simulator statistics (only used when target is VTA)
     """
 
 class LocalBuilder(Builder):
@@ -116,13 +114,13 @@ def build(self, measure_inputs):
                 if isinstance(res, Exception):
                     # timeout or fleet error, return MeasureResult directly
                     results.append(MeasureResult((res,), MeasureErrorNo.BUILD_TIMEOUT,
-                                                 self.timeout, time.time(), {}))
+                                                 self.timeout, time.time()))
                 elif res.error is not None:
                     # instantiation error
                     if isinstance(res.error, InstantiationError):
                         results.append(MeasureResult((res.error,),
                                                      MeasureErrorNo.INSTANTIATION_ERROR,
-                                                     res.time_cost, time.time(), {}))
+                                                     res.time_cost, time.time()))
                     else:
                         if "InstantiationError" in str(res.error):
                             msg = str(res.error)
@@ -132,11 +130,11 @@ def build(self, measure_inputs):
                                 pass
                             results.append(MeasureResult((InstantiationError(msg),),
                                                          MeasureErrorNo.INSTANTIATION_ERROR,
-                                                         res.time_cost, time.time(), {}))
+                                                         res.time_cost, time.time()))
                         else:  # tvm error
                             results.append(MeasureResult((res.error,),
                                                          MeasureErrorNo.COMPILE_HOST,
-                                                         res.time_cost, time.time(), {}))
+                                                         res.time_cost, time.time()))
                 else:
                     # return BuildResult
                     results.append(res)
@@ -284,7 +282,7 @@ def run(self, measure_inputs, build_results):
                 res = future.get()
                 if isinstance(res, Exception):   # executor error or timeout
                     results.append(MeasureResult((str(res),), MeasureErrorNo.RUN_TIMEOUT,
-                                                 self.timeout, time.time(), {}))
+                                                 self.timeout, time.time()))
                 else:
                     results.append(res)
 
@@ -418,8 +416,8 @@ def _wrapped(measure_input, tmp_dir, **kwargs):
             func, arg_info = _build_func_common(measure_input, **kwargs)
             func.export_library(filename, build_func)
         except Exception as e:  # pylint: disable=broad-except
-            return BuildResult(None, None, e, time.time() - tic, {})
-        return BuildResult(filename, arg_info, None, time.time() - tic, {})
+            return BuildResult(None, None, e, time.time() - tic)
+        return BuildResult(filename, arg_info, None, time.time() - tic)
     return _wrapped
 
 
@@ -516,7 +514,7 @@ def run_through_rpc(measure_input, build_result,
         errno = MeasureErrorNo.RUNTIME_DEVICE
     tstamp = time.time()
     time.sleep(cooldown_interval)
-    return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp, build_result.sim_stats)
+    return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)
 
 
 def request_remote(device_key, host=None, port=None, priority=1, timeout=60):
diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py
index 9d3747352f79..14efb7bd9239 100644
--- a/python/tvm/autotvm/record.py
+++ b/python/tvm/autotvm/record.py
@@ -98,9 +98,7 @@ def encode(inp, result, protocol='json'):
                   result.all_cost,
                   result.timestamp),
 
-            "v": AUTOTVM_LOG_VERSION,
-
-            "s": result.sim_stats
+            "v": AUTOTVM_LOG_VERSION
         }
         return json.dumps(json_dict)
     if protocol == 'pickle':
diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py
index 7ab0834f0368..91e3c4a7e0d8 100644
--- a/vta/python/vta/build_module.py
+++ b/vta/python/vta/build_module.py
@@ -175,5 +175,5 @@ def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs):
         sim(*args)
 
     except Exception as e:  # pylint: disable=broad-except
-        return BuildResult(None, None, e, time.time() - tic, stats)
-    return BuildResult(filename, arg_info, None, time.time() - tic, stats)
+        return BuildResult(None, None, e, time.time() - tic)
+    return BuildResult(filename, arg_info, None, time.time() - tic)

From 8186632733569e1990622f83031e317e931736b8 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Thu, 23 May 2019 18:54:02 -0700
Subject: [PATCH 034/126] autoTVM task extraction for VTA (nnvm for now)

---
 nnvm/python/nnvm/top/nn.py                  |  10 +-
 python/tvm/autotvm/task/nnvm_integration.py |  71 +++---
 python/tvm/autotvm/task/topi_integration.py |  70 ++++--
 vta/scripts/tune_resnet.py                  | 231 ++++++++++++++++++++
 4 files changed, 322 insertions(+), 60 deletions(-)
 create mode 100644 vta/scripts/tune_resnet.py

diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index 13964f4e25f6..128f985bd6d2 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -114,25 +114,25 @@ def compute_conv2d(attrs, inputs, _):
     if groups == 1 and layout == 'NCHW4c' and inputs[0].dtype == 'int8':
         # pylint: disable=assignment-from-no-return
         out = topi.nn.conv2d(inputs[0], inputs[1], strides, padding,
-                             dilation, layout, out_dtype=out_dtype)
+                             dilation, layout, out_dtype)
         # pylint: enable=assignment-from-no-return
     elif groups == 1:
         out = topi.nn.conv2d(
-            inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype=out_dtype)
+            inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype)
     elif layout == "NCHW" and \
          groups == get_const_int(inputs[0].shape[1]) and \
          groups == channels:
         out = topi.nn.depthwise_conv2d_nchw(
-            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
+            inputs[0], inputs[1], strides, padding, dilation, out_dtype)
     elif layout in ["NCHW", "NCHW4c"]:
         out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups,
-                                        out_dtype=out_dtype)
+                                        out_dtype)
     elif layout == "NHWC" and \
          kernel_layout == "HWOI" and \
          groups == get_const_int(inputs[0].shape[3]) and \
          groups == channels:
         out = topi.nn.depthwise_conv2d_nhwc(
-            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
+            inputs[0], inputs[1], strides, padding, dilation, out_dtype)
     else:
         raise ValueError("not support arbitrary group number for now")
 
diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py
index dbcee0e516e1..e4d2b3fb8023 100644
--- a/python/tvm/autotvm/task/nnvm_integration.py
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -27,15 +27,16 @@
 
 from .task import create
 from .topi_integration import TaskExtractEnv
+from .dispatcher import ApplyHistoryBest
 
 logger = logging.getLogger('autotvm')
 
 
-def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
+def extract_from_graph(graph, shape, dtype, target, symbols, params, target_host=None):
     """ Extract tuning tasks from a nnvm graph.
 
     This function collects tuning tasks by building the graph
-    with a "tracing" target and tracing all the calls to topi.
+    and trace all the calls to topi.
 
     Parameters
     ----------
@@ -49,6 +50,8 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
         The compilation target
     symbols : Array of nnvm.symbol
         Array of nnvm symbols want to be tuned
+    params : dict of str to NDArray
+        The parameter dictionary.
     target_host: tvm.target.Target
         The host compilation target
 
@@ -78,32 +81,35 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
             topi_funcs.extend(SYMBOL2TOPI[sym_name])
         else:
             warnings.warn("Symbol %s is not tunable, ignored" % sym_name)
-
-    # run compiler to collect all TOPI calls during compilation
     env.reset(topi_funcs)
 
-    # disable logger temporarily
-    old_state = logger.disabled
-    logger.disabled = True
+    with env:
+        # disable logger temporarily
+        old_state = logger.disabled
+        logger.disabled = True
 
-    # use a "tracing" target to do a fake compile for collecting topi calls
-    tracing_target = _target.create("llvm -device=tracing")
-    nnvm.compiler.engine.clear_cache()
-    nnvm.compiler.build(graph, target=tracing_target, shape=shape, dtype=dtype)
+        # run compiler to collect all TOPI calls during compilation
+        nnvm.compiler.engine.clear_cache()
+        nnvm.compiler.build(graph, target=target, shape=shape, dtype=dtype,
+                            target_host=target_host, params=params)
 
-    logger.disabled = old_state
+        logger.disabled = old_state
 
     # create tasks for target
     tasks = []
     for task_name, args in env.get_tasks():
-        tasks.append(create(task_name, args,
-                            target=target, target_host=target_host,
-                            template_key='direct'))
+        try:
+            tsk = create(task_name, args,
+                         target=target, target_host=target_host,
+                         template_key='direct')
+            tasks.append(tsk)
+        except topi.InvalidShapeError:
+            print("[Warning] Invalid Shape during AutoTVM Task Creation")
 
     return tasks
 
 
-def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, target_host=None):
+def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, params, target_host=None):
     """ Extract tuning tasks from multiple nnvm graphs.
 
     This function is the multiple graph version of extract_from_graph
@@ -120,6 +126,8 @@ def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, target_
         The compilation target
     symbols : Array of nnvm.symbol
         Array of nnvm symbols want to be tuned
+    params : dict of str to NDArray
+        The parameter dictionary.
     target_host: tvm.target.Target
         The host compilation target
 
@@ -149,28 +157,29 @@ def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, target_
             topi_funcs.extend(SYMBOL2TOPI[sym_name])
         else:
             warnings.warn("Symbol %s is not tunable, ignored" % sym_name)
-
-    # run compiler to collect all TOPI calls during compilation
     env.reset(topi_funcs)
 
-    # disable logger temporarily
-    old_state = logger.disabled
-    logger.disabled = True
+    with env:
+        # disable logger temporarily
+        old_state = logger.disabled
+        logger.disabled = True
 
-    # use a "tracing" target to do a fake compile for collecting topi calls
-    tracing_target = _target.create("llvm -device=tracing")
+        nnvm.compiler.engine.clear_cache()
+        for graph, shape, dtype in zip(graphs, shapes, dtypes):
+            nnvm.compiler.build(graph, target=target, shape=shape, dtype=dtype)
 
-    nnvm.compiler.engine.clear_cache()
-    for graph, shape, dtype in zip(graphs, shapes, dtypes):
-        nnvm.compiler.build(graph, target=tracing_target, shape=shape, dtype=dtype)
-
-    logger.disabled = old_state
+        logger.disabled = old_state
 
     # create tasks for target
     tasks = []
     for task_name, args in env.get_tasks():
-        tasks.append(create(task_name, args,
-                            target=target, target_host=target_host,
-                            template_key='direct'))
+        try:
+            tsk = create(task_name, args,
+                         target=target, target_host=target_host,
+                         template_key='direct')
+            tasks.append(tsk)
+        except topi.InvalidShapeError:
+            print("[Warning] Invalid Shape during AutoTVM Task Creation")
 
     return tasks
+
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index c48d4f58edce..ed85504e4c0a 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -27,6 +27,9 @@
 See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
 """
 
+import warnings
+import sys
+
 from ... import _api_internal, tensor, placeholder, create_schedule
 
 from .task import args_to_workload, dispatcher, register
@@ -73,6 +76,7 @@ def deserialize_args(args):
 class TaskExtractEnv:
     """Global environment for extracting tuning tasks from nnvm graph"""
     current = None
+    registered = None
 
     def __init__(self, allow_duplicate=False):
         import topi
@@ -106,47 +110,65 @@ def __init__(self, allow_duplicate=False):
             topi.nn.deformable_conv2d_nchw: [topi.generic.schedule_deformable_conv2d_nchw],
         }
 
+        # support reflection for tracing
+        self.func_to_reflection = {
+            topi.nn.conv2d:                 lambda x: setattr(topi.nn, 'conv2d', x),
+            topi.nn.conv2d_NCHWc:           lambda x: setattr(topi.nn, 'conv2d_NCHWc', x),
+            topi.nn.depthwise_conv2d_nchw:  lambda x: setattr(topi.nn, 'depthwise_conv2d_nchw', x),
+            topi.nn.group_conv2d_nchw:      lambda x: setattr(topi.nn, 'group_conv2d_nchw', x),
+            topi.nn.conv2d_transpose_nchw:  lambda x: setattr(topi.nn, 'conv2d_transpose_nchw', x),
+            topi.nn.dense:                  lambda x: setattr(topi.nn, 'dense', x),
+            topi.nn.bitserial_conv2d_nchw:  lambda x: setattr(topi.nn, 'bitserial_conv2d_nchw', x),
+            topi.nn.bitserial_conv2d_nhwc:  lambda x: setattr(topi.nn, 'bitserial_conv2d_nhwc', x),
+            topi.nn.bitserial_dense:        lambda x: setattr(topi.nn, 'bitserial_dense', x),
+            topi.nn.deformable_conv2d_nchw: lambda x: setattr(topi.nn, 'deformable_conv2d_nchw', x),
+        }
+
         self.allow_duplicate = allow_duplicate
-        self._register_tracing()
         self._register_topi_task()
         self.task_collection = []
         self.wanted_topi_funcs = list(self.topi_to_task.keys())
+        self.modified_funcs = []
+
+    def __enter__(self):
+        self.task_collection = []
+        self.modified_funcs = []
 
-    def _register_tracing(self):
-        """Register tracing function to track the topi function call"""
-        # register topi compute for "tracing" target
-        for topi_compute in self.topi_to_task:
+        for topi_compute in self.wanted_topi_funcs:
             def _local_scope(compute_func):
                 """start a scope to hold the local function in for loop"""
 
-                @compute_func.register("tracing", )
-                def _tracing_topi_compute(*args, **kwargs):
-                    assert not kwargs, "Do not support extracting tuning tasks when" \
-                                       "kwargs is used in TOPI function call." \
+                def _tracing_wrapper(*args, **kwargs):
+                    assert not kwargs, "Do not support extracting tuning tasks when " \
+                                       "kwargs is used in TOPI function call. " \
                                        "Please modify it to use only positional args."
-                    if compute_func in self.wanted_topi_funcs:  # record this call
-                        key = (self.topi_to_task[compute_func], serialize_args(args))
-                        if self.allow_duplicate or key not in self.task_collection:
-                            self.task_collection.append(key)
-                    return compute_func.fdefault(*args)
+                    key = (self.topi_to_task[compute_func], serialize_args(args))
+                    if self.allow_duplicate or key not in self.task_collection:
+                        self.task_collection.append(key)
+
+                    return compute_func(*args, **kwargs)
+
+                self.func_to_reflection[topi_compute](_tracing_wrapper)
+                self.modified_funcs.append(topi_compute)
+
             _local_scope(topi_compute)
 
-        # register topi schedule for "tracing" target
-        for topi_compute in self.topi_to_task:
-            for topi_schedule in self.topi_to_schedule[topi_compute]:
-                def _local_scope_(schedule_func):
-                    """start a scope to hold the local function in for loop"""
+        return self
 
-                    @schedule_func.register("tracing", )
-                    def _tracing_topi_compute(outs):
-                        outs = [outs] if isinstance(outs, tensor.Tensor) else outs
-                        return create_schedule([x.op for x in outs])
-                _local_scope_(topi_schedule)
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # revert modification
+        for func in self.modified_funcs:
+            self.func_to_reflection[func](func)
 
     def _register_topi_task(self):
         """register tuning wrapper for topi function"""
         import topi
 
+        # Avoid double registration for certain targets
+        if TaskExtractEnv.registered:
+            return
+        TaskExtractEnv.registered = True
+
         # Tuning wrapper for topi functions
         @register("topi_nn_conv2d")
         def _topi_nn_conv2d(*args, **kwargs):
diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
new file mode 100644
index 000000000000..b22a63e09df8
--- /dev/null
+++ b/vta/scripts/tune_resnet.py
@@ -0,0 +1,231 @@
+import argparse
+import os
+import time
+import numpy as np
+
+import tvm
+from tvm import rpc, autotvm
+from tvm.autotvm.measure.measure_methods import request_remote
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.contrib import graph_runtime, util
+from tvm.contrib.download import download
+
+import topi
+import nnvm.compiler
+import vta
+import vta.testing
+
+env = vta.get_env()
+
+def register_vta_tuning_tasks():
+    from tvm.autotvm.task.topi_integration import TaskExtractEnv, deserialize_args
+
+    @tvm.tag_scope(tag=topi.tag.ELEMWISE)
+    def my_clip(x, a_min, a_max):
+        """Unlike topi's current clip, put min and max into two stages."""
+        const_min = tvm.const(a_min, x.dtype)
+        const_max = tvm.const(a_max, x.dtype)
+        x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
+        x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+        return x
+
+    # init autotvm env to register VTA operator
+    TaskExtractEnv()
+
+    @autotvm.task.register("topi_nn_conv2d", override=True)
+    def _topi_nn_conv2d(*args, **kwargs):
+        assert not kwargs, "Do not support kwargs in template function call"
+        args = deserialize_args(args)
+        A, W = args[:2]
+
+        with tvm.target.vta():
+            res = topi.nn.conv2d(*args, **kwargs)
+            res = topi.right_shift(res, 8)
+            res = my_clip(res, 0, 127)
+            res = topi.cast(res, "int8")
+
+        if tvm.target.current_target().device_name == 'vta':
+            s = topi.generic.schedule_conv2d_nchw([res])
+        else:
+            s = tvm.create_schedule([res.op])
+        return s, [A, W, res]
+
+
+
+def generate_graph(sym, params, target, target_host):
+    # Populate the shape and data type dictionary
+    shape_dict = {"data": (1, 3, 224, 224)}
+    dtype_dict = {"data": 'float32'}
+    shape_dict.update({k: v.shape for k, v in params.items()})
+    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+    # Apply NNVM graph optimization passes
+    sym = vta.graph.clean_cast(sym)
+    sym = vta.graph.clean_conv_fuse(sym)
+    assert env.BLOCK_IN == env.BLOCK_OUT
+    sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT)
+
+    # Compile NNVM graph
+    with nnvm.compiler.build_config(opt_level=3):
+        with vta.build_config():
+            graph, lib, params = nnvm.compiler.build(
+                sym, target, shape_dict, dtype_dict,
+                params=params, target_host=target_host)
+
+    return graph, lib, params
+
+
+def extract_tasks(sym, params, target, target_host):
+    # Populate the shape and data type dictionary
+    shape_dict = {"data": (1, 3, 224, 224)}
+    dtype_dict = {"data": 'float32'}
+    shape_dict.update({k: v.shape for k, v in params.items()})
+    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+    # Apply NNVM graph optimization passes
+    sym = vta.graph.clean_cast(sym)
+    sym = vta.graph.clean_conv_fuse(sym)
+    assert env.BLOCK_IN == env.BLOCK_OUT
+    sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT)
+
+    with vta.build_config():
+        tasks = autotvm.task.extract_from_graph(graph=sym, shape=shape_dict, dtype=dtype_dict, target=target,
+                                                params=params, symbols=(nnvm.sym.conv2d,), target_host=target_host)
+    return tasks
+
+
+def download_model():
+    url = "https://github.com/uwsaml/web-data/raw/master/vta/models/"
+    categ_fn = 'synset.txt'
+    graph_fn = 'resnet18_qt8.json'
+    params_fn = 'resnet18_qt8.params'
+    data_dir = '_data'
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+
+    for file in [categ_fn, graph_fn, params_fn]:
+        if not os.path.isfile(file):
+            download(os.path.join(url, file), os.path.join(data_dir, file))
+
+    sym = nnvm.graph.load_json(open(os.path.join(data_dir, graph_fn)).read())
+    params = nnvm.compiler.load_param_dict(open(os.path.join(data_dir, params_fn), 'rb').read())
+
+    return sym, params
+
+
+def tune_tasks(tasks,
+               measure_option,
+               tuner='xgb',
+               n_trial=1000,
+               early_stopping=None,
+               log_filename='tuning.log',
+               use_transfer_learning=True,
+               try_winograd=True):
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(tsk, loss_type='rank')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(tsk, pop_size=50)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        n_trial_ = min(n_trial, len(tsk.config_space))
+        tuner_obj.tune(n_trial_,
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(n_trial_, prefix=prefix),
+                           autotvm.callback.log_to_file(tmp_log_file)])
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+if __name__ == '__main__':
+
+    # Get tracker info from env
+    tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
+    tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None))
+    if not tracket_host or not tracket_port:
+        print("Set your AutoTVM tracker node host and port variables to run the autotuner")
+        exit()
+
+    tuning_opt = {
+        'log_filename': 'resnet-18.log',
+
+        'tuner': 'random',
+        'n_trial': 1e9,
+        'early_stopping': None,
+
+        'measure_option':  autotvm.measure_option(
+                builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func),
+                runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port,
+                    number=4, repeat=3, timeout=60,
+                    check_correctness=True))
+    }
+
+    # download model
+    sym, params = download_model()
+
+    # register VTA tuning tasks
+    register_vta_tuning_tasks()
+
+    # extract tasks
+    print("Extract tasks...")
+    target = tvm.target.vta()
+    target_host = env.target_host
+    tasks = extract_tasks(sym, params, target, target_host)
+
+    print("Tuning...")
+    tune_tasks(tasks, **tuning_opt)
+
+    # compile kernels with history best records
+    with autotvm.tophub.context(target, extra_files=[tuning_opt['log_filename']]):
+        print("Compile...")
+        graph, lib, params = generate_graph(sym, params, target, target_host)
+        input_shape = (1, 3, 224, 224)
+        dtype = 'float32'
+
+        # export library
+        tmp = util.tempdir()
+        filename = "net.tar"
+        lib.export_library(tmp.relpath(filename))
+
+        # upload module to device
+        print("Upload...")
+        remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000)
+        remote.upload(tmp.relpath(filename))
+        rlib = remote.load_module(filename)
+
+        # upload parameters to device
+        ctx = remote.context(str(target), 0)
+        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+        module = graph_runtime.create(graph, rlib, ctx)
+        module.set_input('data', data_tvm)
+        module.set_input(**rparams)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        ftimer = module.module.time_evaluator("run", ctx, number=3, repeat=3)
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+              (np.mean(prof_res), np.std(prof_res)))
+

From 51773ecd45853ca7cad807e4fbb68f3b7f6ca33c Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 29 May 2019 18:55:36 -0500
Subject: [PATCH 035/126] merge fix

---
 vta/python/vta/top/vta_conv2d.py | 57 --------------------------------
 1 file changed, 57 deletions(-)

diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index 1672af47ca0c..eef047965a56 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -193,60 +193,3 @@ def _traverse(op):
     s[conv2d_stage].tensorize(x_bi, env.gemm)
     s[output].pragma(x_co1, env.dma_copy)
     return s
-<<<<<<< HEAD
-
-class Conv2DSchedule(object):
-    """ 2D convolution schedule object.
-    """
-    def __init__(self,
-                 b_factor=1,
-                 oc_factor=1,
-                 ic_factor=1,
-                 h_factor=1,
-                 w_factor=0,
-                 oc_nthread=0,
-                 h_nthread=0,
-                 debug_sync=False):
-        self.b_factor = b_factor
-        self.oc_factor = oc_factor
-        self.ic_factor = ic_factor
-        self.h_factor = h_factor
-        self.w_factor = w_factor
-        self.oc_nthread = oc_nthread
-        self.h_nthread = h_nthread
-        self.debug_sync = debug_sync
-
-    def __str__(self):
-        return "{}.{}.{}.{}.{}.{}.{}".format(
-            self.b_factor, self.oc_factor, self.ic_factor,
-            self.h_factor, self.w_factor,
-            self.oc_nthread, self.h_nthread)
-
-Schedule = Conv2DSchedule
-
-# Layer description of the ResNet18
-RESNET = {
-    0: Workload(1, 224, 224, 16, 64, 7, 7, 3, 3, 2, 2),
-    1: Workload(1, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
-    2: Workload(1, 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
-    3: Workload(1, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
-    4: Workload(1, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
-    5: Workload(1, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
-    6: Workload(1, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
-    7: Workload(1, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
-    8: Workload(1, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
-    9: Workload(1, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
-    10: Workload(1, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
-    11: Workload(1, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
-}
-
-for idx in RESNET:
-    f_schedules = find_schedules(RESNET[idx], vt_only=True, best_only=True)
-    if f_schedules:
-        scheds = f_schedules[0]
-        _WL2PLAN[RESNET[idx]] = scheds
-    else:
-        logging.warning("No valid schedule was found for the workload on current vta configuration")
-        break
-=======
->>>>>>> autotvm support for conv2d operator

From 3ad29a4bdf3e73e74edf97fa38e4c40f3566a4b6 Mon Sep 17 00:00:00 2001
From: ZihengJiang <ziheng@apache.org>
Date: Wed, 29 May 2019 14:39:49 -0700
Subject: [PATCH 036/126] Insert stop_fusion for vta.

---
 src/relay/pass/quantize.cc | 62 +++++++++++++++++++++++++++++++++++---
 1 file changed, 57 insertions(+), 5 deletions(-)

diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
index 07233a83ca23..f55b26881f8b 100644
--- a/src/relay/pass/quantize.cc
+++ b/src/relay/pass/quantize.cc
@@ -504,10 +504,10 @@ RELAY_REGISTER_OP("nn.relu")
 RELAY_REGISTER_OP("strided_slice")
 .set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
 
-
-Expr MaxPoolRealize(const Call& ref_call,
-                    const Array<Expr>& new_args,
-                    const NodeRef& ctx) {
+/* \brief for unary operators which requantize its input to dtype_nbit */
+Expr CastDtypeInputRealize(const Call& ref_call,
+                           const Array<Expr>& new_args,
+                           const NodeRef& ctx) {
   const QConfig& cfg = QConfig::Current();
   CHECK_EQ(new_args.size(), 1);
   if (const auto* n = new_args[0].as<QRealizeIntExprNode>()) {
@@ -520,7 +520,10 @@ Expr MaxPoolRealize(const Call& ref_call,
 }
 
 RELAY_REGISTER_OP("nn.max_pool2d")
-.set_attr<FForwardRewrite>("FQRealizeRewrite", MaxPoolRealize);
+.set_attr<FForwardRewrite>("FQRealizeRewrite", CastDtypeInputRealize);
+
+RELAY_REGISTER_OP("stop_fusion")
+.set_attr<FForwardRewrite>("FQRealizeRewrite", CastDtypeInputRealize);
 
 
 Expr AvgPoolRealize(const Call& ref_call,
@@ -646,6 +649,55 @@ Pass QuantizeRealizePass() {
 TVM_REGISTER_API("relay._quantize.QuantizeRealize")
 .set_body_typed(QuantizeRealizePass);
 
+class VtaStoreInjector : public ExprMutator {
+ private:
+  const CallNode* GetPreviousNode(const CallNode* n) {
+    if (n == nullptr || n->args.size() == 0) {
+      return nullptr;
+    }
+    return n->args[0].as<CallNode>();
+  }
+
+ public:
+  Expr VisitExpr_(const CallNode* n) final {
+    static const Op& conv2d = Op::Get("nn.conv2d");
+    static const Op& add = Op::Get("add");
+    static const Op& relu = Op::Get("nn.relu");
+    auto new_e = ExprMutator::VisitExpr_(n);
+    const CallNode* n0 = new_e.as<CallNode>();
+    // conv->add->relu->[here]
+    if (n0 && n0->op.same_as(relu)) {
+      const CallNode* n1 = n0->args[0].as<CallNode>();
+      if (n1 && n1->op.same_as(add)) {
+        const CallNode* n2 = n1->args[0].as<CallNode>();
+        if (n2 && n2->op.same_as(conv2d)) {
+          return StopFusion(new_e);
+        }
+      }
+    }
+    // conv->add->[here]->add
+    if (n0 && n0->op.same_as(add)) {
+      const CallNode* n1 = n0->args[1].as<CallNode>();
+      if (n1 && n1->op.same_as(add)) {
+        const CallNode* n2 = n1->args[0].as<CallNode>();
+        if (n2 && n2->op.same_as(conv2d)) {
+          Expr child = StopFusion(n0->args[1]);
+          return CallNode::make(add, {n0->args[0], child}, Attrs{}, {});
+        }
+      }
+    }
+    return new_e;
+  }
+};
+
+Expr VtaStoreHint(const Expr& e) {
+  return VtaStoreInjector().Mutate(e);
+}
+
+TVM_REGISTER_API("relay._quantize.vta_store_hint")
+.set_body_typed(VtaStoreHint);
+
+
 }  // namespace quantize
 }  // namespace relay
 }  // namespace tvm

From df679584822f5db56a365723a034e688ab7c2e9e Mon Sep 17 00:00:00 2001
From: ZihengJiang <ziheng@apache.org>
Date: Fri, 31 May 2019 15:44:35 -0700
Subject: [PATCH 037/126] Update.

---
 python/tvm/relay/quantize/_annotate.py | 120 +++++++++++++++++--
 python/tvm/relay/quantize/quantize.py  |  42 +++++++
 src/relay/op/annotation/annotation.cc  |  26 ++++-
 src/relay/pass/pattern_util.h          |   6 +-
 src/relay/pass/quantize.cc             | 154 ++++++++++++++++---------
 src/relay/pass/quantize.h              |  24 ++++
 6 files changed, 306 insertions(+), 66 deletions(-)

diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index 61e895ac7efb..799b553a702c 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -171,14 +171,16 @@ def conv2d_rewrite(ref_call, new_args, ctx):
     lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
     rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
 
-    if lhs_kind is None or lhs_kind != QAnnotateKind.INPUT:
+    # print('conv2d lhs kind: {0}'.format(lhs_kind))
+    # print('conv2d lhs: \n{0}'.format(lhs_expr))
+    # print('\n\n\n')
+    if lhs_kind is None or lhs_kind == QAnnotateKind.ACTIVATION:
         lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
 
     assert rhs_kind is None
     rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
 
     expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
-
     return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
 
 
@@ -241,25 +243,43 @@ def add_rewrite(ref_call, new_args, ctx):
 
     lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
     rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
+    # print('add lhs kind: {0}'.format(lhs_kind))
+    # print('add rhs kind: {0}'.format(rhs_kind))
 
     if lhs_kind is None and rhs_kind is None:
         return None
+
     if lhs_kind is None and rhs_kind is not None:
         # quantize lhs to INPUT field if it is normal expression
+        assert rhs_kind == QAnnotateKind.INPUT
         lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
+        expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
+        # print('execute add with INPUT')
+        return QAnnotateExpr(expr, QAnnotateKind.INPUT)
+
     if lhs_kind is not None and rhs_kind is None:
         if isinstance(rhs_expr, _expr.Constant):
             # quantize rhs to WEIGHT field if it is Constant
             rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
+            assert lhs_kind == QAnnotateKind.ACTIVATION
+            expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
+            return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
         else:
             # quantize rhs to INPUT field if it is not Constant
             rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT)
-    if lhs_kind == QAnnotateKind.ACTIVATION and rhs_kind == QAnnotateKind.ACTIVATION:
-        # quantize rhs to INPUT field if both lhs and rhs are ACTIVATION
-        rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT)
+            raise ValueError
+
+    if lhs_kind is not None and rhs_kind is not None:
+        if lhs_kind == QAnnotateKind.INPUT and rhs_kind == QAnnotateKind.INPUT:
+            expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
+            # print('execute add with INPUT')
+            return QAnnotateExpr(expr, QAnnotateKind.INPUT)
+        if lhs_kind == QAnnotateKind.ACTIVATION and rhs_kind == QAnnotateKind.ACTIVATION:
+            # quantize rhs to INPUT field if both lhs and rhs are ACTIVATION
+            rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT)
 
-    expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
-    return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
+            expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
+            return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
 
 
 @register_annotate_function("stop_fusion")
@@ -294,6 +314,7 @@ def identity_rewrite(ref_call, new_args, ctx):
 register_annotate_function("nn.relu", identity_rewrite)
 register_annotate_function("strided_slice", identity_rewrite)
 register_annotate_function("nn.avg_pool2d", identity_rewrite)
+register_annotate_function("stop_fusion", identity_rewrite)
 
 
 def pool2d_rewrite(ref_call, new_args, ctx):
@@ -313,6 +334,20 @@ def pool2d_rewrite(ref_call, new_args, ctx):
 
 register_annotate_function("nn.max_pool2d", pool2d_rewrite)
 
+@register_annotate_function("force_cast")
+def force_cast_rewrite(ref_call, new_args, ctx):
+    if _conv_counter() <= current_qconfig().skip_k_conv:
+        return None
+    expr, x_kind = _get_expr_kind(new_args[0])
+
+    if x_kind is None:
+        return new_args[0]
+    if x_kind == QAnnotateKind.ACTIVATION:
+        expr = attach_simulated_quantize(expr, QAnnotateKind.INPUT)
+
+    expr = _forward_op(ref_call, [expr])
+    return QAnnotateExpr(expr, QAnnotateKind.INPUT)
+
 
 @register_annotate_function("concatenate")
 def concatenate_rewrite(ref_call, new_args, ctx):
@@ -333,3 +368,74 @@ def concatenate_rewrite(ref_call, new_args, ctx):
             expr_list[i] = attach_simulated_quantize(expr_list[i], QAnnotateKind.ACTIVATION)
     expr = _forward_op(ref_call, [_expr.Tuple(expr_list)])
     return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
+
+
+# register for vta stop fusion
+def register_vta_rewrite(op_name, frewrite=None, level=10):
+    def _register(func):
+        return _op.op._Register(op_name, "FQVtaRewrite", func, level)
+    return _register(frewrite) if frewrite is not None else _register
+
+@register_relay_node
+class QVtaExpr(_expr.TempExpr):
+    def __init__(self, expr):
+        self.__init_handle_by_constructor__(
+            _quantize.make_vta_expr, expr)
+
+    def realize(self):
+        return _quantize.temp_expr_realize(self)
+
+
+def vta_expr_check(expr):
+    if isinstance(expr, QVtaExpr):
+        return True, expr.expr
+    return False, expr
+
+# def _stop_fusion(expr):
+#     return _quantize.make_stop_fusion(expr)
+
+@register_vta_rewrite("nn.conv2d")
+def conv2d_vta_rewrite(ref_call, new_args, ctx):
+    cnt = _conv_counter()
+    if cnt < current_qconfig().skip_k_conv:
+        _set_conv_counter(cnt + 1)
+        return None
+    _set_conv_counter(cnt + 1)
+
+
+    data_cond, data = vta_expr_check(new_args[0])
+    kernel_cond, kernel = vta_expr_check(new_args[1])
+
+    assert not kernel_cond
+    if data_cond:
+        data = new_args[0].realize()
+    ret = _forward_op(ref_call, [data, kernel])
+    return QVtaExpr(ret)
+
+def identity_vta_rewrite(ref_call, new_args, ctx):
+    cond, expr = vta_expr_check(new_args[0])
+    if cond:
+        return QVtaExpr(_forward_op(ref_call, [expr]))
+    else:
+        return None
+
+register_vta_rewrite("nn.relu", identity_vta_rewrite)
+register_vta_rewrite("nn.max_pool2d", identity_vta_rewrite)
+
+
+# @register_vta_rewrite("nn.max_pool2d")
+# def pool_vta_rewrite(ref_call, new_args, ctx):
+#     pass
+
+@register_vta_rewrite("add")
+def add_vta_rewrite(ref_call, new_args, ctx):
+    lhs_cond, lhs = vta_expr_check(new_args[0])
+    rhs_cond, rhs = vta_expr_check(new_args[1])
+    if lhs_cond and rhs_cond:
+        lhs = new_args[0].realize()
+        rhs = new_args[1].realize()
+        return _forward_op(ref_call, [lhs, rhs])
+    elif lhs_cond and not rhs_cond:
+        return QVtaExpr(_forward_op(ref_call, [lhs, rhs]))
+    else:
+        return None
diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index a7749d4892fb..da881db26fb2 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -349,3 +349,45 @@ def quantize(graph, params=None, dataset=None):
             mod = optimize(mod)
             mod = quantize_seq(mod)
     return mod[mod.entry_func.name_hint]
+
+def quantize_vta(graph, params=None, dataset=None):
+
+    """ The quantization procedure for VTA specifically.
+
+    Parameters
+    ---------
+    graph: Function
+        The original graph.
+
+    params : dict of str to NDArray
+        Input parameters to the graph that do not change
+        during inference time. Used for constant folding.
+
+    dataset: list of dict of Var -> NDArray
+        The calibration dataset.
+
+    Returns
+    -------
+    ret: Function
+        The graph after quantization
+    """
+
+    # TODO(zhiics) Move this to the pass manager.
+    graph = optimize(graph, params)
+
+    print('original graph')
+    print(graph)
+    graph = _quantize.rewrite_for_vta(graph)
+    print('after rewrite for vta')
+    print(graph)
+
+    graph = annotate(graph)
+    graph = calibrate(graph, dataset)
+    print('after calibrate')
+    print(graph)
+    graph = realize(graph)
+    graph = _ir_pass.fold_constant(graph)
+
+    print('after realize')
+    print(graph)
+    return graph
diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index f09a3a22e3ab..789c85e39074 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -83,6 +83,28 @@ TVM_ADD_FILELINE)
                          return {topi::identity(inputs[0])};
                        });
 
+Expr ForceCast(Expr data) {
+  static const Op& op = Op::Get("force_cast");
+  return CallNode::make(op, {data}, Attrs{}, {});
+}
+
+RELAY_REGISTER_OP("force_cast")
+.describe(R"code(Annotate an expression to prevent it being fused with previous expressions.)code"
+TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input data.")
+.add_type_rel("Identity", IdentityRel)
+.set_support_level(10)
+.set_attr<TOpPattern>("TOpPattern", kOpaque)
+.set_attr<TOpIsStateful>("TOpIsStateful", false)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+.set_attr<FTVMCompute>("FTVMCompute",
+                       [](const Attrs& attrs, const Array<Tensor>& inputs,
+                          const Type& out_dtype, const Target& target) -> Array<Tensor> {
+                         return {topi::identity(inputs[0])};
+                       });
+
+
 RELAY_REGISTER_OP("bitpack_start")
 .describe(R"code(
 Mark the start of bitpacking.
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index b709f2846b34..5c303905968e 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -379,6 +379,8 @@ Expr MakeStridedSlice(Expr data, Array<Integer> begin, Array<Integer> end, Array
 
 Expr StopFusion(Expr data);
 
+Expr ForceCast(Expr data);
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_PASS_PATTERN_UTIL_H_
diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
index f55b26881f8b..8fbe290ad60b 100644
--- a/src/relay/pass/quantize.cc
+++ b/src/relay/pass/quantize.cc
@@ -90,7 +90,7 @@ RELAY_REGISTER_OP("relay.op.annotation.simulated_quantize")
 .add_argument("clip_min", "Tensor", "lower bound. It should be a scalar")
 .add_argument("clip_max", "Tensor", "upper bound. It should be a scalar")
 .set_attrs_type_key("relay.attrs.SimulatedQuantizeAttrs")
-.set_support_level(10)
+.set_support_level(11)
 .add_type_rel("SimulatedQuantize", SimulatedQuantizeRel);
 
 TVM_REGISTER_API("relay._quantize.simulated_quantize")
@@ -111,13 +111,14 @@ TVM_REGISTER_API("relay._quantize.simulated_quantize")
 
 Expr QAnnotateExprNode::Realize() const {
   const auto& cfg = QConfig::Current();
-  if (cfg->store_lowbit_output) {
-    // store low bit output back for VTA
-    const PackedFunc* f = runtime::Registry::Get("relay.quantize.attach_simulated_quantize");
-    return (*f)(this->expr, static_cast<int>(kQInput));
-  } else {
-    return expr;
-  }
+  return expr;
+  // if (cfg->store_lowbit_output) {
+  //   // store low bit output back for VTA
+  //   const PackedFunc* f = runtime::Registry::Get("relay.quantize.attach_simulated_quantize");
+  //   return (*f)(this->expr, static_cast<int>(kQInput));
+  // } else {
+  //   return expr;
+  // }
 }
 
 QAnnotateExpr QAnnotateExprNode::make(Expr expr, QAnnotateKind kind) {
@@ -133,6 +134,23 @@ TVM_REGISTER_API("relay._quantize.make_annotate_expr")
       static_cast<QAnnotateKind>(args[1].operator int()));
   });
 
+
+TVM_REGISTER_API("relay._quantize.annotate")
+.set_body_typed<Expr(Expr)>([] (const Expr& expr) {
+  std::function<Expr(const Expr&)> fmulti_ref = [](const Expr& e) {
+      if (e->derived_from<TempExprNode>()) {
+        const auto* n = e.as<QAnnotateExprNode>();
+        CHECK(n);
+        const PackedFunc* f = runtime::Registry::Get("relay.quantize.attach_simulated_quantize");
+        Expr ret = (*f)(n->expr, static_cast<int>(kQInput));
+        return static_cast<Expr>(QAnnotateExprNode::make(ret, kQInput));
+      }
+      return e;
+    };
+  return ForwardRewrite(expr, "FQAnnotateRewrite", nullptr, nullptr);
+});
+
+
 // =============
 // realize pass
 
@@ -385,7 +403,17 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args,
 
   // unify the data type
   CHECK_EQ(ref_args.size(), args.size());
-  DataType dtype = cfg->dtype_activation;
+  DataType dtype;
+  if (nptrs[0]->dtype == cfg->dtype_activation) {
+    DataType dtype = cfg->dtype_activation;
+    ret.Set(1, Cast(ret[1], dtype));
+  } else if (nptrs[1]->dtype == cfg->dtype_input) {
+    DataType dtype = cfg->dtype_input;
+    ret.Set(0, Cast(ret[0], dtype));
+  } else {
+    LOG(FATAL) << "should not touch here.";
+  }
+
   for (size_t i = 0; i < ret.size(); ++i) {
     auto ref_arg = ref_args[i].as<CallNode>();
     if (nptrs[i]->dtype != dtype) {
@@ -504,6 +532,9 @@ RELAY_REGISTER_OP("nn.relu")
 RELAY_REGISTER_OP("strided_slice")
 .set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
 
+RELAY_REGISTER_OP("stop_fusion")
+.set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
+
 /* \brief for unary operators which requantize its input to dtype_nbit */
 Expr CastDtypeInputRealize(const Call& ref_call,
                            const Array<Expr>& new_args,
@@ -522,9 +553,6 @@ Expr CastDtypeInputRealize(const Call& ref_call,
 RELAY_REGISTER_OP("nn.max_pool2d")
 .set_attr<FForwardRewrite>("FQRealizeRewrite", CastDtypeInputRealize);
 
-RELAY_REGISTER_OP("stop_fusion")
-.set_attr<FForwardRewrite>("FQRealizeRewrite", CastDtypeInputRealize);
-
 
 Expr AvgPoolRealize(const Call& ref_call,
                     const Array<Expr>& new_args,
@@ -546,6 +574,29 @@ Expr AvgPoolRealize(const Call& ref_call,
 RELAY_REGISTER_OP("nn.avg_pool2d")
 .set_attr<FForwardRewrite>("FQRealizeRewrite", AvgPoolRealize);
 
+Expr ForceCastRealize(const Call& ref_call,
+                      const Array<Expr>& new_args,
+                      const NodeRef& ctx) {
+  const QConfig& cfg = QConfig::Current();
+  CHECK_EQ(new_args.size(), 1);
+  if (const auto* n = new_args[0].as<QRealizeIntExprNode>()) {
+    Expr ret = Cast(n->data, cfg->dtype_input);
+    return QRealizeIntExprNode::make(ret, n->dom_scale, cfg->dtype_input);
+  }
+  CHECK(!new_args[0]->derived_from<TempExprNode>());
+  return Expr(nullptr);
+}
+
+RELAY_REGISTER_OP("force_cast")
+.set_attr<FForwardRewrite>("FQRealizeRewrite", ForceCastRealize);
+
+TVM_REGISTER_API("relay._quantize.realize")
+.set_body_typed<Expr(Expr)>([](const Expr& e) {
+  Expr ret = ForwardRewrite(e, "FQRealizeRewrite", nullptr, nullptr);
+  return ret;
+});
+
+
 // =============
 // qconfig
 
@@ -649,53 +700,46 @@ Pass QuantizeRealizePass() {
 TVM_REGISTER_API("relay._quantize.QuantizeRealize")
 .set_body_typed(QuantizeRealizePass);
 
-class VtaStoreInjector : public ExprMutator {
- private:
-  const CallNode* GetPreviousNode(const CallNode* n) {
-    if (n == nullptr || n->args.size() == 0) {
-      return nullptr;
-    }
-    return n->args[0].as<CallNode>();
-  }
+// =============
+// Insert stop_fusion for vta.
 
- public:
-  Expr VisitExpr_(const CallNode* n) final {
-    static const Op& conv2d = Op::Get("nn.conv2d");
-    static const Op& add = Op::Get("add");
-    static const Op& relu = Op::Get("nn.relu");
-    auto new_e = ExprMutator::VisitExpr_(n);
-    const CallNode* n0 = new_e.as<CallNode>();
-    // conv->add->relu->[here]
-    if (n0 && n0->op.same_as(relu)) {
-      const CallNode* n1 = n0->args[0].as<CallNode>();
-      if (n1 && n1->op.same_as(add)) {
-        const CallNode* n2 = n1->args[0].as<CallNode>();
-        if (n2 && n2->op.same_as(conv2d)) {
-          return StopFusion(new_e);
-        }
-      }
-    }
-    // conv->add->[here]->add
-    if (n0 && n0->op.same_as(add)) {
-      const CallNode* n1 = n0->args[1].as<CallNode>();
-      if (n1 && n1->op.same_as(add)) {
-        const CallNode* n2 = n1->args[0].as<CallNode>();
-        if (n2 && n2->op.same_as(conv2d)) {
-          Expr child = StopFusion(n0->args[1]);
-          return CallNode::make(add, {n0->args[0], child}, Attrs{}, {});
-        }
-      }
-    }
-    return new_e;
-  }
-};
 
-Expr VtaStoreHint(const Expr& e) {
-  return VtaStoreInjector().Mutate(e);
+Expr QVtaExprNode::Realize() const {
+  Expr ret = ForceCast(this->expr);
+  return StopFusion(ret);
+}
+
+QVtaExpr QVtaExprNode::make(Expr expr) {
+  auto rnode = make_node<QVtaExprNode>();
+  rnode->expr = expr;
+  return QVtaExpr(rnode);
 }
 
-TVM_REGISTER_API("relay._quantize.vta_store_hint")
-.set_body_typed(VtaStoreHint);
+TVM_REGISTER_API("relay._quantize.rewrite_for_vta")
+.set_body_typed<Expr(Expr)>([] (const Expr& expr) {
+  return ForwardRewrite(expr, "FQVtaRewrite", nullptr, nullptr);
+});
+
+
+TVM_REGISTER_API("relay._quantize.make_vta_expr")
+.set_body([](TVMArgs args,  TVMRetValue *ret) {
+    *ret = QVtaExprNode::make(args[0]);
+  });
+
+
+TVM_REGISTER_API("relay._quantize.make_stop_fusion")
+.set_body_typed<Expr(Expr)>([] (const Expr& expr) {
+  return StopFusion(expr);
+});
+
+TVM_REGISTER_API("relay._quantize.temp_expr_realize")
+.set_body_typed<Expr(Expr)>([] (const Expr& expr) {
+  const QVtaExprNode* n = expr.as<QVtaExprNode>();
+  CHECK(n);
+  return n->Realize();
+});
+
+
 
 
 }  // namespace quantize
diff --git a/src/relay/pass/quantize.h b/src/relay/pass/quantize.h
index da95a6c2134a..fce98e54459c 100644
--- a/src/relay/pass/quantize.h
+++ b/src/relay/pass/quantize.h
@@ -72,6 +72,30 @@ class QAnnotateExprNode : public TempExprNode {
 RELAY_DEFINE_NODE_REF(QAnnotateExpr, QAnnotateExprNode, TempExpr);
 
 
+class QVtaExpr;
+/*!
+ * \brief TempExprNode used during annotate forward rewrite.
+ */
+class QVtaExprNode : public TempExprNode {
+ public:
+  /*! \brief The original expression */
+  Expr expr;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("expr", &expr);
+  }
+
+  TVM_DLL static QVtaExpr make(Expr expr);
+
+  Expr Realize() const final;
+
+  static constexpr const char* _type_key = "relay.QVtaExpr";
+  TVM_DECLARE_NODE_TYPE_INFO(QVtaExprNode, TempExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(QVtaExpr, QVtaExprNode, TempExpr);
+
+
 /*! \brief TempExpr used during realize forward rewrite. */
 class QRealizeExpr;
 /*! \brief TempExpr representing integer. */

From 7b2d3067f11b75d22c056292686e6a2081b310d3 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 3 Jun 2019 17:55:56 -0700
Subject: [PATCH 038/126] fix bug from relay build config change

---
 vta/scripts/relay_to_vta.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vta/scripts/relay_to_vta.py b/vta/scripts/relay_to_vta.py
index 66af34f659e4..22c76bdda259 100644
--- a/vta/scripts/relay_to_vta.py
+++ b/vta/scripts/relay_to_vta.py
@@ -188,7 +188,7 @@ def run(device = "vta"):
             relay_graph = relay.ir_pass.fold_constant(relay_graph)
 
         # Compile Relay program.
-        with relay.build_module.build_config(opt_level=3, disable_pass={"AlterOpLayout"}):
+        with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
             if target.device_name != "vta":
                 graph, lib, params = relay.build(
                     relay_graph, target=target,

From d539d15745599237d938435681c90e842cb473fd Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 4 Jun 2019 15:28:44 -0700
Subject: [PATCH 039/126] typo fix

---
 src/pass/make_api.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pass/make_api.cc b/src/pass/make_api.cc
index dfb764f535c9..13f46ecb6f7a 100644
--- a/src/pass/make_api.cc
+++ b/src/pass/make_api.cc
@@ -184,7 +184,7 @@ LoweredFunc MakeAPI(Stmt body,
     for (Var v : undefined) {
       os << " \'" << v->name_hint << "\' ";
     }
-    os << " does not appeared in api_args";
+    os << " does not appear in api_args";
     LOG(FATAL) << "Not all Vars are passed in api_args: " << os.str();
   }
   return f;

From b1077181931cc82efafcc7f246add95a36d5c8ee Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 4 Jun 2019 15:47:36 -0700
Subject: [PATCH 040/126] typo fix

---
 src/schedule/schedule_lang.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/schedule/schedule_lang.cc b/src/schedule/schedule_lang.cc
index e1cb4c5f9bdc..7532f4bcd31c 100644
--- a/src/schedule/schedule_lang.cc
+++ b/src/schedule/schedule_lang.cc
@@ -47,7 +47,7 @@ size_t FindLeafVar(ArrayNode* all_vars, ArrayNode* leaf_vars, const IterVar& v)
 
   if (FindNodeRef(all_vars, v) < all_vars->data.size()) {
     LOG(FATAL) << "Operate on iter var " << v
-               << "that has already been splitted";
+               << "that has already been split";
   } else {
     LOG(FATAL) << "Operate on iter var " << v
                << "that is not part of the schedule";

From c5936ba65ef4e5498f2025a77624fe5dde73b96b Mon Sep 17 00:00:00 2001
From: ZihengJiang <ziheng@apache.org>
Date: Tue, 4 Jun 2019 16:47:25 -0700
Subject: [PATCH 041/126] Fix for tvm::buuild

---
 src/codegen/build_module.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 04a2fd6d4db9..488baa9bce46 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -580,6 +580,9 @@ runtime::Module build(const Map<std::string, Array<LoweredFunc>>& inputs,
   Map<Target, Array<LoweredFunc>> updated_input;
   for (const auto& it : inputs) {
     auto target = Target::Create(it.first);
+    if (target->device_name == "vta") {
+      target = Target::Create("ext_dev");
+    }
     updated_input.Set(target, it.second);
   }
   return build(updated_input, target_host, config);

From 96b7529d4d61b946eeae084f5ad29f72839d421a Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 4 Jun 2019 18:06:37 -0700
Subject: [PATCH 042/126] relay task extraction for VTA (wip)

---
 python/tvm/autotvm/task/nnvm_integration.py  | 19 +++--
 python/tvm/autotvm/task/relay_integration.py | 87 +++++++++++---------
 2 files changed, 57 insertions(+), 49 deletions(-)

diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py
index e4d2b3fb8023..251a310cf7aa 100644
--- a/python/tvm/autotvm/task/nnvm_integration.py
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -66,8 +66,8 @@ def extract_from_graph(graph, shape, dtype, target, symbols, params, target_host
 
     env = TaskExtractEnv.get()
 
-    #NOTE: To add more symbols, you only need to change the following lists
-    #nnvm symbol -> topi compute
+    # NOTE: To add more symbols, you only need to change the following lists
+    # nnvm symbol -> topi compute
     SYMBOL2TOPI = {
         nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw,
                           topi.nn.group_conv2d_nchw],
@@ -81,14 +81,14 @@ def extract_from_graph(graph, shape, dtype, target, symbols, params, target_host
             topi_funcs.extend(SYMBOL2TOPI[sym_name])
         else:
             warnings.warn("Symbol %s is not tunable, ignored" % sym_name)
-    env.reset(topi_funcs)
 
+    # run compiler to collect all TOPI calls during compilation
+    env.reset(topi_funcs)
     with env:
         # disable logger temporarily
         old_state = logger.disabled
         logger.disabled = True
 
-        # run compiler to collect all TOPI calls during compilation
         nnvm.compiler.engine.clear_cache()
         nnvm.compiler.build(graph, target=target, shape=shape, dtype=dtype,
                             target_host=target_host, params=params)
@@ -99,12 +99,14 @@ def extract_from_graph(graph, shape, dtype, target, symbols, params, target_host
     tasks = []
     for task_name, args in env.get_tasks():
         try:
+            print(task_name)
+            print(args)
             tsk = create(task_name, args,
                          target=target, target_host=target_host,
                          template_key='direct')
             tasks.append(tsk)
         except topi.InvalidShapeError:
-            print("[Warning] Invalid Shape during AutoTVM Task Creation")
+            print("[Warning] Invalid shape during AutoTVM task creation")
 
     return tasks
 
@@ -157,15 +159,16 @@ def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, params,
             topi_funcs.extend(SYMBOL2TOPI[sym_name])
         else:
             warnings.warn("Symbol %s is not tunable, ignored" % sym_name)
-    env.reset(topi_funcs)
 
+    # run compiler to collect all TOPI calls during compilation
+    env.reset(topi_funcs)
     with env:
         # disable logger temporarily
         old_state = logger.disabled
         logger.disabled = True
 
-        nnvm.compiler.engine.clear_cache()
         for graph, shape, dtype in zip(graphs, shapes, dtypes):
+            nnvm.compiler.engine.clear_cache()
             nnvm.compiler.build(graph, target=target, shape=shape, dtype=dtype)
 
         logger.disabled = old_state
@@ -179,7 +182,7 @@ def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, params,
                          template_key='direct')
             tasks.append(tsk)
         except topi.InvalidShapeError:
-            print("[Warning] Invalid Shape during AutoTVM Task Creation")
+            print("[Warning] Invalid shape during AutoTVM task creation")
 
     return tasks
 
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index 14caa70c0b84..c22496369637 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -57,11 +57,12 @@ def extract_from_program(func, params, ops, target, target_host=None):
     task: Array of autotvm.task.Task
         collected tasks
     """
-    env = TaskExtractEnv.get()
     import tvm.relay.op
     from tvm import relay
     import topi
 
+    env = TaskExtractEnv.get()
+
     # NOTE: To add more ops, you only need to change the following lists
     # relay op -> topi compute
     OP2TOPI = {
@@ -81,30 +82,32 @@ def extract_from_program(func, params, ops, target, target_host=None):
 
     # run compiler to collect all TOPI calls during compilation
     env.reset(topi_funcs)
+    with env:
+        # disable logger temporarily
+        old_state = logger.disabled
+        logger.disabled = True
 
-    # disable logger temporarily
-    old_state = logger.disabled
-    logger.disabled = True
-
-    # use a "tracing" target to do a fake compile for collecting topi calls
-    tracing_target = _target.create("llvm -device=tracing")
-    relay.backend.compile_engine.get().clear()
-    # wrap build call in thread to avoid multiprocessing problems
-    build_thread = threading.Thread(target=relay.build, args=(func,
-                                                              tracing_target,
-                                                              target_host,
-                                                              params))
-    build_thread.start()
-    build_thread.join()
-    logger.disabled = old_state
+        relay.backend.compile_engine.get().clear()
+        # wrap build call in thread to avoid multiprocessing problems
+        build_thread = threading.Thread(target=relay.build, args=(func,
+                                                                  target,
+                                                                  target_host,
+                                                                  params))
+        build_thread.start()
+        build_thread.join()
+
+        logger.disabled = old_state
 
     # create tasks for target
     tasks = []
     for task_name, args in env.get_tasks():
-        tasks.append(create(task_name, args,
-                            target=target, target_host=target_host,
-                            template_key='direct'))
-
+        try:
+            tsk = create(task_name, args,
+                         target=target, target_host=target_host,
+                         template_key='direct')
+            tasks.append(tsk)
+        except topi.InvalidShapeError:
+            print("[Warning] Invalid shape during AutoTVM task creation")
     return tasks
 
 
@@ -155,30 +158,32 @@ def extract_from_multiple_program(funcs, params, ops, target, target_host=None):
 
     # run compiler to collect all TOPI calls during compilation
     env.reset(topi_funcs)
-
-    # disable logger temporarily
-    old_state = logger.disabled
-    logger.disabled = True
-
-    # use a "tracing" target to do a fake compile for collecting topi calls
-    tracing_target = _target.create("llvm -device=tracing")
-
-    for func, param in zip(funcs, params):
-        # wrap build call in thread to avoid multiprocessing problems
-        build_thread = threading.Thread(target=relay.build, args=(func,
-                                                                  tracing_target,
-                                                                  target_host,
-                                                                  params))
-        build_thread.start()
-        build_thread.join()
-
-    logger.disabled = old_state
+    with env:
+        # disable logger temporarily
+        old_state = logger.disabled
+        logger.disabled = True
+
+        for func, param in zip(funcs, params):
+            relay.backend.compile_engine.get().clear()
+            # wrap build call in thread to avoid multiprocessing problems
+            build_thread = threading.Thread(target=relay.build, args=(func,
+                                                                      target,
+                                                                      target_host,
+                                                                      params))
+            build_thread.start()
+            build_thread.join()
+
+        logger.disabled = old_state
 
     # create tasks for target
     tasks = []
     for task_name, args in env.get_tasks():
-        tasks.append(create(task_name, args,
-                            target=target, target_host=target_host,
-                            template_key='direct'))
+        try:
+            tsk = create(task_name, args,
+                         target=target, target_host=target_host,
+                         template_key='direct')
+            tasks.append(tsk)
+        except topi.InvalidShapeError:
+            print("[Warning] Invalid shape during AutoTVM task creation")
 
     return tasks

From fadd29dd14816679b5bdb2e3a95f2e6fb3e75657 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 5 Jun 2019 11:34:27 -0700
Subject: [PATCH 043/126] refactor relay to vta compilation script

---
 vta/scripts/relay_to_vta.py | 162 +++++++++++++++---------------------
 1 file changed, 68 insertions(+), 94 deletions(-)

diff --git a/vta/scripts/relay_to_vta.py b/vta/scripts/relay_to_vta.py
index 22c76bdda259..59bc8095608f 100644
--- a/vta/scripts/relay_to_vta.py
+++ b/vta/scripts/relay_to_vta.py
@@ -16,7 +16,7 @@
 from vta.top import graph_pack
 
 parser = argparse.ArgumentParser(description='Train a model for image classification.')
-parser.add_argument('--model', type=str, required=True,
+parser.add_argument('--model', type=str, required=False, default='resnet18_v1',
                     help='Input model name.')
 parser.add_argument('--start-name', type=str, default='nn.max_pool2d',
                     help='The name of the node where packing starts')
@@ -31,15 +31,6 @@
 
 opt = parser.parse_args()
 
-if 'mobilenet' in opt.model:
-    opt.start_name = 'nn.relu'
-elif 'gan' in opt.model:
-    opt.start_name = 'reshape0'
-    opt.stop_name = 'copy2'
-elif 'rnn' in opt.model:
-    opt.start_name = 'reshape0'
-    opt.stop_name = 'reshape1'
-
 # Helper function to read in image
 # Takes in Image object, returns an ND array
 def process_image(image):
@@ -51,63 +42,11 @@ def process_image(image):
 
     return tvm.nd.array(image.astype("float32"))
 
-def demo_cat_classification(env, m, ctx, remote, shape_dict, dtype_dict):
-    # Read in ImageNet Categories
-    url = "https://github.com/uwsaml/web-data/raw/master/vta/models/"
-    categ_fn = "synset.txt"
-    for fn in ["synset.txt"]:
-        if not isfile(fn):
-            download.download(join(url, fn), fn)
-    synset = eval(open(categ_fn).read())
-    # Read in test image
-    image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg'
-    # Read in test image
-    response = requests.get(image_url)
-    image = Image.open(BytesIO(response.content)).resize((224, 224))
-    # Set the input
-    image = process_image(image)
-    if "gan" in opt.model or "rnn" in opt.model:
-        # non-classification networks require custom input shapes and out shapes
-        m.set_input('data', tvm.nd.array(
-            10 * np.random.uniform(size=shape_dict['data']).astype(dtype_dict['data'])))
-        timer = m.module.time_evaluator("run", ctx, number=1, repeat=opt.measurements)
-        tcost = timer()
-        std = np.std(tcost.results) * 1000 / env.BATCH
-        mean = tcost.mean * 1000 / env.BATCH
-        print("Performed inference in %.2fms/samlple (std = %.2f)" % (mean, std))
-    else:
-        image = np.repeat(image.asnumpy(), env.BATCH, axis=0)
-        m.set_input('data', image)
-        # Perform inference
-        timer = m.module.time_evaluator("run", ctx, number=1, repeat=opt.measurements)
-        tcost = timer()
-
-        if opt.debug_profile:
-            m.run()
-
-        # Get classification results
-        tvm_output = m.get_output(0,
-                                  tvm.nd.empty((env.BATCH, 1000), "float32", remote.cpu(0)))
-        top_categories = np.argsort(tvm_output.asnumpy()[0])
-
-        # Report top-5 classification results
-        std = np.std(tcost.results) * 1000 / env.BATCH
-        mean = tcost.mean * 1000 / env.BATCH
-        print("%s Prediction" % opt.model)
-        print("                     #1:", synset[top_categories[-1]])
-        print("                     #2:", synset[top_categories[-2]])
-        print("                     #3:", synset[top_categories[-3]])
-        print("                     #4:", synset[top_categories[-4]])
-        print("                     #5:", synset[top_categories[-5]])
-        print("Performed inference in %.2fms/sample (std = %.2f)" % (mean, std))
-
-######################################################################
-# Setup the Pynq Board's RPC Server
-# ---------------------------------
-# Build the RPC server's VTA runtime and program the Pynq FPGA.
+if __name__ == '__main__':
 
-def run(device = "vta"):
+    # Read in VTA environment
     env = vta.get_env()
+
     # Measure build start time
     reconfig_start = time.time()
 
@@ -119,7 +58,12 @@ def run(device = "vta"):
         assert tvm.module.enabled("rpc")
 
         # Get remote from fleet node
-        remote = autotvm.measure.request_remote(env.TARGET, '10.77.1.109', 9190, timeout=10000)
+        tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
+        tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None))
+        if not tracket_host or not tracket_port:
+            print("Set your AutoTVM tracker node host and port variables to run the autotuner")
+            exit()
+        remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000)
 
         # Reconfigure the JIT runtime
         vta.reconfig_runtime(remote)
@@ -138,9 +82,10 @@ def run(device = "vta"):
         remote = rpc.LocalSession()
 
     # TVM target and context
-    target = tvm.target.create("llvm -device={}".format(device))
-    ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
+    target = tvm.target.create("llvm -device={}".format(opt.device))
+    ctx = remote.ext_dev(0) if opt.device == "vta" else remote.cpu(0)
 
+    # Get tophub schedules
     with autotvm.tophub.context(target):
 
         # Measure build start time
@@ -152,54 +97,44 @@ def run(device = "vta"):
 
         # Populate the shape and data type dictionary
         dtype_dict = {"data": 'float32'}
-        if "gan" in opt.model:
-            shape_dict = {"data": (env.BATCH, 100)}
-        elif 'rnn' in opt.model:
-            batch_size, seq_len, hidden_dim = 4, 1, 640
-            begin_state_shape = (batch_size, hidden_dim, 1, 1)
-            shape_dict = {"data": (seq_len, batch_size),
-                        "cell_l0_begin_state_0": begin_state_shape,
-                        "cell_l1_begin_state_0": begin_state_shape}
-            dtype_dict = {"data": "int32",
-                        "cell_l0_begin_state_0": 'float32',
-                        "cell_l1_begin_state_0": 'float32'}
-        else:
-            shape_dict = {"data": (env.BATCH, 3, 224, 224)}
+        shape_dict = {"data": (env.BATCH, 3, 224, 224)}
 
+        # Get off the shelf gluon model, and convert to relay
         gluon_model = vision.get_model(opt.model, pretrained=True)
-        relay_graph, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
+        relay_prog, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
 
+        # Update shape and type dictionary
         shape_dict.update({k: v.shape for k, v in params.items()})
         dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
 
+        # Perform quantization in Relay
         with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1):
-            relay_graph = relay.quantize.quantize(relay_graph, params=params)
+            relay_prog = relay.quantize.quantize(relay_prog, params=params)
 
+        # Perform graph packing and constant folding for VTA target
         if target.device_name == "vta":
             assert env.BLOCK_IN == env.BLOCK_OUT
-            relay_graph = graph_pack(
-                relay_graph,
+            relay_prog = graph_pack(
+                relay_prog,
                 env.BATCH,
                 env.BLOCK_OUT,
                 env.WGT_WIDTH,
                 start_name=opt.start_name,
                 stop_name=opt.stop_name)
+            relay_prog = relay.ir_pass.fold_constant(relay_prog)
 
-            relay_graph = relay.ir_pass.fold_constant(relay_graph)
-
-        # Compile Relay program.
+        # Compile Relay program with AlterOpLayout disabled
         with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
             if target.device_name != "vta":
                 graph, lib, params = relay.build(
-                    relay_graph, target=target,
+                    relay_prog, target=target,
                     params=params, target_host=target_host)
             else:
                 with vta.build_config():
                     graph, lib, params = relay.build(
-                        relay_graph, target=target,
+                        relay_prog, target=target,
                         params=params, target_host=target_host)
 
-
         # Save the compiled inference graph library
         assert tvm.module.enabled("rpc")
         temp = util.tempdir()
@@ -213,13 +148,52 @@ def run(device = "vta"):
         build_time = time.time() - build_start
         print(opt.model + " inference graph built in {0:.2f}s!".format(build_time))
 
+        # If detailed runtime info is needed build with debug runtime
         if opt.debug_profile:
             m = debug_runtime.create(graph, lib, ctx)
         else:
             m = graph_runtime.create(graph, lib, ctx)
 
-        # Set the parameters
+        # Set the network parameters
         m.set_input(**params)
-        demo_cat_classification(env, m, ctx, remote, shape_dict, dtype_dict)
 
-run(opt.device)
+        # Read in ImageNet Categories
+        url = "https://github.com/uwsaml/web-data/raw/master/vta/models/"
+        categ_fn = "synset.txt"
+        for fn in ["synset.txt"]:
+            if not isfile(fn):
+                download.download(join(url, fn), fn)
+        synset = eval(open(categ_fn).read())
+
+        # Read in test image
+        image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg'
+        response = requests.get(image_url)
+        image = Image.open(BytesIO(response.content)).resize((224, 224))
+
+        # Set the input
+        image = process_image(image)
+        image = np.repeat(image.asnumpy(), env.BATCH, axis=0)
+        m.set_input('data', image)
+
+        # Perform inference
+        timer = m.module.time_evaluator("run", ctx, number=1, repeat=opt.measurements)
+        tcost = timer()
+
+        # Display profile information
+        if opt.debug_profile:
+            m.run()
+
+        # Get classification results
+        tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 1000), "float32", remote.cpu(0)))
+        top_categories = np.argsort(tvm_output.asnumpy()[0])
+
+        # Report top-5 classification results
+        std = np.std(tcost.results) * 1000 / env.BATCH
+        mean = tcost.mean * 1000 / env.BATCH
+        print("%s Prediction" % opt.model)
+        print("                     #1:", synset[top_categories[-1]])
+        print("                     #2:", synset[top_categories[-2]])
+        print("                     #3:", synset[top_categories[-3]])
+        print("                     #4:", synset[top_categories[-4]])
+        print("                     #5:", synset[top_categories[-5]])
+        print("Performed inference in %.2fms/sample (std = %.2f)" % (mean, std))

From 6ba0fff45387fda0bac42066dcd2cc17dec15127 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 5 Jun 2019 12:06:35 -0700
Subject: [PATCH 044/126] further refactor, cleanup

---
 vta/scripts/relay_to_vta.py | 123 ++++++++++++++++++------------------
 1 file changed, 61 insertions(+), 62 deletions(-)

diff --git a/vta/scripts/relay_to_vta.py b/vta/scripts/relay_to_vta.py
index 59bc8095608f..d75deea432ae 100644
--- a/vta/scripts/relay_to_vta.py
+++ b/vta/scripts/relay_to_vta.py
@@ -15,47 +15,47 @@
 from vta.testing import simulator
 from vta.top import graph_pack
 
-parser = argparse.ArgumentParser(description='Train a model for image classification.')
-parser.add_argument('--model', type=str, required=False, default='resnet18_v1',
-                    help='Input model name.')
-parser.add_argument('--start-name', type=str, default='nn.max_pool2d',
-                    help='The name of the node where packing starts')
-parser.add_argument('--stop-name', type=str, default='nn.global_avg_pool2d',
-                    help='The name of the node where packing stops')
-parser.add_argument('--debug-profile', action='store_true',
-                    help='Show layer-wise time cost profiling results')
-parser.add_argument('--device', default="vta",
-                    help='Select device target, either "vta" or "vtacpu"')
-parser.add_argument('--measurements', type=int, default=1,
-                    help='Number of measurements')
-
-opt = parser.parse_args()
-
-# Helper function to read in image
-# Takes in Image object, returns an ND array
-def process_image(image):
-    # Convert to neural network input format
-    image = np.array(image) - np.array([123., 117., 104.])
-    image /= np.array([58.395, 57.12, 57.375])
-    image = image.transpose((2, 0, 1))
-    image = image[np.newaxis, :]
 
-    return tvm.nd.array(image.astype("float32"))
+def classification_demo(opt):
+    """Image classification demo.
 
-if __name__ == '__main__':
+    Parameters
+    ----------
+    opt: a dictionary obtained from argparse
+    """
+    
+    # Make sure that TVM was compiled with RPC=1
+    assert tvm.module.enabled("rpc")
 
     # Read in VTA environment
     env = vta.get_env()
 
-    # Measure build start time
-    reconfig_start = time.time()
+    # Download ImageNet Categories
+    url = "https://github.com/uwsaml/web-data/raw/master/vta/models/"
+    categ_fn = "synset.txt"
+    for fn in ["synset.txt"]:
+        if not isfile(fn):
+            download.download(join(url, fn), fn)
+    synset = eval(open(categ_fn).read())
+
+    # Download test image
+    image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg'
+    response = requests.get(image_url)
+
+    # Prepare test image for inference
+    image = Image.open(BytesIO(response.content)).resize((224, 224))
+    image = np.array(image) - np.array([123., 117., 104.])
+    image /= np.array([58.395, 57.12, 57.375])
+    image = image.transpose((2, 0, 1))
+    image = image[np.newaxis, :]
+    image = np.repeat(image, env.BATCH, axis=0)
 
     # We configure both the bitstream and the runtime system on the Pynq
     # to match the VTA configuration specified by the vta_config.json file.
     if env.TARGET != "sim":
 
-        # Make sure that TVM was compiled with RPC=1
-        assert tvm.module.enabled("rpc")
+        # Measure build start time
+        reconfig_start = time.time()
 
         # Get remote from fleet node
         tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
@@ -65,12 +65,10 @@ def process_image(image):
             exit()
         remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000)
 
-        # Reconfigure the JIT runtime
-        vta.reconfig_runtime(remote)
-
-        # Program the FPGA with a pre-compiled VTA bitstream.
+        # Reconfigure the JIT runtime and FPGA.
         # You can program the FPGA with your own custom bitstream
         # by passing the path to the bitstream file instead of None.
+        vta.reconfig_runtime(remote)
         vta.program_fpga(remote, bitstream=None)
 
         # Report on reconfiguration time
@@ -78,10 +76,10 @@ def process_image(image):
         print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
 
     # In simulation mode, host the RPC server locally.
-    elif env.TARGET == "sim":
+    else:
         remote = rpc.LocalSession()
 
-    # TVM target and context
+    # Create a TVM target and execution context
     target = tvm.target.create("llvm -device={}".format(opt.device))
     ctx = remote.ext_dev(0) if opt.device == "vta" else remote.cpu(0)
 
@@ -134,45 +132,25 @@ def process_image(image):
                     graph, lib, params = relay.build(
                         relay_prog, target=target,
                         params=params, target_host=target_host)
+        
+        # Measure Relay build time
+        build_time = time.time() - build_start
+        print(opt.model + " inference graph built in {0:.2f}s!".format(build_time))
 
-        # Save the compiled inference graph library
-        assert tvm.module.enabled("rpc")
+        # Send the inference library over to the remote RPC server
         temp = util.tempdir()
         lib.save(temp.relpath("graphlib.o"))
-
-        # Send the inference library over to the remote RPC server
         remote.upload(temp.relpath("graphlib.o"))
         lib = remote.load_module("graphlib.o")
 
-        # Measure build time
-        build_time = time.time() - build_start
-        print(opt.model + " inference graph built in {0:.2f}s!".format(build_time))
-
         # If detailed runtime info is needed build with debug runtime
         if opt.debug_profile:
             m = debug_runtime.create(graph, lib, ctx)
         else:
             m = graph_runtime.create(graph, lib, ctx)
 
-        # Set the network parameters
+        # Set the network parameters and inputs
         m.set_input(**params)
-
-        # Read in ImageNet Categories
-        url = "https://github.com/uwsaml/web-data/raw/master/vta/models/"
-        categ_fn = "synset.txt"
-        for fn in ["synset.txt"]:
-            if not isfile(fn):
-                download.download(join(url, fn), fn)
-        synset = eval(open(categ_fn).read())
-
-        # Read in test image
-        image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg'
-        response = requests.get(image_url)
-        image = Image.open(BytesIO(response.content)).resize((224, 224))
-
-        # Set the input
-        image = process_image(image)
-        image = np.repeat(image.asnumpy(), env.BATCH, axis=0)
         m.set_input('data', image)
 
         # Perform inference
@@ -197,3 +175,24 @@ def process_image(image):
         print("                     #4:", synset[top_categories[-4]])
         print("                     #5:", synset[top_categories[-5]])
         print("Performed inference in %.2fms/sample (std = %.2f)" % (mean, std))
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Train a model for image classification.')
+    parser.add_argument('--model', type=str, required=False, default='resnet18_v1',
+                        help='Input model name.')
+    parser.add_argument('--start-name', type=str, default='nn.max_pool2d',
+                        help='The name of the node where packing starts')
+    parser.add_argument('--stop-name', type=str, default='nn.global_avg_pool2d',
+                        help='The name of the node where packing stops')
+    parser.add_argument('--debug-profile', action='store_true',
+                        help='Show layer-wise time cost profiling results')
+    parser.add_argument('--device', default="vta",
+                        help='Select device target, either "vta" or "vtacpu"')
+    parser.add_argument('--measurements', type=int, default=1,
+                        help='Number of measurements')
+
+    opt = parser.parse_args()
+
+    classification_demo(opt)

From 4a34d4b1dcfebaa2b93d44d2250c7fe8e8123886 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 5 Jun 2019 18:53:18 -0700
Subject: [PATCH 045/126] relay based task extraction working

---
 python/tvm/autotvm/task/relay_integration.py |  36 ++-
 python/tvm/relay/op/nn/_nn.py                |   2 +-
 vta/scripts/tune_resnet.py                   | 232 +++++++++----------
 vta/scripts/tune_resnet_nnvm.py              | 231 ++++++++++++++++++
 4 files changed, 370 insertions(+), 131 deletions(-)
 create mode 100644 vta/scripts/tune_resnet_nnvm.py

diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index c22496369637..cb18653d8f37 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -33,6 +33,24 @@
 logger = logging.getLogger('autotvm')
 
 
+def my_build(func,
+            target,
+            target_host,
+            params):
+    """ VTA compatible relay build.
+    """
+
+    from tvm import relay
+
+    if "vta" in target.device_name:
+        with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
+            if target.device_name == "vta":
+                import vta
+                with vta.build_config():
+                    return relay.build(func, target, target_host, params)
+            else:
+                return relay.build(func, target, target_host, params)
+
 def extract_from_program(func, params, ops, target, target_host=None):
     """ Extract tuning tasks from a relay program.
 
@@ -89,10 +107,11 @@ def extract_from_program(func, params, ops, target, target_host=None):
 
         relay.backend.compile_engine.get().clear()
         # wrap build call in thread to avoid multiprocessing problems
-        build_thread = threading.Thread(target=relay.build, args=(func,
-                                                                  target,
-                                                                  target_host,
-                                                                  params))
+        build_thread = threading.Thread(target=my_build,
+                                        args=(func,
+                                              target,
+                                              target_host,
+                                              params))
         build_thread.start()
         build_thread.join()
 
@@ -166,10 +185,11 @@ def extract_from_multiple_program(funcs, params, ops, target, target_host=None):
         for func, param in zip(funcs, params):
             relay.backend.compile_engine.get().clear()
             # wrap build call in thread to avoid multiprocessing problems
-            build_thread = threading.Thread(target=relay.build, args=(func,
-                                                                      target,
-                                                                      target_host,
-                                                                      params))
+            build_thread = threading.Thread(target=my_build,
+                                            args=(func,
+                                                target,
+                                                target_host,
+                                                params))
             build_thread.start()
             build_thread.join()
 
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 6c8f8f88795c..e796995d5b42 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -119,7 +119,7 @@ def compute_conv2d(attrs, inputs, out_type, target):
     if groups == 1:
         out = topi.nn.conv2d(
             inputs[0], inputs[1], strides, padding,
-            dilation, layout, out_dtype=out_dtype)
+            dilation, layout, out_dtype)
     elif layout == "NCHW" and \
             get_const_int(inputs[1].shape[0]) == groups and \
             get_const_int(inputs[1].shape[1]) == 1:
diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
index b22a63e09df8..9a4cf3ce6845 100644
--- a/vta/scripts/tune_resnet.py
+++ b/vta/scripts/tune_resnet.py
@@ -1,21 +1,21 @@
-import argparse
-import os
-import time
+"""Perform inference on VTA using Relay."""
+
+import argparse, os
+from mxnet.gluon.model_zoo import vision
 import numpy as np
+from PIL import Image
 
+import topi
 import tvm
-from tvm import rpc, autotvm
+from tvm import rpc, autotvm, relay
 from tvm.autotvm.measure.measure_methods import request_remote
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
-from tvm.contrib import graph_runtime, util
-from tvm.contrib.download import download
-
-import topi
-import nnvm.compiler
+from tvm.contrib import graph_runtime, util, download
+from tvm.contrib.debugger import debug_runtime
 import vta
-import vta.testing
-
-env = vta.get_env()
+from vta.testing import simulator
+from vta.top import graph_pack
+from tvm.autotvm.task import extract_from_program
 
 def register_vta_tuning_tasks():
     from tvm.autotvm.task.topi_integration import TaskExtractEnv, deserialize_args
@@ -50,69 +50,6 @@ def _topi_nn_conv2d(*args, **kwargs):
             s = tvm.create_schedule([res.op])
         return s, [A, W, res]
 
-
-
-def generate_graph(sym, params, target, target_host):
-    # Populate the shape and data type dictionary
-    shape_dict = {"data": (1, 3, 224, 224)}
-    dtype_dict = {"data": 'float32'}
-    shape_dict.update({k: v.shape for k, v in params.items()})
-    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
-
-    # Apply NNVM graph optimization passes
-    sym = vta.graph.clean_cast(sym)
-    sym = vta.graph.clean_conv_fuse(sym)
-    assert env.BLOCK_IN == env.BLOCK_OUT
-    sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT)
-
-    # Compile NNVM graph
-    with nnvm.compiler.build_config(opt_level=3):
-        with vta.build_config():
-            graph, lib, params = nnvm.compiler.build(
-                sym, target, shape_dict, dtype_dict,
-                params=params, target_host=target_host)
-
-    return graph, lib, params
-
-
-def extract_tasks(sym, params, target, target_host):
-    # Populate the shape and data type dictionary
-    shape_dict = {"data": (1, 3, 224, 224)}
-    dtype_dict = {"data": 'float32'}
-    shape_dict.update({k: v.shape for k, v in params.items()})
-    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
-
-    # Apply NNVM graph optimization passes
-    sym = vta.graph.clean_cast(sym)
-    sym = vta.graph.clean_conv_fuse(sym)
-    assert env.BLOCK_IN == env.BLOCK_OUT
-    sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT)
-
-    with vta.build_config():
-        tasks = autotvm.task.extract_from_graph(graph=sym, shape=shape_dict, dtype=dtype_dict, target=target,
-                                                params=params, symbols=(nnvm.sym.conv2d,), target_host=target_host)
-    return tasks
-
-
-def download_model():
-    url = "https://github.com/uwsaml/web-data/raw/master/vta/models/"
-    categ_fn = 'synset.txt'
-    graph_fn = 'resnet18_qt8.json'
-    params_fn = 'resnet18_qt8.params'
-    data_dir = '_data'
-    if not os.path.exists(data_dir):
-        os.makedirs(data_dir)
-
-    for file in [categ_fn, graph_fn, params_fn]:
-        if not os.path.isfile(file):
-            download(os.path.join(url, file), os.path.join(data_dir, file))
-
-    sym = nnvm.graph.load_json(open(os.path.join(data_dir, graph_fn)).read())
-    params = nnvm.compiler.load_param_dict(open(os.path.join(data_dir, params_fn), 'rb').read())
-
-    return sym, params
-
-
 def tune_tasks(tasks,
                measure_option,
                tuner='xgb',
@@ -158,8 +95,103 @@ def tune_tasks(tasks,
     autotvm.record.pick_best(tmp_log_file, log_filename)
     os.remove(tmp_log_file)
 
+
+def extract_tasks(opt, env, target):
+    """Compile network and extract tasks.
+
+    Parameters
+    ----------
+    opt: a dictionary of parameters obtained from argparse
+    env: the VTA environment
+    target: the TVM target
+
+
+    Returns
+    -------
+    task: Array of autotvm.task.Task collected tasks
+    """
+    
+    # Make sure that TVM was compiled with RPC=1
+    assert tvm.module.enabled("rpc")
+
+    # Get tracker info from env
+    tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
+    tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None))
+    if not tracket_host or not tracket_port:
+        print("Set your AutoTVM tracker node host and port variables to run the autotuner")
+        exit()
+
+    # Register VTA tuning tasks
+    register_vta_tuning_tasks()
+
+    # Create a TVM target and execution context
+    target_host = env.target_host
+
+    # Get tophub schedules
+    with autotvm.tophub.context(target):
+
+        # Populate the shape and data type dictionary
+        dtype_dict = {"data": 'float32'}
+        shape_dict = {"data": (env.BATCH, 3, 224, 224)}
+
+        # Get off the shelf gluon model, and convert to relay
+        gluon_model = vision.get_model(opt.model, pretrained=True)
+        relay_prog, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
+
+        # Update shape and type dictionary
+        shape_dict.update({k: v.shape for k, v in params.items()})
+        dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+        # Perform quantization in Relay
+        with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1):
+            relay_prog = relay.quantize.quantize(relay_prog, params=params)
+
+        # Perform graph packing and constant folding for VTA target
+        if target.device_name == "vta":
+            assert env.BLOCK_IN == env.BLOCK_OUT
+            relay_prog = graph_pack(
+                relay_prog,
+                env.BATCH,
+                env.BLOCK_OUT,
+                env.WGT_WIDTH,
+                start_name=opt.start_name,
+                stop_name=opt.stop_name)
+            relay_prog = relay.ir_pass.fold_constant(relay_prog)
+
+        # Perform task extraction on Relay program
+        tasks = extract_from_program(func=relay_prog,
+                                        params=params,
+                                        ops=(tvm.relay.op.nn.conv2d,),
+                                        target=target,
+                                        target_host=target_host)
+
+        return tasks
+
+
 if __name__ == '__main__':
 
+    parser = argparse.ArgumentParser(description='Train a model for image classification.')
+    parser.add_argument('--model', type=str, required=False, default='resnet18_v1',
+                        help='Input model name.')
+    parser.add_argument('--start-name', type=str, default='nn.max_pool2d',
+                        help='The name of the node where packing starts')
+    parser.add_argument('--stop-name', type=str, default='nn.global_avg_pool2d',
+                        help='The name of the node where packing stops')
+    parser.add_argument('--debug-profile', action='store_true',
+                        help='Show layer-wise time cost profiling results')
+    parser.add_argument('--device', default="vta",
+                        help='Select device target, either "vta" or "vtacpu"')
+    parser.add_argument('--measurements', type=int, default=1,
+                        help='Number of measurements')
+
+    opt = parser.parse_args()
+
+    # Read in VTA environment
+    env = vta.get_env()
+
+    # Target
+    target = tvm.target.vta()
+
     # Get tracker info from env
     tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
     tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None))
@@ -167,6 +199,7 @@ def tune_tasks(tasks,
         print("Set your AutoTVM tracker node host and port variables to run the autotuner")
         exit()
 
+    # Set tuner options
     tuning_opt = {
         'log_filename': 'resnet-18.log',
 
@@ -177,55 +210,10 @@ def tune_tasks(tasks,
         'measure_option':  autotvm.measure_option(
                 builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func),
                 runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port,
-                    number=4, repeat=3, timeout=60,
+                    number=4, min_repeat_ms=150, repeat=3, timeout=60,
                     check_correctness=True))
     }
 
-    # download model
-    sym, params = download_model()
+    tasks = extract_tasks(opt, env, target)
 
-    # register VTA tuning tasks
-    register_vta_tuning_tasks()
-
-    # extract tasks
-    print("Extract tasks...")
-    target = tvm.target.vta()
-    target_host = env.target_host
-    tasks = extract_tasks(sym, params, target, target_host)
-
-    print("Tuning...")
     tune_tasks(tasks, **tuning_opt)
-
-    # compile kernels with history best records
-    with autotvm.tophub.context(target, extra_files=[tuning_opt['log_filename']]):
-        print("Compile...")
-        graph, lib, params = generate_graph(sym, params, target, target_host)
-        input_shape = (1, 3, 224, 224)
-        dtype = 'float32'
-
-        # export library
-        tmp = util.tempdir()
-        filename = "net.tar"
-        lib.export_library(tmp.relpath(filename))
-
-        # upload module to device
-        print("Upload...")
-        remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000)
-        remote.upload(tmp.relpath(filename))
-        rlib = remote.load_module(filename)
-
-        # upload parameters to device
-        ctx = remote.context(str(target), 0)
-        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
-        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
-        module = graph_runtime.create(graph, rlib, ctx)
-        module.set_input('data', data_tvm)
-        module.set_input(**rparams)
-
-        # evaluate
-        print("Evaluate inference time cost...")
-        ftimer = module.module.time_evaluator("run", ctx, number=3, repeat=3)
-        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
-        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
-              (np.mean(prof_res), np.std(prof_res)))
-
diff --git a/vta/scripts/tune_resnet_nnvm.py b/vta/scripts/tune_resnet_nnvm.py
new file mode 100644
index 000000000000..b22a63e09df8
--- /dev/null
+++ b/vta/scripts/tune_resnet_nnvm.py
@@ -0,0 +1,231 @@
+import argparse
+import os
+import time
+import numpy as np
+
+import tvm
+from tvm import rpc, autotvm
+from tvm.autotvm.measure.measure_methods import request_remote
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.contrib import graph_runtime, util
+from tvm.contrib.download import download
+
+import topi
+import nnvm.compiler
+import vta
+import vta.testing
+
+env = vta.get_env()
+
+def register_vta_tuning_tasks():
+    from tvm.autotvm.task.topi_integration import TaskExtractEnv, deserialize_args
+
+    @tvm.tag_scope(tag=topi.tag.ELEMWISE)
+    def my_clip(x, a_min, a_max):
+        """Unlike topi's current clip, put min and max into two stages."""
+        const_min = tvm.const(a_min, x.dtype)
+        const_max = tvm.const(a_max, x.dtype)
+        x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
+        x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+        return x
+
+    # init autotvm env to register VTA operator
+    TaskExtractEnv()
+
+    @autotvm.task.register("topi_nn_conv2d", override=True)
+    def _topi_nn_conv2d(*args, **kwargs):
+        assert not kwargs, "Do not support kwargs in template function call"
+        args = deserialize_args(args)
+        A, W = args[:2]
+
+        with tvm.target.vta():
+            res = topi.nn.conv2d(*args, **kwargs)
+            res = topi.right_shift(res, 8)
+            res = my_clip(res, 0, 127)
+            res = topi.cast(res, "int8")
+
+        if tvm.target.current_target().device_name == 'vta':
+            s = topi.generic.schedule_conv2d_nchw([res])
+        else:
+            s = tvm.create_schedule([res.op])
+        return s, [A, W, res]
+
+
+
+def generate_graph(sym, params, target, target_host):
+    # Populate the shape and data type dictionary
+    shape_dict = {"data": (1, 3, 224, 224)}
+    dtype_dict = {"data": 'float32'}
+    shape_dict.update({k: v.shape for k, v in params.items()})
+    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+    # Apply NNVM graph optimization passes
+    sym = vta.graph.clean_cast(sym)
+    sym = vta.graph.clean_conv_fuse(sym)
+    assert env.BLOCK_IN == env.BLOCK_OUT
+    sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT)
+
+    # Compile NNVM graph
+    with nnvm.compiler.build_config(opt_level=3):
+        with vta.build_config():
+            graph, lib, params = nnvm.compiler.build(
+                sym, target, shape_dict, dtype_dict,
+                params=params, target_host=target_host)
+
+    return graph, lib, params
+
+
+def extract_tasks(sym, params, target, target_host):
+    # Populate the shape and data type dictionary
+    shape_dict = {"data": (1, 3, 224, 224)}
+    dtype_dict = {"data": 'float32'}
+    shape_dict.update({k: v.shape for k, v in params.items()})
+    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+    # Apply NNVM graph optimization passes
+    sym = vta.graph.clean_cast(sym)
+    sym = vta.graph.clean_conv_fuse(sym)
+    assert env.BLOCK_IN == env.BLOCK_OUT
+    sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT)
+
+    with vta.build_config():
+        tasks = autotvm.task.extract_from_graph(graph=sym, shape=shape_dict, dtype=dtype_dict, target=target,
+                                                params=params, symbols=(nnvm.sym.conv2d,), target_host=target_host)
+    return tasks
+
+
+def download_model():
+    url = "https://github.com/uwsaml/web-data/raw/master/vta/models/"
+    categ_fn = 'synset.txt'
+    graph_fn = 'resnet18_qt8.json'
+    params_fn = 'resnet18_qt8.params'
+    data_dir = '_data'
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+
+    for file in [categ_fn, graph_fn, params_fn]:
+        if not os.path.isfile(file):
+            download(os.path.join(url, file), os.path.join(data_dir, file))
+
+    sym = nnvm.graph.load_json(open(os.path.join(data_dir, graph_fn)).read())
+    params = nnvm.compiler.load_param_dict(open(os.path.join(data_dir, params_fn), 'rb').read())
+
+    return sym, params
+
+
+def tune_tasks(tasks,
+               measure_option,
+               tuner='xgb',
+               n_trial=1000,
+               early_stopping=None,
+               log_filename='tuning.log',
+               use_transfer_learning=True,
+               try_winograd=True):
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(tsk, loss_type='rank')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(tsk, pop_size=50)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        n_trial_ = min(n_trial, len(tsk.config_space))
+        tuner_obj.tune(n_trial_,
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(n_trial_, prefix=prefix),
+                           autotvm.callback.log_to_file(tmp_log_file)])
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+if __name__ == '__main__':
+
+    # Get tracker info from env
+    tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
+    tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None))
+    if not tracket_host or not tracket_port:
+        print("Set your AutoTVM tracker node host and port variables to run the autotuner")
+        exit()
+
+    tuning_opt = {
+        'log_filename': 'resnet-18.log',
+
+        'tuner': 'random',
+        'n_trial': 1e9,
+        'early_stopping': None,
+
+        'measure_option':  autotvm.measure_option(
+                builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func),
+                runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port,
+                    number=4, repeat=3, timeout=60,
+                    check_correctness=True))
+    }
+
+    # download model
+    sym, params = download_model()
+
+    # register VTA tuning tasks
+    register_vta_tuning_tasks()
+
+    # extract tasks
+    print("Extract tasks...")
+    target = tvm.target.vta()
+    target_host = env.target_host
+    tasks = extract_tasks(sym, params, target, target_host)
+
+    print("Tuning...")
+    tune_tasks(tasks, **tuning_opt)
+
+    # compile kernels with history best records
+    with autotvm.tophub.context(target, extra_files=[tuning_opt['log_filename']]):
+        print("Compile...")
+        graph, lib, params = generate_graph(sym, params, target, target_host)
+        input_shape = (1, 3, 224, 224)
+        dtype = 'float32'
+
+        # export library
+        tmp = util.tempdir()
+        filename = "net.tar"
+        lib.export_library(tmp.relpath(filename))
+
+        # upload module to device
+        print("Upload...")
+        remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000)
+        remote.upload(tmp.relpath(filename))
+        rlib = remote.load_module(filename)
+
+        # upload parameters to device
+        ctx = remote.context(str(target), 0)
+        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+        module = graph_runtime.create(graph, rlib, ctx)
+        module.set_input('data', data_tvm)
+        module.set_input(**rparams)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        ftimer = module.module.time_evaluator("run", ctx, number=3, repeat=3)
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+              (np.mean(prof_res), np.std(prof_res)))
+

From 59f1c026be8c3f2fe791d07e7a11a9c3c2d7ccd7 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Thu, 6 Jun 2019 11:37:22 -0700
Subject: [PATCH 046/126] autotuning script refactor

---
 vta/scripts/tune_resnet.py      | 226 ++++++++++++++++++--------------
 vta/scripts/tune_resnet_nnvm.py |  44 ++++---
 2 files changed, 154 insertions(+), 116 deletions(-)

diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
index 9a4cf3ce6845..72e1395f0af2 100644
--- a/vta/scripts/tune_resnet.py
+++ b/vta/scripts/tune_resnet.py
@@ -17,6 +17,29 @@
 from vta.top import graph_pack
 from tvm.autotvm.task import extract_from_program
 
+def parse_arguments():
+
+    parser = argparse.ArgumentParser(description='Train a model for image classification.')
+    parser.add_argument('--model', type=str, required=False, default='resnet18_v1',
+                        help='Input model name.')
+    parser.add_argument('--start-name', type=str, default='nn.max_pool2d',
+                        help='The name of the node where packing starts')
+    parser.add_argument('--stop-name', type=str, default='nn.global_avg_pool2d',
+                        help='The name of the node where packing stops')
+    parser.add_argument('--debug-profile', action='store_true',
+                        help='Show layer-wise time cost profiling results')
+    parser.add_argument('--device', default="vta",
+                        help='Select device target, either "vta" or "vtacpu"')
+    parser.add_argument('--measurements', type=int, default=1,
+                        help='Number of measurements during AutoTVM search')
+    parser.add_argument('--tuner', type=str, default="random",
+                        help='AutoTVM search strategy')
+    parser.add_argument('--log-filename', type=str, default="resnet-18.log",
+                        help='AutoTVM log file name')
+
+    return parser.parse_args()
+
+
 def register_vta_tuning_tasks():
     from tvm.autotvm.task.topi_integration import TaskExtractEnv, deserialize_args
 
@@ -50,6 +73,40 @@ def _topi_nn_conv2d(*args, **kwargs):
             s = tvm.create_schedule([res.op])
         return s, [A, W, res]
 
+
+def compile_network(opt, env, target):
+
+    # Populate the shape and data type dictionary
+    dtype_dict = {"data": 'float32'}
+    shape_dict = {"data": (env.BATCH, 3, 224, 224)}
+
+    # Get off the shelf gluon model, and convert to relay
+    gluon_model = vision.get_model(opt.model, pretrained=True)
+    relay_prog, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
+
+    # Update shape and type dictionary
+    shape_dict.update({k: v.shape for k, v in params.items()})
+    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+    # Perform quantization in Relay
+    with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1):
+        relay_prog = relay.quantize.quantize(relay_prog, params=params)
+
+    # Perform graph packing and constant folding for VTA target
+    if target.device_name == "vta":
+        assert env.BLOCK_IN == env.BLOCK_OUT
+        relay_prog = graph_pack(
+            relay_prog,
+            env.BATCH,
+            env.BLOCK_OUT,
+            env.WGT_WIDTH,
+            start_name=opt.start_name,
+            stop_name=opt.stop_name)
+        relay_prog = relay.ir_pass.fold_constant(relay_prog)
+
+    return relay_prog, params
+
+
 def tune_tasks(tasks,
                measure_option,
                tuner='xgb',
@@ -58,6 +115,7 @@ def tune_tasks(tasks,
                log_filename='tuning.log',
                use_transfer_learning=True,
                try_winograd=True):
+
     # create tmp log file
     tmp_log_file = log_filename + ".tmp"
     if os.path.exists(tmp_log_file):
@@ -95,101 +153,17 @@ def tune_tasks(tasks,
     autotvm.record.pick_best(tmp_log_file, log_filename)
     os.remove(tmp_log_file)
 
+if __name__ == '__main__':
 
-def extract_tasks(opt, env, target):
-    """Compile network and extract tasks.
-
-    Parameters
-    ----------
-    opt: a dictionary of parameters obtained from argparse
-    env: the VTA environment
-    target: the TVM target
-
+    opt = parse_arguments()
 
-    Returns
-    -------
-    task: Array of autotvm.task.Task collected tasks
-    """
-    
     # Make sure that TVM was compiled with RPC=1
     assert tvm.module.enabled("rpc")
 
-    # Get tracker info from env
-    tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
-    tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None))
-    if not tracket_host or not tracket_port:
-        print("Set your AutoTVM tracker node host and port variables to run the autotuner")
-        exit()
-
-    # Register VTA tuning tasks
-    register_vta_tuning_tasks()
-
-    # Create a TVM target and execution context
-    target_host = env.target_host
-
-    # Get tophub schedules
-    with autotvm.tophub.context(target):
-
-        # Populate the shape and data type dictionary
-        dtype_dict = {"data": 'float32'}
-        shape_dict = {"data": (env.BATCH, 3, 224, 224)}
-
-        # Get off the shelf gluon model, and convert to relay
-        gluon_model = vision.get_model(opt.model, pretrained=True)
-        relay_prog, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
-
-        # Update shape and type dictionary
-        shape_dict.update({k: v.shape for k, v in params.items()})
-        dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
-
-        # Perform quantization in Relay
-        with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1):
-            relay_prog = relay.quantize.quantize(relay_prog, params=params)
-
-        # Perform graph packing and constant folding for VTA target
-        if target.device_name == "vta":
-            assert env.BLOCK_IN == env.BLOCK_OUT
-            relay_prog = graph_pack(
-                relay_prog,
-                env.BATCH,
-                env.BLOCK_OUT,
-                env.WGT_WIDTH,
-                start_name=opt.start_name,
-                stop_name=opt.stop_name)
-            relay_prog = relay.ir_pass.fold_constant(relay_prog)
-
-        # Perform task extraction on Relay program
-        tasks = extract_from_program(func=relay_prog,
-                                        params=params,
-                                        ops=(tvm.relay.op.nn.conv2d,),
-                                        target=target,
-                                        target_host=target_host)
-
-        return tasks
-
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser(description='Train a model for image classification.')
-    parser.add_argument('--model', type=str, required=False, default='resnet18_v1',
-                        help='Input model name.')
-    parser.add_argument('--start-name', type=str, default='nn.max_pool2d',
-                        help='The name of the node where packing starts')
-    parser.add_argument('--stop-name', type=str, default='nn.global_avg_pool2d',
-                        help='The name of the node where packing stops')
-    parser.add_argument('--debug-profile', action='store_true',
-                        help='Show layer-wise time cost profiling results')
-    parser.add_argument('--device', default="vta",
-                        help='Select device target, either "vta" or "vtacpu"')
-    parser.add_argument('--measurements', type=int, default=1,
-                        help='Number of measurements')
-
-    opt = parser.parse_args()
-
     # Read in VTA environment
     env = vta.get_env()
 
-    # Target
+    # VTA target
     target = tvm.target.vta()
 
     # Get tracker info from env
@@ -198,22 +172,80 @@ def extract_tasks(opt, env, target):
     if not tracket_host or not tracket_port:
         print("Set your AutoTVM tracker node host and port variables to run the autotuner")
         exit()
+    
+    # Compile Relay program
+    print("Initial compile...")
+    relay_prog, params = compile_network(opt, env, target)
 
-    # Set tuner options
-    tuning_opt = {
-        'log_filename': 'resnet-18.log',
+    # Register VTA tuning tasks
+    register_vta_tuning_tasks()
 
-        'tuner': 'random',
+    # Perform task extraction on Relay program
+    print("Extracting tasks...")
+    tasks = extract_from_program(func=relay_prog,
+                                 params=params,
+                                 ops=(tvm.relay.op.nn.conv2d,),
+                                 target=target,
+                                 target_host=env.target_host)
+
+    # Perform Autotuning
+    print("Tuning...")
+    tuning_opt = {
+        'log_filename': opt.log_filename,
+        'tuner': opt.tuner,
         'n_trial': 1e9,
         'early_stopping': None,
-
-        'measure_option':  autotvm.measure_option(
+        'measure_option': autotvm.measure_option(
                 builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func),
                 runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port,
-                    number=4, min_repeat_ms=150, repeat=3, timeout=60,
+                    number=4, min_repeat_ms=150, repeat=opt.measurements, timeout=60,
                     check_correctness=True))
     }
-
-    tasks = extract_tasks(opt, env, target)
-
     tune_tasks(tasks, **tuning_opt)
+
+    # Compile kernels with history best records
+    with autotvm.tophub.context(target, extra_files=[opt.log_filename]):
+
+        # ResNet parameters
+        input_shape = (1, 3, 224, 224)
+        dtype = 'float32'
+
+        # Compile network
+        print("Compiling network with best tuning parameters...")
+        relay_prog, params = compile_network(opt, env, target)
+        with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
+            if target.device_name != "vta":
+                graph, lib, params = relay.build(
+                    relay_prog, target=target,
+                    params=params, target_host=env.target_host)
+            else:
+                with vta.build_config():
+                    graph, lib, params = relay.build(
+                        relay_prog, target=target,
+                        params=params, target_host=env.target_host)
+
+        # Export library
+        tmp = util.tempdir()
+        filename = "net.tar"
+        lib.export_library(tmp.relpath(filename))
+
+        # Upload module to device
+        print("Upload...")
+        remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000)
+        remote.upload(tmp.relpath(filename))
+        rlib = remote.load_module(filename)
+
+        # Upload parameters to device
+        ctx = remote.context(str(target), 0)
+        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+        module = graph_runtime.create(graph, rlib, ctx)
+        module.set_input('data', data_tvm)
+        module.set_input(**rparams)
+
+        # Evaluate
+        print("Evaluate inference time cost...")
+        ftimer = module.module.time_evaluator("run", ctx, number=4, repeat=opt.measurements)
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+              (np.mean(prof_res), np.std(prof_res)))
diff --git a/vta/scripts/tune_resnet_nnvm.py b/vta/scripts/tune_resnet_nnvm.py
index b22a63e09df8..433951570372 100644
--- a/vta/scripts/tune_resnet_nnvm.py
+++ b/vta/scripts/tune_resnet_nnvm.py
@@ -167,6 +167,20 @@ def tune_tasks(tasks,
         print("Set your AutoTVM tracker node host and port variables to run the autotuner")
         exit()
 
+    # Download model
+    sym, params = download_model()
+
+    # Register VTA tuning tasks
+    register_vta_tuning_tasks()
+
+    # Extract tasks
+    print("Extracting tasks...")
+    target = tvm.target.vta()
+    target_host = env.target_host
+    tasks = extract_tasks(sym, params, target, target_host)
+
+    # Perform Autotuning
+    print("Tuning...")
     tuning_opt = {
         'log_filename': 'resnet-18.log',
 
@@ -180,41 +194,33 @@ def tune_tasks(tasks,
                     number=4, repeat=3, timeout=60,
                     check_correctness=True))
     }
-
-    # download model
-    sym, params = download_model()
-
-    # register VTA tuning tasks
-    register_vta_tuning_tasks()
-
-    # extract tasks
-    print("Extract tasks...")
-    target = tvm.target.vta()
-    target_host = env.target_host
-    tasks = extract_tasks(sym, params, target, target_host)
-
-    print("Tuning...")
     tune_tasks(tasks, **tuning_opt)
 
     # compile kernels with history best records
     with autotvm.tophub.context(target, extra_files=[tuning_opt['log_filename']]):
-        print("Compile...")
+
+        # ResNet parameters
+        input_shape = (1, 3, 224, 224)
+        dtype = 'float32'\
+
+        # Compile network
+        print("Compiling network with best tuning parameters...")
         graph, lib, params = generate_graph(sym, params, target, target_host)
         input_shape = (1, 3, 224, 224)
         dtype = 'float32'
 
-        # export library
+        # Export library
         tmp = util.tempdir()
         filename = "net.tar"
         lib.export_library(tmp.relpath(filename))
 
-        # upload module to device
+        # Upload module to device
         print("Upload...")
         remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000)
         remote.upload(tmp.relpath(filename))
         rlib = remote.load_module(filename)
 
-        # upload parameters to device
+        # Upload parameters to device
         ctx = remote.context(str(target), 0)
         rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
         data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
@@ -222,7 +228,7 @@ def tune_tasks(tasks,
         module.set_input('data', data_tvm)
         module.set_input(**rparams)
 
-        # evaluate
+        # Evaluate
         print("Evaluate inference time cost...")
         ftimer = module.module.time_evaluator("run", ctx, number=3, repeat=3)
         prof_res = np.array(ftimer().results) * 1000  # convert to millisecond

From 51f1ee09b037e095bf250da74ed50cb7a8172f96 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Thu, 6 Jun 2019 17:18:21 -0700
Subject: [PATCH 047/126] refactoring, debug runtime

---
 vta/scripts/relay_to_vta.py     | 18 +++---
 vta/scripts/tune_resnet.py      | 97 ++++++++++++++++++++-------------
 vta/scripts/tune_resnet_nnvm.py | 10 ++--
 3 files changed, 75 insertions(+), 50 deletions(-)

diff --git a/vta/scripts/relay_to_vta.py b/vta/scripts/relay_to_vta.py
index d75deea432ae..c47ae09ef2a2 100644
--- a/vta/scripts/relay_to_vta.py
+++ b/vta/scripts/relay_to_vta.py
@@ -1,6 +1,6 @@
 """Perform inference on VTA using Relay."""
 
-import argparse, json, requests, time
+import argparse, json, os, requests, time
 from io import BytesIO
 from mxnet.gluon.model_zoo import vision
 import numpy as np
@@ -50,6 +50,13 @@ def classification_demo(opt):
     image = image[np.newaxis, :]
     image = np.repeat(image, env.BATCH, axis=0)
 
+    # For tuning, make sure tracker variables are set
+    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
+    tracker_port = int(os.environ.get("TVM_TRACKER_PORT", None))
+    if not tracker_host or not tracker_port:
+        print("Set your AutoTVM tracker node host and port variables to run the autotuner")
+        exit()
+
     # We configure both the bitstream and the runtime system on the Pynq
     # to match the VTA configuration specified by the vta_config.json file.
     if env.TARGET != "sim":
@@ -58,12 +65,7 @@ def classification_demo(opt):
         reconfig_start = time.time()
 
         # Get remote from fleet node
-        tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
-        tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None))
-        if not tracket_host or not tracket_port:
-            print("Set your AutoTVM tracker node host and port variables to run the autotuner")
-            exit()
-        remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000)
+        remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000)
 
         # Reconfigure the JIT runtime and FPGA.
         # You can program the FPGA with your own custom bitstream
@@ -84,7 +86,7 @@ def classification_demo(opt):
     ctx = remote.ext_dev(0) if opt.device == "vta" else remote.cpu(0)
 
     # Get tophub schedules
-    with autotvm.tophub.context(target):
+    with autotvm.tophub.context(target, extra_files=["resnet-18.log"]):
 
         # Measure build start time
         build_start = time.time()
diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
index 72e1395f0af2..18dac6df01e7 100644
--- a/vta/scripts/tune_resnet.py
+++ b/vta/scripts/tune_resnet.py
@@ -1,6 +1,6 @@
 """Perform inference on VTA using Relay."""
 
-import argparse, os
+import argparse, os, time
 from mxnet.gluon.model_zoo import vision
 import numpy as np
 from PIL import Image
@@ -163,15 +163,39 @@ def tune_tasks(tasks,
     # Read in VTA environment
     env = vta.get_env()
 
-    # VTA target
-    target = tvm.target.vta()
-
-    # Get tracker info from env
-    tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
-    tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None))
-    if not tracket_host or not tracket_port:
+    # Get remote from fleet node
+    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
+    tracker_port = int(os.environ.get("TVM_TRACKER_PORT", None))
+    if not tracker_host or not tracker_port:
         print("Set your AutoTVM tracker node host and port variables to run the autotuner")
         exit()
+
+    # Get remote
+    if env.TARGET != "sim":
+
+        # Measure build start time
+        reconfig_start = time.time()
+
+        # Get remote from fleet node
+        remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000)
+
+        # Reconfigure the JIT runtime and FPGA.
+        # You can program the FPGA with your own custom bitstream
+        # by passing the path to the bitstream file instead of None.
+        vta.reconfig_runtime(remote)
+        vta.program_fpga(remote, bitstream=None)
+
+        # Report on reconfiguration time
+        reconfig_time = time.time() - reconfig_start
+        print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
+
+    # In simulation mode, host the RPC server locally.
+    else:
+        remote = rpc.LocalSession()
+
+    # VTA target and execution context
+    target = tvm.target.vta()
+    ctx = remote.ext_dev(0) if opt.device == "vta" else remote.cpu(0)
     
     # Compile Relay program
     print("Initial compile...")
@@ -197,22 +221,18 @@ def tune_tasks(tasks,
         'early_stopping': None,
         'measure_option': autotvm.measure_option(
                 builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func),
-                runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port,
+                runner=autotvm.RPCRunner(env.TARGET, tracker_host, tracker_port,
                     number=4, min_repeat_ms=150, repeat=opt.measurements, timeout=60,
                     check_correctness=True))
     }
     tune_tasks(tasks, **tuning_opt)
 
     # Compile kernels with history best records
-    with autotvm.tophub.context(target, extra_files=[opt.log_filename]):
-
-        # ResNet parameters
-        input_shape = (1, 3, 224, 224)
-        dtype = 'float32'
+    with autotvm.tophub.context(target, extra_files=[opt.log_filename]): 
 
         # Compile network
         print("Compiling network with best tuning parameters...")
-        relay_prog, params = compile_network(opt, env, target)
+        # relay_prog, params = compile_network(opt, env, target)
         with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
             if target.device_name != "vta":
                 graph, lib, params = relay.build(
@@ -225,27 +245,30 @@ def tune_tasks(tasks,
                         params=params, target_host=env.target_host)
 
         # Export library
-        tmp = util.tempdir()
-        filename = "net.tar"
-        lib.export_library(tmp.relpath(filename))
-
-        # Upload module to device
-        print("Upload...")
-        remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000)
-        remote.upload(tmp.relpath(filename))
-        rlib = remote.load_module(filename)
-
-        # Upload parameters to device
-        ctx = remote.context(str(target), 0)
-        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
-        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
-        module = graph_runtime.create(graph, rlib, ctx)
-        module.set_input('data', data_tvm)
-        module.set_input(**rparams)
-
-        # Evaluate
-        print("Evaluate inference time cost...")
-        ftimer = module.module.time_evaluator("run", ctx, number=4, repeat=opt.measurements)
-        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+        temp = util.tempdir()
+        lib.save(temp.relpath("graphlib.o"))
+        remote.upload(temp.relpath("graphlib.o"))
+        lib = remote.load_module("graphlib.o")
+
+        # If detailed runtime info is needed build with debug runtime
+        if opt.debug_profile:
+            m = debug_runtime.create(graph, lib, ctx)
+        else:
+            m = graph_runtime.create(graph, lib, ctx)
+
+        # Set the network parameters and synthetic input
+        image = tvm.nd.array(
+            (np.random.uniform(size=(1, 3, 224, 224))).astype('float32'))
+        m.set_input(**params)
+        m.set_input('data', image)
+
+        # Perform inference
+        timer = m.module.time_evaluator("run", ctx, number=4, repeat=opt.measurements)
+        tcost = timer()
+        prof_res = np.array(tcost.results) * 1000  # convert to millisecond
         print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
               (np.mean(prof_res), np.std(prof_res)))
+
+        # Display profile information
+        if opt.debug_profile:
+            m.run()
diff --git a/vta/scripts/tune_resnet_nnvm.py b/vta/scripts/tune_resnet_nnvm.py
index 433951570372..3a6149df267c 100644
--- a/vta/scripts/tune_resnet_nnvm.py
+++ b/vta/scripts/tune_resnet_nnvm.py
@@ -161,9 +161,9 @@ def tune_tasks(tasks,
 if __name__ == '__main__':
 
     # Get tracker info from env
-    tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
-    tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None))
-    if not tracket_host or not tracket_port:
+    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
+    tracker_port = int(os.environ.get("TVM_TRACKER_PORT", None))
+    if not tracker_host or not tracker_port:
         print("Set your AutoTVM tracker node host and port variables to run the autotuner")
         exit()
 
@@ -190,7 +190,7 @@ def tune_tasks(tasks,
 
         'measure_option':  autotvm.measure_option(
                 builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func),
-                runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port,
+                runner=autotvm.RPCRunner(env.TARGET, tracker_host, tracker_port,
                     number=4, repeat=3, timeout=60,
                     check_correctness=True))
     }
@@ -216,7 +216,7 @@ def tune_tasks(tasks,
 
         # Upload module to device
         print("Upload...")
-        remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000)
+        remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000)
         remote.upload(tmp.relpath(filename))
         rlib = remote.load_module(filename)
 

From e92c0c2838476910612d78c6e33f687dea4db199 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Thu, 6 Jun 2019 17:34:36 -0700
Subject: [PATCH 048/126] removing debug messages

---
 python/tvm/relay/quantize/quantize.py | 10 ----------
 vta/python/vta/top/graphpack.py       |  2 --
 2 files changed, 12 deletions(-)

diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index da881db26fb2..6fc3f9ed57fc 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -374,20 +374,10 @@ def quantize_vta(graph, params=None, dataset=None):
 
     # TODO(zhiics) Move this to the pass manager.
     graph = optimize(graph, params)
-
-    print('original graph')
-    print(graph)
     graph = _quantize.rewrite_for_vta(graph)
-    print('after rewrite for vta')
-    print(graph)
-
     graph = annotate(graph)
     graph = calibrate(graph, dataset)
-    print('after calibrate')
-    print(graph)
     graph = realize(graph)
     graph = _ir_pass.fold_constant(graph)
 
-    print('after realize')
-    print(graph)
     return graph
diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
index 3ce50d06dbda..770dd380403d 100644
--- a/vta/python/vta/top/graphpack.py
+++ b/vta/python/vta/top/graphpack.py
@@ -265,13 +265,11 @@ def graph_pack(expr,
     """
     assert isinstance(expr, relay.Function)
     expr = get_subgraph(expr, start_name, stop_name)
-    print("Before", expr.astext(show_meta_data=False))
     expr = relay.ir_pass.infer_type(expr)
     packer = ExprPack(
         bfactor, cfactor,
         weight_bits)
     expr = packer.visit(expr)
-    print("After", expr.astext(show_meta_data=False))
     assert not packer.start_pack
     return relay.ir_pass.infer_type(expr)
 

From affd1581403a3f264b190d988149130d8860fa53 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 10 Jun 2019 10:02:26 -0700
Subject: [PATCH 049/126] proper argparsing, and target setting

---
 vta/scripts/relay_to_vta.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vta/scripts/relay_to_vta.py b/vta/scripts/relay_to_vta.py
index c47ae09ef2a2..c71e6f61f37a 100644
--- a/vta/scripts/relay_to_vta.py
+++ b/vta/scripts/relay_to_vta.py
@@ -82,11 +82,11 @@ def classification_demo(opt):
         remote = rpc.LocalSession()
 
     # Create a TVM target and execution context
-    target = tvm.target.create("llvm -device={}".format(opt.device))
+    target = env.target if opt.device == "vta" else env.target_vta_cpu
     ctx = remote.ext_dev(0) if opt.device == "vta" else remote.cpu(0)
 
     # Get tophub schedules
-    with autotvm.tophub.context(target, extra_files=["resnet-18.log"]):
+    with autotvm.tophub.context(target):
 
         # Measure build start time
         build_start = time.time()
@@ -182,7 +182,7 @@ def classification_demo(opt):
 if __name__ == '__main__':
 
     parser = argparse.ArgumentParser(description='Train a model for image classification.')
-    parser.add_argument('--model', type=str, required=False, default='resnet18_v1',
+    parser.add_argument('--model', type=str, default='resnet18_v1', choices=['resnet18_v1'],
                         help='Input model name.')
     parser.add_argument('--start-name', type=str, default='nn.max_pool2d',
                         help='The name of the node where packing starts')
@@ -190,8 +190,8 @@ def classification_demo(opt):
                         help='The name of the node where packing stops')
     parser.add_argument('--debug-profile', action='store_true',
                         help='Show layer-wise time cost profiling results')
-    parser.add_argument('--device', default="vta",
-                        help='Select device target, either "vta" or "vtacpu"')
+    parser.add_argument('--device', default='vta',  choices=['vta', 'arm_cpu'],
+                        help='Select device target')
     parser.add_argument('--measurements', type=int, default=1,
                         help='Number of measurements')
 

From 8f277bf4125c769b1117e2abd8e79e0555f54ddb Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 10 Jun 2019 10:04:08 -0700
Subject: [PATCH 050/126] adding dense tuning

---
 vta/scripts/tune_resnet.py | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
index 18dac6df01e7..463187f3caa3 100644
--- a/vta/scripts/tune_resnet.py
+++ b/vta/scripts/tune_resnet.py
@@ -20,7 +20,7 @@
 def parse_arguments():
 
     parser = argparse.ArgumentParser(description='Train a model for image classification.')
-    parser.add_argument('--model', type=str, required=False, default='resnet18_v1',
+    parser.add_argument('--model', type=str, default='resnet18_v1', choices=['resnet18_v1'],
                         help='Input model name.')
     parser.add_argument('--start-name', type=str, default='nn.max_pool2d',
                         help='The name of the node where packing starts')
@@ -28,8 +28,8 @@ def parse_arguments():
                         help='The name of the node where packing stops')
     parser.add_argument('--debug-profile', action='store_true',
                         help='Show layer-wise time cost profiling results')
-    parser.add_argument('--device', default="vta",
-                        help='Select device target, either "vta" or "vtacpu"')
+    parser.add_argument('--device', default='vta',  choices=['vta', 'arm_cpu'],
+                        help='Select device target')
     parser.add_argument('--measurements', type=int, default=1,
                         help='Number of measurements during AutoTVM search')
     parser.add_argument('--tuner', type=str, default="random",
@@ -74,6 +74,23 @@ def _topi_nn_conv2d(*args, **kwargs):
         return s, [A, W, res]
 
 
+    @autotvm.task.register("topi_nn_dense", override=True)
+    def _topi_nn_dense(*args, **kwargs):
+        assert not kwargs, "Do not support kwargs in template function call"
+        args = deserialize_args(args)
+        A, W = args[:2]
+
+        with tvm.target.vta():
+            res = topi.nn.dense(*args, **kwargs)
+
+        if tvm.target.current_target().device_name == 'vta':
+            s = topi.generic.schedule_conv2d_nchw([res])
+        else:
+            s = tvm.create_schedule([res.op])
+
+        return s, [A, W, res]
+
+
 def compile_network(opt, env, target):
 
     # Populate the shape and data type dictionary
@@ -194,7 +211,7 @@ def tune_tasks(tasks,
         remote = rpc.LocalSession()
 
     # VTA target and execution context
-    target = tvm.target.vta()
+    target = env.target if opt.device == "vta" else env.target_vta_cpu
     ctx = remote.ext_dev(0) if opt.device == "vta" else remote.cpu(0)
     
     # Compile Relay program
@@ -208,7 +225,8 @@ def tune_tasks(tasks,
     print("Extracting tasks...")
     tasks = extract_from_program(func=relay_prog,
                                  params=params,
-                                 ops=(tvm.relay.op.nn.conv2d,),
+                                 ops=(tvm.relay.op.nn.conv2d,
+                                      tvm.relay.op.nn.dense),
                                  target=target,
                                  target_host=env.target_host)
 

From be4d3a1f5ab1a8ac0aa648c132e3b653adfea02f Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 10 Jun 2019 14:34:12 -0700
Subject: [PATCH 051/126] updated tutorial to use Relay

---
 vta/tutorials/resnet.py | 324 ++++++++++++++--------------------------
 1 file changed, 116 insertions(+), 208 deletions(-)

diff --git a/vta/tutorials/resnet.py b/vta/tutorials/resnet.py
index 13161586480e..d3ed0cebe79d 100644
--- a/vta/tutorials/resnet.py
+++ b/vta/tutorials/resnet.py
@@ -24,292 +24,200 @@
 
 """
 
+
 ######################################################################
 # Import Libraries
 # ----------------
-# We start by importing the tvm, vta, nnvm libraries to run this example.
+# We start by importing libraries to run this example.
 
 from __future__ import absolute_import, print_function
 
-import os
-import time
+import argparse, json, os, requests, time
 from io import BytesIO
+from os.path import join, isfile
+from PIL import Image
 
+from mxnet.gluon.model_zoo import vision
 import numpy as np
-import requests
 from matplotlib import pyplot as plt
-from PIL import Image
 
 import tvm
-from tvm import rpc, autotvm
-from tvm.contrib import graph_runtime, util
-from tvm.contrib.download import download
-import nnvm.compiler
-import vta
-import vta.testing
+from tvm import rpc, autotvm, relay
+from tvm.contrib import graph_runtime, util, download
+from tvm.contrib.debugger import debug_runtime
 
-# Load VTA parameters from the vta/config/vta_config.json file
-env = vta.get_env()
+import vta
+from vta.testing import simulator
+from vta.top import graph_pack
 
-# Helper to crop an image to a square (224, 224)
-# Takes in an Image object, returns an Image object
-def thumbnailify(image, pad=15):
-    w, h = image.size
-    crop = ((w-h)//2+pad, pad, h+(w-h)//2-pad, h-pad)
-    image = image.crop(crop)
-    image = image.resize((224, 224))
-    return image
-
-# Helper function to read in image
-# Takes in Image object, returns an ND array
-def process_image(image):
-    # Convert to neural network input format
-    image = np.array(image) - np.array([123., 117., 104.])
-    image /= np.array([58.395, 57.12, 57.375])
-    image = image.transpose((2, 0, 1))
-    image = image[np.newaxis, :]
-
-    return tvm.nd.array(image.astype("float32"))
-
-# Classification helper function
-# Takes in the graph runtime, and an image, and returns top result and time
-def classify(m, image):
-    m.set_input('data', image)
-    timer = m.module.time_evaluator("run", ctx, number=1)
-    tcost = timer()
-    tvm_output = m.get_output(0)
-    top = np.argmax(tvm_output.asnumpy()[0])
-    tcost = "t={0:.2f}s".format(tcost.mean)
-    return tcost + " {}".format(synset[top])
+# Make sure that TVM was compiled with RPC=1
+assert tvm.module.enabled("rpc")
 
 
 ######################################################################
-# Download ResNet Model
-# --------------------------------------------
-# Download the necessary files to run ResNet-18.
-#
-
-# Obtain ResNet model and download them into _data dir
-url = "https://github.com/uwsaml/web-data/raw/master/vta/models/"
-categ_fn = 'synset.txt'
-graph_fn = 'resnet18_qt8.json'
-params_fn = 'resnet18_qt8.params'
+# Define the platform and model targets
+# ----------------
+# Execute on CPU vs. VTA, and define the model.
 
-# Create data dir
-data_dir = "_data/"
-if not os.path.exists(data_dir):
-    os.makedirs(data_dir)
+# Load VTA parameters from the vta/config/vta_config.json file
+env = vta.get_env()
 
-# Download files
-for file in [categ_fn, graph_fn, params_fn]:
-    download(os.path.join(url, file), os.path.join(data_dir, file))
+# Set ``device=arm_cpu`` to run inference on the CPU
+# or ``device=vta`` to run inference on the FPGA.
+device = "vta"
+target = env.target if device == "vta" else env.target_vta_cpu
 
-# Read in ImageNet Categories
-synset = eval(open(os.path.join(data_dir, categ_fn)).read())
+# Name of Gluon model to compile
+model = "resnet18_v1"
+start_pack="nn.max_pool2d"
+stop_pack="nn.global_avg_pool2d"
 
 ######################################################################
-# Setup the Pynq Board's RPC Server
+# Obtain an execution remote
 # ---------------------------------
-# Build the RPC server's VTA runtime and program the Pynq FPGA.
-
-# Measure build start time
-reconfig_start = time.time()
-
-# We read the Pynq RPC host IP address and port number from the OS environment
-host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
-port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))
-
-# We configure both the bitstream and the runtime system on the Pynq
-# to match the VTA configuration specified by the vta_config.json file.
-if env.TARGET == "pynq":
-    # Make sure that TVM was compiled with RPC=1
-    assert tvm.module.enabled("rpc")
-    remote = rpc.connect(host, port)
-
-    # Reconfigure the JIT runtime
-    vta.reconfig_runtime(remote)
-
-    # Program the FPGA with a pre-compiled VTA bitstream.
+# When target is 'pynq', reconfigure FPGA and runtime.
+# Otherwise, if target is 'sim', execute locally.
+
+if env.TARGET != "sim":
+
+    # Get remote from fleet node if environment variable is set
+    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
+    tracker_port = int(os.environ.get("TVM_TRACKER_PORT", None))
+    device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
+    device_port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))
+    if not tracker_host or not tracker_port:
+        remote = rpc.connect(device_host, device_port)
+    else:
+        remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000)
+
+    # Reconfigure the JIT runtime and FPGA.
     # You can program the FPGA with your own custom bitstream
     # by passing the path to the bitstream file instead of None.
+    reconfig_start = time.time()
+    vta.reconfig_runtime(remote)
     vta.program_fpga(remote, bitstream=None)
-
-    # Report on reconfiguration time
     reconfig_time = time.time() - reconfig_start
     print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
 
 # In simulation mode, host the RPC server locally.
-elif env.TARGET == "sim":
+else:
     remote = rpc.LocalSession()
 
+# Get execution context from remote
+ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
 
 ######################################################################
-# Build the ResNet Runtime
+# Build the inference runtime
 # ------------------------
-# Build the ResNet graph runtime, and configure the parameters.
-
-# Set ``device=vtacpu`` to run inference on the CPU
-# or ``device=vta`` to run inference on the FPGA.
-device = "vta"
-
-# TVM target and context
-target = tvm.target.create("llvm -device={}".format(device))
-ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
-
-# TVM module
-m = None
+# Build ResNet from Gluon with Relay.
 
+# Load pre-configured AutoTVM schedules
 with autotvm.tophub.context(target):
 
-    graph_fn = os.path.join(data_dir, graph_fn)
-    params_fn= os.path.join(data_dir, params_fn)
+    # Populate the shape and data type dictionary for ResNet input
+    dtype_dict = {"data": 'float32'}
+    shape_dict = {"data": (env.BATCH, 3, 224, 224)}
+
+    # Get off the shelf gluon model, and convert to relay
+    gluon_model = vision.get_model(model, pretrained=True)
 
     # Measure build start time
     build_start = time.time()
 
-    # Load the ResNet-18 graph and parameters
-    sym = nnvm.graph.load_json(open(graph_fn).read())
-    params = nnvm.compiler.load_param_dict(open(params_fn, 'rb').read())
+    # Start front end compilation
+    relay_prog, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
+    print(relay_prog)
+    # exit()
 
-    # Populate the shape and data type dictionary
-    shape_dict = {"data": (1, 3, 224, 224)}
-    dtype_dict = {"data": 'float32'}
+    # Update shape and type dictionary
     shape_dict.update({k: v.shape for k, v in params.items()})
     dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
 
-    # Apply NNVM graph optimization passes
-    sym = vta.graph.clean_cast(sym)
-    sym = vta.graph.clean_conv_fuse(sym)
+    # Perform quantization in Relay
+    with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1):
+        relay_prog = relay.quantize.quantize(relay_prog, params=params)
+
+    # Perform graph packing and constant folding for VTA target
     if target.device_name == "vta":
         assert env.BLOCK_IN == env.BLOCK_OUT
-        sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT)
-
-    # Compile NNVM graph
-    with nnvm.compiler.build_config(opt_level=3):
+        relay_prog = graph_pack(
+            relay_prog,
+            env.BATCH,
+            env.BLOCK_OUT,
+            env.WGT_WIDTH,
+            start_name=start_pack,
+            stop_name=stop_pack)
+        relay_prog = relay.ir_pass.fold_constant(relay_prog)
+
+    # Compile Relay program with AlterOpLayout disabled
+    with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
         if target.device_name != "vta":
-            graph, lib, params = nnvm.compiler.build(
-                sym, target, shape_dict, dtype_dict,
+            graph, lib, params = relay.build(
+                relay_prog, target=target,
                 params=params, target_host=env.target_host)
         else:
             with vta.build_config():
-                graph, lib, params = nnvm.compiler.build(
-                    sym, target, shape_dict, dtype_dict,
+                graph, lib, params = relay.build(
+                    relay_prog, target=target,
                     params=params, target_host=env.target_host)
 
-    # Save the compiled inference graph library
-    assert tvm.module.enabled("rpc")
-    temp = util.tempdir()
-    lib.save(temp.relpath("graphlib.o"))
+    # Measure Relay build time
+    build_time = time.time() - build_start
+    print(model + " inference graph built in {0:.2f}s!".format(build_time))
 
     # Send the inference library over to the remote RPC server
+    temp = util.tempdir()
+    lib.save(temp.relpath("graphlib.o"))
     remote.upload(temp.relpath("graphlib.o"))
     lib = remote.load_module("graphlib.o")
 
-    # Measure build time
-    build_time = time.time() - build_start
-    print("ResNet-18 inference graph built in {0:.2f}s!".format(build_time))
-
+    # Graph runtime
     m = graph_runtime.create(graph, lib, ctx)
 
-    # Set the parameters
-    m.set_input(**params)
-
 ######################################################################
-# Run ResNet-18 inference on a sample image
-# -----------------------------------------
-# Perform image classification on test image.
-# You can change the test image URL to any image of your choosing.
+# Perform ResNet-18 inference
+# ------------------------
+# We run classification on an image sample from ImageNet
+
+# Download ImageNet categories
+categ_url = "https://github.com/uwsaml/web-data/raw/master/vta/models/synset.txt"
+categ_fn = "synset.txt"
+download.download(join(categ_url, categ_fn), categ_fn)
+synset = eval(open(categ_fn).read())
 
-# Read in test image
+# Download test image
 image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg'
-# Read in test image
 response = requests.get(image_url)
+
+# Prepare test image for inference
 image = Image.open(BytesIO(response.content)).resize((224, 224))
-# Show Image
 plt.imshow(image)
 plt.show()
-# Set the input
-image = process_image(image)
+image = np.array(image) - np.array([123., 117., 104.])
+image /= np.array([58.395, 57.12, 57.375])
+image = image.transpose((2, 0, 1))
+image = image[np.newaxis, :]
+image = np.repeat(image, env.BATCH, axis=0)
+
+# Set the network parameters and inputs
+m.set_input(**params)
 m.set_input('data', image)
 
 # Perform inference
-timer = m.module.time_evaluator("run", ctx, number=1)
+timer = m.module.time_evaluator("run", ctx, number=4, repeat=3)
 tcost = timer()
 
 # Get classification results
-tvm_output = m.get_output(0)
+tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 1000), "float32", remote.cpu(0)))
 top_categories = np.argsort(tvm_output.asnumpy()[0])
 
 # Report top-5 classification results
-print("ResNet-18 Prediction #1:", synset[top_categories[-1]])
+std = np.std(tcost.results) * 1000 / env.BATCH
+mean = tcost.mean * 1000 / env.BATCH
+print("%s prediction" % model)
+print("                     #1:", synset[top_categories[-1]])
 print("                     #2:", synset[top_categories[-2]])
 print("                     #3:", synset[top_categories[-3]])
 print("                     #4:", synset[top_categories[-4]])
 print("                     #5:", synset[top_categories[-5]])
-print("Performed inference in {0:.2f}s".format(tcost.mean))
-
-
-######################################################################
-# Run a Youtube Video Image Classifier
-# ------------------------------------
-# Perform image classification on test stream on 1 frame every 48 frames.
-# Comment the `if False:` out to run the demo
-
-# Early exit - remove for Demo
-if False:
-
-    import cv2
-    import pafy
-    from IPython.display import clear_output
-
-    # Helper to crop an image to a square (224, 224)
-    # Takes in an Image object, returns an Image object
-    def thumbnailify(image, pad=15):
-        w, h = image.size
-        crop = ((w-h)//2+pad, pad, h+(w-h)//2-pad, h-pad)
-        image = image.crop(crop)
-        image = image.resize((224, 224))
-        return image
-
-    # 16:16 inches
-    plt.rcParams['figure.figsize'] = [16, 16]
-
-    # Stream the video in
-    url = "https://www.youtube.com/watch?v=PJlmYh27MHg&t=2s"
-    video = pafy.new(url)
-    best = video.getbest(preftype="mp4")
-    cap = cv2.VideoCapture(best.url)
-
-    # Process one frame out of every 48 for variety
-    count = 0
-    guess = ""
-    while(count<2400):
-
-        # Capture frame-by-frame
-        ret, frame = cap.read()
-
-        # Process one every 48 frames
-        if count % 48 == 1:
-            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            frame = Image.fromarray(frame)
-            # Crop and resize
-            thumb = np.array(thumbnailify(frame))
-            image = process_image(thumb)
-            guess = classify(m, image)
-
-            # Insert guess in frame
-            frame = cv2.rectangle(thumb,(0,0),(200,0),(0,0,0),50)
-            cv2.putText(frame, guess, (5,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (256,256,256), 1, cv2.LINE_AA)
-
-            plt.imshow(thumb)
-            plt.axis('off')
-            plt.show()
-            if cv2.waitKey(1) & 0xFF == ord('q'):
-                break
-            clear_output(wait=True)
-
-        count += 1
-
-    # When everything done, release the capture
-    cap.release()
-    cv2.destroyAllWindows()
+print("Performed inference in %.2fms/sample (std = %.2f)" % (mean, std))

From 05b08fc8f33f97c8362382f6e4e9cf7f02f49949 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 11 Jun 2019 15:18:06 -0700
Subject: [PATCH 052/126] setup for colab

---
 package.sh | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100755 package.sh

diff --git a/package.sh b/package.sh
new file mode 100755
index 000000000000..da227738637d
--- /dev/null
+++ b/package.sh
@@ -0,0 +1,6 @@
+echo "Installing Dependencies ..."
+echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list
+sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
+sudo apt-get update
+sudo apt-get install -y -q llvm-6.0 libtinfo-dev libffi-dev zlib1g-dev clinfo tree
+sudo apt-get install verilator sbt

From 1ef1e2509c2a03af3cc865623a80094361d1d6a5 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 11 Jun 2019 23:23:09 -0700
Subject: [PATCH 053/126] fix url

---
 vta/tutorials/resnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vta/tutorials/resnet.py b/vta/tutorials/resnet.py
index d3ed0cebe79d..9caa6cdafbd6 100644
--- a/vta/tutorials/resnet.py
+++ b/vta/tutorials/resnet.py
@@ -180,7 +180,7 @@
 # We run classification on an image sample from ImageNet
 
 # Download ImageNet categories
-categ_url = "https://github.com/uwsaml/web-data/raw/master/vta/models/synset.txt"
+categ_url = "https://github.com/uwsaml/web-data/raw/master/vta/models/"
 categ_fn = "synset.txt"
 download.download(join(categ_url, categ_fn), categ_fn)
 synset = eval(open(categ_fn).read())

From 1619a11a8ab2270ab6556a773bbffce9a1035537 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Thu, 13 Jun 2019 00:00:05 -0700
Subject: [PATCH 054/126] dense operator placeholder

---
 vta/python/vta/top/vta_dense.py | 171 ++++++++++++++++++++++++++++++++
 1 file changed, 171 insertions(+)
 create mode 100644 vta/python/vta/top/vta_dense.py

diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py
new file mode 100644
index 000000000000..f2fdbc7e93a4
--- /dev/null
+++ b/vta/python/vta/top/vta_dense.py
@@ -0,0 +1,171 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Dense operator declaration and schedule registration for VTA."""
+
+import numpy as np
+import tvm
+from tvm import autotvm
+import topi
+
+from ..environment import get_env
+
+def is_packed_layout(layout):
+    """Check if layout is packed layout"""
+    if layout == "NCHW":
+        return False
+    if "n" in layout and "c" in layout:
+        return True
+    return False
+
+@autotvm.register_topi_compute(topi.nn.dense, 'vta', 'direct')
+def _declaration_dense(cfg,
+                       data,
+                       weight,
+                       bias=None,
+                       out_dtype=None):
+    """Dense function declaration."""
+
+    # Make sure that the dense operator is packed
+    assert len(data.shape) == 4
+    assert len(weight.shape) == 4
+    # Derive output shape
+    oshape = (data.shape[0], weight.shape[0], data.shape[2], weight.shape[2])
+    
+    # Reduction axes (input channel)
+    assert(data.shape[1] == weight.shape[1])
+    assert(data.shape[3] == weight.shape[3])
+    k_o = tvm.reduce_axis((0, data.shape[1]), name='k_o')
+    k_i = tvm.reduce_axis((0, data.shape[3]), name='k_i')
+
+    res = tvm.compute(
+        oshape,
+        lambda b_o, c_o, b_i, c_i: tvm.sum(
+            data[b_o, k_o, b_i, k_i].astype(out_dtype) *
+            weight[c_o, k_o, c_i, k_i].astype(out_dtype),
+            axis=[k_o, k_i]),
+        name="res", tag="packed_dense")
+    
+    cfg.add_flop(2 * np.prod(topi.util.get_const_tuple(oshape)) *
+                 data.shape[1] * data.shape[3])
+    return res
+
+@autotvm.register_topi_schedule(topi.generic.schedule_dense, 'vta', 'direct')
+def _schedule_dense(cfg, outs):
+    """Packed dense schedule."""
+    
+    assert len(outs) == 1
+    output = outs[0]
+    const_ops = []
+    ewise_inputs = []
+    ewise_ops = []
+    dense_res = []
+    assert "int" in output.op.input_tensors[0].dtype
+
+    def _traverse(op):
+        if topi.tag.is_broadcast(op.tag):
+            if not op.same_as(output.op):
+                if len(op.axis) == 0:
+                    const_ops.append(op)
+                else:
+                    ewise_ops.append(op)
+            for tensor in op.input_tensors:
+                if isinstance(tensor.op, tvm.tensor.PlaceholderOp):
+                    ewise_inputs.append((op, tensor))
+                else:
+                    _traverse(tensor.op)
+        else:
+            assert op.tag == "dense"
+            dense_res.append(op)
+
+    _traverse(output.op)
+    assert len(dense_res) == 1
+    dense_stage = dense_res[0].output(0)
+    s = tvm.create_schedule(output.op)
+
+    ##### space definition begin #####
+    b, co, _, _ = s[dense_stage].op.axis
+    ci, _ = s[dense_stage].op.reduce_axis
+    cfg.define_split('tile_b', b, num_outputs=2)
+    cfg.define_split('tile_co', co, num_outputs=2)
+    cfg.define_split('tile_ci', ci, num_outputs=2)
+    cfg.define_knob('oc_nthread', [1, 2])
+    cfg.define_knob('h_nthread', [1, 2])
+    ###### space definition end ######
+
+    data, kernel = dense_stage.op.input_tensors
+
+    env = get_env()
+
+    cdata = s.cache_read(data, env.inp_scope, [dense_stage])
+    ckernel = s.cache_read(kernel, env.wgt_scope, [dense_stage])
+    s[dense_stage].set_scope(env.acc_scope)
+
+    # cache read input
+    cache_read_ewise = []
+    for consumer, tensor in ewise_inputs:
+        cache_read_ewise.append(
+            s.cache_read(tensor, env.acc_scope, [consumer]))
+
+    # set ewise scope
+    for op in ewise_ops:
+        s[op].set_scope(env.acc_scope)
+        s[op].pragma(s[op].op.axis[0], env.alu)
+
+    for op in const_ops:
+        s[op].compute_inline()
+
+    # tile
+    x_bo, x_co, x_bi, x_ci = s[output].op.axis
+    x_bo0, x_bo1 = cfg['tile_b'].apply(s, output, x_bo)
+    x_co0, x_co1 = cfg['tile_co'].apply(s, output, x_co)
+    s[output].reorder(x_bo0, x_co0, x_bo1, x_co1, x_bi, x_ci)
+    store_pt = x_co0
+
+    # set all compute scopes
+    s[dense_stage].compute_at(s[output], store_pt)
+    for op in ewise_ops:
+        s[op].compute_at(s[output], store_pt)
+
+    for tensor in cache_read_ewise:
+        s[tensor].compute_at(s[output], store_pt)
+        s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy)
+
+    # virtual threading along output channel axes
+    if cfg['oc_nthread'].val > 1:
+        _, v_t = s[output].split(x_co0, factor=cfg['oc_nthread'].val)
+        s[output].reorder(v_t, x_bo)
+        s[output].bind(v_t, tvm.thread_axis("cthread"))
+
+    # virtual threading along spatial rows
+    if cfg['h_nthread'].val > 1:
+        _, v_t = s[output].split(x_i0, factor=cfg['h_nthread'].val)
+        s[output].reorder(v_t, x_bo)
+        s[output].bind(v_t, tvm.thread_axis("cthread"))
+
+    x_bo, x_co, x_bi, x_ci = s[dense_stage].op.axis
+    k_o, k_i = s[dense_stage].op.reduce_axis
+    s[dense_stage].reorder(x_bo, k_o, x_co, x_bi, x_ci, k_i)
+
+    k_o, _ = cfg['tile_ci'].apply(s, dense_stage, k_o)
+    s[cdata].compute_at(s[dense_stage], k_o)
+    s[ckernel].compute_at(s[dense_stage], k_o)
+
+    # Use VTA instructions
+    s[cdata].pragma(s[cdata].op.axis[0], env.dma_copy)
+    s[ckernel].pragma(s[ckernel].op.axis[0], env.dma_copy)
+    s[dense_stage].tensorize(x_bi, env.gemm)
+    s[output].pragma(x_co1, env.dma_copy)

From e0e1bc7f8410094860778a53811716ef146aba2b Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Thu, 13 Jun 2019 12:52:50 -0700
Subject: [PATCH 055/126] fix support for pass manager

---
 python/tvm/relay/quantize/quantize.py | 43 +++++++--------------------
 src/relay/pass/quantize.cc            | 21 +++++++------
 2 files changed, 23 insertions(+), 41 deletions(-)

diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index 6fc3f9ed57fc..df6f8b9e139c 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -279,6 +279,17 @@ def realize():
     return _quantize.QuantizeRealize()
 
 
+def rewrite_for_vta():
+    """Performs rewriting for VTA target.
+
+    Returns
+    -------
+    ret: tvm.relay.Pass
+        The registered pass for VTA rewrite.
+    """
+    return _quantize.QuantizeRewriteForVTA()
+
+
 def _bind_params(func, params):
     """Bind the params to the expression.
     """
@@ -349,35 +360,3 @@ def quantize(graph, params=None, dataset=None):
             mod = optimize(mod)
             mod = quantize_seq(mod)
     return mod[mod.entry_func.name_hint]
-
-def quantize_vta(graph, params=None, dataset=None):
-
-    """ The quantization procedure for VTA specifically.
-
-    Parameters
-    ---------
-    graph: Function
-        The original graph.
-
-    params : dict of str to NDArray
-        Input parameters to the graph that do not change
-        during inference time. Used for constant folding.
-
-    dataset: list of dict of Var -> NDArray
-        The calibration dataset.
-
-    Returns
-    -------
-    ret: Function
-        The graph after quantization
-    """
-
-    # TODO(zhiics) Move this to the pass manager.
-    graph = optimize(graph, params)
-    graph = _quantize.rewrite_for_vta(graph)
-    graph = annotate(graph)
-    graph = calibrate(graph, dataset)
-    graph = realize(graph)
-    graph = _ir_pass.fold_constant(graph)
-
-    return graph
diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
index 8fbe290ad60b..cb64902d74f9 100644
--- a/src/relay/pass/quantize.cc
+++ b/src/relay/pass/quantize.cc
@@ -700,6 +700,18 @@ Pass QuantizeRealizePass() {
 TVM_REGISTER_API("relay._quantize.QuantizeRealize")
 .set_body_typed(QuantizeRealizePass);
 
+Pass QuantizeRewriteForVTAPass() {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+      return Downcast<Function>(
+          ForwardRewrite(f, "FQVtaRewrite", nullptr, nullptr));
+  };
+  return CreateFunctionPass(pass_func, 1, "QuantizeRewriteForVTA", {});
+}
+
+TVM_REGISTER_API("relay._quantize.QuantizeRewriteForVTA")
+.set_body_typed(QuantizeRewriteForVTAPass);
+
 // =============
 // Insert stop_fusion for vta.
 
@@ -715,18 +727,11 @@ QVtaExpr QVtaExprNode::make(Expr expr) {
   return QVtaExpr(rnode);
 }
 
-TVM_REGISTER_API("relay._quantize.rewrite_for_vta")
-.set_body_typed<Expr(Expr)>([] (const Expr& expr) {
-  return ForwardRewrite(expr, "FQVtaRewrite", nullptr, nullptr);
-});
-
-
 TVM_REGISTER_API("relay._quantize.make_vta_expr")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
     *ret = QVtaExprNode::make(args[0]);
   });
 
-
 TVM_REGISTER_API("relay._quantize.make_stop_fusion")
 .set_body_typed<Expr(Expr)>([] (const Expr& expr) {
   return StopFusion(expr);
@@ -740,8 +745,6 @@ TVM_REGISTER_API("relay._quantize.temp_expr_realize")
 });
 
 
-
-
 }  // namespace quantize
 }  // namespace relay
 }  // namespace tvm

From 0b4addb416e163c4096947eb486fe5bf1d33c881 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Thu, 13 Jun 2019 17:02:10 -0700
Subject: [PATCH 056/126] dense op benchmark

---
 vta/python/vta/top/__init__.py                |   1 +
 vta/python/vta/top/op.py                      |  21 +-
 vta/python/vta/top/vta_conv2d.py              |  29 +--
 vta/python/vta/top/vta_dense.py               |  20 +-
 .../integration/test_benchmark_topi_conv2d.py |   3 +-
 .../integration/test_benchmark_topi_dense.py  | 185 ++++++++++++++++++
 6 files changed, 229 insertions(+), 30 deletions(-)
 create mode 100644 vta/tests/python/integration/test_benchmark_topi_dense.py

diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py
index 5111035decd3..ee2b5ec21ef8 100644
--- a/vta/python/vta/top/__init__.py
+++ b/vta/python/vta/top/__init__.py
@@ -7,3 +7,4 @@
 from . import nnvm_op
 from . import op
 from . import vta_conv2d
+from . import vta_dense
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index 7f3c58a46116..5d6cfadc34f4 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -31,6 +31,7 @@ def compute_clip(attrs, inputs, output_type, target):
             x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
     return [x]
 
+
 @reg.register_compute("nn.conv2d", level=15)
 def compute_conv2d(attrs, inputs, output_type, target):
     """ Compute definition of conv2d """
@@ -41,13 +42,13 @@ def compute_conv2d(attrs, inputs, output_type, target):
     layout = attrs.data_layout
     out_dtype = attrs.out_dtype
 
-    assert dilation == (1, 1), "not support dilate now"
+    assert dilation == (1, 1), "support for dilation limited to (1, 1)"
     if is_packed_layout(layout):
         if groups == 1:
             assert groups == 1
             env = get_env()
             assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now"
-            assert env.LOG_OUT_WIDTH == 3, "only support 8bit inp for now"
+            assert env.LOG_WGT_WIDTH == 3, "only support 8bit wgt for now"
             inputs = list(inputs)
             assert inputs[1].dtype == "int8"
             return [topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype)]
@@ -57,6 +58,7 @@ def compute_conv2d(attrs, inputs, output_type, target):
     with tvm.target.arm_cpu(tvm.target.current_target().model):
         return _nn.compute_conv2d(attrs, inputs, output_type, target)
 
+
 @reg.register_schedule("nn.conv2d", level=15)
 def schedule_conv2d(attrs, outs, target):
     """ Schedule definition of conv2d """
@@ -77,3 +79,18 @@ def schedule_conv2d(attrs, outs, target):
 
     with tvm.target.arm_cpu(tvm.target.current_target().model):
         return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target())
+
+
+@reg.register_compute("nn.dense", level=15)
+def compute_dense(attrs, inputs, out_type, target):
+    """Compute definition of dense"""
+    out_dtype = attrs.out_dtype
+    out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
+    return [topi.nn.dense(inputs[0], inputs[1], None, out_dtype)]
+
+
+@reg.register_schedule("nn.dense", level=15)
+def schedule_dense(attrs, outputs, target):
+    """Schedule definition of dense"""
+    with target:
+        return topi.generic.schedule_dense(outputs)
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index eef047965a56..15d45029af82 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Namespace for supporting packed_conv2d + ewise variant of nnvm."""
+"""Conv2D operator declaration and schedule registration for VTA."""
 
 import numpy as np
 import tvm
@@ -32,14 +32,14 @@ def is_packed_layout(layout):
     return False
 
 @autotvm.register_topi_compute(topi.nn.conv2d, 'vta', 'direct')
-def packed_conv2d(cfg,
-                  data,
-                  kernel,
-                  strides,
-                  padding,
-                  dilation,
-                  layout,
-                  out_dtype):
+def _declaration_conv2d(cfg,
+                        data,
+                        kernel,
+                        strides,
+                        padding,
+                        dilation,
+                        layout,
+                        out_dtype):
     """ Packed conv2d function."""
     if not is_packed_layout(layout):
         raise topi.InvalidShapeError()
@@ -68,14 +68,14 @@ def packed_conv2d(cfg,
             pad_data[b_o, k_o, i*hstride+d_i, j*wstride+d_j, b_i, k_i].astype(out_dtype) *
             kernel[c_o, k_o, d_i, d_j, c_i, k_i].astype(out_dtype),
             axis=[k_o, d_i, d_j, k_i]),
-        name="res", tag="packed_conv2d")
+        name="res", tag="conv2d")
 
     cfg.add_flop(2 * np.prod(topi.util.get_const_tuple(oshape)) *
                  kshape[2] * kshape[3] * ishape[1] * ishape[-1])
     return res
 
 @autotvm.register_topi_schedule(topi.generic.schedule_conv2d_nchw, 'vta', 'direct')
-def schedule_packed_conv2d(cfg, outs):
+def _schedule_conv2d(cfg, outs):
     assert len(outs) == 1
     output = outs[0]
     const_ops = []
@@ -97,7 +97,7 @@ def _traverse(op):
                 else:
                     _traverse(tensor.op)
         else:
-            assert op.tag == "packed_conv2d"
+            assert op.tag == "conv2d"
             conv2d_res.append(op)
 
     _traverse(output.op)
@@ -106,8 +106,8 @@ def _traverse(op):
     s = tvm.create_schedule(output.op)
 
     ##### space definition begin #####
-    b, co, h, w, bi, ci = s[conv2d_stage].op.axis
-    ci, kh, kw, bci = s[conv2d_stage].op.reduce_axis
+    b, co, h, w, _, _ = s[conv2d_stage].op.axis
+    ci, _, _, _ = s[conv2d_stage].op.reduce_axis
     cfg.define_split('tile_b', b, num_outputs=2)
     cfg.define_split('tile_h', h, num_outputs=2)
     cfg.define_split('tile_w', w, num_outputs=2)
@@ -192,4 +192,5 @@ def _traverse(op):
     s[ckernel].pragma(s[ckernel].op.axis[0], env.dma_copy)
     s[conv2d_stage].tensorize(x_bi, env.gemm)
     s[output].pragma(x_co1, env.dma_copy)
+
     return s
diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py
index f2fdbc7e93a4..69f96f1aba29 100644
--- a/vta/python/vta/top/vta_dense.py
+++ b/vta/python/vta/top/vta_dense.py
@@ -44,13 +44,12 @@ def _declaration_dense(cfg,
     assert len(weight.shape) == 4
     # Derive output shape
     oshape = (data.shape[0], weight.shape[0], data.shape[2], weight.shape[2])
-    
+
     # Reduction axes (input channel)
-    assert(data.shape[1] == weight.shape[1])
-    assert(data.shape[3] == weight.shape[3])
+    assert(int(data.shape[1]) == int(weight.shape[1]))
+    assert(int(data.shape[3]) == int(weight.shape[3]))
     k_o = tvm.reduce_axis((0, data.shape[1]), name='k_o')
     k_i = tvm.reduce_axis((0, data.shape[3]), name='k_i')
-
     res = tvm.compute(
         oshape,
         lambda b_o, c_o, b_i, c_i: tvm.sum(
@@ -58,7 +57,7 @@ def _declaration_dense(cfg,
             weight[c_o, k_o, c_i, k_i].astype(out_dtype),
             axis=[k_o, k_i]),
         name="res", tag="packed_dense")
-    
+
     cfg.add_flop(2 * np.prod(topi.util.get_const_tuple(oshape)) *
                  data.shape[1] * data.shape[3])
     return res
@@ -88,7 +87,7 @@ def _traverse(op):
                 else:
                     _traverse(tensor.op)
         else:
-            assert op.tag == "dense"
+            assert op.tag == "packed_dense"
             dense_res.append(op)
 
     _traverse(output.op)
@@ -103,7 +102,6 @@ def _traverse(op):
     cfg.define_split('tile_co', co, num_outputs=2)
     cfg.define_split('tile_ci', ci, num_outputs=2)
     cfg.define_knob('oc_nthread', [1, 2])
-    cfg.define_knob('h_nthread', [1, 2])
     ###### space definition end ######
 
     data, kernel = dense_stage.op.input_tensors
@@ -150,12 +148,6 @@ def _traverse(op):
         s[output].reorder(v_t, x_bo)
         s[output].bind(v_t, tvm.thread_axis("cthread"))
 
-    # virtual threading along spatial rows
-    if cfg['h_nthread'].val > 1:
-        _, v_t = s[output].split(x_i0, factor=cfg['h_nthread'].val)
-        s[output].reorder(v_t, x_bo)
-        s[output].bind(v_t, tvm.thread_axis("cthread"))
-
     x_bo, x_co, x_bi, x_ci = s[dense_stage].op.axis
     k_o, k_i = s[dense_stage].op.reduce_axis
     s[dense_stage].reorder(x_bo, k_o, x_co, x_bi, x_ci, k_i)
@@ -169,3 +161,5 @@ def _traverse(op):
     s[ckernel].pragma(s[ckernel].op.axis[0], env.dma_copy)
     s[dense_stage].tensorize(x_bi, env.gemm)
     s[output].pragma(x_co1, env.dma_copy)
+
+    return s
\ No newline at end of file
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index dc7b5d710c29..2aec47118e44 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -14,7 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Testing if we can generate code in topi style"""
+
+"""Testing topi conv2d operator for VTA"""
 
 import os
 import json
diff --git a/vta/tests/python/integration/test_benchmark_topi_dense.py b/vta/tests/python/integration/test_benchmark_topi_dense.py
new file mode 100644
index 000000000000..6759cc19b292
--- /dev/null
+++ b/vta/tests/python/integration/test_benchmark_topi_dense.py
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Testing topi gemm operator for VTA"""
+
+import os
+import json
+from collections import namedtuple
+
+import numpy as np
+
+import tvm
+from tvm import autotvm
+from tvm.contrib import util
+from tvm.contrib.pickle_memoize import memoize
+import topi
+import topi.testing
+import vta
+from vta import program_fpga, reconfig_runtime
+import vta.testing
+from vta.testing import simulator
+
+# FIXME: we need a custom clip operator to circumvent a pattern detection limitation
+@tvm.tag_scope(tag=topi.tag.ELEMWISE)
+def my_clip(x, a_min, a_max):
+    """Unlike topi's current clip, put min and max into two stages."""
+    const_min = tvm.const(a_min, x.dtype)
+    const_max = tvm.const(a_max, x.dtype)
+    x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
+    x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+    return x
+
+def run_gemm(env, remote, target,
+             batch_size, in_feat, out_feat,
+             check_correctness=True, print_ir=True,
+             samples=4):
+
+    # Perform packing only if we are targeting the accelerator
+    if "arm_cpu" in target.keys:
+        data_pack = False
+    elif "vta" in target.keys:
+        data_pack = True
+
+    # Derive shapes depending upon packing
+    a_shape = (batch_size, in_feat)
+    w_shape = (out_feat, in_feat)
+    if data_pack:
+        data_shape = (batch_size//env.BATCH, in_feat//env.BLOCK_IN,
+                      env.BATCH, env.BLOCK_IN)
+        kernel_shape = (out_feat//env.BLOCK_OUT, in_feat//env.BLOCK_IN,
+                        env.BLOCK_OUT, env.BLOCK_IN)
+    else:
+        data_shape = a_shape
+        kernel_shape = w_shape
+    data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
+    kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
+
+    # Define base computation schedule
+    with target:
+        res = topi.nn.dense(
+            data, kernel, None, env.acc_dtype)
+        res = topi.right_shift(res, 8)
+        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
+        res = topi.cast(res, env.out_dtype)
+        # Derive base schedule
+        s = topi.generic.schedule_dense([res])
+        if print_ir:
+            print(vta.lower(s, [data, kernel, res], simple_mode=True))
+
+    # Derive number of ops
+    num_ops = 2 * batch_size * in_feat * out_feat
+
+    # @memoize("vta.tests.test_benchmark_topi.dense.verify")
+    def get_ref_data():
+        # derive min max for act, wgt types (max non inclusive)
+        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1))
+        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1))
+        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
+        w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype)
+
+        r_np = np.dot(a_np.astype(env.acc_dtype), w_np.T.astype(env.acc_dtype)).astype(env.acc_dtype)
+        return a_np, w_np, r_np
+
+    # Data in original format
+    data_np, kernel_np, res_ref = get_ref_data()
+    if data_pack:
+        data_np = data_np.reshape(
+            batch_size//env.BATCH, env.BATCH,
+            in_feat//env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3))
+        kernel_np = kernel_np.reshape(
+            out_feat//env.BLOCK_OUT, env.BLOCK_OUT,
+            in_feat//env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3))
+
+    # Build
+    if "vta" in target.keys:
+        mod = vta.build(s, [data, kernel, res],
+                        target=target,
+                        target_host=env.target_host,
+                        name="dense")
+    else:
+        mod = tvm.build(s, [data, kernel, res],
+                        target=target,
+                        target_host=env.target_host,
+                        name="dense")
+    temp = util.tempdir()
+    mod.save(temp.relpath("dense.o"))
+    remote.upload(temp.relpath("dense.o"))
+    f = remote.load_module("dense.o")
+    ctx = remote.context(str(target))
+
+    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
+    data_arr = tvm.nd.array(data_np, ctx)
+    kernel_arr = tvm.nd.array(kernel_np, ctx)
+    res_arr = tvm.nd.array(res_np, ctx)
+    time_f = f.time_evaluator("conv2d", ctx, number=samples)
+
+    # In vta sim mode, collect simulator runtime statistics
+    stats = {}
+    cost = None
+    if env.TARGET == "sim":
+        # Check if we're in local RPC mode (allows us to rebuild the
+        # runtime on the fly when varying the VTA designs)
+        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
+        if local_rpc:
+            remote.get_function("vta.simulator.profiler_clear")()
+            cost = time_f(data_arr, kernel_arr, res_arr)
+            stats = json.loads(remote.get_function("vta.simulator.profiler_status")())
+        else:
+            simulator.clear_stats()
+            cost = time_f(data_arr, kernel_arr, res_arr)
+            stats = simulator.stats()
+    else:
+        cost = time_f(data_arr, kernel_arr, res_arr)
+
+    # Check correctness
+    correct = False
+    if check_correctness:
+        res_orig = res_arr.asnumpy()
+        if data_pack:
+            res_orig = res_orig.reshape(batch_size, out_feat)
+        res_ref = res_ref >> 8
+        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
+        res_ref = res_ref.astype(env.out_dtype)
+        correct = np.allclose(res_orig, res_ref)
+
+    gops = (num_ops / cost.mean) / float(10 ** 9)
+    status = "PASSED" if correct else "FAILED"
+    if "arm_cpu" in target.keys:
+        device = "CPU"
+    elif "vta" in target.keys:
+        device = "VTA"
+    print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops))
+
+    return correct, cost, stats
+
+def test_gemm(device="vta", batch=128, in_feat=128, out_feat=128):
+    def _run(env, remote):
+        if device == "vta":
+            target = env.target
+            if env.TARGET != "sim":
+                assert tvm.module.enabled("rpc")
+                program_fpga(remote, bitstream=None)
+                reconfig_runtime(remote)
+        elif device == "arm_cpu":
+            target = env.target_vta_cpu
+        with autotvm.tophub.context(target): # load pre-tuned schedule parameters
+            run_gemm(env, remote, target, batch, in_feat, out_feat)
+    vta.testing.run(_run)
+
+if __name__ == "__main__":
+    test_gemm("vta", 1, 16, 16)

From d16f91cbf8750f31c380ea957dc6790675464d79 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Fri, 14 Jun 2019 12:24:55 -0700
Subject: [PATCH 057/126] getting rid of kwargs usage

---
 python/tvm/relay/op/nn/_nn.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index e796995d5b42..3778a56aa9ca 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -56,7 +56,7 @@ def compute_dense(attrs, inputs, out_type, target):
     """Compute definition of dense"""
     out_dtype = attrs.out_dtype
     out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
-    return [topi.nn.dense(inputs[0], inputs[1], out_dtype=out_dtype)]
+    return [topi.nn.dense(inputs[0], inputs[1], None, out_dtype)]
 
 
 @reg.register_schedule("nn.dense")
@@ -124,16 +124,16 @@ def compute_conv2d(attrs, inputs, out_type, target):
             get_const_int(inputs[1].shape[0]) == groups and \
             get_const_int(inputs[1].shape[1]) == 1:
         out = topi.nn.depthwise_conv2d_nchw(
-            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
+            inputs[0], inputs[1], strides, padding, dilation, out_dtype)
     elif layout == "NHWC" and \
             kernel_layout == "HWOI" and\
             get_const_int(inputs[1].shape[2]) == groups and \
             get_const_int(inputs[1].shape[3]) == 1:
         out = topi.nn.depthwise_conv2d_nhwc(
-            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
+            inputs[0], inputs[1], strides, padding, dilation, out_dtype)
     elif layout in ['NCHW', 'NCHW4c']:
         out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups,
-                                        out_dtype=out_dtype)
+                                        out_dtype)
     else:
         raise ValueError("not support arbitrary group number for now")
     return [out]

From 5e8173289d521d73686a9c1ea1f6d777aea5d9de Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Fri, 14 Jun 2019 12:27:07 -0700
Subject: [PATCH 058/126] registration of dense definition and schedule for vta

---
 vta/python/vta/top/op.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index 5d6cfadc34f4..abb529dbe7f1 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -75,7 +75,7 @@ def schedule_conv2d(attrs, outs, target):
         elif str(target).startswith("llvm"):
             return tvm.create_schedule([x.op for x in outs])
         else:
-            raise RuntimeError("not support target %s" % target)
+            raise RuntimeError("Target %s is not supported" % target)
 
     with tvm.target.arm_cpu(tvm.target.current_target().model):
         return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target())
@@ -86,11 +86,26 @@ def compute_dense(attrs, inputs, out_type, target):
     """Compute definition of dense"""
     out_dtype = attrs.out_dtype
     out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
-    return [topi.nn.dense(inputs[0], inputs[1], None, out_dtype)]
+
+    if inputs[0].shape == 4: # this implies the layout is packed
+        return [topi.nn.dense(inputs[0], inputs[1], None, out_dtype)]
+
+    with tvm.target.arm_cpu(tvm.target.current_target().model):
+        return _nn.compute_dense(attrs, inputs, out_type, target)
 
 
 @reg.register_schedule("nn.dense", level=15)
-def schedule_dense(attrs, outputs, target):
+def schedule_dense(attrs, outs, target):
     """Schedule definition of dense"""
-    with target:
-        return topi.generic.schedule_dense(outputs)
+
+    if outs[0].shape == 4: # this implies the layout is packed
+        target = tvm.target.create(target)
+        if target.device_name == "vta":
+            return topi.generic.schedule_dense(outs)
+        elif str(target).startswith("llvm"):
+            return tvm.create_schedule([x.op for x in outs])
+        else:
+            raise RuntimeError("Target %s is not supported" % target)
+
+    with tvm.target.arm_cpu(tvm.target.current_target().model):
+        return _nn.schedule_dense(attrs, outs, tvm.target.current_target())

From 52880c932735d8483c51234efe8bd5a970b5b26f Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Fri, 14 Jun 2019 12:27:35 -0700
Subject: [PATCH 059/126] error reporting

---
 vta/python/vta/top/vta_dense.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py
index 69f96f1aba29..63fed639ca8c 100644
--- a/vta/python/vta/top/vta_dense.py
+++ b/vta/python/vta/top/vta_dense.py
@@ -40,8 +40,9 @@ def _declaration_dense(cfg,
     """Dense function declaration."""
 
     # Make sure that the dense operator is packed
-    assert len(data.shape) == 4
-    assert len(weight.shape) == 4
+    if len(data.shape) != 4 or len(weight.shape) != 4:
+        raise topi.InvalidShapeError()
+
     # Derive output shape
     oshape = (data.shape[0], weight.shape[0], data.shape[2], weight.shape[2])
 

From d0b2ade79a6ff987304a73168c29e386efb41ad1 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Fri, 14 Jun 2019 12:28:22 -0700
Subject: [PATCH 060/126] dense support

---
 vta/scripts/tune_resnet.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
index 463187f3caa3..46cbffc86a35 100644
--- a/vta/scripts/tune_resnet.py
+++ b/vta/scripts/tune_resnet.py
@@ -82,9 +82,12 @@ def _topi_nn_dense(*args, **kwargs):
 
         with tvm.target.vta():
             res = topi.nn.dense(*args, **kwargs)
+            res = topi.right_shift(res, 8)
+            res = my_clip(res, 0, 127)
+            res = topi.cast(res, "int8")
 
         if tvm.target.current_target().device_name == 'vta':
-            s = topi.generic.schedule_conv2d_nchw([res])
+            s = topi.generic.schedule_dense([res])
         else:
             s = tvm.create_schedule([res.op])
 

From 5e100b5754c06c49cb983769f967ffda12db4ecf Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Fri, 14 Jun 2019 12:29:57 -0700
Subject: [PATCH 061/126] remove use of kwargs

---
 vta/tests/python/integration/test_benchmark_topi_dense.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vta/tests/python/integration/test_benchmark_topi_dense.py b/vta/tests/python/integration/test_benchmark_topi_dense.py
index 6759cc19b292..656a939302f9 100644
--- a/vta/tests/python/integration/test_benchmark_topi_dense.py
+++ b/vta/tests/python/integration/test_benchmark_topi_dense.py
@@ -72,7 +72,7 @@ def run_gemm(env, remote, target,
     # Define base computation schedule
     with target:
         res = topi.nn.dense(
-            data, kernel, None, env.acc_dtype)
+            data, kernel, out_dtype=env.acc_dtype)
         res = topi.right_shift(res, 8)
         res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
         res = topi.cast(res, env.out_dtype)

From 28b976fd69cf2ceb53292fabd11c5afa2a892a9f Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Fri, 14 Jun 2019 16:50:44 -0700
Subject: [PATCH 062/126] update dense schedule

---
 vta/python/vta/top/vta_dense.py               | 36 +++++++++----------
 .../integration/test_benchmark_topi_dense.py  |  6 ++--
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py
index 63fed639ca8c..3c4b0e009e03 100644
--- a/vta/python/vta/top/vta_dense.py
+++ b/vta/python/vta/top/vta_dense.py
@@ -57,7 +57,7 @@ def _declaration_dense(cfg,
             data[b_o, k_o, b_i, k_i].astype(out_dtype) *
             weight[c_o, k_o, c_i, k_i].astype(out_dtype),
             axis=[k_o, k_i]),
-        name="res", tag="packed_dense")
+        name="res", tag="dense")
 
     cfg.add_flop(2 * np.prod(topi.util.get_const_tuple(oshape)) *
                  data.shape[1] * data.shape[3])
@@ -88,7 +88,7 @@ def _traverse(op):
                 else:
                     _traverse(tensor.op)
         else:
-            assert op.tag == "packed_dense"
+            assert op.tag == "dense"
             dense_res.append(op)
 
     _traverse(output.op)
@@ -100,17 +100,17 @@ def _traverse(op):
     b, co, _, _ = s[dense_stage].op.axis
     ci, _ = s[dense_stage].op.reduce_axis
     cfg.define_split('tile_b', b, num_outputs=2)
-    cfg.define_split('tile_co', co, num_outputs=2)
     cfg.define_split('tile_ci', ci, num_outputs=2)
+    cfg.define_split('tile_co', co, num_outputs=2)
     cfg.define_knob('oc_nthread', [1, 2])
     ###### space definition end ######
 
-    data, kernel = dense_stage.op.input_tensors
+    data, weight = dense_stage.op.input_tensors
 
     env = get_env()
 
     cdata = s.cache_read(data, env.inp_scope, [dense_stage])
-    ckernel = s.cache_read(kernel, env.wgt_scope, [dense_stage])
+    cweight = s.cache_read(weight, env.wgt_scope, [dense_stage])
     s[dense_stage].set_scope(env.acc_scope)
 
     # cache read input
@@ -127,12 +127,12 @@ def _traverse(op):
     for op in const_ops:
         s[op].compute_inline()
 
-    # tile
-    x_bo, x_co, x_bi, x_ci = s[output].op.axis
-    x_bo0, x_bo1 = cfg['tile_b'].apply(s, output, x_bo)
-    x_co0, x_co1 = cfg['tile_co'].apply(s, output, x_co)
-    s[output].reorder(x_bo0, x_co0, x_bo1, x_co1, x_bi, x_ci)
-    store_pt = x_co0
+    # apply tiling for SRAM reuse
+    x_b, x_c, _, _ = s[output].op.axis
+    x_bo, x_bi = cfg['tile_b'].apply(s, output, x_b)
+    x_co, x_ci = cfg['tile_co'].apply(s, output, x_c)
+    s[output].reorder(x_bo, x_co, x_bi, x_ci)
+    store_pt = x_co
 
     # set all compute scopes
     s[dense_stage].compute_at(s[output], store_pt)
@@ -145,22 +145,22 @@ def _traverse(op):
 
     # virtual threading along output channel axes
     if cfg['oc_nthread'].val > 1:
-        _, v_t = s[output].split(x_co0, factor=cfg['oc_nthread'].val)
+        _, v_t = s[output].split(x_co, factor=cfg['oc_nthread'].val)
         s[output].reorder(v_t, x_bo)
         s[output].bind(v_t, tvm.thread_axis("cthread"))
 
-    x_bo, x_co, x_bi, x_ci = s[dense_stage].op.axis
-    k_o, k_i = s[dense_stage].op.reduce_axis
-    s[dense_stage].reorder(x_bo, k_o, x_co, x_bi, x_ci, k_i)
+    x_bo, x_co, x_bi, _ = s[dense_stage].op.axis
+    k_o, _ = s[dense_stage].op.reduce_axis
+    s[dense_stage].reorder(x_bo, k_o, x_co)
 
     k_o, _ = cfg['tile_ci'].apply(s, dense_stage, k_o)
     s[cdata].compute_at(s[dense_stage], k_o)
-    s[ckernel].compute_at(s[dense_stage], k_o)
+    s[cweight].compute_at(s[dense_stage], k_o)
 
     # Use VTA instructions
     s[cdata].pragma(s[cdata].op.axis[0], env.dma_copy)
-    s[ckernel].pragma(s[ckernel].op.axis[0], env.dma_copy)
+    s[cweight].pragma(s[cweight].op.axis[0], env.dma_copy)
     s[dense_stage].tensorize(x_bi, env.gemm)
-    s[output].pragma(x_co1, env.dma_copy)
+    s[output].pragma(x_ci, env.dma_copy)
 
     return s
\ No newline at end of file
diff --git a/vta/tests/python/integration/test_benchmark_topi_dense.py b/vta/tests/python/integration/test_benchmark_topi_dense.py
index 656a939302f9..12fbc45c1c4b 100644
--- a/vta/tests/python/integration/test_benchmark_topi_dense.py
+++ b/vta/tests/python/integration/test_benchmark_topi_dense.py
@@ -126,7 +126,7 @@ def get_ref_data():
     data_arr = tvm.nd.array(data_np, ctx)
     kernel_arr = tvm.nd.array(kernel_np, ctx)
     res_arr = tvm.nd.array(res_np, ctx)
-    time_f = f.time_evaluator("conv2d", ctx, number=samples)
+    time_f = f.time_evaluator("dense", ctx, number=samples)
 
     # In vta sim mode, collect simulator runtime statistics
     stats = {}
@@ -163,7 +163,7 @@ def get_ref_data():
         device = "CPU"
     elif "vta" in target.keys:
         device = "VTA"
-    print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops))
+    print("%s DENSE TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops))
 
     return correct, cost, stats
 
@@ -182,4 +182,4 @@ def _run(env, remote):
     vta.testing.run(_run)
 
 if __name__ == "__main__":
-    test_gemm("vta", 1, 16, 16)
+    test_gemm("vta", 16, 512, 1008)

From eafd93e8ac77c2b042a5081a77be80405b1b2d4b Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 17 Jun 2019 19:29:08 -0700
Subject: [PATCH 063/126] fix API change from PR3353

---
 vta/scripts/relay_to_vta.py | 4 ++--
 vta/scripts/tune_resnet.py  | 4 ++--
 vta/tutorials/resnet.py     | 6 ++----
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/vta/scripts/relay_to_vta.py b/vta/scripts/relay_to_vta.py
index c71e6f61f37a..a0b8a5fa6998 100644
--- a/vta/scripts/relay_to_vta.py
+++ b/vta/scripts/relay_to_vta.py
@@ -101,7 +101,7 @@ def classification_demo(opt):
 
         # Get off the shelf gluon model, and convert to relay
         gluon_model = vision.get_model(opt.model, pretrained=True)
-        relay_prog, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
+        mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
 
         # Update shape and type dictionary
         shape_dict.update({k: v.shape for k, v in params.items()})
@@ -109,7 +109,7 @@ def classification_demo(opt):
 
         # Perform quantization in Relay
         with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1):
-            relay_prog = relay.quantize.quantize(relay_prog, params=params)
+            relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params)
 
         # Perform graph packing and constant folding for VTA target
         if target.device_name == "vta":
diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
index 46cbffc86a35..6f3a688074f8 100644
--- a/vta/scripts/tune_resnet.py
+++ b/vta/scripts/tune_resnet.py
@@ -102,7 +102,7 @@ def compile_network(opt, env, target):
 
     # Get off the shelf gluon model, and convert to relay
     gluon_model = vision.get_model(opt.model, pretrained=True)
-    relay_prog, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
+    mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
 
     # Update shape and type dictionary
     shape_dict.update({k: v.shape for k, v in params.items()})
@@ -110,7 +110,7 @@ def compile_network(opt, env, target):
 
     # Perform quantization in Relay
     with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1):
-        relay_prog = relay.quantize.quantize(relay_prog, params=params)
+        relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params)
 
     # Perform graph packing and constant folding for VTA target
     if target.device_name == "vta":
diff --git a/vta/tutorials/resnet.py b/vta/tutorials/resnet.py
index 9caa6cdafbd6..af86e8a32c04 100644
--- a/vta/tutorials/resnet.py
+++ b/vta/tutorials/resnet.py
@@ -125,9 +125,7 @@
     build_start = time.time()
 
     # Start front end compilation
-    relay_prog, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
-    print(relay_prog)
-    # exit()
+    mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
 
     # Update shape and type dictionary
     shape_dict.update({k: v.shape for k, v in params.items()})
@@ -135,7 +133,7 @@
 
     # Perform quantization in Relay
     with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1):
-        relay_prog = relay.quantize.quantize(relay_prog, params=params)
+        relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params)
 
     # Perform graph packing and constant folding for VTA target
     if target.device_name == "vta":

From a333a0747cba044035f691a209acaa425bd01220 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 18 Jun 2019 11:41:13 -0700
Subject: [PATCH 064/126] fixing flop derivation bug

---
 vta/python/vta/top/vta_conv2d.py |  5 +++--
 vta/python/vta/top/vta_dense.py  | 19 +++++++++++--------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index 15d45029af82..e588a2ff0404 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -68,10 +68,11 @@ def _declaration_conv2d(cfg,
             pad_data[b_o, k_o, i*hstride+d_i, j*wstride+d_j, b_i, k_i].astype(out_dtype) *
             kernel[c_o, k_o, d_i, d_j, c_i, k_i].astype(out_dtype),
             axis=[k_o, d_i, d_j, k_i]),
-        name="res", tag="conv2d")
+        name="res", tag="conv2d_dense")
 
     cfg.add_flop(2 * np.prod(topi.util.get_const_tuple(oshape)) *
                  kshape[2] * kshape[3] * ishape[1] * ishape[-1])
+
     return res
 
 @autotvm.register_topi_schedule(topi.generic.schedule_conv2d_nchw, 'vta', 'direct')
@@ -97,7 +98,7 @@ def _traverse(op):
                 else:
                     _traverse(tensor.op)
         else:
-            assert op.tag == "conv2d"
+            assert op.tag == "conv2d_dense"
             conv2d_res.append(op)
 
     _traverse(output.op)
diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py
index 3c4b0e009e03..0b4d907853e4 100644
--- a/vta/python/vta/top/vta_dense.py
+++ b/vta/python/vta/top/vta_dense.py
@@ -43,24 +43,27 @@ def _declaration_dense(cfg,
     if len(data.shape) != 4 or len(weight.shape) != 4:
         raise topi.InvalidShapeError()
 
-    # Derive output shape
+    # Derive shapes
+    ishape = topi.util.get_const_tuple(data.shape)
+    wshape = topi.util.get_const_tuple(weight.shape)
     oshape = (data.shape[0], weight.shape[0], data.shape[2], weight.shape[2])
 
     # Reduction axes (input channel)
-    assert(int(data.shape[1]) == int(weight.shape[1]))
-    assert(int(data.shape[3]) == int(weight.shape[3]))
-    k_o = tvm.reduce_axis((0, data.shape[1]), name='k_o')
-    k_i = tvm.reduce_axis((0, data.shape[3]), name='k_i')
+    assert(ishape[1] == wshape[1])
+    assert(ishape[3] == wshape[3])
+    k_o = tvm.reduce_axis((0, ishape[1]), name='k_o')
+    k_i = tvm.reduce_axis((0, ishape[3]), name='k_i')
     res = tvm.compute(
         oshape,
         lambda b_o, c_o, b_i, c_i: tvm.sum(
             data[b_o, k_o, b_i, k_i].astype(out_dtype) *
             weight[c_o, k_o, c_i, k_i].astype(out_dtype),
             axis=[k_o, k_i]),
-        name="res", tag="dense")
+        name="res", tag="dense_pack")
 
     cfg.add_flop(2 * np.prod(topi.util.get_const_tuple(oshape)) *
-                 data.shape[1] * data.shape[3])
+                 ishape[1] * ishape[3])
+
     return res
 
 @autotvm.register_topi_schedule(topi.generic.schedule_dense, 'vta', 'direct')
@@ -88,7 +91,7 @@ def _traverse(op):
                 else:
                     _traverse(tensor.op)
         else:
-            assert op.tag == "dense"
+            assert op.tag == "dense_pack"
             dense_res.append(op)
 
     _traverse(output.op)

From 1c4e950132d8ec0a58b0223223c86f347bcc5c79 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 18 Jun 2019 11:41:24 -0700
Subject: [PATCH 065/126] dense operator tuning

---
 vta/scripts/tune_dense.py | 87 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 vta/scripts/tune_dense.py

diff --git a/vta/scripts/tune_dense.py b/vta/scripts/tune_dense.py
new file mode 100644
index 000000000000..5f38e1bb70f8
--- /dev/null
+++ b/vta/scripts/tune_dense.py
@@ -0,0 +1,87 @@
+"""Tuning a single conv2d operator"""
+from collections import namedtuple
+import logging
+import os
+
+import tvm
+from tvm import autotvm
+from tvm.contrib.util import get_lower_ir
+import topi
+import vta
+import vta.testing
+
+env = vta.get_env()
+
+Workload = namedtuple("DenseWorkload",
+                      ['batch', 'in_filter', 'out_filter'])
+
+resnet_wkls = [
+    # Workloads of resnet18 on imagenet
+    ('resnet-18.dense',  Workload(16, 512, 1024)),
+]
+
+@tvm.tag_scope(tag=topi.tag.ELEMWISE)
+def my_clip(x, a_min, a_max):
+    """Unlike topi's current clip, put min and max into two stages."""
+    const_min = tvm.const(a_min, x.dtype)
+    const_max = tvm.const(a_max, x.dtype)
+    x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
+    x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+    return x
+
+def dense(N, CI, CO):
+    data_shape = (N//env.BATCH, CI//env.BLOCK_IN, env.BATCH, env.BLOCK_IN)
+    kernel_shape = (CO//env.BLOCK_OUT, CI//env.BLOCK_IN, env.BLOCK_OUT, env.BLOCK_IN)
+
+    data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
+    kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
+
+    with tvm.target.vta():
+        res = topi.nn.dense(data, kernel, None, 'int32')
+        res = topi.right_shift(res, 8)
+        res = my_clip(res, 0, 127)
+        res = topi.cast(res, "int8")
+
+    if tvm.target.current_target().device_name == 'vta':
+        s = topi.generic.schedule_dense([res])
+    else:
+        s = tvm.create_schedule([res.op])
+
+    return s, [data, kernel, res]
+
+if __name__ == '__main__':
+
+    # Logging config (for printing tuning log to the screen)
+    logging.basicConfig()
+    logging.getLogger('autotvm').setLevel(logging.DEBUG)
+
+    # Get tracker info from env
+    tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
+    tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None))
+    if not tracket_host or not tracket_port:
+        print("Set your AutoTVM tracker node host and port variables to run the autotuner")
+        exit()
+
+    for wl_name, wl in resnet_wkls:
+
+        # Workload parameters
+        N = wl.batch
+        CI = wl.in_filter
+        CO = wl.out_filter
+
+        task = autotvm.task.create(dense, args=(N, CI, CO),
+                target=tvm.target.vta(), target_host=env.target_host, template_key='direct')
+        print(task.config_space)
+
+        measure_option = autotvm.measure_option(
+                builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func),
+                runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port, number=4, repeat=3, timeout=10000,
+                                        check_correctness=True))
+
+        tuner = autotvm.tuner.RandomTuner(task)
+        tuner.tune(n_trial=len(task.config_space),
+                measure_option=measure_option,
+                callbacks=[autotvm.callback.log_to_file('conv2d.log')])
+
+        print("\nBest tuner config:")
+        print(tuner.best_config)

From af5cfd441ee709c8aa8a9eaac97b8299e2616b04 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 18 Jun 2019 11:50:56 -0700
Subject: [PATCH 066/126] tuning conv2d only

---
 vta/scripts/tune_resnet.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
index 6f3a688074f8..c715a3883add 100644
--- a/vta/scripts/tune_resnet.py
+++ b/vta/scripts/tune_resnet.py
@@ -73,7 +73,6 @@ def _topi_nn_conv2d(*args, **kwargs):
             s = tvm.create_schedule([res.op])
         return s, [A, W, res]
 
-
     @autotvm.task.register("topi_nn_dense", override=True)
     def _topi_nn_dense(*args, **kwargs):
         assert not kwargs, "Do not support kwargs in template function call"
@@ -228,8 +227,7 @@ def tune_tasks(tasks,
     print("Extracting tasks...")
     tasks = extract_from_program(func=relay_prog,
                                  params=params,
-                                 ops=(tvm.relay.op.nn.conv2d,
-                                      tvm.relay.op.nn.dense),
+                                 ops=(tvm.relay.op.nn.conv2d,),
                                  target=target,
                                  target_host=env.target_host)
 

From a04a3cb2c373a7e0ce3eef5d265f8a0f7a63145c Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 18 Jun 2019 17:52:18 -0700
Subject: [PATCH 067/126] skip dense layer in quant, cleanup

---
 python/tvm/relay/quantize/_annotate.py | 20 +++++---------------
 python/tvm/relay/quantize/quantize.py  | 18 ++++++++++++++++++
 src/relay/pass/quantize.cc             |  2 ++
 src/relay/pass/quantize.h              |  4 ++++
 vta/tutorials/resnet.py                |  2 +-
 5 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index 799b553a702c..8edc690daa29 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -171,9 +171,6 @@ def conv2d_rewrite(ref_call, new_args, ctx):
     lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
     rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
 
-    # print('conv2d lhs kind: {0}'.format(lhs_kind))
-    # print('conv2d lhs: \n{0}'.format(lhs_expr))
-    # print('\n\n\n')
     if lhs_kind is None or lhs_kind == QAnnotateKind.ACTIVATION:
         lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
 
@@ -181,6 +178,7 @@ def conv2d_rewrite(ref_call, new_args, ctx):
     rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
 
     expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
+
     return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
 
 
@@ -200,6 +198,8 @@ def dense_rewrite(ref_call, new_args, ctx):
     if check_to_skip():
         return None
 
+    _set_dense_counter(cnt + 1)
+
     lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
     rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
 
@@ -243,8 +243,6 @@ def add_rewrite(ref_call, new_args, ctx):
 
     lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
     rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
-    # print('add lhs kind: {0}'.format(lhs_kind))
-    # print('add rhs kind: {0}'.format(rhs_kind))
 
     if lhs_kind is None and rhs_kind is None:
         return None
@@ -254,7 +252,6 @@ def add_rewrite(ref_call, new_args, ctx):
         assert rhs_kind == QAnnotateKind.INPUT
         lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
         expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
-        # print('execute add with INPUT')
         return QAnnotateExpr(expr, QAnnotateKind.INPUT)
 
     if lhs_kind is not None and rhs_kind is None:
@@ -272,12 +269,10 @@ def add_rewrite(ref_call, new_args, ctx):
     if lhs_kind is not None and rhs_kind is not None:
         if lhs_kind == QAnnotateKind.INPUT and rhs_kind == QAnnotateKind.INPUT:
             expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
-            # print('execute add with INPUT')
             return QAnnotateExpr(expr, QAnnotateKind.INPUT)
         if lhs_kind == QAnnotateKind.ACTIVATION and rhs_kind == QAnnotateKind.ACTIVATION:
             # quantize rhs to INPUT field if both lhs and rhs are ACTIVATION
             rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT)
-
             expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
             return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
 
@@ -376,6 +371,7 @@ def _register(func):
         return _op.op._Register(op_name, "FQVtaRewrite", func, level)
     return _register(frewrite) if frewrite is not None else _register
 
+
 @register_relay_node
 class QVtaExpr(_expr.TempExpr):
     def __init__(self, expr):
@@ -391,8 +387,6 @@ def vta_expr_check(expr):
         return True, expr.expr
     return False, expr
 
-# def _stop_fusion(expr):
-#     return _quantize.make_stop_fusion(expr)
 
 @register_vta_rewrite("nn.conv2d")
 def conv2d_vta_rewrite(ref_call, new_args, ctx):
@@ -402,7 +396,6 @@ def conv2d_vta_rewrite(ref_call, new_args, ctx):
         return None
     _set_conv_counter(cnt + 1)
 
-
     data_cond, data = vta_expr_check(new_args[0])
     kernel_cond, kernel = vta_expr_check(new_args[1])
 
@@ -412,6 +405,7 @@ def conv2d_vta_rewrite(ref_call, new_args, ctx):
     ret = _forward_op(ref_call, [data, kernel])
     return QVtaExpr(ret)
 
+
 def identity_vta_rewrite(ref_call, new_args, ctx):
     cond, expr = vta_expr_check(new_args[0])
     if cond:
@@ -423,10 +417,6 @@ def identity_vta_rewrite(ref_call, new_args, ctx):
 register_vta_rewrite("nn.max_pool2d", identity_vta_rewrite)
 
 
-# @register_vta_rewrite("nn.max_pool2d")
-# def pool_vta_rewrite(ref_call, new_args, ctx):
-#     pass
-
 @register_vta_rewrite("add")
 def add_vta_rewrite(ref_call, new_args, ctx):
     lhs_cond, lhs = vta_expr_check(new_args[0])
diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index df6f8b9e139c..487bb1c3d47e 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -140,6 +140,10 @@ def qconfig(**kwargs):
         Specifying which layers to be skipped. Provide a list of indices
         that indicate which conv2d layers to leave untouched.
 
+    skip_dense_layers: list
+        Specifies which dense layers to avoid. Provide a list of indices
+        that indicate which conv2d layers to leave untouched.
+
     round_for_shift: boolean
         Whether to add bias for rounding during shift.
 
@@ -193,6 +197,20 @@ def annotate_context():
     return AnnotateContext.Current
 
 
+DENSE_COUNTER = 0
+
+
+def _dense_counter():
+    """Get the global counter for dense."""
+    return DENSE_COUNTER
+
+
+def _set_dense_counter(n):
+    """Set the value of the global dense counter."""
+    global DENSE_COUNTER
+    DENSE_COUNTER = n
+
+
 def calibrate(graph, mod=None, ctx=None):
     """The calibrate procedure will try to calculate the content of
     dom_scale, nbit, clip_min, clip_max for every `simulated_quantize`
diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
index cb64902d74f9..72e9da681f86 100644
--- a/src/relay/pass/quantize.cc
+++ b/src/relay/pass/quantize.cc
@@ -649,6 +649,8 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
   p->stream << "nbit_activation=" << op->nbit_activation << ", ";
   p->stream << "global_scale=" << op->global_scale << ", ";
   p->stream << "skip_conv_layers==" << op->skip_conv_layers << ", ";
+  p->stream << "skip_k_dense==" << op->skip_k_dense << ", ";
+  p->stream << "skip_dense_layers==" << op->skip_dense_layers << ", ";
   p->stream << "round_for_shift==" << op->round_for_shift << ", ";
   p->stream << "store_lowbit_output==" << op->store_lowbit_output << ", ";
   p->stream << "debug_enabled_ops==" << op->debug_enabled_ops;
diff --git a/src/relay/pass/quantize.h b/src/relay/pass/quantize.h
index fce98e54459c..318ebe57e2af 100644
--- a/src/relay/pass/quantize.h
+++ b/src/relay/pass/quantize.h
@@ -150,6 +150,8 @@ class QConfigNode : public Node {
   DataType dtype_activation = Int(32);
   double global_scale = 8.0;
   Array<Expr> skip_conv_layers = Array<Expr>(NodePtr<Node>(nullptr));
+  int skip_k_dense = 0;
+  Array<Expr> skip_dense_layers = Array<Expr>(NodePtr<Node>(nullptr));
   bool round_for_shift = true;
   bool store_lowbit_output = true;
   Array<Expr> debug_enabled_ops = Array<Expr>(NodePtr<Node>(nullptr));
@@ -163,6 +165,8 @@ class QConfigNode : public Node {
     v->Visit("dtype_activation", &dtype_activation);
     v->Visit("global_scale", &global_scale);
     v->Visit("skip_conv_layers", &skip_conv_layers);
+    v->Visit("skip_k_dense", &skip_k_dense);
+    v->Visit("skip_dense_layers", &skip_dense_layers);
     v->Visit("round_for_shift", &round_for_shift);
     v->Visit("store_lowbit_output", &store_lowbit_output);
     v->Visit("debug_enabled_ops", &debug_enabled_ops);
diff --git a/vta/tutorials/resnet.py b/vta/tutorials/resnet.py
index af86e8a32c04..c58f5412d974 100644
--- a/vta/tutorials/resnet.py
+++ b/vta/tutorials/resnet.py
@@ -132,7 +132,7 @@
     dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
 
     # Perform quantization in Relay
-    with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1):
+    with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1, skip_k_dense=1):
         relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params)
 
     # Perform graph packing and constant folding for VTA target

From db7462def68339f3bb82461b55d71faec5fa274f Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 18 Jun 2019 18:37:32 -0700
Subject: [PATCH 068/126] support for callable build func

---
 python/tvm/autotvm/measure/measure_methods.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index dcdd46728e3e..1ed990f394ba 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -86,10 +86,9 @@ def __init__(self, timeout=10, n_parallel=None, build_func='default'):
                 build_func = ndk.create_shared
             else:
                 raise ValueError("Invalid build_func" + build_func)
-
-        # FIXME: right now we're circumventing the wrap_build_func
-        # self.build_func = _wrap_build_func(build_func)
-        self.build_func = build_func
+            self.build_func = _wrap_build_func(build_func)
+        else:
+            self.build_func = build_func
         self.executor = LocalExecutor(timeout=timeout)
         self.tmp_dir = tempfile.mkdtemp()
 

From ae413e5fa54cf6838a7f91681aef7f13406bf439 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 18 Jun 2019 19:27:48 -0700
Subject: [PATCH 069/126] multiprocessing bug fix

---
 python/tvm/autotvm/measure/measure_methods.py | 13 +---------
 python/tvm/autotvm/task/nnvm_integration.py   | 24 ++++++++++++++++---
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 1ed990f394ba..7ddc6cd9ea5f 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -224,18 +224,7 @@ def set_task(self, task):
                               for x in arg_bufs]
             func = build(s, arg_bufs, "llvm")
             tvm_buf = [nd.array(x) for x in self.ref_input]
-
-            def _run_func():
-                """Run tvm function in a thread.
-                Because there is some issues with python multiprocessing and the thread pool in tvm
-                """
-                func(*tvm_buf)
-
-            thread = threading.Thread(target=_run_func)
-            thread.start()
-            thread.join()
-            del thread
-
+            func(*tvm_buf)
             self.ref_output = [x.asnumpy() for x in tvm_buf]
 
     def get_build_kwargs(self):
diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py
index 251a310cf7aa..9c64761662d3 100644
--- a/python/tvm/autotvm/task/nnvm_integration.py
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -19,6 +19,7 @@
 Decorator and utilities for the integration with TOPI and NNVM
 
 """
+import threading
 import warnings
 import logging
 
@@ -90,8 +91,16 @@ def extract_from_graph(graph, shape, dtype, target, symbols, params, target_host
         logger.disabled = True
 
         nnvm.compiler.engine.clear_cache()
-        nnvm.compiler.build(graph, target=target, shape=shape, dtype=dtype,
-                            target_host=target_host, params=params)
+        # wrap build call in thread to avoid multiprocessing problems
+        build_thread = threading.Thread(target=nnvm.compiler.build,
+                                        args=(graph,
+                                              target,
+                                              shape,
+                                              dtype,
+                                              params,
+                                              target_host))
+        build_thread.start()
+        build_thread.join()
 
         logger.disabled = old_state
 
@@ -169,7 +178,16 @@ def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, params,
 
         for graph, shape, dtype in zip(graphs, shapes, dtypes):
             nnvm.compiler.engine.clear_cache()
-            nnvm.compiler.build(graph, target=target, shape=shape, dtype=dtype)
+            # wrap build call in thread to avoid multiprocessing problems
+            build_thread = threading.Thread(target=nnvm.compiler.build,
+                                            args=(graph,
+                                                  target,
+                                                  shape,
+                                                  dtype,
+                                                  params,
+                                                  target_host))
+            build_thread.start()
+            build_thread.join()
 
         logger.disabled = old_state
 

From 6e3e5b83a3abfeaf6f18e04ce4d538a136264ddb Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 18 Jun 2019 19:37:31 -0700
Subject: [PATCH 070/126] doc

---
 python/tvm/relay/quantize/quantize.py | 4 ++--
 src/codegen/build_module.cc           | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index 487bb1c3d47e..c127484f9b54 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -141,8 +141,8 @@ def qconfig(**kwargs):
         that indicate which conv2d layers to leave untouched.
 
     skip_dense_layers: list
-        Specifies which dense layers to avoid. Provide a list of indices
-        that indicate which conv2d layers to leave untouched.
+        Different way of specifying which dense layers to avoid.
+        Provide a list of indices that indicate which conv2d layers to leave untouched.
 
     round_for_shift: boolean
         Whether to add bias for rounding during shift.
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 488baa9bce46..6917200ff920 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -80,9 +80,9 @@ Target CreateTarget(const std::string& target_name,
     }
   }
 
-  // if (t->device_name.length() > 0) {
-  //   t->keys_array.push_back(ir::StringImm::make(t->device_name));
-  // }
+  if (t->device_name.length() > 0) {
+    t->keys_array.push_back(ir::StringImm::make(t->device_name));
+  }
   t->device_type = kDLCPU;
   t->thread_warp_size = 1;
   if (target_name == "c" || target_name == "llvm") {

From 432a2cc240fcfcc592f5fab0902c0d2788d9e1b1 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 18 Jun 2019 19:52:42 -0700
Subject: [PATCH 071/126] skip dense layer

---
 vta/scripts/relay_to_vta.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vta/scripts/relay_to_vta.py b/vta/scripts/relay_to_vta.py
index a0b8a5fa6998..6d2855a83a76 100644
--- a/vta/scripts/relay_to_vta.py
+++ b/vta/scripts/relay_to_vta.py
@@ -108,7 +108,7 @@ def classification_demo(opt):
         dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
 
         # Perform quantization in Relay
-        with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1):
+        with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1, skip_k_dense=1):
             relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params)
 
         # Perform graph packing and constant folding for VTA target

From 794ce529dc68c30384216e12f916aa605be65d55 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 18 Jun 2019 19:58:25 -0700
Subject: [PATCH 072/126] cleanup

---
 package.sh | 6 ------
 1 file changed, 6 deletions(-)
 delete mode 100755 package.sh

diff --git a/package.sh b/package.sh
deleted file mode 100755
index da227738637d..000000000000
--- a/package.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-echo "Installing Dependencies ..."
-echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list
-sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
-sudo apt-get update
-sudo apt-get install -y -q llvm-6.0 libtinfo-dev libffi-dev zlib1g-dev clinfo tree
-sudo apt-get install verilator sbt

From 80c4f6b209d17dce6393ce8ce2388b5df8e59e08 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 18 Jun 2019 20:06:16 -0700
Subject: [PATCH 073/126] clean up

---
 src/relay/pass/quantize.cc | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
index 72e9da681f86..c41ee6ac0935 100644
--- a/src/relay/pass/quantize.cc
+++ b/src/relay/pass/quantize.cc
@@ -111,14 +111,13 @@ TVM_REGISTER_API("relay._quantize.simulated_quantize")
 
 Expr QAnnotateExprNode::Realize() const {
   const auto& cfg = QConfig::Current();
-  return expr;
-  // if (cfg->store_lowbit_output) {
-  //   // store low bit output back for VTA
-  //   const PackedFunc* f = runtime::Registry::Get("relay.quantize.attach_simulated_quantize");
-  //   return (*f)(this->expr, static_cast<int>(kQInput));
-  // } else {
-  //   return expr;
-  // }
+  if (cfg->store_lowbit_output) {
+    // store low bit output back for VTA
+    const PackedFunc* f = runtime::Registry::Get("relay.quantize.attach_simulated_quantize");
+    return (*f)(this->expr, static_cast<int>(kQInput));
+  } else {
+    return expr;
+  }
 }
 
 QAnnotateExpr QAnnotateExprNode::make(Expr expr, QAnnotateKind kind) {

From ab1f6cd956c9be385852b9617843f16e27f744fc Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 18 Jun 2019 21:18:52 -0700
Subject: [PATCH 074/126] this ensures that relay to vta compilation works for
 renset-18

---
 .../test_benchmark_resnet18_relay.py}         | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)
 rename vta/{scripts/relay_to_vta.py => tests/python/integration/test_benchmark_resnet18_relay.py} (92%)

diff --git a/vta/scripts/relay_to_vta.py b/vta/tests/python/integration/test_benchmark_resnet18_relay.py
similarity index 92%
rename from vta/scripts/relay_to_vta.py
rename to vta/tests/python/integration/test_benchmark_resnet18_relay.py
index 6d2855a83a76..f9cfb5a34f2b 100644
--- a/vta/scripts/relay_to_vta.py
+++ b/vta/tests/python/integration/test_benchmark_resnet18_relay.py
@@ -16,8 +16,8 @@
 from vta.top import graph_pack
 
 
-def classification_demo(opt):
-    """Image classification demo.
+def classification_test(opt):
+    """ResNet-18 classification test.
 
     Parameters
     ----------
@@ -167,7 +167,18 @@ def classification_demo(opt):
         tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 1000), "float32", remote.cpu(0)))
         top_categories = np.argsort(tvm_output.asnumpy()[0])
 
-        # Report top-5 classification results
+        # This just checks that one of the 5 top categories
+        # is one variety of cat; this is by no means an accurate
+        # assessment of how quantization affects classification
+        # accuracy but is meant to catch changes to the quantization
+        # pass that would break basic correctness
+        cat_detected = False
+        for k in top_categories[-5:]:
+            if "cat" in synset[k]:
+                cat_detected = True
+        assert(cat_detected)
+
+        # Report latency and top-5 classification results
         std = np.std(tcost.results) * 1000 / env.BATCH
         mean = tcost.mean * 1000 / env.BATCH
         print("%s Prediction" % opt.model)
@@ -197,4 +208,4 @@ def classification_demo(opt):
 
     opt = parser.parse_args()
 
-    classification_demo(opt)
+    classification_test(opt)

From cce05daa65191a3f55c4f25bfd481f77e473c595 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 18 Jun 2019 21:29:23 -0700
Subject: [PATCH 075/126] autotvm task extraction test for VTA

---
 .../test_autotvm_task_extraction.py           | 188 ++++++++++++++++++
 1 file changed, 188 insertions(+)
 create mode 100644 vta/tests/python/integration/test_autotvm_task_extraction.py

diff --git a/vta/tests/python/integration/test_autotvm_task_extraction.py b/vta/tests/python/integration/test_autotvm_task_extraction.py
new file mode 100644
index 000000000000..995ea411bfbd
--- /dev/null
+++ b/vta/tests/python/integration/test_autotvm_task_extraction.py
@@ -0,0 +1,188 @@
+"""Perform inference on VTA using Relay."""
+
+import argparse, os, time
+from mxnet.gluon.model_zoo import vision
+import numpy as np
+from PIL import Image
+
+import topi
+import tvm
+from tvm import rpc, autotvm, relay
+from tvm.autotvm.measure.measure_methods import request_remote
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.contrib import graph_runtime, util, download
+from tvm.contrib.debugger import debug_runtime
+import vta
+from vta.testing import simulator
+from vta.top import graph_pack
+from tvm.autotvm.task import extract_from_program
+
+def parse_arguments():
+
+    parser = argparse.ArgumentParser(description='Train a model for image classification.')
+    parser.add_argument('--model', type=str, default='resnet18_v1', choices=['resnet18_v1'],
+                        help='Input model name.')
+    parser.add_argument('--start-name', type=str, default='nn.max_pool2d',
+                        help='The name of the node where packing starts')
+    parser.add_argument('--stop-name', type=str, default='nn.global_avg_pool2d',
+                        help='The name of the node where packing stops')
+    parser.add_argument('--debug-profile', action='store_true',
+                        help='Show layer-wise time cost profiling results')
+    parser.add_argument('--device', default='vta',  choices=['vta', 'arm_cpu'],
+                        help='Select device target')
+    parser.add_argument('--measurements', type=int, default=1,
+                        help='Number of measurements during AutoTVM search')
+    parser.add_argument('--tuner', type=str, default="random",
+                        help='AutoTVM search strategy')
+    parser.add_argument('--log-filename', type=str, default="resnet-18.log",
+                        help='AutoTVM log file name')
+
+    return parser.parse_args()
+
+
+def register_vta_tuning_tasks():
+    from tvm.autotvm.task.topi_integration import TaskExtractEnv, deserialize_args
+
+    @tvm.tag_scope(tag=topi.tag.ELEMWISE)
+    def my_clip(x, a_min, a_max):
+        """Unlike topi's current clip, put min and max into two stages."""
+        const_min = tvm.const(a_min, x.dtype)
+        const_max = tvm.const(a_max, x.dtype)
+        x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
+        x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+        return x
+
+    # init autotvm env to register VTA operator
+    TaskExtractEnv()
+
+    @autotvm.task.register("topi_nn_conv2d", override=True)
+    def _topi_nn_conv2d(*args, **kwargs):
+        assert not kwargs, "Do not support kwargs in template function call"
+        args = deserialize_args(args)
+        A, W = args[:2]
+
+        with tvm.target.vta():
+            res = topi.nn.conv2d(*args, **kwargs)
+            res = topi.right_shift(res, 8)
+            res = my_clip(res, 0, 127)
+            res = topi.cast(res, "int8")
+
+        if tvm.target.current_target().device_name == 'vta':
+            s = topi.generic.schedule_conv2d_nchw([res])
+        else:
+            s = tvm.create_schedule([res.op])
+        return s, [A, W, res]
+
+    @autotvm.task.register("topi_nn_dense", override=True)
+    def _topi_nn_dense(*args, **kwargs):
+        assert not kwargs, "Do not support kwargs in template function call"
+        args = deserialize_args(args)
+        A, W = args[:2]
+
+        with tvm.target.vta():
+            res = topi.nn.dense(*args, **kwargs)
+            res = topi.right_shift(res, 8)
+            res = my_clip(res, 0, 127)
+            res = topi.cast(res, "int8")
+
+        if tvm.target.current_target().device_name == 'vta':
+            s = topi.generic.schedule_dense([res])
+        else:
+            s = tvm.create_schedule([res.op])
+
+        return s, [A, W, res]
+
+
+def compile_network(opt, env, target):
+
+    # Populate the shape and data type dictionary
+    dtype_dict = {"data": 'float32'}
+    shape_dict = {"data": (env.BATCH, 3, 224, 224)}
+
+    # Get off the shelf gluon model, and convert to relay
+    gluon_model = vision.get_model(opt.model, pretrained=True)
+    mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
+
+    # Update shape and type dictionary
+    shape_dict.update({k: v.shape for k, v in params.items()})
+    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+    # Perform quantization in Relay
+    with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1):
+        relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params)
+
+    # Perform graph packing and constant folding for VTA target
+    if target.device_name == "vta":
+        assert env.BLOCK_IN == env.BLOCK_OUT
+        relay_prog = graph_pack(
+            relay_prog,
+            env.BATCH,
+            env.BLOCK_OUT,
+            env.WGT_WIDTH,
+            start_name=opt.start_name,
+            stop_name=opt.stop_name)
+        relay_prog = relay.ir_pass.fold_constant(relay_prog)
+
+    return relay_prog, params
+
+if __name__ == '__main__':
+
+    opt = parse_arguments()
+
+    # Make sure that TVM was compiled with RPC=1
+    assert tvm.module.enabled("rpc")
+
+    # Read in VTA environment
+    env = vta.get_env()
+
+    # Get remote from fleet node
+    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
+    tracker_port = int(os.environ.get("TVM_TRACKER_PORT", None))
+    if not tracker_host or not tracker_port:
+        print("Set your AutoTVM tracker node host and port variables to run the autotuner")
+        exit()
+
+    # Get remote
+    if env.TARGET != "sim":
+
+        # Measure build start time
+        reconfig_start = time.time()
+
+        # Get remote from fleet node
+        remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000)
+
+        # Reconfigure the JIT runtime and FPGA.
+        # You can program the FPGA with your own custom bitstream
+        # by passing the path to the bitstream file instead of None.
+        vta.reconfig_runtime(remote)
+        vta.program_fpga(remote, bitstream=None)
+
+        # Report on reconfiguration time
+        reconfig_time = time.time() - reconfig_start
+        print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
+
+    # In simulation mode, host the RPC server locally.
+    else:
+        remote = rpc.LocalSession()
+
+    # VTA target and execution context
+    target = env.target if opt.device == "vta" else env.target_vta_cpu
+    ctx = remote.ext_dev(0) if opt.device == "vta" else remote.cpu(0)
+    
+    # Register VTA tuning tasks
+    register_vta_tuning_tasks()
+
+    # Compile Relay program
+    relay_prog, params = compile_network(opt, env, target)
+
+    # Perform task extraction on Relay program
+    tasks = extract_from_program(func=relay_prog,
+                                 params=params,
+                                 ops=(tvm.relay.op.nn.conv2d,),
+                                 target=target,
+                                 target_host=env.target_host)
+    
+    # Check that we have extracted the right number of tasks
+    assert opt.model == "resnet18_v1" and len(tasks) == 10
+
+    print("Task extraction passed!")

From 4be3cbcfcbff4162d62408fd222933a0f8ae0bfe Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 18 Jun 2019 21:37:15 -0700
Subject: [PATCH 076/126] adding headers

---
 vta/scripts/tune_conv2d.py                    | 18 +++++++++++++++
 vta/scripts/tune_dense.py                     | 22 +++++++++++++++++--
 vta/scripts/tune_resnet.py                    | 19 +++++++++++++++-
 vta/scripts/tune_resnet_nnvm.py               | 19 ++++++++++++++++
 .../test_autotvm_task_extraction.py           | 17 ++++++++++++++
 .../test_benchmark_resnet18_relay.py          | 17 ++++++++++++++
 6 files changed, 109 insertions(+), 3 deletions(-)

diff --git a/vta/scripts/tune_conv2d.py b/vta/scripts/tune_conv2d.py
index 0113060a77da..f55c7e985716 100644
--- a/vta/scripts/tune_conv2d.py
+++ b/vta/scripts/tune_conv2d.py
@@ -1,4 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 """Tuning a single conv2d operator"""
+
 from collections import namedtuple
 import logging
 import os
diff --git a/vta/scripts/tune_dense.py b/vta/scripts/tune_dense.py
index 5f38e1bb70f8..237ca2754512 100644
--- a/vta/scripts/tune_dense.py
+++ b/vta/scripts/tune_dense.py
@@ -1,4 +1,22 @@
-"""Tuning a single conv2d operator"""
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Tuning a single dense operator"""
+
 from collections import namedtuple
 import logging
 import os
@@ -81,7 +99,7 @@ def dense(N, CI, CO):
         tuner = autotvm.tuner.RandomTuner(task)
         tuner.tune(n_trial=len(task.config_space),
                 measure_option=measure_option,
-                callbacks=[autotvm.callback.log_to_file('conv2d.log')])
+                callbacks=[autotvm.callback.log_to_file('dense.log')])
 
         print("\nBest tuner config:")
         print(tuner.best_config)
diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
index c715a3883add..e89de92af531 100644
--- a/vta/scripts/tune_resnet.py
+++ b/vta/scripts/tune_resnet.py
@@ -1,4 +1,21 @@
-"""Perform inference on VTA using Relay."""
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Perform ResNet autoTVM tuning on VTA using Relay."""
 
 import argparse, os, time
 from mxnet.gluon.model_zoo import vision
diff --git a/vta/scripts/tune_resnet_nnvm.py b/vta/scripts/tune_resnet_nnvm.py
index 3a6149df267c..22a4dd5dfc78 100644
--- a/vta/scripts/tune_resnet_nnvm.py
+++ b/vta/scripts/tune_resnet_nnvm.py
@@ -1,3 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Perform ResNet autoTVM tuning on VTA using NNVM."""
+
 import argparse
 import os
 import time
diff --git a/vta/tests/python/integration/test_autotvm_task_extraction.py b/vta/tests/python/integration/test_autotvm_task_extraction.py
index 995ea411bfbd..e276b5c0672f 100644
--- a/vta/tests/python/integration/test_autotvm_task_extraction.py
+++ b/vta/tests/python/integration/test_autotvm_task_extraction.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 """Perform inference on VTA using Relay."""
 
 import argparse, os, time
diff --git a/vta/tests/python/integration/test_benchmark_resnet18_relay.py b/vta/tests/python/integration/test_benchmark_resnet18_relay.py
index f9cfb5a34f2b..ced6e9db3fc7 100644
--- a/vta/tests/python/integration/test_benchmark_resnet18_relay.py
+++ b/vta/tests/python/integration/test_benchmark_resnet18_relay.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 """Perform inference on VTA using Relay."""
 
 import argparse, json, os, requests, time

From ab3069ed1daae5ff2ce60e051c323cc27ca4e1ff Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 18 Jun 2019 21:54:54 -0700
Subject: [PATCH 077/126] missing headers

---
 vta/python/vta/top/bitpack.py        | 17 +++++++++++++++++
 vta/python/vta/top/graphpack.py      | 17 +++++++++++++++++
 vta/python/vta/top/nnvm_bitpack.py   | 17 +++++++++++++++++
 vta/python/vta/top/nnvm_graphpack.py | 17 +++++++++++++++++
 vta/python/vta/top/nnvm_op.py        | 17 +++++++++++++++++
 5 files changed, 85 insertions(+)

diff --git a/vta/python/vta/top/bitpack.py b/vta/python/vta/top/bitpack.py
index 2265af4518b4..b39a96fa263a 100644
--- a/vta/python/vta/top/bitpack.py
+++ b/vta/python/vta/top/bitpack.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 """Bit packing operators"""
 from __future__ import absolute_import as _abs
 
diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
index 770dd380403d..650465b066d0 100644
--- a/vta/python/vta/top/graphpack.py
+++ b/vta/python/vta/top/graphpack.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 """A Relay implementation of graph packing."""
 
 from tvm import relay
diff --git a/vta/python/vta/top/nnvm_bitpack.py b/vta/python/vta/top/nnvm_bitpack.py
index 7b09ffbf43c0..52b3fa7d9899 100644
--- a/vta/python/vta/top/nnvm_bitpack.py
+++ b/vta/python/vta/top/nnvm_bitpack.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 """Bit packing operators"""
 from __future__ import absolute_import as _abs
 
diff --git a/vta/python/vta/top/nnvm_graphpack.py b/vta/python/vta/top/nnvm_graphpack.py
index 1f713acd3e27..427001ffa5ed 100644
--- a/vta/python/vta/top/nnvm_graphpack.py
+++ b/vta/python/vta/top/nnvm_graphpack.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 """An NNVM implementation of graph packing."""
 
 import nnvm
diff --git a/vta/python/vta/top/nnvm_op.py b/vta/python/vta/top/nnvm_op.py
index ce69b2b438d1..d9c2efb550f2 100644
--- a/vta/python/vta/top/nnvm_op.py
+++ b/vta/python/vta/top/nnvm_op.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 """Namespace for supporting packed_conv2d + ewise variant of nnvm."""
 from __future__ import absolute_import as _abs
 

From 67ae8d13b0c63c5c4ef82c4927a196020e6cffd0 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 18 Jun 2019 23:49:51 -0700
Subject: [PATCH 078/126] header

---
 vta/python/vta/top/op.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index abb529dbe7f1..dc4dd08c4c50 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 """Namespace for supporting packed_conv2d + ewise variant of nnvm."""
 from __future__ import absolute_import as _abs
 

From 5c86609a1ae99f7b1c49cd61e342796750640ac9 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 00:08:54 -0700
Subject: [PATCH 079/126] rename test file

---
 .../{test_benchmark_resnet18_relay.py => test_resnet18.py}        | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename vta/tests/python/integration/{test_benchmark_resnet18_relay.py => test_resnet18.py} (100%)

diff --git a/vta/tests/python/integration/test_benchmark_resnet18_relay.py b/vta/tests/python/integration/test_resnet18.py
similarity index 100%
rename from vta/tests/python/integration/test_benchmark_resnet18_relay.py
rename to vta/tests/python/integration/test_resnet18.py

From 19f51fc6c7be89d58ad71f011f16b71f5102f8ce Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 00:25:38 -0700
Subject: [PATCH 080/126] lint fix

---
 python/tvm/autotvm/task/nnvm_integration.py  |  4 ----
 python/tvm/autotvm/task/relay_integration.py | 24 +++++++++-----------
 python/tvm/autotvm/task/topi_integration.py  |  9 +++-----
 python/tvm/relay/quantize/_annotate.py       | 12 ++++++----
 src/relay/op/annotation/annotation.cc        |  2 +-
 5 files changed, 22 insertions(+), 29 deletions(-)

diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py
index 9c64761662d3..d945abb054e2 100644
--- a/python/tvm/autotvm/task/nnvm_integration.py
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -24,11 +24,8 @@
 import logging
 
 
-from ... import target as _target
-
 from .task import create
 from .topi_integration import TaskExtractEnv
-from .dispatcher import ApplyHistoryBest
 
 logger = logging.getLogger('autotvm')
 
@@ -203,4 +200,3 @@ def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, params,
             print("[Warning] Invalid shape during AutoTVM task creation")
 
     return tasks
-
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index cb18653d8f37..ff55055b3c10 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -25,19 +25,17 @@
 import logging
 
 
-from ... import target as _target
-
 from .task import create
 from .topi_integration import TaskExtractEnv
 
 logger = logging.getLogger('autotvm')
 
 
-def my_build(func,
-            target,
-            target_host,
-            params):
-    """ VTA compatible relay build.
+def _build(func,
+           target,
+           target_host,
+           params):
+    """ Helper to build VTA properly.
     """
 
     from tvm import relay
@@ -48,8 +46,8 @@ def my_build(func,
                 import vta
                 with vta.build_config():
                     return relay.build(func, target, target_host, params)
-            else:
-                return relay.build(func, target, target_host, params)
+    # default case
+    return relay.build(func, target, target_host, params)
 
 def extract_from_program(func, params, ops, target, target_host=None):
     """ Extract tuning tasks from a relay program.
@@ -107,7 +105,7 @@ def extract_from_program(func, params, ops, target, target_host=None):
 
         relay.backend.compile_engine.get().clear()
         # wrap build call in thread to avoid multiprocessing problems
-        build_thread = threading.Thread(target=my_build,
+        build_thread = threading.Thread(target=_build,
                                         args=(func,
                                               target,
                                               target_host,
@@ -187,9 +185,9 @@ def extract_from_multiple_program(funcs, params, ops, target, target_host=None):
             # wrap build call in thread to avoid multiprocessing problems
             build_thread = threading.Thread(target=my_build,
                                             args=(func,
-                                                target,
-                                                target_host,
-                                                params))
+                                                  target,
+                                                  target_host,
+                                                  params))
             build_thread.start()
             build_thread.join()
 
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index ed85504e4c0a..f41d7ee934c5 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -27,10 +27,7 @@
 See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
 """
 
-import warnings
-import sys
-
-from ... import _api_internal, tensor, placeholder, create_schedule
+from ... import _api_internal, tensor, placeholder
 
 from .task import args_to_workload, dispatcher, register
 from ..util import get_const_tuple
@@ -148,8 +145,8 @@ def _tracing_wrapper(*args, **kwargs):
 
                     return compute_func(*args, **kwargs)
 
-                self.func_to_reflection[topi_compute](_tracing_wrapper)
-                self.modified_funcs.append(topi_compute)
+                self.func_to_reflection[compute_func](_tracing_wrapper)
+                self.modified_funcs.append(compute_func)
 
             _local_scope(topi_compute)
 
diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index 8edc690daa29..e98f45ef96b0 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-#pylint: disable=unused-argument
+#pylint: disable=unused-argument,inconsistent-return-statements
 """Internal module for registering attribute for annotation."""
 from __future__ import absolute_import
 import warnings
@@ -329,8 +329,10 @@ def pool2d_rewrite(ref_call, new_args, ctx):
 
 register_annotate_function("nn.max_pool2d", pool2d_rewrite)
 
+
 @register_annotate_function("force_cast")
 def force_cast_rewrite(ref_call, new_args, ctx):
+    """Rewrite function to force cast"""
     if _conv_counter() <= current_qconfig().skip_k_conv:
         return None
     expr, x_kind = _get_expr_kind(new_args[0])
@@ -390,6 +392,7 @@ def vta_expr_check(expr):
 
 @register_vta_rewrite("nn.conv2d")
 def conv2d_vta_rewrite(ref_call, new_args, ctx):
+    """Rewrite function for conv2d for VTA target"""
     cnt = _conv_counter()
     if cnt < current_qconfig().skip_k_conv:
         _set_conv_counter(cnt + 1)
@@ -410,8 +413,7 @@ def identity_vta_rewrite(ref_call, new_args, ctx):
     cond, expr = vta_expr_check(new_args[0])
     if cond:
         return QVtaExpr(_forward_op(ref_call, [expr]))
-    else:
-        return None
+    return None
 
 register_vta_rewrite("nn.relu", identity_vta_rewrite)
 register_vta_rewrite("nn.max_pool2d", identity_vta_rewrite)
@@ -419,6 +421,7 @@ def identity_vta_rewrite(ref_call, new_args, ctx):
 
 @register_vta_rewrite("add")
 def add_vta_rewrite(ref_call, new_args, ctx):
+    """Rewrite function for ewise add for VTA target"""
     lhs_cond, lhs = vta_expr_check(new_args[0])
     rhs_cond, rhs = vta_expr_check(new_args[1])
     if lhs_cond and rhs_cond:
@@ -427,5 +430,4 @@ def add_vta_rewrite(ref_call, new_args, ctx):
         return _forward_op(ref_call, [lhs, rhs])
     elif lhs_cond and not rhs_cond:
         return QVtaExpr(_forward_op(ref_call, [lhs, rhs]))
-    else:
-        return None
+    return None
diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index 789c85e39074..e6d41073e473 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -89,7 +89,7 @@ Expr ForceCast(Expr data) {
 }
 
 RELAY_REGISTER_OP("force_cast")
-.describe(R"code(Annotate an expression to prevent it being fused with previous expressions.)code"
+.describe(R"code(Annotate an expression to force a cast.)code"
 TVM_ADD_FILELINE)
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input data.")

From 49689bcc5f90865c46254c83a373123e9db7094d Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 01:09:19 -0700
Subject: [PATCH 081/126] another set of lint fixes

---
 vta/python/vta/build_module.py     | 12 ++++-----
 vta/python/vta/testing/util.py     |  5 +++-
 vta/python/vta/top/bitpack.py      |  3 ++-
 vta/python/vta/top/graphpack.py    | 42 ++++++++++++++++++++----------
 vta/python/vta/top/nnvm_bitpack.py |  5 ++--
 vta/python/vta/top/op.py           | 27 ++++++++++++-------
 vta/python/vta/top/vta_conv2d.py   | 14 +++++-----
 vta/python/vta/top/vta_dense.py    | 19 +++++++-------
 8 files changed, 75 insertions(+), 52 deletions(-)

diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py
index 91e3c4a7e0d8..71fc0d3283c6 100644
--- a/vta/python/vta/build_module.py
+++ b/vta/python/vta/build_module.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=unused-argument
 """VTA specific buildin for runtime."""
 from __future__ import absolute_import as _abs
 
@@ -129,8 +130,6 @@ def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs):
     from tvm.autotvm.measure.measure_methods import BuildResult, InstantiationError
 
     tic = time.time()
-    # simulator stats
-    stats = {}
     try:
         filename = os.path.join(tmp_dir, "tmp_func_%0x.tar" % getrandbits(64))
         target, task, config = measure_input
@@ -143,7 +142,7 @@ def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs):
             func = build(s, args, target_host=task.target_host)
             sim = build(s, args)
 
-        arg_info =  tuple((get_const_tuple(x.shape), x.dtype) for x in args)
+        arg_info = tuple((get_const_tuple(x.shape), x.dtype) for x in args)
         func.export_library(filename)
 
         # When targeting VTA test the schedule on simulator first
@@ -164,16 +163,15 @@ def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs):
             f = remote.load_module(os.path.split(sim_path)[1])
             ctx = remote.context(str(measure_input.target), 0)
             args = [tvm.nd.empty(x[0], dtype=x[1], ctx=ctx) for x in arg_info]
-            simulator.clear_stats()
+            # Skip execution just to verify correctness
             simulator.debug_mode(simulator.DEBUG_SKIP_EXEC)
             f(*args)
-            stats = simulator.stats()
 
         # check by local simulator
         ctx = tvm.context(str(target))
         args = [tvm.nd.empty(x[0], dtype=x[1], ctx=ctx) for x in arg_info]
         sim(*args)
 
-    except Exception as e:  # pylint: disable=broad-except
-        return BuildResult(None, None, e, time.time() - tic)
+    except Exception as ex: # pylint: disable=broad-except
+        return BuildResult(None, None, ex, time.time() - tic)
     return BuildResult(filename, arg_info, None, time.time() - tic)
diff --git a/vta/python/vta/testing/util.py b/vta/python/vta/testing/util.py
index b009b7f27fd3..30760409733c 100644
--- a/vta/python/vta/testing/util.py
+++ b/vta/python/vta/testing/util.py
@@ -60,7 +60,10 @@ def run(run_func):
         pynq_port = int(os.environ.get("VTA_PYNQ_RPC_PORT", None))
         # Run device from fleet node if env variables are defined
         if tracket_host and tracket_port:
-            remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000)
+            remote = autotvm.measure.request_remote(env.TARGET,
+                                                    tracket_host,
+                                                    tracket_port,
+                                                    timeout=10000)
             run_func(env, remote)
         else:
             # Next, run on PYNQ if env variables are defined
diff --git a/vta/python/vta/top/bitpack.py b/vta/python/vta/top/bitpack.py
index b39a96fa263a..d4748faad6a7 100644
--- a/vta/python/vta/top/bitpack.py
+++ b/vta/python/vta/top/bitpack.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=ungrouped-imports
 
 """Bit packing operators"""
 from __future__ import absolute_import as _abs
@@ -76,7 +77,7 @@ def _bitpack(*indices):
 
 
 @register_compute("bitpack", level=15)
-def compute_bitpack(attrs, inputs, output_type, target):
+def compute_bitpack(attrs, inputs):
     lanes = attrs.lanes
     dtype = inputs[0].dtype
     assert dtype == "int8"
diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
index 650465b066d0..c8f39c87a9c6 100644
--- a/vta/python/vta/top/graphpack.py
+++ b/vta/python/vta/top/graphpack.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+# pylint: disable=unused-argument
 """A Relay implementation of graph packing."""
 
 from tvm import relay
@@ -54,8 +54,8 @@ def _pack_weight(data, dshape, cfactor):
     assert int(dshape[1]) % cfactor == 0
     data = op.reshape(data,
                       newshape=(int(dshape[0]) // cfactor, cfactor,
-                             int(dshape[1]) // cfactor, cfactor,
-                             int(dshape[2]), int(dshape[3])))
+                                int(dshape[1]) // cfactor, cfactor,
+                                int(dshape[2]), int(dshape[3])))
     data = op.transpose(
         data, axes=(0, 2, 4, 5, 1, 3))
     return data
@@ -92,8 +92,8 @@ def _pack_bias(data, dshape, dtype, bfactor, cfactor):
 
     # broadcast batch dimension to bfactor
     data = op.broadcast_to(
-            data,
-            shape=(dshape[0] // cfactor, dshape[1], dshape[2], bfactor, cfactor))
+        data,
+        shape=(dshape[0] // cfactor, dshape[1], dshape[2], bfactor, cfactor))
     return data
 
 
@@ -103,6 +103,8 @@ def _get_shape(node):
     return _to_shape(node.checked_type.shape)
 
 class ExprPack(ExprMutator):
+    """Visitor to perform graph packing on an AST.
+    """
     def __init__(self, bfactor, cfactor, weight_bits):
         self.bfactor = bfactor
         self.cfactor = cfactor
@@ -196,13 +198,22 @@ def visit_call(self, call):
                 pass
             elif call.op == self.add and len(input_types[1].shape) == 3:
                 data, bias = args
-                bias = _pack_bias(bias, _to_shape(input_types[1].shape), input_types[1].dtype, self.bfactor, self.cfactor)
+                bias = _pack_bias(bias,
+                                  _to_shape(input_types[1].shape),
+                                  input_types[1].dtype,
+                                  self.bfactor,
+                                  self.cfactor)
                 return relay.Call(self.add, [data, bias])
             elif self.start_pack and call.op == self.bias_add:
                 data, bias = args
-                bias = _pack_bias(bias, _to_shape(input_types[1].shape), input_types[1].dtype, self.bfactor, self.cfactor)
+                bias = _pack_bias(bias,
+                                  _to_shape(input_types[1].shape),
+                                  input_types[1].dtype,
+                                  self.bfactor,
+                                  self.cfactor)
                 return relay.Call(self.add, [data, bias])
-            elif self.start_pack and call.op == op.op.get('cast') and input_types[0].dtype == 'int32':
+            elif self.start_pack and call.op == op.op.get('cast') and \
+                    input_types[0].dtype == 'int32':
                 cast = relay.Call(op.op.get('cast'), [args[0]], call.attrs)
                 return relay.Call(op.op.get('copy'), [cast])
 
@@ -214,15 +225,18 @@ def visit_call(self, call):
 class BT(Exception):
     pass
 def get_subgraph(expr, start_name, stop_name):
-    "we assume stop_name only appear once for simplicity."
-    "this constraint will be lifted in the future."
-    "bitpack_start and bitpack_end is both inclusive"
+    """ We assume stop_name only appears once for simplicity.
+        This constraint will be lifted in the future.
+        bitpack_start and bitpack_end are both inclusive
+    """
     bitpack_start = op.op.get('bitpack_start')
     bitpack_end = op.op.get('bitpack_end')
     anf = relay.ir_pass.to_a_normal_form(expr)
     def recursion(anf, start_found, stop_found):
         if isinstance(anf, relay.expr.Function):
-            return relay.expr.Function(anf.params, recursion(anf.body, start_found, stop_found), anf.ret_type, anf.type_params, anf.attrs)
+            return relay.expr.Function(anf.params,
+                                       recursion(anf.body, start_found, stop_found),
+                                       anf.ret_type, anf.type_params, anf.attrs)
         elif isinstance(anf, relay.expr.Let):
             value = anf.value
             if isinstance(value, relay.expr.Call):
@@ -239,7 +253,8 @@ def recursion(anf, start_found, stop_found):
                 assert not stop_found
                 stop_found = True
                 value = relay.expr.Call(bitpack_end, [value])
-                return relay.expr.Let(anf.var, value, anf.body) # todo: check anf.body has no more stop_name beside that one
+                # todo: check anf.body has no more stop_name beside that one
+                return relay.expr.Let(anf.var, value, anf.body)
         else:
             assert start_found
             assert stop_found
@@ -289,4 +304,3 @@ def graph_pack(expr,
     expr = packer.visit(expr)
     assert not packer.start_pack
     return relay.ir_pass.infer_type(expr)
-
diff --git a/vta/python/vta/top/nnvm_bitpack.py b/vta/python/vta/top/nnvm_bitpack.py
index 52b3fa7d9899..0dc241330339 100644
--- a/vta/python/vta/top/nnvm_bitpack.py
+++ b/vta/python/vta/top/nnvm_bitpack.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+# pylint: disable=unused-argument
 """Bit packing operators"""
 from __future__ import absolute_import as _abs
 
@@ -22,7 +22,6 @@
 from topi import util
 
 from nnvm.top import registry as reg, OpPattern
-from nnvm.top import nn as _nn
 from nnvm.top.tensor import _fschedule_broadcast
 
 def bitpack(data, bits, pack_type="int8", name="bitpack"):
@@ -84,4 +83,4 @@ def compute_bitpack(attrs, inputs, out):
     return bitpack(inputs[0], bits, dtype)
 
 reg.register_schedule("bitpack", _fschedule_broadcast)
-reg.register_pattern("bitpack", OpPattern.INJECTIVE)
\ No newline at end of file
+reg.register_pattern("bitpack", OpPattern.INJECTIVE)
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index dc4dd08c4c50..da3d7eb900ef 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -14,12 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+# pylint: disable=unused-argument, ungrouped-imports
 """Namespace for supporting packed_conv2d + ewise variant of nnvm."""
 from __future__ import absolute_import as _abs
 
-import logging
-
 import tvm
 import topi
 
@@ -68,9 +66,20 @@ def compute_conv2d(attrs, inputs, output_type, target):
             assert env.LOG_WGT_WIDTH == 3, "only support 8bit wgt for now"
             inputs = list(inputs)
             assert inputs[1].dtype == "int8"
-            return [topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype)]
-        else:
-            return [topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, out_dtype)]
+            return [topi.nn.conv2d(inputs[0],
+                                   inputs[1],
+                                   strides,
+                                   padding,
+                                   dilation,
+                                   layout,
+                                   out_dtype)]
+        return [topi.nn.group_conv2d_nchw(inputs[0],
+                                          inputs[1],
+                                          strides,
+                                          padding,
+                                          dilation,
+                                          groups,
+                                          out_dtype)]
 
     with tvm.target.arm_cpu(tvm.target.current_target().model):
         return _nn.compute_conv2d(attrs, inputs, output_type, target)
@@ -87,12 +96,10 @@ def schedule_conv2d(attrs, outs, target):
         if target.device_name == "vta":
             if groups == 1:
                 return topi.generic.schedule_conv2d_nchw(outs)
-            else:
-                return topi.generic.schedule_group_conv2d_nchw(outs)
+            return topi.generic.schedule_group_conv2d_nchw(outs)
         elif str(target).startswith("llvm"):
             return tvm.create_schedule([x.op for x in outs])
-        else:
-            raise RuntimeError("Target %s is not supported" % target)
+        raise RuntimeError("Target %s is not supported" % target)
 
     with tvm.target.arm_cpu(tvm.target.current_target().model):
         return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target())
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index e588a2ff0404..c455f535d93c 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -88,7 +88,7 @@ def _schedule_conv2d(cfg, outs):
     def _traverse(op):
         if topi.tag.is_broadcast(op.tag):
             if not op.same_as(output.op):
-                if len(op.axis) == 0:
+                if not op.axis:
                     const_ops.append(op)
                 else:
                     ewise_ops.append(op)
@@ -107,13 +107,13 @@ def _traverse(op):
     s = tvm.create_schedule(output.op)
 
     ##### space definition begin #####
-    b, co, h, w, _, _ = s[conv2d_stage].op.axis
-    ci, _, _, _ = s[conv2d_stage].op.reduce_axis
+    b, c_o, x_i, x_j, _, _ = s[conv2d_stage].op.axis
+    c_i, _, _, _ = s[conv2d_stage].op.reduce_axis
     cfg.define_split('tile_b', b, num_outputs=2)
-    cfg.define_split('tile_h', h, num_outputs=2)
-    cfg.define_split('tile_w', w, num_outputs=2)
-    cfg.define_split('tile_ci', ci, num_outputs=2)
-    cfg.define_split('tile_co', co, num_outputs=2)
+    cfg.define_split('tile_h', x_i, num_outputs=2)
+    cfg.define_split('tile_w', x_j, num_outputs=2)
+    cfg.define_split('tile_ci', c_i, num_outputs=2)
+    cfg.define_split('tile_co', c_o, num_outputs=2)
     cfg.define_knob('oc_nthread', [1, 2])
     cfg.define_knob('h_nthread', [1, 2])
     ###### space definition end ######
diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py
index 0b4d907853e4..9d6c19c5af20 100644
--- a/vta/python/vta/top/vta_dense.py
+++ b/vta/python/vta/top/vta_dense.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=unused-argument
 """Dense operator declaration and schedule registration for VTA."""
 
 import numpy as np
@@ -49,8 +50,8 @@ def _declaration_dense(cfg,
     oshape = (data.shape[0], weight.shape[0], data.shape[2], weight.shape[2])
 
     # Reduction axes (input channel)
-    assert(ishape[1] == wshape[1])
-    assert(ishape[3] == wshape[3])
+    assert ishape[1] == wshape[1]
+    assert ishape[3] == wshape[3]
     k_o = tvm.reduce_axis((0, ishape[1]), name='k_o')
     k_i = tvm.reduce_axis((0, ishape[3]), name='k_i')
     res = tvm.compute(
@@ -69,7 +70,7 @@ def _declaration_dense(cfg,
 @autotvm.register_topi_schedule(topi.generic.schedule_dense, 'vta', 'direct')
 def _schedule_dense(cfg, outs):
     """Packed dense schedule."""
-    
+
     assert len(outs) == 1
     output = outs[0]
     const_ops = []
@@ -81,7 +82,7 @@ def _schedule_dense(cfg, outs):
     def _traverse(op):
         if topi.tag.is_broadcast(op.tag):
             if not op.same_as(output.op):
-                if len(op.axis) == 0:
+                if not op.axis:
                     const_ops.append(op)
                 else:
                     ewise_ops.append(op)
@@ -100,11 +101,11 @@ def _traverse(op):
     s = tvm.create_schedule(output.op)
 
     ##### space definition begin #####
-    b, co, _, _ = s[dense_stage].op.axis
-    ci, _ = s[dense_stage].op.reduce_axis
+    b, c_o, _, _ = s[dense_stage].op.axis
+    c_i, _ = s[dense_stage].op.reduce_axis
     cfg.define_split('tile_b', b, num_outputs=2)
-    cfg.define_split('tile_ci', ci, num_outputs=2)
-    cfg.define_split('tile_co', co, num_outputs=2)
+    cfg.define_split('tile_ci', c_i, num_outputs=2)
+    cfg.define_split('tile_co', c_o, num_outputs=2)
     cfg.define_knob('oc_nthread', [1, 2])
     ###### space definition end ######
 
@@ -166,4 +167,4 @@ def _traverse(op):
     s[dense_stage].tensorize(x_bi, env.gemm)
     s[output].pragma(x_ci, env.dma_copy)
 
-    return s
\ No newline at end of file
+    return s

From e6f2187e35e336d58df0d629b5fd8dd1b9edc3cc Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 01:14:24 -0700
Subject: [PATCH 082/126] lint fix

---
 vta/python/vta/build_module.py  |  4 ++--
 vta/python/vta/top/graphpack.py | 16 +++++++++-------
 vta/python/vta/top/nnvm_op.py   | 10 +++++-----
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py
index 71fc0d3283c6..854dd4daf14a 100644
--- a/vta/python/vta/build_module.py
+++ b/vta/python/vta/build_module.py
@@ -172,6 +172,6 @@ def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs):
         args = [tvm.nd.empty(x[0], dtype=x[1], ctx=ctx) for x in arg_info]
         sim(*args)
 
-    except Exception as ex: # pylint: disable=broad-except
-        return BuildResult(None, None, ex, time.time() - tic)
+    except Exception as exc: # pylint: disable=broad-except
+        return BuildResult(None, None, exc, time.time() - tic)
     return BuildResult(filename, arg_info, None, time.time() - tic)
diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
index c8f39c87a9c6..c6cc49748bac 100644
--- a/vta/python/vta/top/graphpack.py
+++ b/vta/python/vta/top/graphpack.py
@@ -70,8 +70,8 @@ def _pack_weight_conv2d_transpose(data, dshape, cfactor):
     assert dshape[1] % cfactor == 0
     data = op.reshape(data,
                       newshape=(dshape[0] // cfactor, cfactor,
-                             dshape[1] // cfactor, cfactor,
-                             dshape[2], dshape[3]))
+                                dshape[1] // cfactor, cfactor,
+                                dshape[2], dshape[3]))
     data = op.transpose(
         data, axes=(2, 0, 4, 5, 3, 1))
     return data
@@ -227,15 +227,17 @@ class BT(Exception):
 def get_subgraph(expr, start_name, stop_name):
     """ We assume stop_name only appears once for simplicity.
         This constraint will be lifted in the future.
-        bitpack_start and bitpack_end are both inclusive
+        bitpack_start and bitpack_end are both inclusive.
     """
     bitpack_start = op.op.get('bitpack_start')
     bitpack_end = op.op.get('bitpack_end')
     anf = relay.ir_pass.to_a_normal_form(expr)
-    def recursion(anf, start_found, stop_found):
+    def _recursion(anf, start_found, stop_found):
+        """ Helper to obtain the subgraph.
+        """
         if isinstance(anf, relay.expr.Function):
             return relay.expr.Function(anf.params,
-                                       recursion(anf.body, start_found, stop_found),
+                                       _recursion(anf.body, start_found, stop_found),
                                        anf.ret_type, anf.type_params, anf.attrs)
         elif isinstance(anf, relay.expr.Let):
             value = anf.value
@@ -247,7 +249,7 @@ def recursion(anf, start_found, stop_found):
                     elif value.op.name == stop_name:
                         raise BT()
             try:
-                return relay.expr.Let(anf.var, value, recursion(anf.body, start_found, stop_found))
+                return relay.expr.Let(anf.var, value, _recursion(anf.body, start_found, stop_found))
             except BT:
                 assert start_found
                 assert not stop_found
@@ -259,7 +261,7 @@ def recursion(anf, start_found, stop_found):
             assert start_found
             assert stop_found
             return anf
-    annotated = recursion(anf, False, False)
+    annotated = _recursion(anf, False, False)
     return relay.ir_pass.infer_type(relay.ir_pass.to_graph_normal_form(annotated))
 
 def graph_pack(expr,
diff --git a/vta/python/vta/top/nnvm_op.py b/vta/python/vta/top/nnvm_op.py
index d9c2efb550f2..a38b2172671b 100644
--- a/vta/python/vta/top/nnvm_op.py
+++ b/vta/python/vta/top/nnvm_op.py
@@ -92,9 +92,10 @@ def compute_conv2d(attrs, inputs, out):
             assert env.LOG_OUT_WIDTH == 3, "only support 8bit inp for now"
             inputs = list(inputs)
             assert inputs[1].dtype == "int8"
-            return topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype)
-        else:
-            return topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, out_dtype)
+            return topi.nn.conv2d(inputs[0], inputs[1], strides,
+                                  padding, dilation, layout, out_dtype)
+        return topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides,
+                                         padding, dilation, groups, out_dtype)
 
     with tvm.target.arm_cpu(tvm.target.current_target().model):
         return _nn.compute_conv2d(attrs, inputs, out)
@@ -110,8 +111,7 @@ def schedule_conv2d(attrs, outs, target):
         if target.device_name == "vta":
             if groups == 1:
                 return topi.generic.schedule_conv2d_nchw(outs)
-            else:
-                return topi.generic.schedule_group_conv2d_nchw(outs)
+            return topi.generic.schedule_group_conv2d_nchw(outs)
         elif str(target).startswith("llvm"):
             return tvm.create_schedule([x.op for x in outs])
         else:

From 2a1b76ea081f4e781985e2c4295b8ac2e74b84ee Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 01:24:13 -0700
Subject: [PATCH 083/126] compiler warnings

---
 src/relay/pass/quantize.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
index c41ee6ac0935..3d5802307af3 100644
--- a/src/relay/pass/quantize.cc
+++ b/src/relay/pass/quantize.cc
@@ -388,7 +388,6 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args,
                             const Array<Expr>& args,
                             DataType* dtype_ptr,
                             Expr* scale_ptr) {
-  static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize");
   const QConfig& cfg = QConfig::Current();
 
   std::vector<const QRealizeIntExprNode*> nptrs;

From b0fdab0127680a5d4021b10d0b127fd09c5a7594 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 11:33:42 -0700
Subject: [PATCH 084/126] removing ci tests for now that require changes to the
 packages on the test machine

---
 .../test_autotvm_task_extraction.py           | 205 ----------------
 vta/tests/python/integration/test_resnet18.py | 228 ------------------
 2 files changed, 433 deletions(-)
 delete mode 100644 vta/tests/python/integration/test_autotvm_task_extraction.py
 delete mode 100644 vta/tests/python/integration/test_resnet18.py

diff --git a/vta/tests/python/integration/test_autotvm_task_extraction.py b/vta/tests/python/integration/test_autotvm_task_extraction.py
deleted file mode 100644
index e276b5c0672f..000000000000
--- a/vta/tests/python/integration/test_autotvm_task_extraction.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Perform inference on VTA using Relay."""
-
-import argparse, os, time
-from mxnet.gluon.model_zoo import vision
-import numpy as np
-from PIL import Image
-
-import topi
-import tvm
-from tvm import rpc, autotvm, relay
-from tvm.autotvm.measure.measure_methods import request_remote
-from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
-from tvm.contrib import graph_runtime, util, download
-from tvm.contrib.debugger import debug_runtime
-import vta
-from vta.testing import simulator
-from vta.top import graph_pack
-from tvm.autotvm.task import extract_from_program
-
-def parse_arguments():
-
-    parser = argparse.ArgumentParser(description='Train a model for image classification.')
-    parser.add_argument('--model', type=str, default='resnet18_v1', choices=['resnet18_v1'],
-                        help='Input model name.')
-    parser.add_argument('--start-name', type=str, default='nn.max_pool2d',
-                        help='The name of the node where packing starts')
-    parser.add_argument('--stop-name', type=str, default='nn.global_avg_pool2d',
-                        help='The name of the node where packing stops')
-    parser.add_argument('--debug-profile', action='store_true',
-                        help='Show layer-wise time cost profiling results')
-    parser.add_argument('--device', default='vta',  choices=['vta', 'arm_cpu'],
-                        help='Select device target')
-    parser.add_argument('--measurements', type=int, default=1,
-                        help='Number of measurements during AutoTVM search')
-    parser.add_argument('--tuner', type=str, default="random",
-                        help='AutoTVM search strategy')
-    parser.add_argument('--log-filename', type=str, default="resnet-18.log",
-                        help='AutoTVM log file name')
-
-    return parser.parse_args()
-
-
-def register_vta_tuning_tasks():
-    from tvm.autotvm.task.topi_integration import TaskExtractEnv, deserialize_args
-
-    @tvm.tag_scope(tag=topi.tag.ELEMWISE)
-    def my_clip(x, a_min, a_max):
-        """Unlike topi's current clip, put min and max into two stages."""
-        const_min = tvm.const(a_min, x.dtype)
-        const_max = tvm.const(a_max, x.dtype)
-        x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
-        x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
-        return x
-
-    # init autotvm env to register VTA operator
-    TaskExtractEnv()
-
-    @autotvm.task.register("topi_nn_conv2d", override=True)
-    def _topi_nn_conv2d(*args, **kwargs):
-        assert not kwargs, "Do not support kwargs in template function call"
-        args = deserialize_args(args)
-        A, W = args[:2]
-
-        with tvm.target.vta():
-            res = topi.nn.conv2d(*args, **kwargs)
-            res = topi.right_shift(res, 8)
-            res = my_clip(res, 0, 127)
-            res = topi.cast(res, "int8")
-
-        if tvm.target.current_target().device_name == 'vta':
-            s = topi.generic.schedule_conv2d_nchw([res])
-        else:
-            s = tvm.create_schedule([res.op])
-        return s, [A, W, res]
-
-    @autotvm.task.register("topi_nn_dense", override=True)
-    def _topi_nn_dense(*args, **kwargs):
-        assert not kwargs, "Do not support kwargs in template function call"
-        args = deserialize_args(args)
-        A, W = args[:2]
-
-        with tvm.target.vta():
-            res = topi.nn.dense(*args, **kwargs)
-            res = topi.right_shift(res, 8)
-            res = my_clip(res, 0, 127)
-            res = topi.cast(res, "int8")
-
-        if tvm.target.current_target().device_name == 'vta':
-            s = topi.generic.schedule_dense([res])
-        else:
-            s = tvm.create_schedule([res.op])
-
-        return s, [A, W, res]
-
-
-def compile_network(opt, env, target):
-
-    # Populate the shape and data type dictionary
-    dtype_dict = {"data": 'float32'}
-    shape_dict = {"data": (env.BATCH, 3, 224, 224)}
-
-    # Get off the shelf gluon model, and convert to relay
-    gluon_model = vision.get_model(opt.model, pretrained=True)
-    mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
-
-    # Update shape and type dictionary
-    shape_dict.update({k: v.shape for k, v in params.items()})
-    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
-
-    # Perform quantization in Relay
-    with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1):
-        relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params)
-
-    # Perform graph packing and constant folding for VTA target
-    if target.device_name == "vta":
-        assert env.BLOCK_IN == env.BLOCK_OUT
-        relay_prog = graph_pack(
-            relay_prog,
-            env.BATCH,
-            env.BLOCK_OUT,
-            env.WGT_WIDTH,
-            start_name=opt.start_name,
-            stop_name=opt.stop_name)
-        relay_prog = relay.ir_pass.fold_constant(relay_prog)
-
-    return relay_prog, params
-
-if __name__ == '__main__':
-
-    opt = parse_arguments()
-
-    # Make sure that TVM was compiled with RPC=1
-    assert tvm.module.enabled("rpc")
-
-    # Read in VTA environment
-    env = vta.get_env()
-
-    # Get remote from fleet node
-    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
-    tracker_port = int(os.environ.get("TVM_TRACKER_PORT", None))
-    if not tracker_host or not tracker_port:
-        print("Set your AutoTVM tracker node host and port variables to run the autotuner")
-        exit()
-
-    # Get remote
-    if env.TARGET != "sim":
-
-        # Measure build start time
-        reconfig_start = time.time()
-
-        # Get remote from fleet node
-        remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000)
-
-        # Reconfigure the JIT runtime and FPGA.
-        # You can program the FPGA with your own custom bitstream
-        # by passing the path to the bitstream file instead of None.
-        vta.reconfig_runtime(remote)
-        vta.program_fpga(remote, bitstream=None)
-
-        # Report on reconfiguration time
-        reconfig_time = time.time() - reconfig_start
-        print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
-
-    # In simulation mode, host the RPC server locally.
-    else:
-        remote = rpc.LocalSession()
-
-    # VTA target and execution context
-    target = env.target if opt.device == "vta" else env.target_vta_cpu
-    ctx = remote.ext_dev(0) if opt.device == "vta" else remote.cpu(0)
-    
-    # Register VTA tuning tasks
-    register_vta_tuning_tasks()
-
-    # Compile Relay program
-    relay_prog, params = compile_network(opt, env, target)
-
-    # Perform task extraction on Relay program
-    tasks = extract_from_program(func=relay_prog,
-                                 params=params,
-                                 ops=(tvm.relay.op.nn.conv2d,),
-                                 target=target,
-                                 target_host=env.target_host)
-    
-    # Check that we have extracted the right number of tasks
-    assert opt.model == "resnet18_v1" and len(tasks) == 10
-
-    print("Task extraction passed!")
diff --git a/vta/tests/python/integration/test_resnet18.py b/vta/tests/python/integration/test_resnet18.py
deleted file mode 100644
index ced6e9db3fc7..000000000000
--- a/vta/tests/python/integration/test_resnet18.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Perform inference on VTA using Relay."""
-
-import argparse, json, os, requests, time
-from io import BytesIO
-from mxnet.gluon.model_zoo import vision
-import numpy as np
-from os.path import join, isfile
-from PIL import Image
-
-import tvm
-from tvm import rpc, autotvm, relay
-from tvm.contrib import graph_runtime, util, download
-from tvm.contrib.debugger import debug_runtime
-import vta
-from vta.testing import simulator
-from vta.top import graph_pack
-
-
-def classification_test(opt):
-    """ResNet-18 classification test.
-
-    Parameters
-    ----------
-    opt: a dictionary obtained from argparse
-    """
-    
-    # Make sure that TVM was compiled with RPC=1
-    assert tvm.module.enabled("rpc")
-
-    # Read in VTA environment
-    env = vta.get_env()
-
-    # Download ImageNet Categories
-    url = "https://github.com/uwsaml/web-data/raw/master/vta/models/"
-    categ_fn = "synset.txt"
-    for fn in ["synset.txt"]:
-        if not isfile(fn):
-            download.download(join(url, fn), fn)
-    synset = eval(open(categ_fn).read())
-
-    # Download test image
-    image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg'
-    response = requests.get(image_url)
-
-    # Prepare test image for inference
-    image = Image.open(BytesIO(response.content)).resize((224, 224))
-    image = np.array(image) - np.array([123., 117., 104.])
-    image /= np.array([58.395, 57.12, 57.375])
-    image = image.transpose((2, 0, 1))
-    image = image[np.newaxis, :]
-    image = np.repeat(image, env.BATCH, axis=0)
-
-    # For tuning, make sure tracker variables are set
-    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
-    tracker_port = int(os.environ.get("TVM_TRACKER_PORT", None))
-    if not tracker_host or not tracker_port:
-        print("Set your AutoTVM tracker node host and port variables to run the autotuner")
-        exit()
-
-    # We configure both the bitstream and the runtime system on the Pynq
-    # to match the VTA configuration specified by the vta_config.json file.
-    if env.TARGET != "sim":
-
-        # Measure build start time
-        reconfig_start = time.time()
-
-        # Get remote from fleet node
-        remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000)
-
-        # Reconfigure the JIT runtime and FPGA.
-        # You can program the FPGA with your own custom bitstream
-        # by passing the path to the bitstream file instead of None.
-        vta.reconfig_runtime(remote)
-        vta.program_fpga(remote, bitstream=None)
-
-        # Report on reconfiguration time
-        reconfig_time = time.time() - reconfig_start
-        print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
-
-    # In simulation mode, host the RPC server locally.
-    else:
-        remote = rpc.LocalSession()
-
-    # Create a TVM target and execution context
-    target = env.target if opt.device == "vta" else env.target_vta_cpu
-    ctx = remote.ext_dev(0) if opt.device == "vta" else remote.cpu(0)
-
-    # Get tophub schedules
-    with autotvm.tophub.context(target):
-
-        # Measure build start time
-        build_start = time.time()
-
-        # Derive the LLVM compiler flags
-        # When targetting the Pynq/Ultra-96, cross-compile to ARM ISA
-        target_host = env.target_host
-
-        # Populate the shape and data type dictionary
-        dtype_dict = {"data": 'float32'}
-        shape_dict = {"data": (env.BATCH, 3, 224, 224)}
-
-        # Get off the shelf gluon model, and convert to relay
-        gluon_model = vision.get_model(opt.model, pretrained=True)
-        mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
-
-        # Update shape and type dictionary
-        shape_dict.update({k: v.shape for k, v in params.items()})
-        dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
-
-        # Perform quantization in Relay
-        with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1, skip_k_dense=1):
-            relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params)
-
-        # Perform graph packing and constant folding for VTA target
-        if target.device_name == "vta":
-            assert env.BLOCK_IN == env.BLOCK_OUT
-            relay_prog = graph_pack(
-                relay_prog,
-                env.BATCH,
-                env.BLOCK_OUT,
-                env.WGT_WIDTH,
-                start_name=opt.start_name,
-                stop_name=opt.stop_name)
-            relay_prog = relay.ir_pass.fold_constant(relay_prog)
-
-        # Compile Relay program with AlterOpLayout disabled
-        with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
-            if target.device_name != "vta":
-                graph, lib, params = relay.build(
-                    relay_prog, target=target,
-                    params=params, target_host=target_host)
-            else:
-                with vta.build_config():
-                    graph, lib, params = relay.build(
-                        relay_prog, target=target,
-                        params=params, target_host=target_host)
-        
-        # Measure Relay build time
-        build_time = time.time() - build_start
-        print(opt.model + " inference graph built in {0:.2f}s!".format(build_time))
-
-        # Send the inference library over to the remote RPC server
-        temp = util.tempdir()
-        lib.save(temp.relpath("graphlib.o"))
-        remote.upload(temp.relpath("graphlib.o"))
-        lib = remote.load_module("graphlib.o")
-
-        # If detailed runtime info is needed build with debug runtime
-        if opt.debug_profile:
-            m = debug_runtime.create(graph, lib, ctx)
-        else:
-            m = graph_runtime.create(graph, lib, ctx)
-
-        # Set the network parameters and inputs
-        m.set_input(**params)
-        m.set_input('data', image)
-
-        # Perform inference
-        timer = m.module.time_evaluator("run", ctx, number=1, repeat=opt.measurements)
-        tcost = timer()
-
-        # Display profile information
-        if opt.debug_profile:
-            m.run()
-
-        # Get classification results
-        tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 1000), "float32", remote.cpu(0)))
-        top_categories = np.argsort(tvm_output.asnumpy()[0])
-
-        # This just checks that one of the 5 top categories
-        # is one variety of cat; this is by no means an accurate
-        # assessment of how quantization affects classification
-        # accuracy but is meant to catch changes to the quantization
-        # pass that would break basic correctness
-        cat_detected = False
-        for k in top_categories[-5:]:
-            if "cat" in synset[k]:
-                cat_detected = True
-        assert(cat_detected)
-
-        # Report latency and top-5 classification results
-        std = np.std(tcost.results) * 1000 / env.BATCH
-        mean = tcost.mean * 1000 / env.BATCH
-        print("%s Prediction" % opt.model)
-        print("                     #1:", synset[top_categories[-1]])
-        print("                     #2:", synset[top_categories[-2]])
-        print("                     #3:", synset[top_categories[-3]])
-        print("                     #4:", synset[top_categories[-4]])
-        print("                     #5:", synset[top_categories[-5]])
-        print("Performed inference in %.2fms/sample (std = %.2f)" % (mean, std))
-
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser(description='Train a model for image classification.')
-    parser.add_argument('--model', type=str, default='resnet18_v1', choices=['resnet18_v1'],
-                        help='Input model name.')
-    parser.add_argument('--start-name', type=str, default='nn.max_pool2d',
-                        help='The name of the node where packing starts')
-    parser.add_argument('--stop-name', type=str, default='nn.global_avg_pool2d',
-                        help='The name of the node where packing stops')
-    parser.add_argument('--debug-profile', action='store_true',
-                        help='Show layer-wise time cost profiling results')
-    parser.add_argument('--device', default='vta',  choices=['vta', 'arm_cpu'],
-                        help='Select device target')
-    parser.add_argument('--measurements', type=int, default=1,
-                        help='Number of measurements')
-
-    opt = parser.parse_args()
-
-    classification_test(opt)

From a6ffab3a2ef188a3e235cf4c9ef05c941ba928f7 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 12:21:40 -0700
Subject: [PATCH 085/126] ci fix due to TaskExtractEnv API change

---
 .../graph_tuner/utils/traverse_graph.py       | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
index 08f1017e7fb8..dfdbfe31e5e3 100644
--- a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
+++ b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
@@ -65,18 +65,19 @@ def expr2graph(expr, target_ops, node_dict, node_list):
                                % op_name)
         topi_funcs += OP2COMPUTE[op_name]
     env.reset(topi_funcs)
-    _expr2graph_impl(expr, target_ops, node_dict, node_list)
-    task_pos = 0
-    for node_entry in node_list:
-        if node_entry["op"] in target_ops:
-            task_name, args = env.task_collection[task_pos]
-            task = autotvm.task.create(task_name, args,
-                                       target="llvm",
-                                       target_host=None,
-                                       template_key='direct')
-            node_entry["workloads"] = [task.workload]
-            node_entry["topi_op"] = [task_name]
-            task_pos += 1
+    with env:
+        _expr2graph_impl(expr, target_ops, node_dict, node_list)
+        task_pos = 0
+        for node_entry in node_list:
+            if node_entry["op"] in target_ops:
+                task_name, args = env.task_collection[task_pos]
+                task = autotvm.task.create(task_name, args,
+                                        target="llvm",
+                                        target_host=None,
+                                        template_key='direct')
+                node_entry["workloads"] = [task.workload]
+                node_entry["topi_op"] = [task_name]
+                task_pos += 1
 
 
 def _expr2graph_impl(expr, target_ops, node_dict, node_list):

From 30e8ad0db4b766e8d6f3a52258f8e66eeeb43892 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 12:28:33 -0700
Subject: [PATCH 086/126] lint fix

---
 python/tvm/autotvm/graph_tuner/utils/traverse_graph.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
index dfdbfe31e5e3..c0debaedede0 100644
--- a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
+++ b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
@@ -72,9 +72,9 @@ def expr2graph(expr, target_ops, node_dict, node_list):
             if node_entry["op"] in target_ops:
                 task_name, args = env.task_collection[task_pos]
                 task = autotvm.task.create(task_name, args,
-                                        target="llvm",
-                                        target_host=None,
-                                        template_key='direct')
+                                           target="llvm",
+                                           target_host=None,
+                                           template_key='direct')
                 node_entry["workloads"] = [task.workload]
                 node_entry["topi_op"] = [task_name]
                 task_pos += 1

From 07eb36e239b8c4508ed7c3ef1298b0665997c5d0 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 13:33:22 -0700
Subject: [PATCH 087/126] reorganize vta tutorial page; added more comments to
 e2e resnet

---
 docs/conf.py                                  |  4 +-
 vta/tutorials/README.txt                      |  1 +
 vta/tutorials/frontend/README.txt             |  4 ++
 .../deploy_resnet_on_vta.py}                  | 68 +++++++++++++++----
 .../{ => optimize}/convolution_opt.py         |  0
 .../{ => optimize}/matrix_multiply_opt.py     |  0
 6 files changed, 62 insertions(+), 15 deletions(-)
 create mode 100644 vta/tutorials/frontend/README.txt
 rename vta/tutorials/{resnet.py => frontend/deploy_resnet_on_vta.py} (75%)
 rename vta/tutorials/{ => optimize}/convolution_opt.py (100%)
 rename vta/tutorials/{ => optimize}/matrix_multiply_opt.py (100%)

diff --git a/docs/conf.py b/docs/conf.py
index a1b66325a527..d9eea045a97b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -215,7 +215,9 @@ def run_doxygen(folder):
      '../tutorials/autotvm',
      '../tutorials/dev',
      '../tutorials/topi',
-     '../tutorials/deployment'])
+     '../tutorials/deployment',
+     '../vta/tutorials/frontend',
+     '../vta/tutorials/optimize'])
 
 def generate_doxygen_xml(app):
     """Run the doxygen make commands if we're on the ReadTheDocs server"""
diff --git a/vta/tutorials/README.txt b/vta/tutorials/README.txt
index 1ba48b0b1fad..3d3858b111ba 100644
--- a/vta/tutorials/README.txt
+++ b/vta/tutorials/README.txt
@@ -1,2 +1,3 @@
 VTA Tutorials
 =============
+This page contains tutorials about VTA and how to use TVM/Relay to target VTA.
diff --git a/vta/tutorials/frontend/README.txt b/vta/tutorials/frontend/README.txt
new file mode 100644
index 000000000000..319506d21f8f
--- /dev/null
+++ b/vta/tutorials/frontend/README.txt
@@ -0,0 +1,4 @@
+.. _tutorial-frontend:
+
+Compile Deep Learning Models
+----------------------------
diff --git a/vta/tutorials/resnet.py b/vta/tutorials/frontend/deploy_resnet_on_vta.py
similarity index 75%
rename from vta/tutorials/resnet.py
rename to vta/tutorials/frontend/deploy_resnet_on_vta.py
index c58f5412d974..e2b536b798ad 100644
--- a/vta/tutorials/resnet.py
+++ b/vta/tutorials/frontend/deploy_resnet_on_vta.py
@@ -15,20 +15,28 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-ResNet Inference Example
-========================
+Deploy Pretrained ResNet Model from MxNet on VTA
+================================================
 **Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
 
 This tutorial provides an end-to-end demo, on how to run ResNet-18 inference
 onto the VTA accelerator design to perform ImageNet classification tasks.
-
+It showcases Relay as a front end compiler that can perform quantization (VTA
+only supports int8/32 inference) as well as graph packing (in order to enable
+tensorization in the core) to massage the compute graph for the hardware target.
 """
 
-
 ######################################################################
-# Import Libraries
-# ----------------
-# We start by importing libraries to run this example.
+# Install dependencies
+# --------------------
+# To use the autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user mxnet requests pillow
+#
+# Now return to the python code. Import packages.
 
 from __future__ import absolute_import, print_function
 
@@ -56,7 +64,7 @@
 
 ######################################################################
 # Define the platform and model targets
-# ----------------
+# -------------------------------------
 # Execute on CPU vs. VTA, and define the model.
 
 # Load VTA parameters from the vta/config/vta_config.json file
@@ -68,6 +76,9 @@
 target = env.target if device == "vta" else env.target_vta_cpu
 
 # Name of Gluon model to compile
+# The ``start_pack`` and ``stop_pack`` labels indicate where
+# to start and end the graph packing relay pass: in other words
+# where to start and finish offloading to VTA.
 model = "resnet18_v1"
 start_pack="nn.max_pool2d"
 stop_pack="nn.global_avg_pool2d"
@@ -80,9 +91,14 @@
 
 if env.TARGET != "sim":
 
-    # Get remote from fleet node if environment variable is set
+    # Get remote from tracker node if environment variable is set.
+    # To set up the tracker, you'll need to follow the "Auto-tuning
+    # a convolutional network for VTA" tutorial.
     tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
     tracker_port = int(os.environ.get("TVM_TRACKER_PORT", None))
+    # Otherwise if you have a device you want to program directly from
+    # the host, make sure you've set the variables below to the IP of
+    # your board.
     device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
     device_port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))
     if not tracker_host or not tracker_port:
@@ -107,9 +123,19 @@
 ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
 
 ######################################################################
-# Build the inference runtime
-# ------------------------
-# Build ResNet from Gluon with Relay.
+# Build the inference graph runtime
+# ---------------------------------
+# Grab ResNet-18 model from Gluon model zoo and compile with Relay.
+# The compilation steps are:
+#    1) Front end translation from MxNet into Relay module.
+#    2) Apply 8-bit quantization: here we skip the first conv layer,
+#       and dense layer which will both be executed in fp32 on the CPU.
+#    3) Perform graph packing to alter the data layout for tensorization.
+#    4) Perform constant folding to reduce number of operators (e.g. eliminate
+#       batch norm multiply).
+#    5) Perform relay build to object file.
+#    6) Load the object file onto remote (FPGA device).
+#    7) Generate graph runtime, `m`.
 
 # Load pre-configured AutoTVM schedules
 with autotvm.tophub.context(target):
@@ -174,8 +200,10 @@
 
 ######################################################################
 # Perform ResNet-18 inference
-# ------------------------
+# ---------------------------
 # We run classification on an image sample from ImageNet
+# We just need to download the categories files, `synset.txt`
+# and an input test image.
 
 # Download ImageNet categories
 categ_url = "https://github.com/uwsaml/web-data/raw/master/vta/models/"
@@ -201,7 +229,8 @@
 m.set_input(**params)
 m.set_input('data', image)
 
-# Perform inference
+# Perform inference: we run the module 4 times,
+# and repeat 3 times to get error bounds
 timer = m.module.time_evaluator("run", ctx, number=4, repeat=3)
 tcost = timer()
 
@@ -219,3 +248,14 @@
 print("                     #4:", synset[top_categories[-4]])
 print("                     #5:", synset[top_categories[-5]])
 print("Performed inference in %.2fms/sample (std = %.2f)" % (mean, std))
+
+# This just checks that one of the 5 top categories
+# is one variety of cat; this is by no means an accurate
+# assessment of how quantization affects classification
+# accuracy but is meant to catch changes to the
+# quantization pass that would accuracy in the CI.
+cat_detected = False
+for k in top_categories[-5:]:
+    if "cat" in synset[k]:
+        cat_detected = True
+assert(cat_detected)
\ No newline at end of file
diff --git a/vta/tutorials/convolution_opt.py b/vta/tutorials/optimize/convolution_opt.py
similarity index 100%
rename from vta/tutorials/convolution_opt.py
rename to vta/tutorials/optimize/convolution_opt.py
diff --git a/vta/tutorials/matrix_multiply_opt.py b/vta/tutorials/optimize/matrix_multiply_opt.py
similarity index 100%
rename from vta/tutorials/matrix_multiply_opt.py
rename to vta/tutorials/optimize/matrix_multiply_opt.py

From f18de91d5737d6405423ab9e44880c9a6cd94b2b Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 13:34:16 -0700
Subject: [PATCH 088/126] missing readme file for sphynx gallery

---
 vta/tutorials/optimize/README.txt | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 vta/tutorials/optimize/README.txt

diff --git a/vta/tutorials/optimize/README.txt b/vta/tutorials/optimize/README.txt
new file mode 100644
index 000000000000..b051548c5351
--- /dev/null
+++ b/vta/tutorials/optimize/README.txt
@@ -0,0 +1,2 @@
+Optimize Tensor Operators
+-------------------------

From 0985a2135bb6c5af68e3b2f25291b9bc9b0f5e34 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 13:45:10 -0700
Subject: [PATCH 089/126] ci fix

---
 python/tvm/autotvm/task/relay_integration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index ff55055b3c10..e71c076e26d0 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -40,7 +40,7 @@ def _build(func,
 
     from tvm import relay
 
-    if "vta" in target.device_name:
+    if "vta" in str(target):
         with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
             if target.device_name == "vta":
                 import vta

From 0d454d819e33cbf435c48436bf211467cb56df8d Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 14:38:41 -0700
Subject: [PATCH 090/126] quantization ci fix

---
 python/tvm/relay/quantize/quantize.py         | 32 ++++++++++++-------
 vta/scripts/tune_resnet.py                    |  5 ++-
 .../frontend/deploy_resnet_on_vta.py          |  5 ++-
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index c127484f9b54..4f3ff60a8c06 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -156,6 +156,9 @@ def qconfig(**kwargs):
         is None, which means will try to call all operartors' annotate rewrite
         function.
 
+    target_vta: boolean
+        Whether we are performing quantization for VTA.
+
     Returns
     -------
     config: QConfig
@@ -355,6 +358,8 @@ def quantize(graph, params=None, dataset=None):
     if params:
         graph = _bind_params(graph, params)
 
+    cfg = current_qconfig()
+
     mod = _module.Module.from_expr(graph)
     # Perform "SimplifyInference", "FoldScaleAxis", "FoldConstant", and
     # "CanonicalizeOps" optimization before quantization.
@@ -366,15 +371,20 @@ def quantize(graph, params=None, dataset=None):
 
     calibrate_pass = _transform.function_pass(calibrate, opt_level=1,
                                               name="QuantizeCalibrate")
-    quantize_seq = _transform.Sequential([annotate(),
-                                          calibrate_pass,
-                                          realize(),
-                                          _transform.FoldConstant()])
-    with annotate_context():
-        with _transform.PassContext(opt_level=3,
-                                    required_pass=["QuantizeAnnotate",
-                                                   "QuantizeCalibrate",
-                                                   "QuantizeRealize"]):
-            mod = optimize(mod)
-            mod = quantize_seq(mod)
+    # Quantize pass list
+    quant_passes = [annotate(),
+                    calibrate_pass,
+                    realize(),
+                    _transform.FoldConstant()]
+    # Add rewrite_for_vta() pass if target is VTA
+    if cfg.target_vta:
+        quant_passes = [rewrite_for_vta()] + quant_passes
+    quantize_seq = _transform.Sequential(quant_passes)
+    with _transform.PassContext(opt_level=3,
+                                required_pass=["QuantizeAnnotate",
+                                               "QuantizeCalibrate",
+                                               "QuantizeRealize"]):
+        mod = optimize(mod)
+        mod = quantize_seq(mod)
+
     return mod[mod.entry_func.name_hint]
diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
index e89de92af531..5469b6b40b6e 100644
--- a/vta/scripts/tune_resnet.py
+++ b/vta/scripts/tune_resnet.py
@@ -125,7 +125,10 @@ def compile_network(opt, env, target):
     dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
 
     # Perform quantization in Relay
-    with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1):
+    with relay.quantize.qconfig(global_scale=8.0,
+                                skip_k_conv=1,
+                                skip_k_dense=1,
+                                target_vta=True):
         relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params)
 
     # Perform graph packing and constant folding for VTA target
diff --git a/vta/tutorials/frontend/deploy_resnet_on_vta.py b/vta/tutorials/frontend/deploy_resnet_on_vta.py
index e2b536b798ad..7b5e6b2e730e 100644
--- a/vta/tutorials/frontend/deploy_resnet_on_vta.py
+++ b/vta/tutorials/frontend/deploy_resnet_on_vta.py
@@ -158,7 +158,10 @@
     dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
 
     # Perform quantization in Relay
-    with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1, skip_k_dense=1):
+    with relay.quantize.qconfig(global_scale=8.0,
+                                skip_k_conv=1,
+                                skip_k_dense=1,
+                                target_vta=True):
         relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params)
 
     # Perform graph packing and constant folding for VTA target

From 655c0a55b4a68d132cd1b9bc2afac31fa819f9bc Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 15:31:28 -0700
Subject: [PATCH 091/126] ci fix for nnvm task extraction

---
 nnvm/python/nnvm/top/nn.py                  | 2 +-
 python/tvm/autotvm/task/nnvm_integration.py | 2 +-
 python/tvm/autotvm/task/topi_integration.py | 6 +++++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index 128f985bd6d2..521b7f4b1da0 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -78,7 +78,7 @@ def schedule_log_softmax(_, outs, target):
 def compute_dense(attrs, inputs, _):
     """Compute definition of dense"""
     if attrs.get_bool("use_bias"):
-        return topi.nn.dense(inputs[0], inputs[1], bias=inputs[2])
+        return topi.nn.dense(inputs[0], inputs[1], inputs[2])
     return topi.nn.dense(inputs[0], inputs[1])
 
 @reg.register_schedule("dense")
diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py
index d945abb054e2..e785394a7da3 100644
--- a/python/tvm/autotvm/task/nnvm_integration.py
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -30,7 +30,7 @@
 logger = logging.getLogger('autotvm')
 
 
-def extract_from_graph(graph, shape, dtype, target, symbols, params, target_host=None):
+def extract_from_graph(graph, shape, dtype, target, symbols, params=None, target_host=None):
     """ Extract tuning tasks from a nnvm graph.
 
     This function collects tuning tasks by building the graph
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index f41d7ee934c5..c816e67f6deb 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -209,7 +209,11 @@ def _topi_nn_conv2d_transpose_nchw(*args, **kwargs):
         def _topi_nn_dense(*args, **kwargs):
             assert not kwargs, "Do not support kwargs in template function call"
             args = deserialize_args(args)
-            data, weight, bias, _ = args
+            if len(args) > 2:
+                data, weight, bias = args[:2]
+            else:
+                data, weight = args
+                bias = None
             C = topi.nn.dense(*args, **kwargs)
             s = topi.generic.schedule_dense([C])
             if bias is not None:

From 32bb0d4e0684a325403dd09d1d7caea001862c11 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 15:48:06 -0700
Subject: [PATCH 092/126] bug fix

---
 python/tvm/autotvm/task/topi_integration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index c816e67f6deb..7ff8ec73e16e 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -210,7 +210,7 @@ def _topi_nn_dense(*args, **kwargs):
             assert not kwargs, "Do not support kwargs in template function call"
             args = deserialize_args(args)
             if len(args) > 2:
-                data, weight, bias = args[:2]
+                data, weight, bias = args[:3]
             else:
                 data, weight = args
                 bias = None

From a444f03b998340d91d490f10f71222589e25a51a Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 18:09:19 -0700
Subject: [PATCH 093/126] default case in operator override to prevent sphynx
 gallery issues

---
 vta/python/vta/top/op.py | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index da3d7eb900ef..063a8acdb303 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -80,9 +80,11 @@ def compute_conv2d(attrs, inputs, output_type, target):
                                           dilation,
                                           groups,
                                           out_dtype)]
+    elif target.device_name == "vta":
+        with tvm.target.arm_cpu(tvm.target.current_target().model):
+            return _nn.compute_conv2d(attrs, inputs, output_type, target)
 
-    with tvm.target.arm_cpu(tvm.target.current_target().model):
-        return _nn.compute_conv2d(attrs, inputs, output_type, target)
+    return _nn.compute_conv2d(attrs, inputs, output_type, target)
 
 
 @reg.register_schedule("nn.conv2d", level=15)
@@ -97,12 +99,14 @@ def schedule_conv2d(attrs, outs, target):
             if groups == 1:
                 return topi.generic.schedule_conv2d_nchw(outs)
             return topi.generic.schedule_group_conv2d_nchw(outs)
-        elif str(target).startswith("llvm"):
-            return tvm.create_schedule([x.op for x in outs])
+        # elif str(target).startswith("llvm"):
+        #     return tvm.create_schedule([x.op for x in outs])
         raise RuntimeError("Target %s is not supported" % target)
+    elif target.device_name == "vta":
+        with tvm.target.arm_cpu(tvm.target.current_target().model):
+            return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target())
 
-    with tvm.target.arm_cpu(tvm.target.current_target().model):
-        return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target())
+    return _nn.schedule_conv2d(attrs, outs, target)
 
 
 @reg.register_compute("nn.dense", level=15)
@@ -112,10 +116,13 @@ def compute_dense(attrs, inputs, out_type, target):
     out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
 
     if inputs[0].shape == 4: # this implies the layout is packed
+        target = tvm.target.create(target)
         return [topi.nn.dense(inputs[0], inputs[1], None, out_dtype)]
+    elif target.device_name == "vta":
+        with tvm.target.arm_cpu(tvm.target.current_target().model):
+            return _nn.compute_dense(attrs, inputs, out_type, target)
 
-    with tvm.target.arm_cpu(tvm.target.current_target().model):
-        return _nn.compute_dense(attrs, inputs, out_type, target)
+    return _nn.compute_dense(attrs, inputs, out_type, target)
 
 
 @reg.register_schedule("nn.dense", level=15)
@@ -126,10 +133,11 @@ def schedule_dense(attrs, outs, target):
         target = tvm.target.create(target)
         if target.device_name == "vta":
             return topi.generic.schedule_dense(outs)
-        elif str(target).startswith("llvm"):
-            return tvm.create_schedule([x.op for x in outs])
-        else:
-            raise RuntimeError("Target %s is not supported" % target)
+        # elif str(target).startswith("llvm"):
+        #     return tvm.create_schedule([x.op for x in outs])
+        raise RuntimeError("Target %s is not supported" % target)
+    elif target.device_name == "vta":
+        with tvm.target.arm_cpu(tvm.target.current_target().model):
+            return _nn.schedule_dense(attrs, outs, tvm.target.current_target())
 
-    with tvm.target.arm_cpu(tvm.target.current_target().model):
-        return _nn.schedule_dense(attrs, outs, tvm.target.current_target())
+    return _nn.schedule_dense(attrs, outs, target)

From 31beec6239f42fdbfa8922af8195ac2451167885 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 18:17:45 -0700
Subject: [PATCH 094/126] deprecating nnvm for VTA

---
 vta/python/vta/top/__init__.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py
index ee2b5ec21ef8..3b5132ebf0ef 100644
--- a/vta/python/vta/top/__init__.py
+++ b/vta/python/vta/top/__init__.py
@@ -2,9 +2,11 @@
 
 from . import bitpack
 from .graphpack import graph_pack
-from . import nnvm_bitpack
-from .nnvm_graphpack import nnvm_graph_pack
-from . import nnvm_op
 from . import op
 from . import vta_conv2d
 from . import vta_dense
+
+# NNVM is deprecated for VTA
+# from . import nnvm_bitpack
+# from .nnvm_graphpack import nnvm_graph_pack
+# from . import nnvm_op

From fa73537352dd7fb5dc278532b0caef3da59896a1 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 18:18:30 -0700
Subject: [PATCH 095/126] refactoring

---
 vta/python/vta/top/op.py | 87 ++++++++++++++++++++--------------------
 1 file changed, 44 insertions(+), 43 deletions(-)

diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index 063a8acdb303..96eaa8fb9905 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -57,33 +57,35 @@ def compute_conv2d(attrs, inputs, output_type, target):
     layout = attrs.data_layout
     out_dtype = attrs.out_dtype
 
-    assert dilation == (1, 1), "support for dilation limited to (1, 1)"
-    if is_packed_layout(layout):
-        if groups == 1:
-            assert groups == 1
-            env = get_env()
-            assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now"
-            assert env.LOG_WGT_WIDTH == 3, "only support 8bit wgt for now"
-            inputs = list(inputs)
-            assert inputs[1].dtype == "int8"
-            return [topi.nn.conv2d(inputs[0],
-                                   inputs[1],
-                                   strides,
-                                   padding,
-                                   dilation,
-                                   layout,
-                                   out_dtype)]
-        return [topi.nn.group_conv2d_nchw(inputs[0],
-                                          inputs[1],
-                                          strides,
-                                          padding,
-                                          dilation,
-                                          groups,
-                                          out_dtype)]
-    elif target.device_name == "vta":
+    if target.device_name == "vta":
+        assert dilation == (1, 1), "support for dilation limited to (1, 1)"
+        if is_packed_layout(layout):
+            if groups == 1:
+                assert groups == 1
+                env = get_env()
+                assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now"
+                assert env.LOG_WGT_WIDTH == 3, "only support 8bit wgt for now"
+                inputs = list(inputs)
+                assert inputs[1].dtype == "int8"
+                return [topi.nn.conv2d(inputs[0],
+                                       inputs[1],
+                                       strides,
+                                       padding,
+                                       dilation,
+                                       layout,
+                                       out_dtype)]
+            return [topi.nn.group_conv2d_nchw(inputs[0],
+                                              inputs[1],
+                                              strides,
+                                              padding,
+                                              dilation,
+                                              groups,
+                                              out_dtype)]
+        # If it's not packed, run on ARM CPU
         with tvm.target.arm_cpu(tvm.target.current_target().model):
             return _nn.compute_conv2d(attrs, inputs, output_type, target)
 
+    # If VTA is not the target, default to _nn def
     return _nn.compute_conv2d(attrs, inputs, output_type, target)
 
 
@@ -93,19 +95,18 @@ def schedule_conv2d(attrs, outs, target):
     groups = attrs.groups
     layout = attrs.data_layout
 
-    if is_packed_layout(layout):
-        target = tvm.target.create(target)
-        if target.device_name == "vta":
+    if target.device_name == "vta":
+        if is_packed_layout(layout):
+            target = tvm.target.create(target)
+            assert target.device_name == "vta"
             if groups == 1:
                 return topi.generic.schedule_conv2d_nchw(outs)
             return topi.generic.schedule_group_conv2d_nchw(outs)
-        # elif str(target).startswith("llvm"):
-        #     return tvm.create_schedule([x.op for x in outs])
-        raise RuntimeError("Target %s is not supported" % target)
-    elif target.device_name == "vta":
+        # If it's not packed, run on ARM CPU
         with tvm.target.arm_cpu(tvm.target.current_target().model):
             return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target())
 
+    # If VTA is not the target, default to _nn def
     return _nn.schedule_conv2d(attrs, outs, target)
 
 
@@ -115,29 +116,29 @@ def compute_dense(attrs, inputs, out_type, target):
     out_dtype = attrs.out_dtype
     out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
 
-    if inputs[0].shape == 4: # this implies the layout is packed
-        target = tvm.target.create(target)
-        return [topi.nn.dense(inputs[0], inputs[1], None, out_dtype)]
-    elif target.device_name == "vta":
+    if target.device_name == "vta":
+        if inputs[0].shape == 4: # this implies the layout is packed
+            target = tvm.target.create(target)
+            return [topi.nn.dense(inputs[0], inputs[1], None, out_dtype)]
+        # If it's not packed, run on ARM CPU
         with tvm.target.arm_cpu(tvm.target.current_target().model):
             return _nn.compute_dense(attrs, inputs, out_type, target)
 
+    # If VTA is not the target, default to _nn def
     return _nn.compute_dense(attrs, inputs, out_type, target)
 
 
 @reg.register_schedule("nn.dense", level=15)
 def schedule_dense(attrs, outs, target):
     """Schedule definition of dense"""
-
-    if outs[0].shape == 4: # this implies the layout is packed
-        target = tvm.target.create(target)
-        if target.device_name == "vta":
+    if target.device_name == "vta":
+        if outs[0].shape == 4: # this implies the layout is packed
+            target = tvm.target.create(target)
+            assert target.device_name == "vta"
             return topi.generic.schedule_dense(outs)
-        # elif str(target).startswith("llvm"):
-        #     return tvm.create_schedule([x.op for x in outs])
-        raise RuntimeError("Target %s is not supported" % target)
-    elif target.device_name == "vta":
+        # If it's not packed, run on ARM CPU
         with tvm.target.arm_cpu(tvm.target.current_target().model):
             return _nn.schedule_dense(attrs, outs, tvm.target.current_target())
 
+    # If VTA is not the target, default to _nn def
     return _nn.schedule_dense(attrs, outs, target)

From 401baa75db97b65743a0964a307b174a8cc6f60f Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 20:03:13 -0700
Subject: [PATCH 096/126] fix naming

---
 python/tvm/relay/quantize/_annotate.py | 12 ++++++------
 src/relay/pass/quantize.cc             | 14 +++++++-------
 src/relay/pass/quantize.h              | 12 ++++++------
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index e98f45ef96b0..c625dc26bcb7 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -370,12 +370,12 @@ def concatenate_rewrite(ref_call, new_args, ctx):
 # register for vta stop fusion
 def register_vta_rewrite(op_name, frewrite=None, level=10):
     def _register(func):
-        return _op.op._Register(op_name, "FQVtaRewrite", func, level)
+        return _op.op._Register(op_name, "FQVTARewrite", func, level)
     return _register(frewrite) if frewrite is not None else _register
 
 
 @register_relay_node
-class QVtaExpr(_expr.TempExpr):
+class QVTAExpr(_expr.TempExpr):
     def __init__(self, expr):
         self.__init_handle_by_constructor__(
             _quantize.make_vta_expr, expr)
@@ -385,7 +385,7 @@ def realize(self):
 
 
 def vta_expr_check(expr):
-    if isinstance(expr, QVtaExpr):
+    if isinstance(expr, QVTAExpr):
         return True, expr.expr
     return False, expr
 
@@ -406,13 +406,13 @@ def conv2d_vta_rewrite(ref_call, new_args, ctx):
     if data_cond:
         data = new_args[0].realize()
     ret = _forward_op(ref_call, [data, kernel])
-    return QVtaExpr(ret)
+    return QVTAExpr(ret)
 
 
 def identity_vta_rewrite(ref_call, new_args, ctx):
     cond, expr = vta_expr_check(new_args[0])
     if cond:
-        return QVtaExpr(_forward_op(ref_call, [expr]))
+        return QVTAExpr(_forward_op(ref_call, [expr]))
     return None
 
 register_vta_rewrite("nn.relu", identity_vta_rewrite)
@@ -429,5 +429,5 @@ def add_vta_rewrite(ref_call, new_args, ctx):
         rhs = new_args[1].realize()
         return _forward_op(ref_call, [lhs, rhs])
     elif lhs_cond and not rhs_cond:
-        return QVtaExpr(_forward_op(ref_call, [lhs, rhs]))
+        return QVTAExpr(_forward_op(ref_call, [lhs, rhs]))
     return None
diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
index 3d5802307af3..ebc127639287 100644
--- a/src/relay/pass/quantize.cc
+++ b/src/relay/pass/quantize.cc
@@ -704,7 +704,7 @@ Pass QuantizeRewriteForVTAPass() {
   runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
     [=](Function f, Module m, PassContext pc) {
       return Downcast<Function>(
-          ForwardRewrite(f, "FQVtaRewrite", nullptr, nullptr));
+          ForwardRewrite(f, "FQVTARewrite", nullptr, nullptr));
   };
   return CreateFunctionPass(pass_func, 1, "QuantizeRewriteForVTA", {});
 }
@@ -716,20 +716,20 @@ TVM_REGISTER_API("relay._quantize.QuantizeRewriteForVTA")
 // Insert stop_fusion for vta.
 
 
-Expr QVtaExprNode::Realize() const {
+Expr QVTAExprNode::Realize() const {
   Expr ret = ForceCast(this->expr);
   return StopFusion(ret);
 }
 
-QVtaExpr QVtaExprNode::make(Expr expr) {
-  auto rnode = make_node<QVtaExprNode>();
+QVTAExpr QVTAExprNode::make(Expr expr) {
+  auto rnode = make_node<QVTAExprNode>();
   rnode->expr = expr;
-  return QVtaExpr(rnode);
+  return QVTAExpr(rnode);
 }
 
 TVM_REGISTER_API("relay._quantize.make_vta_expr")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
-    *ret = QVtaExprNode::make(args[0]);
+    *ret = QVTAExprNode::make(args[0]);
   });
 
 TVM_REGISTER_API("relay._quantize.make_stop_fusion")
@@ -739,7 +739,7 @@ TVM_REGISTER_API("relay._quantize.make_stop_fusion")
 
 TVM_REGISTER_API("relay._quantize.temp_expr_realize")
 .set_body_typed<Expr(Expr)>([] (const Expr& expr) {
-  const QVtaExprNode* n = expr.as<QVtaExprNode>();
+  const QVTAExprNode* n = expr.as<QVTAExprNode>();
   CHECK(n);
   return n->Realize();
 });
diff --git a/src/relay/pass/quantize.h b/src/relay/pass/quantize.h
index 318ebe57e2af..2699ccd09e57 100644
--- a/src/relay/pass/quantize.h
+++ b/src/relay/pass/quantize.h
@@ -72,11 +72,11 @@ class QAnnotateExprNode : public TempExprNode {
 RELAY_DEFINE_NODE_REF(QAnnotateExpr, QAnnotateExprNode, TempExpr);
 
 
-class QVtaExpr;
+class QVTAExpr;
 /*!
  * \brief TempExprNode used during annotate forward rewrite.
  */
-class QVtaExprNode : public TempExprNode {
+class QVTAExprNode : public TempExprNode {
  public:
   /*! \brief The original expression */
   Expr expr;
@@ -85,15 +85,15 @@ class QVtaExprNode : public TempExprNode {
     v->Visit("expr", &expr);
   }
 
-  TVM_DLL static QVtaExpr make(Expr expr);
+  TVM_DLL static QVTAExpr make(Expr expr);
 
   Expr Realize() const final;
 
-  static constexpr const char* _type_key = "relay.QVtaExpr";
-  TVM_DECLARE_NODE_TYPE_INFO(QVtaExprNode, TempExprNode);
+  static constexpr const char* _type_key = "relay.QVTAExpr";
+  TVM_DECLARE_NODE_TYPE_INFO(QVTAExprNode, TempExprNode);
 };
 
-RELAY_DEFINE_NODE_REF(QVtaExpr, QVtaExprNode, TempExpr);
+RELAY_DEFINE_NODE_REF(QVTAExpr, QVTAExprNode, TempExpr);
 
 
 /*! \brief TempExpr used during realize forward rewrite. */

From f1b810eaf54a54a419adf50095342b60f82aba30 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 20:19:03 -0700
Subject: [PATCH 097/126] annotation ops

---
 python/tvm/relay/quantize/_annotate.py |  4 ++--
 src/relay/op/annotation/annotation.cc  | 14 +++++++-------
 src/relay/pass/fuse_ops.cc             |  2 +-
 src/relay/pass/quantize.cc             |  4 ++--
 vta/python/vta/top/graphpack.py        |  8 ++++----
 5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index c625dc26bcb7..0a549f568b85 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -309,7 +309,7 @@ def identity_rewrite(ref_call, new_args, ctx):
 register_annotate_function("nn.relu", identity_rewrite)
 register_annotate_function("strided_slice", identity_rewrite)
 register_annotate_function("nn.avg_pool2d", identity_rewrite)
-register_annotate_function("stop_fusion", identity_rewrite)
+register_annotate_function("annotation.stop_fusion", identity_rewrite)
 
 
 def pool2d_rewrite(ref_call, new_args, ctx):
@@ -330,7 +330,7 @@ def pool2d_rewrite(ref_call, new_args, ctx):
 register_annotate_function("nn.max_pool2d", pool2d_rewrite)
 
 
-@register_annotate_function("force_cast")
+@register_annotate_function("annotation.force_cast")
 def force_cast_rewrite(ref_call, new_args, ctx):
     """Rewrite function to force cast"""
     if _conv_counter() <= current_qconfig().skip_k_conv:
diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index e6d41073e473..4e37aab11357 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -58,16 +58,16 @@ RELAY_REGISTER_OP("on_device")
                                ElemwiseArbitraryLayout);
 
 Expr StopFusion(Expr data) {
-  static const Op& op = Op::Get("stop_fusion");
+  static const Op& op = Op::Get("annotation.stop_fusion");
   return CallNode::make(op, {data}, Attrs{}, {});
 }
 
-TVM_REGISTER_API("relay.op.annotation._make.stop_fusion")
+TVM_REGISTER_API("relay.op.annotation._make.annotation.")
 .set_body_typed<Expr(Expr)>([](Expr data) {
     return StopFusion(data);
 });
 
-RELAY_REGISTER_OP("stop_fusion")
+RELAY_REGISTER_OP("annotation.stop_fusion")
 .describe(R"code(Annotate an expression to prevent it being fused with previous expressions.)code"
 TVM_ADD_FILELINE)
 .set_num_inputs(1)
@@ -84,11 +84,11 @@ TVM_ADD_FILELINE)
                        });
 
 Expr ForceCast(Expr data) {
-  static const Op& op = Op::Get("force_cast");
+  static const Op& op = Op::Get("annotation.force_cast");
   return CallNode::make(op, {data}, Attrs{}, {});
 }
 
-RELAY_REGISTER_OP("force_cast")
+RELAY_REGISTER_OP("annotation.force_cast")
 .describe(R"code(Annotate an expression to force a cast.)code"
 TVM_ADD_FILELINE)
 .set_num_inputs(1)
@@ -105,7 +105,7 @@ TVM_ADD_FILELINE)
                        });
 
 
-RELAY_REGISTER_OP("bitpack_start")
+RELAY_REGISTER_OP("annotation.bitpack_start")
 .describe(R"code(
 Mark the start of bitpacking.
 )code" TVM_ADD_FILELINE)
@@ -122,7 +122,7 @@ Mark the start of bitpacking.
                          return {topi::identity(inputs[0])};
                        });
 
-RELAY_REGISTER_OP("bitpack_end")
+RELAY_REGISTER_OP("annotation.bitpack_end")
 .describe(R"code(
 Mark the end of bitpacking.
 )code" TVM_ADD_FILELINE)
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 9cd73171bfea..9f940e54953b 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -821,7 +821,7 @@ class FuseMutator : private ExprMutator {
 
   // Transform calls.
   Expr VisitExpr_(const CallNode* call) {
-    static const Op& stop_fusion = Op::Get("stop_fusion");
+    static const Op& stop_fusion = Op::Get("annotation.stop_fusion");
     if (call->op.as<OpNode>()) {
       // If it is a primitive op call
       // then we must have a group assignment for it already.
diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
index ebc127639287..2f23c7659b02 100644
--- a/src/relay/pass/quantize.cc
+++ b/src/relay/pass/quantize.cc
@@ -530,7 +530,7 @@ RELAY_REGISTER_OP("nn.relu")
 RELAY_REGISTER_OP("strided_slice")
 .set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
 
-RELAY_REGISTER_OP("stop_fusion")
+RELAY_REGISTER_OP("annotation.stop_fusion")
 .set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
 
 /* \brief for unary operators which requantize its input to dtype_nbit */
@@ -585,7 +585,7 @@ Expr ForceCastRealize(const Call& ref_call,
   return Expr(nullptr);
 }
 
-RELAY_REGISTER_OP("force_cast")
+RELAY_REGISTER_OP("annotation.force_cast")
 .set_attr<FForwardRewrite>("FQRealizeRewrite", ForceCastRealize);
 
 TVM_REGISTER_API("relay._quantize.realize")
diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
index c6cc49748bac..6f901833ea15 100644
--- a/vta/python/vta/top/graphpack.py
+++ b/vta/python/vta/top/graphpack.py
@@ -111,8 +111,8 @@ def __init__(self, bfactor, cfactor, weight_bits):
         self.weight_bits = weight_bits
         self.start_pack = False
         # Cache Operator the algorithm matches against.
-        self.bitpack_start = op.op.get('bitpack_start')
-        self.bitpack_end = op.op.get('bitpack_end')
+        self.bitpack_start = op.op.get('annotation.bitpack_start')
+        self.bitpack_end = op.op.get('annotation.bitpack_end')
         self.conv2d = op.op.get("nn.conv2d")
         self.conv2d_transpose = op.op.get("nn.conv2d_transpose")
         self.add = op.op.get("add")
@@ -229,8 +229,8 @@ def get_subgraph(expr, start_name, stop_name):
         This constraint will be lifted in the future.
         bitpack_start and bitpack_end are both inclusive.
     """
-    bitpack_start = op.op.get('bitpack_start')
-    bitpack_end = op.op.get('bitpack_end')
+    bitpack_start = op.op.get('annotation.bitpack_start')
+    bitpack_end = op.op.get('annotation.bitpack_end')
     anf = relay.ir_pass.to_a_normal_form(expr)
     def _recursion(anf, start_found, stop_found):
         """ Helper to obtain the subgraph.

From 51acba8d608ab84d282adacdf52a461d737e9245 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 19 Jun 2019 20:20:48 -0700
Subject: [PATCH 098/126] typo fix

---
 src/relay/op/annotation/annotation.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index 4e37aab11357..a5ade5bde304 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -62,7 +62,7 @@ Expr StopFusion(Expr data) {
   return CallNode::make(op, {data}, Attrs{}, {});
 }
 
-TVM_REGISTER_API("relay.op.annotation._make.annotation.")
+TVM_REGISTER_API("relay.op.annotation._make.stop_fusion")
 .set_body_typed<Expr(Expr)>([](Expr data) {
     return StopFusion(data);
 });

From 819e2d9b9599081cda5d0f1dc1a12c37ae7de551 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Thu, 20 Jun 2019 00:02:19 -0700
Subject: [PATCH 099/126] autoTVM tutorial for VTA

---
 docs/conf.py                            |   3 +-
 vta/scripts/tune_resnet.py              |   1 -
 vta/tutorials/autotvm/README.txt        |   3 +
 vta/tutorials/autotvm/tune_relay_vta.py | 458 ++++++++++++++++++++++++
 4 files changed, 463 insertions(+), 2 deletions(-)
 create mode 100644 vta/tutorials/autotvm/README.txt
 create mode 100644 vta/tutorials/autotvm/tune_relay_vta.py

diff --git a/docs/conf.py b/docs/conf.py
index d9eea045a97b..c4410e5864f9 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -217,7 +217,8 @@ def run_doxygen(folder):
      '../tutorials/topi',
      '../tutorials/deployment',
      '../vta/tutorials/frontend',
-     '../vta/tutorials/optimize'])
+     '../vta/tutorials/optimize',
+     '../vta/tutorials/autotvm'])
 
 def generate_doxygen_xml(app):
     """Run the doxygen make commands if we're on the ReadTheDocs server"""
diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
index 5469b6b40b6e..1a7c74bee3f7 100644
--- a/vta/scripts/tune_resnet.py
+++ b/vta/scripts/tune_resnet.py
@@ -271,7 +271,6 @@ def tune_tasks(tasks,
 
         # Compile network
         print("Compiling network with best tuning parameters...")
-        # relay_prog, params = compile_network(opt, env, target)
         with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
             if target.device_name != "vta":
                 graph, lib, params = relay.build(
diff --git a/vta/tutorials/autotvm/README.txt b/vta/tutorials/autotvm/README.txt
new file mode 100644
index 000000000000..c511381dd57d
--- /dev/null
+++ b/vta/tutorials/autotvm/README.txt
@@ -0,0 +1,3 @@
+Auto tuning
+-------------
+
diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
new file mode 100644
index 000000000000..891c23d6d105
--- /dev/null
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -0,0 +1,458 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-tuning a convolutional network on VTA
+==========================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
+
+Auto-tuning for a specific accelerator design is critical for getting the best
+performance for any given operator. This is a tutorial showcases how to tune a
+whole convolutional network on VTA.
+
+The operator implementation for VTA in TVM is written in template form.
+The template has many tunable knobs (tile factor, virtual threads, etc).
+We will tune all convolution operators in the neural network. After tuning,
+we produce a log file which stores the best schedule parameters for all tuned
+operators. When the TVM compiler compiles these operators, it will query this
+log file to get the best knob parameters.
+
+"""
+
+######################################################################
+# Install dependencies
+# --------------------
+# To use the autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost tornado mxnet requests pillow
+#
+# To make TVM run faster during tuning, it is recommended to use cython
+# as FFI of TVM. In the root directory of TVM, execute
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
+import os
+from mxnet.gluon.model_zoo import vision
+import numpy as np
+from PIL import Image
+
+import topi
+import tvm
+from tvm import rpc, autotvm, relay
+from tvm.contrib import graph_runtime, util, download
+from tvm.autotvm.measure.measure_methods import request_remote
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+
+import vta
+from vta.testing import simulator
+from vta.top import graph_pack
+
+#################################################################
+# Compile network
+# ---------------
+# Perform vta-specific compilation with Relay from a Gluon model
+
+def compile_network(env, target, model, start_pack, stop_pack):
+
+    # Populate the shape and data type dictionary
+    dtype_dict = {"data": 'float32'}
+    shape_dict = {"data": (env.BATCH, 3, 224, 224)}
+
+    # Get off the shelf gluon model, and convert to relay
+    gluon_model = vision.get_model(model, pretrained=True)
+    mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
+
+    # Update shape and type dictionary
+    shape_dict.update({k: v.shape for k, v in params.items()})
+    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+    # Perform quantization in Relay
+    with relay.quantize.qconfig(global_scale=8.0,
+                                skip_k_conv=1,
+                                skip_k_dense=1,
+                                target_vta=True):
+        relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params)
+
+    # Perform graph packing and constant folding for VTA target
+    if target.device_name == "vta":
+        assert env.BLOCK_IN == env.BLOCK_OUT
+        relay_prog = graph_pack(
+            relay_prog,
+            env.BATCH,
+            env.BLOCK_OUT,
+            env.WGT_WIDTH,
+            start_name=start_pack,
+            stop_name=stop_pack)
+        relay_prog = relay.ir_pass.fold_constant(relay_prog)
+
+    return relay_prog, params
+
+
+#################################################################
+# Start RPC Tracker
+# -----------------
+# TVM uses an RPC session to communicate with Pynq boards.
+# During tuning, the tuner will send the generated code to the board and
+# measure the speed of code on the board.
+#
+# To scale up tuning, TVM uses an RPC Tracker to manage multiple devices.
+# The RPC Tracker is a centralized master node. We can register all devices to
+# the tracker. For example, if we have 10 Pynq boards, we can register all of them
+# to the tracker, and run 10 measurements in parallel, accelerating the tuning process.
+#
+# To start an RPC tracker, run this command on the host machine. The tracker is
+# required during the whole tuning process, so we need to open a new terminal for
+# this command:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
+#
+# The expected output is:
+#
+# .. code-block:: bash
+#
+#   INFO:RPCTracker:bind to 0.0.0.0:9190
+
+#################################################################
+# Register devices to RPC Tracker
+# -----------------------------------
+# Now we can register our devices to the tracker. The first step is to
+# build the TVM runtime for the Pynq devices.
+#
+# Follow `this section <https://docs.tvm.ai/vta/install.html#pynq-side-rpc-server-build-deployment>`_
+# to build the TVM runtime on the device. Then register the device to the tracker with:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=pynq
+#
+# (replace :code:`[HOST_IP]` with the IP address of your host machine)
+#
+# After registering devices, we can confirm it by querying the rpc_tracker:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+#
+# For example, if we have 6 Pynq boards and 11 Raspberry Pi 3B,
+# the output can be
+#
+# .. code-block:: bash
+#
+#    Queue Status
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    pynq         6      6     0 
+#    rpi3b        11     11    0
+#    ----------------------------------
+#
+# You can register multiple devices to the tracker to accelerate tuning.
+
+###########################################
+# Set Tuning Options
+# ------------------
+# Before tuning, we should apply some configurations.
+# Here we use an Pynq-Z1 board as an example.
+
+# Tracker host and port can be set by your environment
+tracker_host = os.environ.get("TVM_TRACKER_HOST", '0.0.0.0')
+tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190))
+
+# Load VTA parameters from the vta/config/vta_config.json file
+env = vta.get_env()
+
+# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
+# Set ``device=arm_cpu`` to run inference on the CPU
+# or ``device=vta`` to run inference on the FPGA.
+device = "vta"
+target = env.target if device == "vta" else env.target_vta_cpu
+
+# Name of Gluon model to compile
+# The ``start_pack`` and ``stop_pack`` labels indicate where
+# to start and end the graph packing relay pass: in other words
+# where to start and finish offloading to VTA.
+network = "resnet18_v1"
+start_pack="nn.max_pool2d"
+stop_pack="nn.global_avg_pool2d"
+
+# Tuning option
+log_file = "%s.%s.log" % (device, network)
+tuning_option = {
+    'log_filename': log_file,
+
+    'tuner': 'random',
+    'n_trial': 1000,
+    'early_stopping': None,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func),
+        runner=autotvm.RPCRunner(
+            env.TARGET, host=tracker_host, port=tracker_port,
+            number=5,
+            timeout=60,
+            check_correctness=True
+        ),
+    ),
+}
+
+####################################################################
+#
+# .. note:: How to set tuning options
+#
+#   In general, the default values provided here work well.
+#   If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` 
+#   to larger values, makes the tuning run for longer.
+#   If your device is under-powered or your conv2d operators are large, consider
+#   setting a longer timeout.
+#
+
+###################################################################
+# Begin Tuning
+# ------------
+# Now we can extract tuning tasks from the network and begin tuning.
+# Here, we provide a simple utility function to tune a list of tasks.
+# This function is just an initial implementation which tunes them in sequential order.
+# We will introduce a more sophisticated tuning scheduler in the future.
+#
+# Given that the tuning will be done on Pynq FPGA boards, make sure that
+# the ```TARGET`` entry in the ``vta_config.json`` file is set to ``pynq``.
+
+# You can skip the implementation of this function for this tutorial.
+def tune_tasks(tasks,
+               measure_option,
+               tuner='xgb',
+               n_trial=1000,
+               early_stopping=None,
+               log_filename='tuning.log',
+               use_transfer_learning=True):
+
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(tsk, loss_type='rank')
+        elif tuner == 'xgb_knob':
+            tuner_obj = XGBTuner(tsk, loss_type='rank', feature_type='knob')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(tsk, pop_size=50)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        tuner_obj.tune(n_trial=min(n_trial, len(tsk.config_space)),
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(n_trial, prefix=prefix),
+                           autotvm.callback.log_to_file(tmp_log_file)])
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+
+
+########################################################################
+# Register VTA-specific tuning tasks
+
+def register_vta_tuning_tasks():
+    from tvm.autotvm.task.topi_integration import TaskExtractEnv, deserialize_args
+
+    @tvm.tag_scope(tag=topi.tag.ELEMWISE)
+    def my_clip(x, a_min, a_max):
+        """Unlike topi's current clip, put min and max into two stages."""
+        const_min = tvm.const(a_min, x.dtype)
+        const_max = tvm.const(a_max, x.dtype)
+        x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
+        x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+        return x
+
+    # init autotvm env to register VTA operator
+    TaskExtractEnv()
+
+    @autotvm.task.register("topi_nn_conv2d", override=True)
+    def _topi_nn_conv2d(*args, **kwargs):
+        assert not kwargs, "Do not support kwargs in template function call"
+        args = deserialize_args(args)
+        A, W = args[:2]
+
+        with tvm.target.vta():
+            res = topi.nn.conv2d(*args, **kwargs)
+            res = topi.right_shift(res, 8)
+            res = my_clip(res, 0, 127)
+            res = topi.cast(res, "int8")
+
+        if tvm.target.current_target().device_name == 'vta':
+            s = topi.generic.schedule_conv2d_nchw([res])
+        else:
+            s = tvm.create_schedule([res.op])
+        return s, [A, W, res]
+
+
+########################################################################
+# Finally, we launch tuning jobs and evaluate the end-to-end performance.
+
+def tune_and_evaluate(tuning_opt):
+
+    if env.TARGET != "sim":
+        # Get remote from fleet node
+        remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000)
+        # Reconfigure the JIT runtime and FPGA.
+        vta.reconfig_runtime(remote)
+        vta.program_fpga(remote, bitstream=None)
+    else:
+        # In simulation mode, host the RPC server locally.
+        remote = rpc.LocalSession()
+
+    # Register VTA tuning tasks
+    register_vta_tuning_tasks()
+
+    # Perform task extraction on Relay program
+    print("Extract tasks...")
+    relay_prog, params = compile_network(env, target, network, start_pack, stop_pack)
+    tasks = autotvm.task.extract_from_program(func=relay_prog,
+                                              params=params,
+                                              ops=(tvm.relay.op.nn.conv2d,),
+                                              target=target,
+                                              target_host=env.target_host)
+    
+    # We should have extracted 10 convolution tasks
+    assert len(tasks) == 10
+    print("Extracted {} conv2d tasks:".format(len(tasks)))
+    for tsk in tasks:
+        print("\t{}".format(tsk))
+
+    # We do not run the tuning in our webpage server since it takes too long.
+    # Comment the following line to run it by yourself.
+    return
+
+    # run tuning tasks
+    print("Tuning...")
+    tune_tasks(tasks, **tuning_opt)
+
+    # compile kernels with history best records
+    with autotvm.tophub.context(target, extra_files=[log_file]):
+        # Compile network
+        print("Compile...")
+        with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
+            if target.device_name != "vta":
+                graph, lib, params = relay.build(
+                    relay_prog, target=target,
+                    params=params, target_host=env.target_host)
+            else:
+                with vta.build_config():
+                    graph, lib, params = relay.build(
+                        relay_prog, target=target,
+                        params=params, target_host=env.target_host)
+
+        # Export library
+        temp = util.tempdir()
+        lib.save(temp.relpath("graphlib.o"))
+        remote.upload(temp.relpath("graphlib.o"))
+        lib = remote.load_module("graphlib.o")
+
+        # Generate the graph runtime
+        m = graph_runtime.create(graph, lib, ctx)
+
+        # upload parameters to device
+        ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
+        image = tvm.nd.array(
+            (np.random.uniform(size=(1, 3, 224, 224))).astype('float32'))
+        m.set_input(**params)
+        m.set_input('data', image)
+
+        # evaluate
+        timer = m.module.time_evaluator("run", ctx, number=1, repeat=10)
+        tcost = timer()
+        prof_res = np.array(tcost.results) * 1000  # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+              (np.mean(prof_res), np.std(prof_res)))
+
+# Run the tuning and evaluate the results
+tune_and_evaluate(tuning_option)
+
+######################################################################
+# Sample Output
+# -------------
+# The tuning needs to compile many programs and extract feature from them.
+# So a high performance CPU is recommended.
+# One sample output is listed below.
+# It takes about 2 hours on a 16T CPU, and 6 Pynq boards.
+#
+# .. code-block:: bash
+#
+#    Extract tasks...
+#    Tuning...
+#    [Task  1/12]  Current/Best:   22.37/  52.19 GFLOPS | Progress: (544/1000) | 406.59 s Done.
+#    [Task  2/12]  Current/Best:    6.51/  18.77 GFLOPS | Progress: (608/1000) | 325.05 s Done.
+#    [Task  3/12]  Current/Best:    4.67/  24.87 GFLOPS | Progress: (480/1000) | 372.31 s Done.
+#    [Task  4/12]  Current/Best:   11.35/  46.83 GFLOPS | Progress: (736/1000) | 602.39 s Done.
+#    [Task  5/12]  Current/Best:    1.01/  19.80 GFLOPS | Progress: (448/1000) | 262.16 s Done.
+#    [Task  6/12]  Current/Best:    2.47/  23.76 GFLOPS | Progress: (672/1000) | 563.85 s Done.
+#    [Task  7/12]  Current/Best:   14.57/  33.97 GFLOPS | Progress: (544/1000) | 465.15 s Done.
+#    [Task  8/12]  Current/Best:    1.13/  17.65 GFLOPS | Progress: (576/1000) | 365.08 s Done.
+#    [Task  9/12]  Current/Best:   14.45/  22.66 GFLOPS | Progress: (928/1000) | 724.25 s Done.
+#    [Task 10/12]  Current/Best:    3.22/  15.36 GFLOPS | Progress: (864/1000) | 564.27 s Done.
+#    [Task 11/12]  Current/Best:   11.03/  32.23 GFLOPS | Progress: (736/1000) | 635.15 s Done.
+#    [Task 12/12]  Current/Best:    8.00/  21.65 GFLOPS | Progress: (1000/1000) | 1111.81 s Done.
+#    Compile...
+#    Upload...
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 162.59 ms (0.06 ms)
+
+######################################################################
+#
+# .. note:: **Experiencing Difficulties?**
+#
+#   The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS",
+#   then there must be something wrong.
+#
+#   First, make sure you set the correct configuration of your device.
+#   Then, you can print debug information by adding these lines in the beginning
+#   of the script. It will print every measurement result, where you can find useful
+#   error messages.
+#
+#   .. code-block:: python
+#
+#      import logging
+#      logging.getLogger('autotvm').setLevel(logging.DEBUG)
+#
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai

From ff20dc5e6de772fcccf2d0641f7b1fa3c6156d53 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Thu, 20 Jun 2019 09:54:04 -0700
Subject: [PATCH 100/126] bug fix and tweaking output

---
 vta/tutorials/autotvm/tune_relay_vta.py | 40 ++++++++++++++++---------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index 891c23d6d105..1f52bb4f62c1 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -384,22 +384,24 @@ def tune_and_evaluate(tuning_opt):
                         params=params, target_host=env.target_host)
 
         # Export library
+        print("Upload...")
         temp = util.tempdir()
         lib.save(temp.relpath("graphlib.o"))
         remote.upload(temp.relpath("graphlib.o"))
         lib = remote.load_module("graphlib.o")
 
         # Generate the graph runtime
+        ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
         m = graph_runtime.create(graph, lib, ctx)
 
         # upload parameters to device
-        ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
         image = tvm.nd.array(
             (np.random.uniform(size=(1, 3, 224, 224))).astype('float32'))
         m.set_input(**params)
         m.set_input('data', image)
 
         # evaluate
+        print("Evaluate inference time cost...")
         timer = m.module.time_evaluator("run", ctx, number=1, repeat=10)
         tcost = timer()
         prof_res = np.array(tcost.results) * 1000  # convert to millisecond
@@ -420,23 +422,33 @@ def tune_and_evaluate(tuning_opt):
 # .. code-block:: bash
 #
 #    Extract tasks...
+#    [Warning] Invalid shape during AutoTVM task creation
+#    Extracted 10 conv2d tasks:
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 16, 14, 14, 1, 16), 'int8'), ('TENSOR', (32, 16, 1, 1, 16, 16), 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 16, 14, 14, 1, 16, 'int8'), (32, 16, 1, 1, 16, 16, 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 8, 28, 28, 1, 16), 'int8'), ('TENSOR', (16, 8, 1, 1, 16, 16), 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 8, 28, 28, 1, 16, 'int8'), (16, 8, 1, 1, 16, 16, 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 4, 56, 56, 1, 16), 'int8'), ('TENSOR', (8, 4, 1, 1, 16, 16), 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 4, 56, 56, 1, 16, 'int8'), (8, 4, 1, 1, 16, 16, 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 4, 56, 56, 1, 16), 'int8'), ('TENSOR', (4, 4, 3, 3, 16, 16), 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 4, 56, 56, 1, 16, 'int8'), (4, 4, 3, 3, 16, 16, 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 8, 28, 28, 1, 16), 'int8'), ('TENSOR', (8, 8, 3, 3, 16, 16), 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 8, 28, 28, 1, 16, 'int8'), (8, 8, 3, 3, 16, 16, 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 4, 56, 56, 1, 16), 'int8'), ('TENSOR', (8, 4, 3, 3, 16, 16), 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 4, 56, 56, 1, 16, 'int8'), (8, 4, 3, 3, 16, 16, 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 16, 14, 14, 1, 16), 'int8'), ('TENSOR', (16, 16, 3, 3, 16, 16), 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 16, 14, 14, 1, 16, 'int8'), (16, 16, 3, 3, 16, 16, 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 8, 28, 28, 1, 16), 'int8'), ('TENSOR', (16, 8, 3, 3, 16, 16), 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 8, 28, 28, 1, 16, 'int8'), (16, 8, 3, 3, 16, 16, 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 32, 7, 7, 1, 16), 'int8'), ('TENSOR', (32, 32, 3, 3, 16, 16), 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 32, 7, 7, 1, 16, 'int8'), (32, 32, 3, 3, 16, 16, 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 16, 14, 14, 1, 16), 'int8'), ('TENSOR', (32, 16, 3, 3, 16, 16), 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 16, 14, 14, 1, 16, 'int8'), (32, 16, 3, 3, 16, 16, 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
 #    Tuning...
-#    [Task  1/12]  Current/Best:   22.37/  52.19 GFLOPS | Progress: (544/1000) | 406.59 s Done.
-#    [Task  2/12]  Current/Best:    6.51/  18.77 GFLOPS | Progress: (608/1000) | 325.05 s Done.
-#    [Task  3/12]  Current/Best:    4.67/  24.87 GFLOPS | Progress: (480/1000) | 372.31 s Done.
-#    [Task  4/12]  Current/Best:   11.35/  46.83 GFLOPS | Progress: (736/1000) | 602.39 s Done.
-#    [Task  5/12]  Current/Best:    1.01/  19.80 GFLOPS | Progress: (448/1000) | 262.16 s Done.
-#    [Task  6/12]  Current/Best:    2.47/  23.76 GFLOPS | Progress: (672/1000) | 563.85 s Done.
-#    [Task  7/12]  Current/Best:   14.57/  33.97 GFLOPS | Progress: (544/1000) | 465.15 s Done.
-#    [Task  8/12]  Current/Best:    1.13/  17.65 GFLOPS | Progress: (576/1000) | 365.08 s Done.
-#    [Task  9/12]  Current/Best:   14.45/  22.66 GFLOPS | Progress: (928/1000) | 724.25 s Done.
-#    [Task 10/12]  Current/Best:    3.22/  15.36 GFLOPS | Progress: (864/1000) | 564.27 s Done.
-#    [Task 11/12]  Current/Best:   11.03/  32.23 GFLOPS | Progress: (736/1000) | 635.15 s Done.
-#    [Task 12/12]  Current/Best:    8.00/  21.65 GFLOPS | Progress: (1000/1000) | 1111.81 s Done.
+#    [Task  1/10]  Current/Best:    0.72/  23.24 GFLOPS | Progress: (480/1000) | 640.31 s Done.
+#    [Task  2/10]  Current/Best:    0.00/  27.69 GFLOPS | Progress: (576/1000) | 810.09 s Done.
+#    [Task  3/10]  Current/Best:    0.00/  22.97 GFLOPS | Progress: (1000/1000) | 1125.37 s Done.
+#    [Task  4/10]  Current/Best:    0.00/  31.26 GFLOPS | Progress: (1000/1000) | 1025.52 s Done.
+#    [Task  5/10]  Current/Best:    0.00/  15.15 GFLOPS | Progress: (1000/1000) | 1236.58 s Done.
+#    [Task  6/10]  Current/Best:    0.00/  22.74 GFLOPS | Progress: (1000/1000) | 906.60 s Done.
+#    [Task  7/10]  Current/Best:    0.00/  15.27 GFLOPS | Progress: (1000/1000) | 1056.25 s Done.
+#    [Task  8/10]  Current/Best:    0.00/   2.18 GFLOPS | Progress: (1000/1000) | 2275.29 s Done.
+#    [Task  9/10]  Current/Best:    2.23/   3.99 GFLOPS | Progress: (1000/1000) | 2527.25 s Done.
+#    [Task 10/10]  Current/Best:    1.56/   6.32 GFLOPS | Progress: (480/1000) | 1304.84 s Done.
 #    Compile...
 #    Upload...
 #    Evaluate inference time cost...
-#    Mean inference time (std dev): 162.59 ms (0.06 ms)
+#    Mean inference time (std dev): 621.79 ms (0.14 ms)
 
 ######################################################################
 #

From 772a83745d81ff7693db25746e71ef6887efe6db Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Fri, 21 Jun 2019 02:12:20 -0700
Subject: [PATCH 101/126] addressing reviews

---
 python/tvm/autotvm/task/relay_integration.py |  1 +
 python/tvm/autotvm/task/topi_integration.py  |  2 +-
 python/tvm/relay/quantize/quantize.py        | 14 ++++++++------
 vta/python/vta/build_module.py               |  2 +-
 4 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index e71c076e26d0..e5359c2f5d75 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -31,6 +31,7 @@
 logger = logging.getLogger('autotvm')
 
 
+# TODO(moreau89) find a more elegant way to build for VTAs
 def _build(func,
            target,
            target_host,
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index 7ff8ec73e16e..bc434719c36a 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -107,7 +107,7 @@ def __init__(self, allow_duplicate=False):
             topi.nn.deformable_conv2d_nchw: [topi.generic.schedule_deformable_conv2d_nchw],
         }
 
-        # support reflection for tracing
+        # function reflection for tracing
         self.func_to_reflection = {
             topi.nn.conv2d:                 lambda x: setattr(topi.nn, 'conv2d', x),
             topi.nn.conv2d_NCHWc:           lambda x: setattr(topi.nn, 'conv2d_NCHWc', x),
diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index 4f3ff60a8c06..b08c8650f917 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -124,7 +124,9 @@ def current_qconfig():
     """Get the current quantization configuration."""
     return _quantize._GetCurrentQConfig()
 
-
+# TODO(tmoreau89, ZihengJiang) the skip parameters are
+# hacky - we should explore a more future-proof way to
+# skip operators based on pattern matching
 def qconfig(**kwargs):
     """Configure the quantization behavior by setting config variables.
 
@@ -200,18 +202,18 @@ def annotate_context():
     return AnnotateContext.Current
 
 
-DENSE_COUNTER = 0
+_DENSE_COUNTER = 0
 
 
-def _dense_counter():
+def _dense_counter_():
     """Get the global counter for dense."""
-    return DENSE_COUNTER
+    return _DENSE_COUNTER
 
 
 def _set_dense_counter(n):
     """Set the value of the global dense counter."""
-    global DENSE_COUNTER
-    DENSE_COUNTER = n
+    global _DENSE_COUNTER
+    _DENSE_COUNTER = n
 
 
 def calibrate(graph, mod=None, ctx=None):
diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py
index 854dd4daf14a..183a2f4a500d 100644
--- a/vta/python/vta/build_module.py
+++ b/vta/python/vta/build_module.py
@@ -119,7 +119,7 @@ def build(*args, **kwargs):
             return tvm.build(*args, **kwargs)
     return tvm.build(*args, **kwargs)
 
-
+# TODO(tmoreau89) unify the build with the rest of the build modules
 def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs):
     """Custom build func for VTA. Used for autotvm"""
 

From a692d2f73508fc42892ec64050d23d7fe669b114 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Fri, 21 Jun 2019 02:16:57 -0700
Subject: [PATCH 102/126] fix

---
 python/tvm/relay/quantize/quantize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index b08c8650f917..12b1a1be728b 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -205,7 +205,7 @@ def annotate_context():
 _DENSE_COUNTER = 0
 
 
-def _dense_counter_():
+def _dense_counter():
     """Get the global counter for dense."""
     return _DENSE_COUNTER
 

From b2d060a22e6857a64e4f53a5f49c7604d0da603d Mon Sep 17 00:00:00 2001
From: ZihengJiang <ziheng@apache.org>
Date: Sat, 22 Jun 2019 20:29:21 -0700
Subject: [PATCH 103/126] Update.

---
 python/tvm/relay/quantize/quantize.py | 5 +----
 src/relay/pass/quantize.h             | 5 ++++-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index 12b1a1be728b..3c2a8a10026f 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -360,8 +360,6 @@ def quantize(graph, params=None, dataset=None):
     if params:
         graph = _bind_params(graph, params)
 
-    cfg = current_qconfig()
-
     mod = _module.Module.from_expr(graph)
     # Perform "SimplifyInference", "FoldScaleAxis", "FoldConstant", and
     # "CanonicalizeOps" optimization before quantization.
@@ -378,8 +376,7 @@ def quantize(graph, params=None, dataset=None):
                     calibrate_pass,
                     realize(),
                     _transform.FoldConstant()]
-    # Add rewrite_for_vta() pass if target is VTA
-    if cfg.target_vta:
+    if current_qconfig().store_lowbit_output:
         quant_passes = [rewrite_for_vta()] + quant_passes
     quantize_seq = _transform.Sequential(quant_passes)
     with _transform.PassContext(opt_level=3,
diff --git a/src/relay/pass/quantize.h b/src/relay/pass/quantize.h
index 2699ccd09e57..c20f0b606dea 100644
--- a/src/relay/pass/quantize.h
+++ b/src/relay/pass/quantize.h
@@ -72,9 +72,12 @@ class QAnnotateExprNode : public TempExprNode {
 RELAY_DEFINE_NODE_REF(QAnnotateExpr, QAnnotateExprNode, TempExpr);
 
 
+/*!
+ * \brief TempExpr used to insert `force_cast` for VTA.
+ */
 class QVTAExpr;
 /*!
- * \brief TempExprNode used during annotate forward rewrite.
+ * \brief TempExprNode used to insert `force_cast` for VTA.
  */
 class QVTAExprNode : public TempExprNode {
  public:

From 5215d62db1a3f4691f9e81edb9a622f1ae7af6b3 Mon Sep 17 00:00:00 2001
From: ZihengJiang <ziheng@apache.org>
Date: Sat, 22 Jun 2019 20:36:23 -0700
Subject: [PATCH 104/126] Update.

---
 vta/tutorials/frontend/deploy_resnet_on_vta.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vta/tutorials/frontend/deploy_resnet_on_vta.py b/vta/tutorials/frontend/deploy_resnet_on_vta.py
index 7b5e6b2e730e..271630e69558 100644
--- a/vta/tutorials/frontend/deploy_resnet_on_vta.py
+++ b/vta/tutorials/frontend/deploy_resnet_on_vta.py
@@ -159,9 +159,7 @@
 
     # Perform quantization in Relay
     with relay.quantize.qconfig(global_scale=8.0,
-                                skip_k_conv=1,
-                                skip_k_dense=1,
-                                target_vta=True):
+                                skip_conv_layers=[0]):
         relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params)
 
     # Perform graph packing and constant folding for VTA target
@@ -261,4 +259,4 @@
 for k in top_categories[-5:]:
     if "cat" in synset[k]:
         cat_detected = True
-assert(cat_detected)
\ No newline at end of file
+assert(cat_detected)

From 3cb83a615c599c013c82560fcbcb0bda080edb10 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 24 Jun 2019 00:37:37 -0700
Subject: [PATCH 105/126] addressing comments

---
 python/tvm/relay/quantize/_annotate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index 0a549f568b85..657fce54b42d 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -367,7 +367,7 @@ def concatenate_rewrite(ref_call, new_args, ctx):
     return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
 
 
-# register for vta stop fusion
+# Graph rewrite function registration for VTA target
 def register_vta_rewrite(op_name, frewrite=None, level=10):
     def _register(func):
         return _op.op._Register(op_name, "FQVTARewrite", func, level)

From 42a447cb2fee260afc8cea7daa9e164ef664cb36 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 24 Jun 2019 00:40:54 -0700
Subject: [PATCH 106/126] addressing more comments

---
 python/tvm/autotvm/task/relay_integration.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index e5359c2f5d75..29190fa43324 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -41,12 +41,11 @@ def _build(func,
 
     from tvm import relay
 
-    if "vta" in str(target):
+    if target.device_name == "vta":
         with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
-            if target.device_name == "vta":
-                import vta
-                with vta.build_config():
-                    return relay.build(func, target, target_host, params)
+            import vta
+            with vta.build_config():
+                return relay.build(func, target, target_host, params)
     # default case
     return relay.build(func, target, target_host, params)
 
@@ -125,7 +124,7 @@ def extract_from_program(func, params, ops, target, target_host=None):
                          template_key='direct')
             tasks.append(tsk)
         except topi.InvalidShapeError:
-            print("[Warning] Invalid shape during AutoTVM task creation")
+            warnings.warn("Invalid shape during AutoTVM task creation")
     return tasks
 
 

From 1c52ed18eb7469dc48cfd7fe162ef938ac227b87 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 24 Jun 2019 00:45:32 -0700
Subject: [PATCH 107/126] clean up

---
 python/tvm/autotvm/task/nnvm_integration.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py
index e785394a7da3..9161822d173c 100644
--- a/python/tvm/autotvm/task/nnvm_integration.py
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -105,8 +105,6 @@ def extract_from_graph(graph, shape, dtype, target, symbols, params=None, target
     tasks = []
     for task_name, args in env.get_tasks():
         try:
-            print(task_name)
-            print(args)
             tsk = create(task_name, args,
                          target=target, target_host=target_host,
                          template_key='direct')

From 3e7aed38f0ed000b5ce6d4abc344d8e0207106cc Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 24 Jun 2019 01:15:14 -0700
Subject: [PATCH 108/126] comment

---
 python/tvm/autotvm/measure/measure_methods.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 7ddc6cd9ea5f..fd8cbe0e2a0d 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -86,9 +86,7 @@ def __init__(self, timeout=10, n_parallel=None, build_func='default'):
                 build_func = ndk.create_shared
             else:
                 raise ValueError("Invalid build_func" + build_func)
-            self.build_func = _wrap_build_func(build_func)
-        else:
-            self.build_func = build_func
+        self.build_func = _wrap_build_func(build_func)
         self.executor = LocalExecutor(timeout=timeout)
         self.tmp_dir = tempfile.mkdtemp()
 

From 6f9037f1853451d99a8c43bc9528391b07ceb237 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 24 Jun 2019 01:39:50 -0700
Subject: [PATCH 109/126] adding comment

---
 python/tvm/autotvm/measure/measure_methods.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index fd8cbe0e2a0d..65f0c515bff2 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -86,7 +86,10 @@ def __init__(self, timeout=10, n_parallel=None, build_func='default'):
                 build_func = ndk.create_shared
             else:
                 raise ValueError("Invalid build_func" + build_func)
-        self.build_func = _wrap_build_func(build_func)
+            self.build_func = _wrap_build_func(build_func)
+        else:
+            # If build_func is callable, bypass wrapper
+            self.build_func = build_func
         self.executor = LocalExecutor(timeout=timeout)
         self.tmp_dir = tempfile.mkdtemp()
 

From b0d09c1c0f78cac588a544c3972e927e9d97d32b Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 24 Jun 2019 02:25:51 -0700
Subject: [PATCH 110/126] unify the AutoTVM builder

---
 python/tvm/autotvm/measure/measure_methods.py | 14 +++--
 vta/python/vta/__init__.py                    |  2 +-
 vta/python/vta/build_module.py                | 57 -------------------
 vta/tutorials/autotvm/tune_relay_vta.py       |  2 +-
 4 files changed, 10 insertions(+), 65 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 65f0c515bff2..b2cf73f7dee1 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -86,10 +86,7 @@ def __init__(self, timeout=10, n_parallel=None, build_func='default'):
                 build_func = ndk.create_shared
             else:
                 raise ValueError("Invalid build_func" + build_func)
-            self.build_func = _wrap_build_func(build_func)
-        else:
-            # If build_func is callable, bypass wrapper
-            self.build_func = build_func
+        self.build_func = _wrap_build_func(build_func)
         self.executor = LocalExecutor(timeout=timeout)
         self.tmp_dir = tempfile.mkdtemp()
 
@@ -362,8 +359,13 @@ def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_opti
         if cuda_arch:
             set_cuda_target_arch(cuda_arch)
 
-        with build_config(**opts):
-            func = build(s, args, target_host=task.target_host)
+        if measure_input.target.device_name == 'vta':
+            # if target is vta, we need to use vta build
+            import vta
+            func = vta.build(s, args, target_host=task.target_host)
+        else:
+            with build_config(**opts):
+                func = build(s, args, target_host=task.target_host)
     return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
 
 
diff --git a/vta/python/vta/__init__.py b/vta/python/vta/__init__.py
index 75ecdbad4bc7..926d73649b31 100644
--- a/vta/python/vta/__init__.py
+++ b/vta/python/vta/__init__.py
@@ -18,5 +18,5 @@
 # to maintain minimum dependency on the board
 if sys.argv[0] not in ("-c", "-m"):
     from . import top
-    from .build_module import build_config, lower, build, vta_autotvm_build_func
+    from .build_module import build_config, lower, build
     from . import graph
diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py
index 183a2f4a500d..a291c42e592b 100644
--- a/vta/python/vta/build_module.py
+++ b/vta/python/vta/build_module.py
@@ -118,60 +118,3 @@ def build(*args, **kwargs):
         with build_config():
             return tvm.build(*args, **kwargs)
     return tvm.build(*args, **kwargs)
-
-# TODO(tmoreau89) unify the build with the rest of the build modules
-def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs):
-    """Custom build func for VTA. Used for autotvm"""
-
-    import time
-    import os
-    from random import getrandbits
-    from tvm.autotvm.util import get_const_tuple
-    from tvm.autotvm.measure.measure_methods import BuildResult, InstantiationError
-
-    tic = time.time()
-    try:
-        filename = os.path.join(tmp_dir, "tmp_func_%0x.tar" % getrandbits(64))
-        target, task, config = measure_input
-
-        with target:
-            s, args = task.instantiate(config)
-            if not config.valid():
-                raise InstantiationError(config.errors)
-
-            func = build(s, args, target_host=task.target_host)
-            sim = build(s, args)
-
-        arg_info = tuple((get_const_tuple(x.shape), x.dtype) for x in args)
-        func.export_library(filename)
-
-        # When targeting VTA test the schedule on simulator first
-        # in order to catch runtime errors
-        if measure_input.target.device_name == 'vta':
-            from vta import reconfig_runtime
-            # Note: if you're not running the RPC locally, you cannot benefit
-            # from rumtime recompilation...
-            local_rpc_port = int(os.environ.get("VTA_LOCAL_SIM_RPC_PORT", "0"))
-            if local_rpc_port:
-                remote = rpc.connect("localhost", local_rpc_port)
-                reconfig_runtime(remote)
-            else:
-                remote = rpc.LocalSession()
-            sim_path = os.path.join(tmp_dir, "tmp_func_%0x.tar" % getrandbits(64))
-            sim.export_library(sim_path)
-            remote.upload(sim_path)
-            f = remote.load_module(os.path.split(sim_path)[1])
-            ctx = remote.context(str(measure_input.target), 0)
-            args = [tvm.nd.empty(x[0], dtype=x[1], ctx=ctx) for x in arg_info]
-            # Skip execution just to verify correctness
-            simulator.debug_mode(simulator.DEBUG_SKIP_EXEC)
-            f(*args)
-
-        # check by local simulator
-        ctx = tvm.context(str(target))
-        args = [tvm.nd.empty(x[0], dtype=x[1], ctx=ctx) for x in arg_info]
-        sim(*args)
-
-    except Exception as exc: # pylint: disable=broad-except
-        return BuildResult(None, None, exc, time.time() - tic)
-    return BuildResult(filename, arg_info, None, time.time() - tic)
diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index 1f52bb4f62c1..9b91bdb13ba2 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -209,7 +209,7 @@ def compile_network(env, target, model, start_pack, stop_pack):
     'early_stopping': None,
 
     'measure_option': autotvm.measure_option(
-        builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func),
+        builder=autotvm.LocalBuilder(),
         runner=autotvm.RPCRunner(
             env.TARGET, host=tracker_host, port=tracker_port,
             number=5,

From aa028590db3b3bfdf050ba7910eb413112bba958 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 24 Jun 2019 02:29:06 -0700
Subject: [PATCH 111/126] lint fix

---
 vta/python/vta/build_module.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py
index a291c42e592b..dbd2e4b45fd6 100644
--- a/vta/python/vta/build_module.py
+++ b/vta/python/vta/build_module.py
@@ -19,10 +19,8 @@
 from __future__ import absolute_import as _abs
 
 import tvm
-from tvm import rpc
 from . import ir_pass
 from .environment import get_env
-from .testing import simulator
 
 
 def lift_coproc_scope(x):

From 7afb87e8da4dade8503f8e4ee99c6beab3df6b62 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 24 Jun 2019 12:23:04 -0700
Subject: [PATCH 112/126] bug fix

---
 python/tvm/autotvm/task/relay_integration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index 29190fa43324..79e521a3a9e5 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -41,7 +41,7 @@ def _build(func,
 
     from tvm import relay
 
-    if target.device_name == "vta":
+    if "vta" in str(target):
         with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
             import vta
             with vta.build_config():

From aee8f05bb35fb3f429958e5d25b5eaa593df7177 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 24 Jun 2019 21:54:24 -0700
Subject: [PATCH 113/126] reflecting update on qconfig

---
 vta/scripts/tune_resnet.py              | 4 +---
 vta/tutorials/autotvm/tune_relay_vta.py | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
index 1a7c74bee3f7..21aa96cd350f 100644
--- a/vta/scripts/tune_resnet.py
+++ b/vta/scripts/tune_resnet.py
@@ -126,9 +126,7 @@ def compile_network(opt, env, target):
 
     # Perform quantization in Relay
     with relay.quantize.qconfig(global_scale=8.0,
-                                skip_k_conv=1,
-                                skip_k_dense=1,
-                                target_vta=True):
+                                skip_conv_layers=[0]):
         relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params)
 
     # Perform graph packing and constant folding for VTA target
diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index 9b91bdb13ba2..bdeb6c5d03e2 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -90,9 +90,7 @@ def compile_network(env, target, model, start_pack, stop_pack):
 
     # Perform quantization in Relay
     with relay.quantize.qconfig(global_scale=8.0,
-                                skip_k_conv=1,
-                                skip_k_dense=1,
-                                target_vta=True):
+                                skip_conv_layers=[0]):
         relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params)
 
     # Perform graph packing and constant folding for VTA target

From a25bcbf66a797215ba9767cb1b372b3ff18d47c1 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 24 Jun 2019 22:07:20 -0700
Subject: [PATCH 114/126] fixing incorrect target initialization

---
 tests/python/unittest/test_graph_tuner_core.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/python/unittest/test_graph_tuner_core.py b/tests/python/unittest/test_graph_tuner_core.py
index 240da7f88628..6e06b44b3ea9 100644
--- a/tests/python/unittest/test_graph_tuner_core.py
+++ b/tests/python/unittest/test_graph_tuner_core.py
@@ -117,7 +117,7 @@ def _create_data(target, dshape, dtype, layout):
 
 def test_graph_tuner_layout_transform():
     log_file = "%s/test_tuner.log" % (os.getcwd())
-    target = "llvm"
+    target = tvm.target.arm_cpu()
     dshape = (1, 3, 8, 8)
     dtype = "float32"
     layout = "NCHW"
@@ -152,7 +152,7 @@ def test_graph_tuner_layout_transform():
 
 def test_DPTuner_run():
     log_file = "%s/test_tuner.log" % (os.getcwd())
-    target = "llvm"
+    target = tvm.target.arm_cpu()
     dtype = "float32"
     layout = "NCHW"
     dshape = (1, 3, 8, 8)
@@ -201,7 +201,7 @@ def test_DPTuner_run():
 
 
 def test_PBQPTuner_run():
-    target = "llvm"
+    target = tvm.target.arm_cpu()
     dtype = "float32"
     layout = "NCHW"
     dshape = (1, 3, 8, 8)

From a69250af6b4b2021aff078188118ef2275c88d0e Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 24 Jun 2019 22:07:39 -0700
Subject: [PATCH 115/126] proper checking

---
 python/tvm/autotvm/task/relay_integration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index 79e521a3a9e5..29190fa43324 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -41,7 +41,7 @@ def _build(func,
 
     from tvm import relay
 
-    if "vta" in str(target):
+    if target.device_name == "vta":
         with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
             import vta
             with vta.build_config():

From d5ba66ef2de3755c46f4ed97e4fc4be9cc33fe51 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 24 Jun 2019 22:33:21 -0700
Subject: [PATCH 116/126] unused arg

---
 python/tvm/relay/quantize/quantize.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index 3c2a8a10026f..f209590f7ef7 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -158,9 +158,6 @@ def qconfig(**kwargs):
         is None, which means will try to call all operartors' annotate rewrite
         function.
 
-    target_vta: boolean
-        Whether we are performing quantization for VTA.
-
     Returns
     -------
     config: QConfig

From 39a9d6212648c0e83b8209e30c42fab8c78e3110 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Mon, 24 Jun 2019 22:39:10 -0700
Subject: [PATCH 117/126] adding a TODO to address later, bug fix

---
 python/tvm/relay/quantize/_annotate.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index 657fce54b42d..4bb345ee47dd 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -191,7 +191,8 @@ def check_to_skip():
     return False
 
 
-@register_annotate_function("nn.dense")
+# TODO(tmoreau89,ziheng) need to include an option to turn off dense quant
+# @register_annotate_function("nn.dense")
 def dense_rewrite(ref_call, new_args, ctx):
     """Rewrite function for dense. Lhs of dense will be quantized to input field, and rhs of
     dense will be quantized to weight field. Output would be in activation field."""
@@ -203,13 +204,14 @@ def dense_rewrite(ref_call, new_args, ctx):
     lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
     rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
 
-    if lhs_kind is None or lhs_kind != QAnnotateKind.INPUT:
+    if lhs_kind is None or lhs_kind == QAnnotateKind.ACTIVATION:
         lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
 
     assert rhs_kind is None
     rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
 
     expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
+
     return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
 
 

From 3f60022a9086c4644a3b225c833b8b1a4a5803c4 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 25 Jun 2019 10:06:33 -0700
Subject: [PATCH 118/126] merge fix

---
 src/relay/pass/quantize.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
index 2f23c7659b02..1d08e7b7915c 100644
--- a/src/relay/pass/quantize.cc
+++ b/src/relay/pass/quantize.cc
@@ -388,6 +388,7 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args,
                             const Array<Expr>& args,
                             DataType* dtype_ptr,
                             Expr* scale_ptr) {
+  static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize");
   const QConfig& cfg = QConfig::Current();
 
   std::vector<const QRealizeIntExprNode*> nptrs;

From 8df123a6af8d4c07e9716abe55b1eeb113db1697 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 25 Jun 2019 10:10:53 -0700
Subject: [PATCH 119/126] merge fix

---
 src/relay/pass/quantize.cc | 2 --
 src/relay/pass/quantize.h  | 4 ----
 2 files changed, 6 deletions(-)

diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
index 1d08e7b7915c..75897b75831d 100644
--- a/src/relay/pass/quantize.cc
+++ b/src/relay/pass/quantize.cc
@@ -648,8 +648,6 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
   p->stream << "nbit_activation=" << op->nbit_activation << ", ";
   p->stream << "global_scale=" << op->global_scale << ", ";
   p->stream << "skip_conv_layers==" << op->skip_conv_layers << ", ";
-  p->stream << "skip_k_dense==" << op->skip_k_dense << ", ";
-  p->stream << "skip_dense_layers==" << op->skip_dense_layers << ", ";
   p->stream << "round_for_shift==" << op->round_for_shift << ", ";
   p->stream << "store_lowbit_output==" << op->store_lowbit_output << ", ";
   p->stream << "debug_enabled_ops==" << op->debug_enabled_ops;
diff --git a/src/relay/pass/quantize.h b/src/relay/pass/quantize.h
index c20f0b606dea..262d420acf97 100644
--- a/src/relay/pass/quantize.h
+++ b/src/relay/pass/quantize.h
@@ -153,8 +153,6 @@ class QConfigNode : public Node {
   DataType dtype_activation = Int(32);
   double global_scale = 8.0;
   Array<Expr> skip_conv_layers = Array<Expr>(NodePtr<Node>(nullptr));
-  int skip_k_dense = 0;
-  Array<Expr> skip_dense_layers = Array<Expr>(NodePtr<Node>(nullptr));
   bool round_for_shift = true;
   bool store_lowbit_output = true;
   Array<Expr> debug_enabled_ops = Array<Expr>(NodePtr<Node>(nullptr));
@@ -168,8 +166,6 @@ class QConfigNode : public Node {
     v->Visit("dtype_activation", &dtype_activation);
     v->Visit("global_scale", &global_scale);
     v->Visit("skip_conv_layers", &skip_conv_layers);
-    v->Visit("skip_k_dense", &skip_k_dense);
-    v->Visit("skip_dense_layers", &skip_dense_layers);
     v->Visit("round_for_shift", &round_for_shift);
     v->Visit("store_lowbit_output", &store_lowbit_output);
     v->Visit("debug_enabled_ops", &debug_enabled_ops);

From 6c2e142b05e41ebcaf01d346491bd07099436d93 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 25 Jun 2019 10:17:03 -0700
Subject: [PATCH 120/126] merge fixes

---
 python/tvm/relay/quantize/_annotate.py | 20 ++++++++++++--------
 python/tvm/relay/quantize/quantize.py  | 14 --------------
 2 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index 4bb345ee47dd..90bb2d08a8ed 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -199,8 +199,6 @@ def dense_rewrite(ref_call, new_args, ctx):
     if check_to_skip():
         return None
 
-    _set_dense_counter(cnt + 1)
-
     lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
     rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
 
@@ -226,6 +224,7 @@ def multiply_rewrite(ref_call, new_args, ctx):
 
     if lhs_kind is None and rhs_kind is None:
         return None
+
     if lhs_kind in [QAnnotateKind.ACTIVATION, QAnnotateKind.INPUT] and rhs_kind is None:
         # quantize lhs to INPUT field
         if lhs_kind == QAnnotateKind.ACTIVATION:
@@ -234,6 +233,7 @@ def multiply_rewrite(ref_call, new_args, ctx):
         rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
         expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
         return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
+
     raise ValueError
 
 
@@ -325,6 +325,7 @@ def pool2d_rewrite(ref_call, new_args, ctx):
         return None
     if x_kind == QAnnotateKind.ACTIVATION:
         expr = attach_simulated_quantize(expr, QAnnotateKind.INPUT)
+
     expr = _forward_op(ref_call, [expr])
     return QAnnotateExpr(expr, QAnnotateKind.INPUT)
 
@@ -335,8 +336,9 @@ def pool2d_rewrite(ref_call, new_args, ctx):
 @register_annotate_function("annotation.force_cast")
 def force_cast_rewrite(ref_call, new_args, ctx):
     """Rewrite function to force cast"""
-    if _conv_counter() <= current_qconfig().skip_k_conv:
+    if check_to_skip():
         return None
+
     expr, x_kind = _get_expr_kind(new_args[0])
 
     if x_kind is None:
@@ -395,11 +397,13 @@ def vta_expr_check(expr):
 @register_vta_rewrite("nn.conv2d")
 def conv2d_vta_rewrite(ref_call, new_args, ctx):
     """Rewrite function for conv2d for VTA target"""
-    cnt = _conv_counter()
-    if cnt < current_qconfig().skip_k_conv:
-        _set_conv_counter(cnt + 1)
-        return None
-    _set_conv_counter(cnt + 1)
+    actx = annotate_context()
+    if current_qconfig().skip_conv_layers is not None:
+        skipped_indices = [int(x) for x in current_qconfig().skip_conv_layers]
+        if actx.conv2d_counter() in skipped_indices:
+            actx.count_conv2d()
+            return None
+    actx.count_conv2d()
 
     data_cond, data = vta_expr_check(new_args[0])
     kernel_cond, kernel = vta_expr_check(new_args[1])
diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index f209590f7ef7..c24cb153fba5 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -199,20 +199,6 @@ def annotate_context():
     return AnnotateContext.Current
 
 
-_DENSE_COUNTER = 0
-
-
-def _dense_counter():
-    """Get the global counter for dense."""
-    return _DENSE_COUNTER
-
-
-def _set_dense_counter(n):
-    """Set the value of the global dense counter."""
-    global _DENSE_COUNTER
-    _DENSE_COUNTER = n
-
-
 def calibrate(graph, mod=None, ctx=None):
     """The calibrate procedure will try to calculate the content of
     dom_scale, nbit, clip_min, clip_max for every `simulated_quantize`

From 288883af5a16f0477b96462faec8d6ff10f4b6d0 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 25 Jun 2019 10:21:32 -0700
Subject: [PATCH 121/126] merge fix

---
 src/relay/pass/quantize.cc | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
index 75897b75831d..1503d67feaf1 100644
--- a/src/relay/pass/quantize.cc
+++ b/src/relay/pass/quantize.cc
@@ -388,7 +388,6 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args,
                             const Array<Expr>& args,
                             DataType* dtype_ptr,
                             Expr* scale_ptr) {
-  static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize");
   const QConfig& cfg = QConfig::Current();
 
   std::vector<const QRealizeIntExprNode*> nptrs;
@@ -413,20 +412,6 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args,
     LOG(FATAL) << "should not touch here.";
   }
 
-  for (size_t i = 0; i < ret.size(); ++i) {
-    auto ref_arg = ref_args[i].as<CallNode>();
-    if (nptrs[i]->dtype != dtype) {
-      ret.Set(i, Cast(ret[i], dtype));
-    } else if (ref_arg && ref_arg->op.same_as(simulated_quantize) &&
-               ref_arg->attrs.as<SimulatedQuantizeAttrs>()->kind == kQInput) {
-      auto new_arg = Cast(ret[i], cfg->dtype_input);
-      if (cfg->store_lowbit_output) {
-        new_arg = StopFusion(new_arg);
-      }
-      ret.Set(i, Cast(new_arg, dtype));
-    }
-  }
-
   // unify the dom_scale
   float s = ChooseDomScale(nptrs);
   Expr dom_scale = MakeConstantScalar(Float(32), s);

From 4a61b1feecd7ecc3f09de265e7fe2e10c9fe3337 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 25 Jun 2019 10:47:31 -0700
Subject: [PATCH 122/126] guard to avoid errors when target is set as string

---
 python/tvm/autotvm/task/relay_integration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index 29190fa43324..62d4b27fa303 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -41,7 +41,7 @@ def _build(func,
 
     from tvm import relay
 
-    if target.device_name == "vta":
+    if target.device_name and target.device_name == "vta":
         with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
             import vta
             with vta.build_config():

From bf6df6927fb7ab2077d0efbb59bef330d2a22077 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 25 Jun 2019 10:48:44 -0700
Subject: [PATCH 123/126] reverting fix

---
 tests/python/unittest/test_graph_tuner_core.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/python/unittest/test_graph_tuner_core.py b/tests/python/unittest/test_graph_tuner_core.py
index 6e06b44b3ea9..240da7f88628 100644
--- a/tests/python/unittest/test_graph_tuner_core.py
+++ b/tests/python/unittest/test_graph_tuner_core.py
@@ -117,7 +117,7 @@ def _create_data(target, dshape, dtype, layout):
 
 def test_graph_tuner_layout_transform():
     log_file = "%s/test_tuner.log" % (os.getcwd())
-    target = tvm.target.arm_cpu()
+    target = "llvm"
     dshape = (1, 3, 8, 8)
     dtype = "float32"
     layout = "NCHW"
@@ -152,7 +152,7 @@ def test_graph_tuner_layout_transform():
 
 def test_DPTuner_run():
     log_file = "%s/test_tuner.log" % (os.getcwd())
-    target = tvm.target.arm_cpu()
+    target = "llvm"
     dtype = "float32"
     layout = "NCHW"
     dshape = (1, 3, 8, 8)
@@ -201,7 +201,7 @@ def test_DPTuner_run():
 
 
 def test_PBQPTuner_run():
-    target = tvm.target.arm_cpu()
+    target = "llvm"
     dtype = "float32"
     layout = "NCHW"
     dshape = (1, 3, 8, 8)

From 0a5b599d8a1299cf6aaffe123979e871e0ebd692 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 25 Jun 2019 10:50:16 -0700
Subject: [PATCH 124/126] fix

---
 python/tvm/autotvm/task/relay_integration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index 62d4b27fa303..d80443a208d6 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -41,7 +41,7 @@ def _build(func,
 
     from tvm import relay
 
-    if target.device_name and target.device_name == "vta":
+    if hasattr(target, 'device_name') and target.device_name == "vta":
         with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
             import vta
             with vta.build_config():

From f8e629f7a40c32f169bb0fa890cbe9a9ac2b4bbe Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Tue, 25 Jun 2019 21:35:46 -0700
Subject: [PATCH 125/126] removing unused comment

---
 python/tvm/relay/quantize/quantize.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index c24cb153fba5..fa70e1954467 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -142,10 +142,6 @@ def qconfig(**kwargs):
         Specifying which layers to be skipped. Provide a list of indices
         that indicate which conv2d layers to leave untouched.
 
-    skip_dense_layers: list
-        Different way of specifying which dense layers to avoid.
-        Provide a list of indices that indicate which conv2d layers to leave untouched.
-
     round_for_shift: boolean
         Whether to add bias for rounding during shift.
 

From 867ebdf002d0d9eab7489630028c08dd6f200b23 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <moreau@uw.edu>
Date: Wed, 26 Jun 2019 19:51:56 -0700
Subject: [PATCH 126/126] guarding against improperly initialized TVM targets

---
 python/tvm/autotvm/measure/measure_methods.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index b2cf73f7dee1..36efc881958e 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -359,8 +359,9 @@ def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_opti
         if cuda_arch:
             set_cuda_target_arch(cuda_arch)
 
-        if measure_input.target.device_name == 'vta':
-            # if target is vta, we need to use vta build
+        # if target is vta, we need to use vta build
+        if hasattr(measure_input.target, 'device_name') and \
+            measure_input.target.device_name == 'vta':
             import vta
             func = vta.build(s, args, target_host=task.target_host)
         else:
@@ -457,7 +458,8 @@ def run_through_rpc(measure_input, build_result,
         # upload built module
         remote = request_remote(*remote_args)
         # Program the FPGA every single time when targeting VTA
-        if measure_input.target.device_name == 'vta':
+        if hasattr(measure_input.target, 'device_name') and \
+            measure_input.target.device_name == 'vta':
             from vta import program_fpga, reconfig_runtime
             program_fpga(remote, None)
             reconfig_runtime(remote)