From 72ec86ca5ceba5f4b595b16a3b1b3b6d8b78063c Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Thu, 2 May 2019 18:53:32 -0700 Subject: [PATCH 001/126] autotvm support for conv2d operator --- python/tvm/autotvm/tophub.py | 2 +- python/tvm/target.py | 9 +- vta/python/vta/environment.py | 7 + vta/python/vta/testing/util.py | 2 +- vta/python/vta/top/__init__.py | 2 - vta/python/vta/top/arm_conv2d.py | 37 -- vta/python/vta/top/vta_conv2d.py | 403 +++------------- .../integration/test_benchmark_topi_conv2d.py | 429 ++++++++---------- 8 files changed, 277 insertions(+), 614 deletions(-) delete mode 100644 vta/python/vta/top/arm_conv2d.py diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py index 850f501cb1fc..37a95d6f774d 100644 --- a/python/tvm/autotvm/tophub.py +++ b/python/tvm/autotvm/tophub.py @@ -44,7 +44,7 @@ 'opencl': "v0.02", 'mali': "v0.05", - 'vta': "v0.04", + 'vta': "v0.05", } logger = logging.getLogger('autotvm') diff --git a/python/tvm/target.py b/python/tvm/target.py index 828fff8e228c..4548ffac4c88 100644 --- a/python/tvm/target.py +++ b/python/tvm/target.py @@ -344,7 +344,7 @@ def register(key, func=None, override=False): The function to be registered. override : bool - Whether override existing registeration. + Whether override existing registration. Returns ------- @@ -489,6 +489,13 @@ def rasp(options=None): return arm_cpu('rasp3b', options) +def vta(model='unknown', options=None): + opts = ["-device=vta", '-keys=cpu', '-model=%s' % model] + opts = _merge_opts(opts, options) + ret = _api_internal._TargetCreate("ext_dev", *opts) + return ret + + def create(target_str): """Get a target given target string. diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py index 4c2200d04727..093b0ec5c386 100644 --- a/vta/python/vta/environment.py +++ b/vta/python/vta/environment.py @@ -234,6 +234,10 @@ def gemm(self): """GEMM intrinsic""" return self.dev.gemm + @property + def target(self): + return tvm.target.vta(model=self.TARGET) + @property def target_host(self): """The target host""" @@ -243,6 +247,9 @@ def target_host(self): return "llvm" raise ValueError("Unknown target %s" % self.TARGET) + @property + def target_vta_cpu(self): + return tvm.target.arm_cpu(model=self.TARGET) def get_env(): """Get the current VTA Environment. diff --git a/vta/python/vta/testing/util.py b/vta/python/vta/testing/util.py index 06c700cd7119..f99541691082 100644 --- a/vta/python/vta/testing/util.py +++ b/vta/python/vta/testing/util.py @@ -42,7 +42,7 @@ def run(run_func): # the port it's listening to, e.g. 9090 local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) if local_rpc: - remote = rpc.connect("localhost", local_rpc) + remote = rpc.connect("127.0.0.1", local_rpc) run_func(env, remote) else: # Make sure simulation library exists diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py index 614ed2347181..7346c35506a2 100644 --- a/vta/python/vta/top/__init__.py +++ b/vta/python/vta/top/__init__.py @@ -1,5 +1,3 @@ """TVM TOPI connector, eventually most of these should go to TVM repo""" -from .vta_conv2d import packed_conv2d, schedule_packed_conv2d from . import vta_conv2d -from . import arm_conv2d diff --git a/vta/python/vta/top/arm_conv2d.py b/vta/python/vta/top/arm_conv2d.py deleted file mode 100644 index 6e34917c0b71..000000000000 --- a/vta/python/vta/top/arm_conv2d.py +++ /dev/null @@ -1,37 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Reuse conv2d schedule from ARM CPU""" - -import tvm - -from topi.nn import conv2d, conv2d_alter_layout -from topi import generic - -@conv2d.register(["vtacpu", "vta"]) -def compute(*args, **kwargs): - with tvm.target.arm_cpu("vtacpu"): - return conv2d(*args, **kwargs) - -@generic.schedule_conv2d_nchw.register(["vtacpu", "vta"]) -def schedule(*args, **kwargs): - with tvm.target.arm_cpu("vtacpu"): - return generic.schedule_conv2d_nchw(*args, **kwargs) - -@conv2d_alter_layout.register(["vtacpu", "vta"]) -def alter(*args, **kwargs): - with tvm.target.arm_cpu("vtacpu"): - return conv2d_alter_layout(*args, **kwargs) diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index ef4f2017381a..681418d6ecb1 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -15,181 +15,49 @@ # specific language governing permissions and limitations # under the License. """Namespace for supporting packed_conv2d + ewise variant of nnvm.""" -from __future__ import absolute_import as _abs -from collections import namedtuple - -import logging import tvm +from tvm import autotvm import topi -from nnvm.top import registry as reg, OpPattern -from nnvm.top import nn as _nn -from ..environment import get_env - - -Workload = namedtuple("Conv2DWorkload", - ['batch', 'height', 'width', 'in_filter', 'out_filter', - 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride']) - -def find_schedules(layer, vt_only=False, best_only=False): - """ Returns a schedule for a given a layer. - - Parameters - ---------- - layer : Workload - Convolutional layer description. - vt_only : Boolean - Produce a schedule plan with virtual threading. - best_only : Boolean - Return the "best" schedule plan. - - Returns - ------- - fil_sched : list - List of valid schedules. - - """ - # pylint: disable=too-many-nested-blocks - env = get_env() - - # Helper function to get factors - def _find_factors(n): - factors = [] - for f in range(1, n + 1): - if n % f == 0: - factors.append(f) - return factors - - def _get_data_movement_byte(schedule, layer): - """ Estimate data movement in bytes for the schedule plan - """ - env = get_env() - b_f = schedule.b_factor - h_f = schedule.h_factor - w_f = schedule.w_factor - ci_f = schedule.ic_factor - co_f = schedule.oc_factor - # Derive data movement - inp_elem_sizeb = env.BATCH * env.BLOCK_IN * env.INP_WIDTH - wgt_elem_sizeb = env.BLOCK_IN * env.BLOCK_OUT * env.WGT_WIDTH - out_elem_sizeb = env.BATCH * env.BLOCK_OUT * env.OUT_WIDTH - input_tile_elems = b_f * \ - ((h_f - 1) * layer.hstride + layer.hkernel) * \ - ((w_f - 1) * layer.wstride + layer.wkernel) * ci_f - weight_tile_elems = layer.hkernel * layer.wkernel * ci_f - output_tile_elems = b_f * h_f * w_f * co_f - # Derive tiling factors - b_factor = layer.batch // (b_f * env.BATCH) - h_factor = (layer.height // layer.hstride) // h_f - w_factor = (layer.width // layer.wstride) // w_f - ci_factor = layer.in_filter // (ci_f * env.BLOCK_IN) - co_factor = layer.out_filter // (co_f * env.BLOCK_OUT) - # Compute input transaction count - input_xfers = b_factor * h_factor * w_factor * co_factor * ci_factor - weight_xfers = b_factor * h_factor * w_factor * co_factor * ci_factor - output_xfers = b_factor * h_factor * w_factor * co_factor - # Compute total transfer sizes - input_xfer_byte = input_tile_elems * input_xfers * inp_elem_sizeb // 8 - weight_xfer_byte = weight_tile_elems * weight_xfers * wgt_elem_sizeb // 8 - output_xfer_byte = output_tile_elems * output_xfers * out_elem_sizeb // 8 - total_xfer_byte = input_xfer_byte + weight_xfer_byte + output_xfer_byte - return total_xfer_byte - - # Scheduling exploration - batch_factors = _find_factors(layer.batch // env.BATCH) - height_factors = _find_factors(layer.height // layer.hstride) - width_factors = _find_factors(layer.width // layer.wstride) - cin_factors = _find_factors(layer.in_filter // env.BLOCK_IN) - cout_factors = _find_factors(layer.out_filter // env.BLOCK_OUT) - ht_factors = [1, 2] - cot_factors = [1, 2] - - # Explore schedules - schedules = [] - for b_f in batch_factors: - for h_f in height_factors: - for w_f in width_factors: - for ci_f in cin_factors: - for co_f in cout_factors: - # FIXME: 2D load pattern matching imposes restrictions on schedule - valid = (w_f == layer.width // layer.wstride) or \ - (w_f != layer.width // layer.wstride and co_f == 1) and \ - ci_f == 1 - if valid: - schedules.append([b_f, h_f, w_f, ci_f, co_f]) +import numpy as np - # Filter the schedules that wouldn't work in the available BRAM sizes - inp_elem_sizeb = env.BATCH * env.BLOCK_IN * env.INP_WIDTH - wgt_elem_sizeb = env.BLOCK_IN * env.BLOCK_OUT * env.WGT_WIDTH - out_elem_sizeb = env.BATCH * env.BLOCK_OUT * env.OUT_WIDTH - inp_brams_sizeb = env.INP_BUFF_SIZE * 8 - wgt_brams_sizeb = env.WGT_BUFF_SIZE * 8 - out_brams_sizeb = env.OUT_BUFF_SIZE * 8 - fil_sched = [] - xfer_size = [] - for sched in schedules: - b_f, h_f, w_f, ci_f, co_f = sched - for h_t in ht_factors: - for co_t in cot_factors: - # Make sure to filter cases where we apply threading on two axes - # or cases where the threading factors for h and co are not - # factors of h and co - if (h_t == 2 and co_t == 2) or (h_f % h_t != 0) or (co_f % co_t != 0): - continue - # Adjust tile sizes if threading is applied - h_f //= h_t - co_f //= co_t - # Derive tile sizes - input_tile_elems = b_f * \ - ((h_f - 1) * layer.hstride + layer.hkernel) * \ - ((w_f - 1) * layer.wstride + layer.wkernel) * ci_f - weight_tile_elems = layer.hkernel * layer.wkernel * ci_f * co_f - output_tile_elems = b_f * h_f * w_f * co_f - - # Derive valid schedule filter - valid = True - # If in vitrual-threaded mode, only allow for threaded plans - valid &= (vt_only and (h_t == 2 or co_t == 2)) or not vt_only - # Check that we don't exceed input/weight/output capacity - valid &= input_tile_elems * inp_elem_sizeb <= inp_brams_sizeb // (co_t * h_t) - valid &= weight_tile_elems * wgt_elem_sizeb <= wgt_brams_sizeb - valid &= output_tile_elems * out_elem_sizeb <= out_brams_sizeb // (co_t * h_t) - # Make sure that we don't write to the same acc location within 2 consecutive cycles - valid &= h_f > 2 and w_f > 2 - # TODO: check that we don't exceed instruction or micro-op count - - if valid: - schedule = Schedule(b_factor=b_f, oc_factor=co_f, ic_factor=ci_f, h_factor=h_f, - w_factor=w_f, oc_nthread=co_t, h_nthread=h_t) - fil_sched.append(schedule) - xfer_size.append(_get_data_movement_byte(schedule, layer)) +from ..environment import get_env - if best_only and xfer_size: - return [fil_sched[xfer_size.index(min(xfer_size))]] - return fil_sched +def is_packed_layout(layout): + """Check if layout is packed layout""" + if layout == "NCHW": + return False + if "n" in layout and "c" in layout: + return True + return False -def packed_conv2d(data, +@autotvm.register_topi_compute(topi.nn.conv2d, 'vta', 'direct') +def packed_conv2d(cfg, + data, kernel, - padding, strides, - out_dtype="int32"): - """ Packed conv2d function. - """ + padding, + dilation, + layout, + out_dtype): + """ Packed conv2d function.""" + if not is_packed_layout(layout): + raise topi.InvalidShapeError() + assert dilation == (1, 1) + if padding[0]: pad_data = topi.nn.pad(data, [0, 0, padding[0], padding[1], 0, 0], name="pad_data") else: pad_data = data assert len(data.shape) == 6 assert len(kernel.shape) == 6 - oheight = topi.util.simplify((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1) - owidth = topi.util.simplify((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1) + oheight = topi.util.get_const_int((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1) + owidth = topi.util.get_const_int((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1) oshape = (data.shape[0], kernel.shape[0], oheight, owidth, data.shape[4], kernel.shape[4]) ishape = topi.util.get_const_tuple(data.shape) kshape = topi.util.get_const_tuple(kernel.shape) - assert data.dtype == "int8", data.dtype - assert kernel.dtype == "int8", kernel.dtype d_i = tvm.reduce_axis((0, kshape[2]), name='d_i') d_j = tvm.reduce_axis((0, kshape[3]), name='d_j') k_o = tvm.reduce_axis((0, ishape[1]), name='k_o') @@ -202,154 +70,28 @@ def packed_conv2d(data, kernel[c_o, k_o, d_i, d_j, c_i, k_i].astype(out_dtype), axis=[k_o, d_i, d_j, k_i]), name="res", tag="packed_conv2d") - return res - -@tvm.register_func("nnvm.compiler.build_target", override=True) -def _build(funcs, target, target_host): - tvm_t = tvm.target.create(target) - if tvm_t.device_name == "vta": - return tvm.build(funcs, target="ext_dev", target_host=target_host) - if tvm_t.device_name == "rasp" or tvm_t.device_name == "vtacpu": - return tvm.build(funcs, target=target_host) - return tvm.build(funcs, target=target) - - -@tvm.register_func("nnvm.compiler.lower", override=True) -def _lower(sch, inputs, func_name, graph): - import traceback - # pylint: disable=broad-except - try: - f = tvm.lower(sch, inputs, name=func_name) - if "quantized_conv2d" in func_name: - logging.info(graph.ir(join_entry_attrs=["shape"])) - except Exception: - msg = traceback.format_exc() - msg += "Error during compile graph\n" - msg += "--------------------------\n" - msg += graph.ir(join_entry_attrs=["shape"]) - raise RuntimeError(msg) - return f if isinstance( - f, (tvm.container.Array, tuple, list)) else [f] - - -@reg.register_compute("clip", level=15) -def compute_clip(attrs, inputs, _): - """ Clip operator. - """ - x = inputs[0] - a_min = attrs.get_float("a_min") - a_max = attrs.get_float("a_max") - const_min = tvm.const(a_min, x.dtype) - const_max = tvm.const(a_max, x.dtype) - with tvm.tag_scope(topi.tag.ELEMWISE): - x = tvm.compute( - x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") - x = tvm.compute( - x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") - return x - -# override to force partition at copy -reg.register_pattern("copy", OpPattern.INJECTIVE, level=15) -def is_packed_layout(layout): - """Check if layout is packed layout""" - if layout == "NCHW": - return False - if "n" in layout and "c" in layout: - return True - return False - -@reg.register_alter_op_layout("conv2d", level=15) -def alter_conv2d_layout(attrs, inputs, out): - layout = attrs['layout'] - if is_packed_layout(layout): - return None - return _nn.alter_conv2d_layout(attrs, inputs, out) - - -@reg.register_compute("conv2d", level=15) -def compute_conv2d(attrs, inputs, out): - """ 2D convolution algorithm. - """ - padding = attrs.get_int_tuple("padding") - strides = attrs.get_int_tuple("strides") - dilation = attrs.get_int_tuple("dilation") - groups = attrs.get_int("groups") - layout = attrs["layout"] - out_dtype = attrs['out_dtype'] - assert dilation == (1, 1), "not support dilate now" - if is_packed_layout(layout): - assert groups == 1 - return packed_conv2d(inputs[0], inputs[1], - padding, strides, out_dtype=out_dtype) - return _nn.compute_conv2d(attrs, inputs, out) - - -@reg.register_schedule("conv2d", level=15) -def schedule_conv2d(attrs, outs, target): - """ 2D convolution schedule. - """ - layout = attrs["layout"] - - if is_packed_layout(layout): - target = tvm.target.create(target) - if target.device_name == "vta": - return schedule_packed_conv2d(outs) - if str(target).startswith("llvm"): - return tvm.create_schedule([x.op for x in outs]) - raise RuntimeError("not support target %s" % target) - return _nn.schedule_conv2d(attrs, outs, target) - - -def _get_workload(data, pad_data, kernel, output): - """ Get the workload structure. - """ - o_shape = topi.util.get_const_tuple(output.shape) - d_shape = topi.util.get_const_tuple(data.shape) - k_shape = topi.util.get_const_tuple(kernel.shape) - o_b, o_c, o_h, o_w, ob_blk, o_blk = o_shape - i_b, i_c, i_h, i_w, ib_blk, i_blk = d_shape - k_o, k_i, k_h, k_w, ko_blk, ki_blk = k_shape - # For now we need to assume that input channel blocking is the same - # as the output channel blocking - assert o_blk == i_blk - assert ob_blk == ib_blk - # Make sure that dimensions match - assert o_b == i_b - assert o_blk == ko_blk - assert i_blk == ki_blk - assert k_o == o_c - assert k_i == i_c - # Scale the channel size - i_c *= i_blk - o_c *= o_blk - if pad_data is not None: - p_shape = topi.util.get_const_tuple(pad_data.shape) - h_pad = (p_shape[2] - d_shape[2]) // 2 - w_pad = (p_shape[3] - d_shape[3]) // 2 - else: - h_pad, w_pad = 0, 0 - h_str = (i_h + h_pad*2 - k_h) // (o_h - 1) - w_str = (i_w + w_pad*2 - k_w) // (o_w - 1) - return Workload(i_b, i_h, i_w, i_c, o_c, k_h, k_w, h_pad, w_pad, h_str, w_str) - -_WL2PLAN = {} + cfg.add_flop(2 * np.prod(topi.util.get_const_tuple(oshape)) * + kshape[2] * kshape[3] * ishape[1] * ishape[-1]) + return res -def schedule_packed_conv2d(outs): - """ Schedule the packed conv2d. - """ +@autotvm.register_topi_schedule(topi.generic.schedule_conv2d_nchw, 'vta', 'direct') +def schedule_packed_conv2d(cfg, outs): assert len(outs) == 1 output = outs[0] + const_ops = [] ewise_inputs = [] ewise_ops = [] conv2d_res = [] - assert output.dtype == "int8" - assert output.op.input_tensors[0].dtype == "int32" + assert "int" in output.op.input_tensors[0].dtype def _traverse(op): if topi.tag.is_broadcast(op.tag): if not op.same_as(output.op): - ewise_ops.append(op) + if len(op.axis) == 0: + const_ops.append(op) + else: + ewise_ops.append(op) for tensor in op.input_tensors: if isinstance(tensor.op, tvm.tensor.PlaceholderOp): ewise_inputs.append((op, tensor)) @@ -362,6 +104,19 @@ def _traverse(op): _traverse(output.op) assert len(conv2d_res) == 1 conv2d_stage = conv2d_res[0].output(0) + s = tvm.create_schedule(output.op) + + ##### space definition begin ##### + b, co, h, w, bi, ci = s[conv2d_stage].op.axis + ci, kh, kw, bci = s[conv2d_stage].op.reduce_axis + cfg.define_split('tile_b', b, num_outputs=2) + cfg.define_split('tile_h', h, num_outputs=2) + cfg.define_split('tile_w', w, num_outputs=2) + cfg.define_split('tile_ci', ci, num_outputs=2) + cfg.define_split('tile_co', co, num_outputs=2) + cfg.define_knob('oc_nthread', [1, 2]) + cfg.define_knob('h_nthread', [1, 2]) + ###### space definition end ###### data, kernel = conv2d_stage.op.input_tensors if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: @@ -370,21 +125,8 @@ def _traverse(op): data = temp else: pad_data = None - wrkld = _get_workload(data, pad_data, kernel, output) - if wrkld in _WL2PLAN: - plan = _WL2PLAN[wrkld] - else: - plan = find_schedules(wrkld, vt_only=True, best_only=True)[0] - logging.info("Trying to find plan for %s", wrkld) - env = get_env() - - load_inp = load_wgt = load_out = store_out = env.dma_copy - alu = env.alu - gemm = env.gemm - # schedule1 - oshape = topi.util.get_const_tuple(output.shape) - s = tvm.create_schedule(output.op) + env = get_env() # setup pad if pad_data is not None: @@ -394,27 +136,26 @@ def _traverse(op): cdata = s.cache_read(data, env.inp_scope, [conv2d_stage]) ckernel = s.cache_read(kernel, env.wgt_scope, [conv2d_stage]) s[conv2d_stage].set_scope(env.acc_scope) + # cache read input cache_read_ewise = [] - for consumer, tensor in ewise_inputs: cache_read_ewise.append( s.cache_read(tensor, env.acc_scope, [consumer])) + # set ewise scope for op in ewise_ops: s[op].set_scope(env.acc_scope) - s[op].pragma(s[op].op.axis[0], alu) + s[op].pragma(s[op].op.axis[0], env.alu) - # tile - oc_factor = (plan.oc_factor if plan.oc_factor - else plan.out_filter // env.BLOCK_OUT) - h_factor = (plan.h_factor if plan.h_factor else oshape[2]) - w_factor = (plan.w_factor if plan.w_factor else oshape[3]) + for op in const_ops: + s[op].compute_inline() + # tile x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis - x_co0, x_co1 = s[output].split(x_co, factor=oc_factor) - x_i0, x_i1 = s[output].split(x_i, factor=h_factor) - x_j0, x_j1 = s[output].split(x_j, factor=w_factor) + x_co0, x_co1 = cfg['tile_co'].apply(s, output, x_co) + x_i0, x_i1 = cfg['tile_h'].apply(s, output, x_i) + x_j0, x_j1 = cfg['tile_w'].apply(s, output, x_j) s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci) store_pt = x_j0 @@ -425,17 +166,17 @@ def _traverse(op): for tensor in cache_read_ewise: s[tensor].compute_at(s[output], store_pt) - s[tensor].pragma(s[tensor].op.axis[0], load_out) + s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy) # virtual threading along output channel axes - if plan.oc_nthread > 1: - _, v_t = s[output].split(x_co0, factor=plan.oc_nthread) + if cfg['oc_nthread'].val > 1: + _, v_t = s[output].split(x_co0, factor=cfg['oc_nthread'].val) s[output].reorder(v_t, x_bo) s[output].bind(v_t, tvm.thread_axis("cthread")) # virtual threading along spatial rows - if plan.h_nthread > 1: - _, v_t = s[output].split(x_i0, factor=plan.h_nthread) + if cfg['h_nthread'].val > 1: + _, v_t = s[output].split(x_i0, factor=cfg['h_nthread'].val) s[output].reorder(v_t, x_bo) s[output].bind(v_t, tvm.thread_axis("cthread")) @@ -443,17 +184,17 @@ def _traverse(op): k_o, d_i, d_j, k_i = s[conv2d_stage].op.reduce_axis s[conv2d_stage].reorder(x_bo, k_o, x_j, d_j, d_i, x_co, x_i, x_bi, x_ci, k_i) - if plan.ic_factor: - k_o, _ = s[conv2d_stage].split(k_o, factor=plan.ic_factor) - s[cdata].compute_at(s[conv2d_stage], k_o) - s[ckernel].compute_at(s[conv2d_stage], k_o) + k_o, _ = cfg['tile_ci'].apply(s, conv2d_stage, k_o) + s[cdata].compute_at(s[conv2d_stage], k_o) + s[ckernel].compute_at(s[conv2d_stage], k_o) # Use VTA instructions - s[cdata].pragma(s[cdata].op.axis[0], load_inp) - s[ckernel].pragma(s[ckernel].op.axis[0], load_wgt) - s[conv2d_stage].tensorize(x_bi, gemm) - s[output].pragma(x_co1, store_out) + s[cdata].pragma(s[cdata].op.axis[0], env.dma_copy) + s[ckernel].pragma(s[ckernel].op.axis[0], env.dma_copy) + s[conv2d_stage].tensorize(x_bi, env.gemm) + s[output].pragma(x_co1, env.dma_copy) return s +<<<<<<< HEAD class Conv2DSchedule(object): """ 2D convolution schedule object. @@ -508,3 +249,5 @@ def __str__(self): else: logging.warning("No valid schedule was found for the workload on current vta configuration") break +======= +>>>>>>> autotvm support for conv2d operator diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py index 8a03cb020260..56b66bdb0101 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py @@ -16,6 +16,12 @@ # under the License. """Testing if we can generate code in topi style""" +import os +import json +from collections import namedtuple + +import numpy as np + import tvm from tvm import autotvm from tvm.contrib import util @@ -24,10 +30,30 @@ import topi.testing import vta import vta.testing -import numpy as np - -Workload = vta.top.vta_conv2d.Workload - +from vta.testing import simulator + +Workload = namedtuple("Conv2DWorkload", + ['batch', 'height', 'width', 'in_filter', 'out_filter', + 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride']) + +# ResNet18 workloads +resnet_wkls = [ + # Workloads of resnet18 on imagenet + # ('resnet-18.C1', Workload(1, 224, 224, 3, 64, 7, 7, 3, 3, 2, 2)), + ('resnet-18.C2', Workload(1, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1)), + ('resnet-18.C3', Workload(1, 56, 56, 64, 64, 1, 1, 0, 0, 1, 1)), + ('resnet-18.C4', Workload(1, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2)), + ('resnet-18.C5', Workload(1, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2)), + ('resnet-18.C6', Workload(1, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1)), + ('resnet-18.C7', Workload(1, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2)), + ('resnet-18.C8', Workload(1, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2)), + ('resnet-18.C9', Workload(1, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1)), + ('resnet-18.C10', Workload(1, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2)), + ('resnet-18.C11', Workload(1, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2)), + ('resnet-18.C12', Workload(1, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1)), +] + +# FIXME: we need a custom clip operator to circumvent a pattern detection limitation @tvm.tag_scope(tag=topi.tag.ELEMWISE) def my_clip(x, a_min, a_max): """Unlike topi's current clip, put min and max into two stages.""" @@ -37,249 +63,168 @@ def my_clip(x, a_min, a_max): x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") return x -def test_cpu_conv2d(): - def run_cpu_conv2d(env, remote, key, batch_size, wl, profile=True): - data_shape = (batch_size, wl.in_filter, wl.height, wl.width) - kernel_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) - - fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 - fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 - data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) - kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) - res_conv = topi.nn.conv2d( - data, kernel, padding=(wl.hpad, wl.wpad), - strides=(wl.hstride, wl.wstride), - dilation=(1, 1), - out_dtype="int32") - res = topi.right_shift(res_conv, 8) - res = my_clip(res, 0, 127) - res = topi.cast(res, "int8") - - # To compute number of ops, use a x2 factor for FMA - num_ops = 2 * batch_size * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter - - a_shape = (batch_size, wl.in_filter, wl.height, wl.width) - w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) - stride = (wl.hstride, wl.wstride) - data_dtype = data.dtype - kernel_dtype = kernel.dtype - acc_dtype = env.acc_dtype - assert wl.hpad == wl.wpad - padding = wl.hpad - - @memoize("vta.tests.test_benchmark_topi.conv2d.cpu.verify_nhwc") - def get_ref_data(): - a_np = (np.random.uniform(size=a_shape) * 4).astype(data_dtype) - w_np = (np.random.uniform(size=w_shape) * 4).astype(kernel_dtype) - a_np = np.abs(a_np) - w_np = np.abs(w_np) - b_np = topi.testing.conv2d_nchw_python( - a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype) - return a_np, w_np, b_np - - - def verify(s, check_correctness): - mod = tvm.build(s, [data, kernel, res], - target_host=env.target_host, - name="conv2d") - temp = util.tempdir() - mod.save(temp.relpath("conv2d.o")) - remote.upload(temp.relpath("conv2d.o")) - f = remote.load_module("conv2d.o") - # verify - ctx = remote.cpu(0) - # Data in original format - data_orig, kernel_orig, res_ref = get_ref_data() - res_shape = topi.util.get_const_tuple(res.shape) - res_np = np.zeros(res_shape).astype(res.dtype) - data_arr = tvm.nd.array(data_orig, ctx) - kernel_arr = tvm.nd.array(kernel_orig, ctx) - res_arr = tvm.nd.array(res_np, ctx) - time_f = f.time_evaluator("conv2d", ctx, number=5) - cost = time_f(data_arr, kernel_arr, res_arr) - res_unpack = res_arr.asnumpy() - if check_correctness: - assert wl.hpad == wl.wpad - stride = (wl.hstride, wl.wstride) - padding = wl.hpad - res_ref = res_ref >> 8 - res_ref = np.clip(res_ref, 0, 127).astype("int8") - tvm.testing.assert_allclose(res_unpack, res_ref) - return cost - - def conv_normal(print_ir): - print("----- CONV2D CPU End-to-End Test-------") - s = topi.generic.schedule_conv2d_nchw([res]) - if print_ir: - print(tvm.lower(s, [data, kernel, res], simple_mode=True)) - cost = verify(s, True) - gops = (num_ops / cost.mean) / float(10 ** 9) - print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) - - conv_normal(False) - - def _run(env, remote): - # ResNet18 workloads - resnet = { - # Workloads of resnet18 on imagenet - 0: Workload(1, 224, 224, 16, 64, 7, 7, 3, 3, 2, 2), - 1: Workload(1, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1), - 2: Workload(1, 56, 56, 64, 64, 1, 1, 0, 0, 1, 1), - 3: Workload(1, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2), - 4: Workload(1, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2), - 5: Workload(1, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1), - 6: Workload(1, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2), - 7: Workload(1, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2), - 8: Workload(1, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1), - 9: Workload(1, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2), - 10: Workload(1, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2), - 11: Workload(1, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1), - } - batch_size = 1 - for i in range(1, len(resnet)): - wl = resnet[i] - key = "resnet-cfg[%d]" % i - print("key=%s" % key) - print(wl) - with tvm.target.create("llvm -device=vtacpu"): - run_cpu_conv2d(env, remote, key, batch_size, wl) - - # load pre-tuned operator parameters for ARM CPU - autotvm.tophub.check_backend('vta') - with autotvm.tophub.context('llvm -device=vtacpu'): - vta.testing.run(_run) - - -def test_vta_conv2d(): - def run_vta_conv2d(env, remote, key, batch_size, wl, profile=True): - data_shape = (batch_size//env.BATCH, wl.in_filter//env.BLOCK_IN, - wl.height, wl.width, env.BATCH, env.BLOCK_IN) +def run_conv2d(env, remote, wl, target, + check_correctness=True, print_ir=False, + samples=4, profileOnly=False): + + # Workload assertions + assert wl.hpad == wl.wpad + + # Perform packing only if we are targeting the accelerator + if "arm_cpu" in target.keys: + data_pack = False + layout = "NCHW" + elif "vta" in target.keys: + data_pack = True + layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN) + + # Derive shapes depending upon packing + a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) + w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) + b_shape = (wl.batch, wl.out_filter, 1, 1) + if data_pack: + data_shape = (wl.batch//env.BATCH, wl.in_filter//env.BLOCK_IN, + wl.height, wl.width, env.BATCH, env.BLOCK_IN) kernel_shape = (wl.out_filter//env.BLOCK_OUT, wl.in_filter//env.BLOCK_IN, wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN) - bias_shape = (1, wl.out_filter//env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) - - fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 - fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 - data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) - kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) - bias = tvm.placeholder(bias_shape, name="kernel", dtype=env.acc_dtype) - - res_conv = vta.top.packed_conv2d( - data, kernel, padding=(wl.hpad, wl.wpad), strides=(wl.hstride, wl.wstride)) - res = topi.right_shift(res_conv, 8) + bias_shape = (wl.batch//env.BATCH, wl.out_filter//env.BLOCK_OUT, + 1, 1, env.BATCH, env.BLOCK_OUT) + else: + data_shape = a_shape + kernel_shape = w_shape + bias_shape = b_shape + data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) + kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) + bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) + + # Define base computation schedule + with target: + res = topi.nn.conv2d( + data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), (1, 1), + layout, env.acc_dtype) + res = topi.right_shift(res, 8) res = topi.add(res, bias) - res = my_clip(res, 0, 127) - res = topi.cast(res, "int8") - - # To compute number of ops, use a x2 factor for FMA - num_ops = 2 * batch_size * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter - - a_shape = (batch_size, wl.in_filter, wl.height, wl.width) - w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) - stride = (wl.hstride, wl.wstride) - data_dtype = data.dtype - kernel_dtype = kernel.dtype - acc_dtype = env.acc_dtype - assert wl.hpad == wl.wpad - padding = wl.hpad - - @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc") - def get_ref_data(): - a_np = (np.random.uniform(size=a_shape) * 4).astype(data_dtype) - w_np = (np.random.uniform(size=w_shape) * 4).astype(kernel_dtype) - a_np = np.abs(a_np) - w_np = np.abs(w_np) - b_np = topi.testing.conv2d_nchw_python( - a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype) - return a_np, w_np, b_np - - def verify(s, check_correctness): - mod = vta.build(s, [data, kernel, bias, res], "ext_dev", - env.target_host, name="conv2d") - temp = util.tempdir() - - mod.save(temp.relpath("conv2d.o")) - remote.upload(temp.relpath("conv2d.o")) - f = remote.load_module("conv2d.o") - # verify - ctx = remote.ext_dev(0) - # Data in original format - data_orig, kernel_orig, res_ref = get_ref_data() - bias_orig = (np.random.uniform(size=(wl.out_filter,)) * 4).astype("int32") - bias_orig = np.abs(bias_orig) - - data_packed = data_orig.reshape( - batch_size//env.BATCH, env.BATCH, - wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, - wl.height, wl.width).transpose((0, 2, 4, 5, 1, 3)) - kernel_packed = kernel_orig.reshape( - wl.out_filter//env.BLOCK_OUT, env.BLOCK_OUT, - wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, - wl.hkernel, wl.wkernel).transpose((0, 2, 4, 5, 1, 3)) - bias_packed = bias_orig.reshape( - 1, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) - res_shape = topi.util.get_const_tuple(res.shape) - - res_np = np.zeros(res_shape).astype(res.dtype) - data_arr = tvm.nd.array(data_packed, ctx) - kernel_arr = tvm.nd.array(kernel_packed, ctx) - bias_arr = tvm.nd.array(bias_packed, ctx) - res_arr = tvm.nd.array(res_np, ctx) - time_f = f.time_evaluator("conv2d", ctx, number=5) + res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) + res = topi.cast(res, env.out_dtype) + # Derive base schedule + s = topi.generic.schedule_conv2d_nchw([res]) + if print_ir: + print(vta.lower(s, [data, kernel, bias, res], simple_mode=True)) + + # Derive number of ops + fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 + fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 + num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter + + # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc") + def get_ref_data(): + # derive min max for act, wgt, and bias types (max non inclusive) + a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) + w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1)) + b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2), 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2) + a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype) + w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype) + b_np = np.random.randint(b_min, b_max, size=b_shape).astype(env.acc_dtype) + r_np = topi.testing.conv2d_nchw_python( + a_np.astype(env.acc_dtype), w_np.astype(env.acc_dtype), (wl.hstride, wl.wstride), wl.hpad).astype(env.acc_dtype) + return a_np, w_np, b_np, r_np + + # Data in original format + data_np, kernel_np, bias_np, res_ref = get_ref_data() + if data_pack: + data_np = data_np.reshape( + wl.batch//env.BATCH, env.BATCH, + wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, + wl.height, wl.width).transpose((0, 2, 4, 5, 1, 3)) + kernel_np = kernel_np.reshape( + wl.out_filter//env.BLOCK_OUT, env.BLOCK_OUT, + wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, + wl.hkernel, wl.wkernel).transpose((0, 2, 4, 5, 1, 3)) + bias_np = bias_np.reshape( + wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, + 1, 1, env.BATCH, env.BLOCK_OUT) + + # Build + if "vta" in target.keys: + mod = vta.build(s, [data, kernel, bias, res], + target=target, + target_host=env.target_host, + name="conv2d") + else: + mod = tvm.build(s, [data, kernel, bias, res], + target=target, + target_host=env.target_host, + name="conv2d") + temp = util.tempdir() + mod.save(temp.relpath("conv2d.o")) + remote.upload(temp.relpath("conv2d.o")) + f = remote.load_module("conv2d.o") + ctx = remote.context(str(target)) + + res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype) + data_arr = tvm.nd.array(data_np, ctx) + kernel_arr = tvm.nd.array(kernel_np, ctx) + bias_arr = tvm.nd.array(bias_np, ctx) + res_arr = tvm.nd.array(res_np, ctx) + time_f = f.time_evaluator("conv2d", ctx, number=samples) + + # In vta sim mode, collect simulator runtime statistics + stats = {} + cost = None + if env.TARGET == "sim": + # Check if we're in local RPC mode (allows us to rebuild the + # runtime on the fly when varying the VTA designs) + local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) + if local_rpc: + remote.get_function("vta.simulator.profiler_clear")() + if profileOnly: + remote.get_function("vta.simulator.profiler_debug_mode")(1) cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) - res_unpack = res_arr.asnumpy().transpose( - (0, 4, 1, 5, 2, 3)).reshape(batch_size, wl.out_filter, fout_height, fout_width) - if check_correctness: - assert wl.hpad == wl.wpad - stride = (wl.hstride, wl.wstride) - padding = wl.hpad - res_ref = res_ref >> 8 - res_ref += bias_orig.reshape(wl.out_filter, 1, 1) - res_ref = np.clip(res_ref, 0, 127).astype("int8") - tvm.testing.assert_allclose(res_unpack, res_ref) - return cost - - def conv_normal(print_ir): - print("----- CONV2D End-to-End Test-------") - with vta.build_config(): - s = vta.top.schedule_packed_conv2d([res]) - if print_ir: - print(vta.lower(s, [data, kernel, bias, res], simple_mode=True)) - cost = verify(s, True) - gops = (num_ops / cost.mean) / float(10 ** 9) - print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) - - conv_normal(False) - + stats = json.loads(remote.get_function("vta.simulator.profiler_status")()) + else: + simulator.clear_stats() + if profileOnly: + simulator.debug_mode(1) + cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) + stats = simulator.stats() + else: + cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) + + # Check correctness + correct = False + if check_correctness: + res_orig = res_arr.asnumpy() + if data_pack: + res_orig = res_orig.transpose( + (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, fout_height, fout_width) + res_ref = res_ref >> 8 + res_ref += bias_np.reshape(wl.out_filter, 1, 1) + res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1) + res_ref = res_ref.astype(env.out_dtype) + correct = np.allclose(res_orig, res_ref) + + gops = (num_ops / cost.mean) / float(10 ** 9) + status = "PASSED" if correct else "FAILED" + if "arm_cpu" in target.keys: + device = "CPU" + elif "vta" in target.keys: + device = "VTA" + print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops)) + + return correct, cost, stats + +def test_conv2d(device="vta"): def _run(env, remote): - # ResNet18 workloads - resnet = { - # Workloads of resnet18 on imagenet - 0: Workload(1, 224, 224, 16, 64, 7, 7, 3, 3, 2, 2), - 1: Workload(1, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1), - 2: Workload(1, 56, 56, 64, 64, 1, 1, 0, 0, 1, 1), - 3: Workload(1, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2), - 4: Workload(1, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2), - 5: Workload(1, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1), - 6: Workload(1, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2), - 7: Workload(1, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2), - 8: Workload(1, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1), - 9: Workload(1, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2), - 10: Workload(1, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2), - 11: Workload(1, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1), - } - - batch_size = 1 - for i in range(0, len(resnet)): - wl = resnet[i] - key = "resnet-cfg[%d]" % i - print("key=%s" % key) - print(wl) - run_vta_conv2d(env, remote, key, batch_size, wl) - + if device == "vta": + target = env.target + elif device == "arm_cpu": + target = env.target_vta_cpu + with autotvm.tophub.context(target): # load pre-tuned schedule parameters + for _, wl in resnet_wkls: + print(wl) + run_conv2d(env, remote, wl, target) vta.testing.run(_run) - if __name__ == "__main__": - test_cpu_conv2d() - test_vta_conv2d() + test_conv2d(device="arm_cpu") + test_conv2d(device="vta") From e9c995bb98fae8c31fd680e92f7fb124822e7551 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 6 May 2019 21:43:49 -0700 Subject: [PATCH 002/126] removing progileOnly option --- vta/tests/python/integration/test_benchmark_topi_conv2d.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py index 56b66bdb0101..9ae39bca63d2 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py @@ -65,7 +65,7 @@ def my_clip(x, a_min, a_max): def run_conv2d(env, remote, wl, target, check_correctness=True, print_ir=False, - samples=4, profileOnly=False): + samples=4): # Workload assertions assert wl.hpad == wl.wpad @@ -177,14 +177,10 @@ def get_ref_data(): local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) if local_rpc: remote.get_function("vta.simulator.profiler_clear")() - if profileOnly: - remote.get_function("vta.simulator.profiler_debug_mode")(1) cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) stats = json.loads(remote.get_function("vta.simulator.profiler_status")()) else: simulator.clear_stats() - if profileOnly: - simulator.debug_mode(1) cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) stats = simulator.stats() else: From 25713863bc3da4a6acdd7bc9ed8226f93a59b7d8 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 6 May 2019 21:54:13 -0700 Subject: [PATCH 003/126] removing unsupported layer --- vta/tests/python/integration/test_benchmark_topi_conv2d.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py index 9ae39bca63d2..28c8af4283ce 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py @@ -41,7 +41,7 @@ # Workloads of resnet18 on imagenet # ('resnet-18.C1', Workload(1, 224, 224, 3, 64, 7, 7, 3, 3, 2, 2)), ('resnet-18.C2', Workload(1, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1)), - ('resnet-18.C3', Workload(1, 56, 56, 64, 64, 1, 1, 0, 0, 1, 1)), + # ('resnet-18.C3', Workload(1, 56, 56, 64, 64, 1, 1, 0, 0, 1, 1)), # this layer does not appear in ResNet ('resnet-18.C4', Workload(1, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2)), ('resnet-18.C5', Workload(1, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2)), ('resnet-18.C6', Workload(1, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1)), From 77e9191fac344d70dbb77cf898bf3dc8b86d7c0f Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 6 May 2019 22:00:02 -0700 Subject: [PATCH 004/126] fixing bare metal test build --- vta/python/vta/pkg_config.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vta/python/vta/pkg_config.py b/vta/python/vta/pkg_config.py index 2c30414ace1a..3977d5aa2e43 100644 --- a/vta/python/vta/pkg_config.py +++ b/vta/python/vta/pkg_config.py @@ -77,8 +77,6 @@ def __init__(self, cfg, proj_root): if self.target == "pynq": self.ldflags = [ "-L/usr/lib", - "-L/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/", - "-L/opt/python3.6/lib/python3.6/site-packages/pynq/lib/", "-l:libcma.so"] else: self.ldflags = [] From f87417ac35aa9a6c0bb7ec52c27cfdf862d538ee Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 8 May 2019 23:26:49 -0700 Subject: [PATCH 005/126] refactoring resnet WIP --- vta/tutorials/resnet.py | 256 +++++++++++++++++++--------------------- 1 file changed, 124 insertions(+), 132 deletions(-) diff --git a/vta/tutorials/resnet.py b/vta/tutorials/resnet.py index df3bb0607284..7930bfe750c6 100644 --- a/vta/tutorials/resnet.py +++ b/vta/tutorials/resnet.py @@ -37,7 +37,7 @@ import numpy as np import requests -from matplotlib import pyplot as plt +#from matplotlib import pyplot as plt from PIL import Image import tvm @@ -82,67 +82,6 @@ def classify(m, image): tcost = "t={0:.2f}s".format(tcost.mean) return tcost + " {}".format(synset[top]) -# Helper function to compile the NNVM graph -# Takes in a path to a graph file, params file, and device target -# Returns the NNVM graph object, a compiled library object, and the params dict -def generate_graph(graph_fn, params_fn, device="vta"): - # Measure build start time - build_start = time.time() - - # Derive the TVM target - target = tvm.target.create("llvm -device={}".format(device)) - - # Derive the LLVM compiler flags - # When targetting the Pynq, cross-compile to ARMv7 ISA - if env.TARGET == "sim": - target_host = "llvm" - elif env.TARGET == "pynq": - target_host = "llvm -mtriple=armv7-none-linux-gnueabihf -mcpu=cortex-a9 -mattr=+neon" - - # Load the ResNet-18 graph and parameters - sym = nnvm.graph.load_json(open(graph_fn).read()) - params = nnvm.compiler.load_param_dict(open(params_fn, 'rb').read()) - - # Populate the shape and data type dictionary - shape_dict = {"data": (1, 3, 224, 224)} - dtype_dict = {"data": 'float32'} - shape_dict.update({k: v.shape for k, v in params.items()}) - dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) - - # Apply NNVM graph optimization passes - sym = vta.graph.clean_cast(sym) - sym = vta.graph.clean_conv_fuse(sym) - if target.device_name == "vta": - assert env.BLOCK_IN == env.BLOCK_OUT - sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT) - - # Compile NNVM graph - with nnvm.compiler.build_config(opt_level=3): - if target.device_name != "vta": - graph, lib, params = nnvm.compiler.build( - sym, target, shape_dict, dtype_dict, - params=params, target_host=target_host) - else: - with vta.build_config(): - graph, lib, params = nnvm.compiler.build( - sym, target, shape_dict, dtype_dict, - params=params, target_host=target_host) - - # Save the compiled inference graph library - assert tvm.module.enabled("rpc") - temp = util.tempdir() - lib.save(temp.relpath("graphlib.o")) - - # Send the inference library over to the remote RPC server - remote.upload(temp.relpath("graphlib.o")) - lib = remote.load_module("graphlib.o") - - # Measure build time - build_time = time.time() - build_start - print("ResNet-18 inference graph built in {0:.2f}s!".format(build_time)) - - return graph, lib, params - ###################################################################### # Download ResNet Model @@ -169,7 +108,7 @@ def generate_graph(graph_fn, params_fn, device="vta"): synset = eval(open(os.path.join(data_dir, categ_fn)).read()) # Download pre-tuned op parameters of conv2d for ARM CPU used in VTA -autotvm.tophub.check_backend('vta') +# autotvm.tophub.check_backend('vta') ###################################################################### @@ -213,21 +152,74 @@ def generate_graph(graph_fn, params_fn, device="vta"): # ------------------------ # Build the ResNet graph runtime, and configure the parameters. -# Set ``device=vtacpu`` to run inference on the CPU +# Set ``device=arm_cpu`` to run inference on the CPU # or ``device=vta`` to run inference on the FPGA. device = "vta" -# Device context -ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) +# Derive the TVM target +if device == "vta": + target = env.target +elif device == "arm_cpu": + target = env.target_vta_cpu +ctx = remote.context(str(target)) + +# TVM module +m = None + +with autotvm.tophub.context(target): + + graph_fn = os.path.join(data_dir, graph_fn) + params_fn= os.path.join(data_dir, params_fn) + + # Measure build start time + build_start = time.time() + + # Load the ResNet-18 graph and parameters + sym = nnvm.graph.load_json(open(graph_fn).read()) + params = nnvm.compiler.load_param_dict(open(params_fn, 'rb').read()) + + # Populate the shape and data type dictionary + shape_dict = {"data": (1, 3, 224, 224)} + dtype_dict = {"data": 'float32'} + shape_dict.update({k: v.shape for k, v in params.items()}) + dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) + + # Apply NNVM graph optimization passes + sym = vta.graph.clean_cast(sym) + sym = vta.graph.clean_conv_fuse(sym) + if target.device_name == "vta": + assert env.BLOCK_IN == env.BLOCK_OUT + sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT) + + # Compile NNVM graph + with nnvm.compiler.build_config(opt_level=3): + if target.device_name != "vta": + graph, lib, params = nnvm.compiler.build( + sym, target, shape_dict, dtype_dict, + params=params, target_host=env.target_host) + else: + with vta.build_config(): + graph, lib, params = nnvm.compiler.build( + sym, target, shape_dict, dtype_dict, + params=params, target_host=env.target_host) + + # Save the compiled inference graph library + assert tvm.module.enabled("rpc") + temp = util.tempdir() + lib.save(temp.relpath("graphlib.o")) + + # Send the inference library over to the remote RPC server + remote.upload(temp.relpath("graphlib.o")) + lib = remote.load_module("graphlib.o") + + # Measure build time + build_time = time.time() - build_start + print("ResNet-18 inference graph built in {0:.2f}s!".format(build_time)) -# Build the graph runtime -graph, lib, params = generate_graph(os.path.join(data_dir, graph_fn), - os.path.join(data_dir, params_fn), - device) -m = graph_runtime.create(graph, lib, ctx) + m = graph_runtime.create(graph, lib, ctx) -# Set the parameters -m.set_input(**params) + # Set the parameters + m.set_input(**params) ###################################################################### # Run ResNet-18 inference on a sample image @@ -241,8 +233,8 @@ def generate_graph(graph_fn, params_fn, device="vta"): response = requests.get(image_url) image = Image.open(BytesIO(response.content)).resize((224, 224)) # Show Image -plt.imshow(image) -plt.show() +# plt.imshow(image) +# plt.show() # Set the input image = process_image(image) m.set_input('data', image) @@ -271,60 +263,60 @@ def generate_graph(graph_fn, params_fn, device="vta"): # Comment the `if False:` out to run the demo # Early exit - remove for Demo -if False: - - import cv2 - import pafy - from IPython.display import clear_output - - # Helper to crop an image to a square (224, 224) - # Takes in an Image object, returns an Image object - def thumbnailify(image, pad=15): - w, h = image.size - crop = ((w-h)//2+pad, pad, h+(w-h)//2-pad, h-pad) - image = image.crop(crop) - image = image.resize((224, 224)) - return image - - # 16:16 inches - plt.rcParams['figure.figsize'] = [16, 16] - - # Stream the video in - url = "https://www.youtube.com/watch?v=PJlmYh27MHg&t=2s" - video = pafy.new(url) - best = video.getbest(preftype="mp4") - cap = cv2.VideoCapture(best.url) - - # Process one frame out of every 48 for variety - count = 0 - guess = "" - while(count<2400): - - # Capture frame-by-frame - ret, frame = cap.read() - - # Process one every 48 frames - if count % 48 == 1: - frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - frame = Image.fromarray(frame) - # Crop and resize - thumb = np.array(thumbnailify(frame)) - image = process_image(thumb) - guess = classify(m, image) - - # Insert guess in frame - frame = cv2.rectangle(thumb,(0,0),(200,0),(0,0,0),50) - cv2.putText(frame, guess, (5,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (256,256,256), 1, cv2.LINE_AA) - - plt.imshow(thumb) - plt.axis('off') - plt.show() - if cv2.waitKey(1) & 0xFF == ord('q'): - break - clear_output(wait=True) - - count += 1 - - # When everything done, release the capture - cap.release() - cv2.destroyAllWindows() +# if False: + +# import cv2 +# import pafy +# from IPython.display import clear_output + +# # Helper to crop an image to a square (224, 224) +# # Takes in an Image object, returns an Image object +# def thumbnailify(image, pad=15): +# w, h = image.size +# crop = ((w-h)//2+pad, pad, h+(w-h)//2-pad, h-pad) +# image = image.crop(crop) +# image = image.resize((224, 224)) +# return image + +# # 16:16 inches +# plt.rcParams['figure.figsize'] = [16, 16] + +# # Stream the video in +# url = "https://www.youtube.com/watch?v=PJlmYh27MHg&t=2s" +# video = pafy.new(url) +# best = video.getbest(preftype="mp4") +# cap = cv2.VideoCapture(best.url) + +# # Process one frame out of every 48 for variety +# count = 0 +# guess = "" +# while(count<2400): + +# # Capture frame-by-frame +# ret, frame = cap.read() + +# # Process one every 48 frames +# if count % 48 == 1: +# frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) +# frame = Image.fromarray(frame) +# # Crop and resize +# thumb = np.array(thumbnailify(frame)) +# image = process_image(thumb) +# guess = classify(m, image) + +# # Insert guess in frame +# frame = cv2.rectangle(thumb,(0,0),(200,0),(0,0,0),50) +# cv2.putText(frame, guess, (5,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (256,256,256), 1, cv2.LINE_AA) + +# plt.imshow(thumb) +# plt.axis('off') +# plt.show() +# if cv2.waitKey(1) & 0xFF == ord('q'): +# break +# clear_output(wait=True) + +# count += 1 + +# # When everything done, release the capture +# cap.release() +# cv2.destroyAllWindows() From 72f7c40da852db08aca45396a2a7596fcb2c7d2f Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Thu, 9 May 2019 18:47:11 -0700 Subject: [PATCH 006/126] VTA topi support fix for NNVM --- vta/python/vta/top/__init__.py | 1 + vta/python/vta/top/op.py | 132 +++++++++++++++++++++++++++++++ vta/python/vta/top/vta_conv2d.py | 12 +-- 3 files changed, 135 insertions(+), 10 deletions(-) create mode 100644 vta/python/vta/top/op.py diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py index 7346c35506a2..d1f6ec4f3ec1 100644 --- a/vta/python/vta/top/__init__.py +++ b/vta/python/vta/top/__init__.py @@ -1,3 +1,4 @@ """TVM TOPI connector, eventually most of these should go to TVM repo""" +from . import op from . import vta_conv2d diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py new file mode 100644 index 000000000000..c288263e993a --- /dev/null +++ b/vta/python/vta/top/op.py @@ -0,0 +1,132 @@ +"""Namespace for supporting packed_conv2d + ewise variant of nnvm.""" +from __future__ import absolute_import as _abs + +from collections import namedtuple +import logging + +import tvm +from tvm import autotvm +import topi + +from nnvm.top import registry as reg, OpPattern +from nnvm.top import nn as _nn + +from ..environment import get_env + +def is_packed_layout(layout): + """Check if layout is packed layout""" + if layout == "NCHW": + return False + if "n" in layout and "c" in layout: + return True + return False + +@tvm.register_func("nnvm.compiler.build_target", override=True) +def _build(funcs, target, target_host): + tvm_t = tvm.target.create(target) + if tvm_t.device_name == "vta": + return tvm.build(funcs, target="ext_dev", target_host=target_host) + if tvm_t.device_name == "rasp" or tvm_t.device_name == "vtacpu": + return tvm.build(funcs, target=target_host) + return tvm.build(funcs, target=target) + +@tvm.register_func("nnvm.compiler.lower", override=True) +def _lower(sch, inputs, func_name, graph): + import traceback + # pylint: disable=broad-except + try: + f = tvm.lower(sch, inputs, name=func_name) + if "quantized_conv2d" in func_name: + logging.info(graph.ir(join_entry_attrs=["shape"])) + except Exception: + msg = traceback.format_exc() + msg += "Error during compile graph\n" + msg += "--------------------------\n" + msg += graph.ir(join_entry_attrs=["shape"]) + raise RuntimeError(msg) + return f if isinstance( + f, (tvm.container.Array, tuple, list)) else [f] + +# override to force partition at copy +reg.register_pattern("copy", OpPattern.INJECTIVE, level=15) + +@reg.register_compute("clip", level=15) +def compute_clip(attrs, inputs, _): + """ Clip operator. """ + x = inputs[0] + a_min = attrs.get_float("a_min") + a_max = attrs.get_float("a_max") + const_min = tvm.const(a_min, x.dtype) + const_max = tvm.const(a_max, x.dtype) + with tvm.tag_scope(topi.tag.ELEMWISE): + x = tvm.compute( + x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") + x = tvm.compute( + x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") + return x + +@reg.register_compute("conv2d", level=15) +def compute_conv2d(attrs, inputs, out): + """ 2D convolution algorithm. + """ + padding = attrs.get_int_tuple("padding") + strides = attrs.get_int_tuple("strides") + dilation = attrs.get_int_tuple("dilation") + groups = attrs.get_int("groups") + layout = attrs["layout"] + out_dtype = attrs['out_dtype'] + + assert dilation == (1, 1), "not support dilate now" + if is_packed_layout(layout): + if groups == 1: + assert groups == 1 + env = get_env() + assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now" + assert env.LOG_OUT_WIDTH == 3, "only support 8bit inp for now" + inputs = list(inputs) + w_pack_factor = 1 << (3 - env.LOG_WGT_WIDTH) + assert inputs[1].dtype == "int8" + + # Apply bit packing if necessary + if w_pack_factor != 1: + kshape = list(topi.util.get_const_tuple(inputs[1].shape)) + kshape[-1] *= w_pack_factor + inputs[1] = reinterpret(inputs[1], kshape, dtype=env.wgt_dtype) + + return topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype) + else: + return topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, out_dtype) + + with tvm.target.arm_cpu(tvm.target.current_target().model): + return _nn.compute_conv2d(attrs, inputs, out) + +@reg.register_schedule("conv2d", level=15) +def schedule_conv2d(attrs, outs, target): + """ 2D convolution schedule. + """ + layout = attrs["layout"] + groups = attrs.get_int('groups') + + if is_packed_layout(layout): + target = tvm.target.create(target) + if target.device_name == "vta": + if groups == 1: + return topi.generic.schedule_conv2d_nchw(outs) + else: + return topi.generic.schedule_group_conv2d_nchw(outs) + elif str(target).startswith("llvm"): + return tvm.create_schedule([x.op for x in outs]) + else: + raise RuntimeError("not support target %s" % target) + + with tvm.target.arm_cpu(tvm.target.current_target().model): + return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target()) + +@reg.register_alter_op_layout("conv2d", level=15) +def alter_conv2d_layout(attrs, inputs, out): + layout = attrs['layout'] + if is_packed_layout(layout): + return None + + with tvm.target.arm_cpu(tvm.target.current_target().model): + return _nn.alter_conv2d_layout(attrs, inputs, out) diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index 681418d6ecb1..78db543d4774 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -16,22 +16,14 @@ # under the License. """Namespace for supporting packed_conv2d + ewise variant of nnvm.""" +import numpy as np import tvm from tvm import autotvm import topi -import numpy as np - +from .op import is_packed_layout from ..environment import get_env -def is_packed_layout(layout): - """Check if layout is packed layout""" - if layout == "NCHW": - return False - if "n" in layout and "c" in layout: - return True - return False - @autotvm.register_topi_compute(topi.nn.conv2d, 'vta', 'direct') def packed_conv2d(cfg, data, From bb8093d631cb72260ff9dbfd7108db9fe4fbbd95 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Fri, 10 May 2019 13:44:08 -0700 Subject: [PATCH 007/126] fixing resnet18 tutorial to work with TOPI --- vta/tutorials/resnet.py | 135 +++++++++++++++++++--------------------- 1 file changed, 64 insertions(+), 71 deletions(-) diff --git a/vta/tutorials/resnet.py b/vta/tutorials/resnet.py index 7930bfe750c6..13161586480e 100644 --- a/vta/tutorials/resnet.py +++ b/vta/tutorials/resnet.py @@ -37,7 +37,7 @@ import numpy as np import requests -#from matplotlib import pyplot as plt +from matplotlib import pyplot as plt from PIL import Image import tvm @@ -107,10 +107,6 @@ def classify(m, image): # Read in ImageNet Categories synset = eval(open(os.path.join(data_dir, categ_fn)).read()) -# Download pre-tuned op parameters of conv2d for ARM CPU used in VTA -# autotvm.tophub.check_backend('vta') - - ###################################################################### # Setup the Pynq Board's RPC Server # --------------------------------- @@ -152,16 +148,13 @@ def classify(m, image): # ------------------------ # Build the ResNet graph runtime, and configure the parameters. -# Set ``device=arm_cpu`` to run inference on the CPU +# Set ``device=vtacpu`` to run inference on the CPU # or ``device=vta`` to run inference on the FPGA. device = "vta" -# Derive the TVM target -if device == "vta": - target = env.target -elif device == "arm_cpu": - target = env.target_vta_cpu -ctx = remote.context(str(target)) +# TVM target and context +target = tvm.target.create("llvm -device={}".format(device)) +ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) # TVM module m = None @@ -233,8 +226,8 @@ def classify(m, image): response = requests.get(image_url) image = Image.open(BytesIO(response.content)).resize((224, 224)) # Show Image -# plt.imshow(image) -# plt.show() +plt.imshow(image) +plt.show() # Set the input image = process_image(image) m.set_input('data', image) @@ -263,60 +256,60 @@ def classify(m, image): # Comment the `if False:` out to run the demo # Early exit - remove for Demo -# if False: - -# import cv2 -# import pafy -# from IPython.display import clear_output - -# # Helper to crop an image to a square (224, 224) -# # Takes in an Image object, returns an Image object -# def thumbnailify(image, pad=15): -# w, h = image.size -# crop = ((w-h)//2+pad, pad, h+(w-h)//2-pad, h-pad) -# image = image.crop(crop) -# image = image.resize((224, 224)) -# return image - -# # 16:16 inches -# plt.rcParams['figure.figsize'] = [16, 16] - -# # Stream the video in -# url = "https://www.youtube.com/watch?v=PJlmYh27MHg&t=2s" -# video = pafy.new(url) -# best = video.getbest(preftype="mp4") -# cap = cv2.VideoCapture(best.url) - -# # Process one frame out of every 48 for variety -# count = 0 -# guess = "" -# while(count<2400): - -# # Capture frame-by-frame -# ret, frame = cap.read() - -# # Process one every 48 frames -# if count % 48 == 1: -# frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) -# frame = Image.fromarray(frame) -# # Crop and resize -# thumb = np.array(thumbnailify(frame)) -# image = process_image(thumb) -# guess = classify(m, image) - -# # Insert guess in frame -# frame = cv2.rectangle(thumb,(0,0),(200,0),(0,0,0),50) -# cv2.putText(frame, guess, (5,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (256,256,256), 1, cv2.LINE_AA) - -# plt.imshow(thumb) -# plt.axis('off') -# plt.show() -# if cv2.waitKey(1) & 0xFF == ord('q'): -# break -# clear_output(wait=True) - -# count += 1 - -# # When everything done, release the capture -# cap.release() -# cv2.destroyAllWindows() +if False: + + import cv2 + import pafy + from IPython.display import clear_output + + # Helper to crop an image to a square (224, 224) + # Takes in an Image object, returns an Image object + def thumbnailify(image, pad=15): + w, h = image.size + crop = ((w-h)//2+pad, pad, h+(w-h)//2-pad, h-pad) + image = image.crop(crop) + image = image.resize((224, 224)) + return image + + # 16:16 inches + plt.rcParams['figure.figsize'] = [16, 16] + + # Stream the video in + url = "https://www.youtube.com/watch?v=PJlmYh27MHg&t=2s" + video = pafy.new(url) + best = video.getbest(preftype="mp4") + cap = cv2.VideoCapture(best.url) + + # Process one frame out of every 48 for variety + count = 0 + guess = "" + while(count<2400): + + # Capture frame-by-frame + ret, frame = cap.read() + + # Process one every 48 frames + if count % 48 == 1: + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + frame = Image.fromarray(frame) + # Crop and resize + thumb = np.array(thumbnailify(frame)) + image = process_image(thumb) + guess = classify(m, image) + + # Insert guess in frame + frame = cv2.rectangle(thumb,(0,0),(200,0),(0,0,0),50) + cv2.putText(frame, guess, (5,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (256,256,256), 1, cv2.LINE_AA) + + plt.imshow(thumb) + plt.axis('off') + plt.show() + if cv2.waitKey(1) & 0xFF == ord('q'): + break + clear_output(wait=True) + + count += 1 + + # When everything done, release the capture + cap.release() + cv2.destroyAllWindows() From 5f783556e0f60dcf54e9aff7b67e09db6be06d0e Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 14 May 2019 10:51:34 -0700 Subject: [PATCH 008/126] adding bitpacking support by Marissa --- vta/python/vta/top/__init__.py | 1 + vta/python/vta/top/bitpack.py | 70 ++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 vta/python/vta/top/bitpack.py diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py index d1f6ec4f3ec1..5d212cc313ba 100644 --- a/vta/python/vta/top/__init__.py +++ b/vta/python/vta/top/__init__.py @@ -1,4 +1,5 @@ """TVM TOPI connector, eventually most of these should go to TVM repo""" +from . import bitpack from . import op from . import vta_conv2d diff --git a/vta/python/vta/top/bitpack.py b/vta/python/vta/top/bitpack.py new file mode 100644 index 000000000000..7b09ffbf43c0 --- /dev/null +++ b/vta/python/vta/top/bitpack.py @@ -0,0 +1,70 @@ +"""Bit packing operators""" +from __future__ import absolute_import as _abs + +import tvm +from topi import util + +from nnvm.top import registry as reg, OpPattern +from nnvm.top import nn as _nn +from nnvm.top.tensor import _fschedule_broadcast + +def bitpack(data, bits, pack_type="int8", name="bitpack"): + """Packs lowest dimension into format needed by VTA + Parameters + ---------- + pack_axis : int + index of the axis to pack in data + bit_axis : int + index of axis to place bit axis in resulting packed data + Returns + ------- + packed : Tensor + The packed tensor. + """ + shape_vec = list(data.shape) + if pack_type == 'int8': + data_width = 8 + elif pack_type == 'int16': + data_width = 16 + elif pack_type == 'int32': + data_width = 32 + else: + raise RuntimeError("Unknown pack type %s" % pack_type) + assert data_width % bits == 0 + lanes = data_width // bits + + # Data must be in multiples of the data_width + assert util.get_const_int(shape_vec[-1]) % lanes == 0, "Not a multiple of word size" + shape_vec[-1] = shape_vec[-1] // lanes + oshape = tuple(shape_vec) + + def _bitpack(*indices): + ret = None + mask = tvm.const((1 << bits) - 1, pack_type) + for k in range(lanes): + idx = list(indices) + idx[-1] = idx[-1] * lanes + k + elem = data(*idx).astype(pack_type) + if k == 0: + ret = elem & mask + else: + val = (elem & mask) << tvm.const(k * bits, pack_type) + ret = ret | val + return ret + + return tvm.compute( + oshape, _bitpack, name=name, tag='bitpack') + + +@reg.register_compute("bitpack", level=15) +def compute_bitpack(attrs, inputs, out): + lanes = attrs.get_int("lanes") + dtype = inputs[0].dtype + assert dtype == "int8" + width = 8 + assert width % lanes == 0 + bits = 8 // lanes + return bitpack(inputs[0], bits, dtype) + +reg.register_schedule("bitpack", _fschedule_broadcast) +reg.register_pattern("bitpack", OpPattern.INJECTIVE) \ No newline at end of file From 25c88978bdd1cf36db2e672f8653de6d436826c3 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 14 May 2019 11:00:45 -0700 Subject: [PATCH 009/126] no support for bitpacking below 8bits for now --- vta/python/vta/top/op.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index c288263e993a..3fe9a5ed8e70 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -84,15 +84,7 @@ def compute_conv2d(attrs, inputs, out): assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now" assert env.LOG_OUT_WIDTH == 3, "only support 8bit inp for now" inputs = list(inputs) - w_pack_factor = 1 << (3 - env.LOG_WGT_WIDTH) assert inputs[1].dtype == "int8" - - # Apply bit packing if necessary - if w_pack_factor != 1: - kshape = list(topi.util.get_const_tuple(inputs[1].shape)) - kshape[-1] *= w_pack_factor - inputs[1] = reinterpret(inputs[1], kshape, dtype=env.wgt_dtype) - return topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype) else: return topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, out_dtype) From d15c97febe34f12ad35ccd1843b1c3f2ae1b16c4 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 14 May 2019 11:16:48 -0700 Subject: [PATCH 010/126] bitpacking annotations --- src/relay/op/annotation/annotation.cc | 34 +++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc index 428df2fb1115..e0c2b20dfa1f 100644 --- a/src/relay/op/annotation/annotation.cc +++ b/src/relay/op/annotation/annotation.cc @@ -83,5 +83,39 @@ TVM_ADD_FILELINE) return {topi::identity(inputs[0])}; }); +RELAY_REGISTER_OP("bitpack_start") +.describe(R"code( +Mark the start of bitpacking. +)code" TVM_ADD_FILELINE) +.set_num_inputs(1) +.set_support_level(10) +.add_type_rel("Identity", IdentityRel) +.set_attr("TOpPattern", kOpaque) +.set_attr("TOpIsStateful", false) +.set_attr("FInferCorrectLayout", + ElemwiseArbitraryLayout) +.set_attr("FTVMCompute", + [](const Attrs& attrs, const Array& inputs, + const Type& out_dtype, const Target& target) -> Array { + return {topi::identity(inputs[0])}; + }); + +RELAY_REGISTER_OP("bitpack_end") +.describe(R"code( +Mark the end of bitpacking. +)code" TVM_ADD_FILELINE) +.set_num_inputs(1) +.set_support_level(10) +.add_type_rel("Identity", IdentityRel) +.set_attr("TOpPattern", kOpaque) +.set_attr("TOpIsStateful", false) +.set_attr("FInferCorrectLayout", + ElemwiseArbitraryLayout) +.set_attr("FTVMCompute", + [](const Attrs& attrs, const Array& inputs, + const Type& out_dtype, const Target& target) -> Array { + return {topi::identity(inputs[0])}; + }); + } // namespace relay } // namespace tvm From 51463ffb72360ce7c6c93020ce321d66c50ee99b Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 14 May 2019 11:20:10 -0700 Subject: [PATCH 011/126] fix --- src/relay/op/annotation/annotation.cc | 4 ++-- src/relay/pass/fuse_ops.cc | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc index e0c2b20dfa1f..f09a3a22e3ab 100644 --- a/src/relay/op/annotation/annotation.cc +++ b/src/relay/op/annotation/annotation.cc @@ -58,7 +58,7 @@ RELAY_REGISTER_OP("on_device") ElemwiseArbitraryLayout); Expr StopFusion(Expr data) { - static const Op& op = Op::Get("annotation.stop_fusion"); + static const Op& op = Op::Get("stop_fusion"); return CallNode::make(op, {data}, Attrs{}, {}); } @@ -67,7 +67,7 @@ TVM_REGISTER_API("relay.op.annotation._make.stop_fusion") return StopFusion(data); }); -RELAY_REGISTER_OP("annotation.stop_fusion") +RELAY_REGISTER_OP("stop_fusion") .describe(R"code(Annotate an expression to prevent it being fused with previous expressions.)code" TVM_ADD_FILELINE) .set_num_inputs(1) diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc index 9f940e54953b..9cd73171bfea 100644 --- a/src/relay/pass/fuse_ops.cc +++ b/src/relay/pass/fuse_ops.cc @@ -821,7 +821,7 @@ class FuseMutator : private ExprMutator { // Transform calls. Expr VisitExpr_(const CallNode* call) { - static const Op& stop_fusion = Op::Get("annotation.stop_fusion"); + static const Op& stop_fusion = Op::Get("stop_fusion"); if (call->op.as()) { // If it is a primitive op call // then we must have a group assignment for it already. From 8bea36836f1a8efc60324ec443e76a72e925b326 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 14 May 2019 16:02:30 -0700 Subject: [PATCH 012/126] relay topi integtation for vta --- vta/python/vta/top/__init__.py | 2 + vta/python/vta/top/bitpack.py | 18 +++-- vta/python/vta/top/nnvm_bitpack.py | 70 ++++++++++++++++++ vta/python/vta/top/nnvm_op.py | 113 +++++++++++++++++++++++++++++ vta/python/vta/top/op.py | 95 +++++++----------------- vta/python/vta/top/vta_conv2d.py | 9 ++- 6 files changed, 228 insertions(+), 79 deletions(-) create mode 100644 vta/python/vta/top/nnvm_bitpack.py create mode 100644 vta/python/vta/top/nnvm_op.py diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py index 5d212cc313ba..f7b48c0bde2d 100644 --- a/vta/python/vta/top/__init__.py +++ b/vta/python/vta/top/__init__.py @@ -2,4 +2,6 @@ from . import bitpack from . import op +from . import nnvm_bitpack +from . import nnvm_op from . import vta_conv2d diff --git a/vta/python/vta/top/bitpack.py b/vta/python/vta/top/bitpack.py index 7b09ffbf43c0..2265af4518b4 100644 --- a/vta/python/vta/top/bitpack.py +++ b/vta/python/vta/top/bitpack.py @@ -4,18 +4,20 @@ import tvm from topi import util -from nnvm.top import registry as reg, OpPattern -from nnvm.top import nn as _nn -from nnvm.top.tensor import _fschedule_broadcast +from tvm.relay.op.op import register_compute, register_schedule +from tvm.relay.op.op import register_pattern, OpPattern +from tvm.relay.op.op import schedule_injective def bitpack(data, bits, pack_type="int8", name="bitpack"): """Packs lowest dimension into format needed by VTA + Parameters ---------- pack_axis : int index of the axis to pack in data bit_axis : int index of axis to place bit axis in resulting packed data + Returns ------- packed : Tensor @@ -56,9 +58,9 @@ def _bitpack(*indices): oshape, _bitpack, name=name, tag='bitpack') -@reg.register_compute("bitpack", level=15) -def compute_bitpack(attrs, inputs, out): - lanes = attrs.get_int("lanes") +@register_compute("bitpack", level=15) +def compute_bitpack(attrs, inputs, output_type, target): + lanes = attrs.lanes dtype = inputs[0].dtype assert dtype == "int8" width = 8 @@ -66,5 +68,5 @@ def compute_bitpack(attrs, inputs, out): bits = 8 // lanes return bitpack(inputs[0], bits, dtype) -reg.register_schedule("bitpack", _fschedule_broadcast) -reg.register_pattern("bitpack", OpPattern.INJECTIVE) \ No newline at end of file +register_schedule("bitpack", schedule_injective) +register_pattern("bitpack", OpPattern.INJECTIVE) diff --git a/vta/python/vta/top/nnvm_bitpack.py b/vta/python/vta/top/nnvm_bitpack.py new file mode 100644 index 000000000000..7b09ffbf43c0 --- /dev/null +++ b/vta/python/vta/top/nnvm_bitpack.py @@ -0,0 +1,70 @@ +"""Bit packing operators""" +from __future__ import absolute_import as _abs + +import tvm +from topi import util + +from nnvm.top import registry as reg, OpPattern +from nnvm.top import nn as _nn +from nnvm.top.tensor import _fschedule_broadcast + +def bitpack(data, bits, pack_type="int8", name="bitpack"): + """Packs lowest dimension into format needed by VTA + Parameters + ---------- + pack_axis : int + index of the axis to pack in data + bit_axis : int + index of axis to place bit axis in resulting packed data + Returns + ------- + packed : Tensor + The packed tensor. + """ + shape_vec = list(data.shape) + if pack_type == 'int8': + data_width = 8 + elif pack_type == 'int16': + data_width = 16 + elif pack_type == 'int32': + data_width = 32 + else: + raise RuntimeError("Unknown pack type %s" % pack_type) + assert data_width % bits == 0 + lanes = data_width // bits + + # Data must be in multiples of the data_width + assert util.get_const_int(shape_vec[-1]) % lanes == 0, "Not a multiple of word size" + shape_vec[-1] = shape_vec[-1] // lanes + oshape = tuple(shape_vec) + + def _bitpack(*indices): + ret = None + mask = tvm.const((1 << bits) - 1, pack_type) + for k in range(lanes): + idx = list(indices) + idx[-1] = idx[-1] * lanes + k + elem = data(*idx).astype(pack_type) + if k == 0: + ret = elem & mask + else: + val = (elem & mask) << tvm.const(k * bits, pack_type) + ret = ret | val + return ret + + return tvm.compute( + oshape, _bitpack, name=name, tag='bitpack') + + +@reg.register_compute("bitpack", level=15) +def compute_bitpack(attrs, inputs, out): + lanes = attrs.get_int("lanes") + dtype = inputs[0].dtype + assert dtype == "int8" + width = 8 + assert width % lanes == 0 + bits = 8 // lanes + return bitpack(inputs[0], bits, dtype) + +reg.register_schedule("bitpack", _fschedule_broadcast) +reg.register_pattern("bitpack", OpPattern.INJECTIVE) \ No newline at end of file diff --git a/vta/python/vta/top/nnvm_op.py b/vta/python/vta/top/nnvm_op.py new file mode 100644 index 000000000000..ce69b2b438d1 --- /dev/null +++ b/vta/python/vta/top/nnvm_op.py @@ -0,0 +1,113 @@ +"""Namespace for supporting packed_conv2d + ewise variant of nnvm.""" +from __future__ import absolute_import as _abs + +import logging + +import tvm +import topi + +from nnvm.top import registry as reg, OpPattern +from nnvm.top import nn as _nn + +from .vta_conv2d import is_packed_layout +from ..environment import get_env + +@tvm.register_func("nnvm.compiler.build_target", override=True) +def _build(funcs, target, target_host): + tvm_t = tvm.target.create(target) + if tvm_t.device_name == "vta": + return tvm.build(funcs, target="ext_dev", target_host=target_host) + if tvm_t.device_name == "rasp" or tvm_t.device_name == "vtacpu": + return tvm.build(funcs, target=target_host) + return tvm.build(funcs, target=target) + +@tvm.register_func("nnvm.compiler.lower", override=True) +def _lower(sch, inputs, func_name, graph): + import traceback + # pylint: disable=broad-except + try: + f = tvm.lower(sch, inputs, name=func_name) + if "quantized_conv2d" in func_name: + logging.info(graph.ir(join_entry_attrs=["shape"])) + except Exception: + msg = traceback.format_exc() + msg += "Error during compile graph\n" + msg += "--------------------------\n" + msg += graph.ir(join_entry_attrs=["shape"]) + raise RuntimeError(msg) + return f if isinstance( + f, (tvm.container.Array, tuple, list)) else [f] + +# override to force partition at copy +reg.register_pattern("copy", OpPattern.INJECTIVE, level=15) + +@reg.register_compute("clip", level=15) +def compute_clip(attrs, inputs, _): + """ Clip operator. """ + x = inputs[0] + a_min = attrs.get_float("a_min") + a_max = attrs.get_float("a_max") + const_min = tvm.const(a_min, x.dtype) + const_max = tvm.const(a_max, x.dtype) + with tvm.tag_scope(topi.tag.ELEMWISE): + x = tvm.compute( + x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") + x = tvm.compute( + x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") + return x + +@reg.register_compute("conv2d", level=15) +def compute_conv2d(attrs, inputs, out): + """ Compute definition of conv2d """ + padding = attrs.get_int_tuple("padding") + strides = attrs.get_int_tuple("strides") + dilation = attrs.get_int_tuple("dilation") + groups = attrs.get_int("groups") + layout = attrs["layout"] + out_dtype = attrs['out_dtype'] + + assert dilation == (1, 1), "not support dilate now" + if is_packed_layout(layout): + if groups == 1: + assert groups == 1 + env = get_env() + assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now" + assert env.LOG_OUT_WIDTH == 3, "only support 8bit inp for now" + inputs = list(inputs) + assert inputs[1].dtype == "int8" + return topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype) + else: + return topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, out_dtype) + + with tvm.target.arm_cpu(tvm.target.current_target().model): + return _nn.compute_conv2d(attrs, inputs, out) + +@reg.register_schedule("conv2d", level=15) +def schedule_conv2d(attrs, outs, target): + """ Schedule definition of conv2d """ + layout = attrs["layout"] + groups = attrs.get_int('groups') + + if is_packed_layout(layout): + target = tvm.target.create(target) + if target.device_name == "vta": + if groups == 1: + return topi.generic.schedule_conv2d_nchw(outs) + else: + return topi.generic.schedule_group_conv2d_nchw(outs) + elif str(target).startswith("llvm"): + return tvm.create_schedule([x.op for x in outs]) + else: + raise RuntimeError("not support target %s" % target) + + with tvm.target.arm_cpu(tvm.target.current_target().model): + return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target()) + +@reg.register_alter_op_layout("conv2d", level=15) +def alter_conv2d_layout(attrs, inputs, out): + layout = attrs['layout'] + if is_packed_layout(layout): + return None + + with tvm.target.arm_cpu(tvm.target.current_target().model): + return _nn.alter_conv2d_layout(attrs, inputs, out) diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index 3fe9a5ed8e70..7f3c58a46116 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -1,61 +1,27 @@ """Namespace for supporting packed_conv2d + ewise variant of nnvm.""" from __future__ import absolute_import as _abs -from collections import namedtuple import logging import tvm -from tvm import autotvm import topi -from nnvm.top import registry as reg, OpPattern -from nnvm.top import nn as _nn +from tvm.relay.op import op as reg +from tvm.relay.op.op import OpPattern +from tvm.relay.op.nn import _nn +from .vta_conv2d import is_packed_layout from ..environment import get_env -def is_packed_layout(layout): - """Check if layout is packed layout""" - if layout == "NCHW": - return False - if "n" in layout and "c" in layout: - return True - return False - -@tvm.register_func("nnvm.compiler.build_target", override=True) -def _build(funcs, target, target_host): - tvm_t = tvm.target.create(target) - if tvm_t.device_name == "vta": - return tvm.build(funcs, target="ext_dev", target_host=target_host) - if tvm_t.device_name == "rasp" or tvm_t.device_name == "vtacpu": - return tvm.build(funcs, target=target_host) - return tvm.build(funcs, target=target) - -@tvm.register_func("nnvm.compiler.lower", override=True) -def _lower(sch, inputs, func_name, graph): - import traceback - # pylint: disable=broad-except - try: - f = tvm.lower(sch, inputs, name=func_name) - if "quantized_conv2d" in func_name: - logging.info(graph.ir(join_entry_attrs=["shape"])) - except Exception: - msg = traceback.format_exc() - msg += "Error during compile graph\n" - msg += "--------------------------\n" - msg += graph.ir(join_entry_attrs=["shape"]) - raise RuntimeError(msg) - return f if isinstance( - f, (tvm.container.Array, tuple, list)) else [f] - # override to force partition at copy reg.register_pattern("copy", OpPattern.INJECTIVE, level=15) @reg.register_compute("clip", level=15) -def compute_clip(attrs, inputs, _): +def compute_clip(attrs, inputs, output_type, target): """ Clip operator. """ x = inputs[0] - a_min = attrs.get_float("a_min") - a_max = attrs.get_float("a_max") + a_min = attrs.a_min + a_max = attrs.a_max const_min = tvm.const(a_min, x.dtype) const_max = tvm.const(a_max, x.dtype) with tvm.tag_scope(topi.tag.ELEMWISE): @@ -63,18 +29,17 @@ def compute_clip(attrs, inputs, _): x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") x = tvm.compute( x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") - return x - -@reg.register_compute("conv2d", level=15) -def compute_conv2d(attrs, inputs, out): - """ 2D convolution algorithm. - """ - padding = attrs.get_int_tuple("padding") - strides = attrs.get_int_tuple("strides") - dilation = attrs.get_int_tuple("dilation") - groups = attrs.get_int("groups") - layout = attrs["layout"] - out_dtype = attrs['out_dtype'] + return [x] + +@reg.register_compute("nn.conv2d", level=15) +def compute_conv2d(attrs, inputs, output_type, target): + """ Compute definition of conv2d """ + padding = topi.util.get_const_tuple(attrs.padding) + strides = topi.util.get_const_tuple(attrs.strides) + dilation = tuple([int(d) for d in attrs.dilation]) + groups = attrs.groups + layout = attrs.data_layout + out_dtype = attrs.out_dtype assert dilation == (1, 1), "not support dilate now" if is_packed_layout(layout): @@ -85,19 +50,18 @@ def compute_conv2d(attrs, inputs, out): assert env.LOG_OUT_WIDTH == 3, "only support 8bit inp for now" inputs = list(inputs) assert inputs[1].dtype == "int8" - return topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype) + return [topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype)] else: - return topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, out_dtype) + return [topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, out_dtype)] with tvm.target.arm_cpu(tvm.target.current_target().model): - return _nn.compute_conv2d(attrs, inputs, out) + return _nn.compute_conv2d(attrs, inputs, output_type, target) -@reg.register_schedule("conv2d", level=15) +@reg.register_schedule("nn.conv2d", level=15) def schedule_conv2d(attrs, outs, target): - """ 2D convolution schedule. - """ - layout = attrs["layout"] - groups = attrs.get_int('groups') + """ Schedule definition of conv2d """ + groups = attrs.groups + layout = attrs.data_layout if is_packed_layout(layout): target = tvm.target.create(target) @@ -113,12 +77,3 @@ def schedule_conv2d(attrs, outs, target): with tvm.target.arm_cpu(tvm.target.current_target().model): return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target()) - -@reg.register_alter_op_layout("conv2d", level=15) -def alter_conv2d_layout(attrs, inputs, out): - layout = attrs['layout'] - if is_packed_layout(layout): - return None - - with tvm.target.arm_cpu(tvm.target.current_target().model): - return _nn.alter_conv2d_layout(attrs, inputs, out) diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index 78db543d4774..1672af47ca0c 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -21,9 +21,16 @@ from tvm import autotvm import topi -from .op import is_packed_layout from ..environment import get_env +def is_packed_layout(layout): + """Check if layout is packed layout""" + if layout == "NCHW": + return False + if "n" in layout and "c" in layout: + return True + return False + @autotvm.register_topi_compute(topi.nn.conv2d, 'vta', 'direct') def packed_conv2d(cfg, data, From 3f31c6c394bc5daf91ce60d54a8fe2b69a87a380 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 14 May 2019 16:03:02 -0700 Subject: [PATCH 013/126] operator tagging for broadcast --- src/relay/backend/compile_engine.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc index f11dd2875b80..7ae1befcfe89 100644 --- a/src/relay/backend/compile_engine.cc +++ b/src/relay/backend/compile_engine.cc @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -170,7 +171,7 @@ class ScheduleGetter : LOG(FATAL) << "not handled"; return tvm::Expr(); } - }); + }, "compile_engine_const", topi::kBroadcast); scalars_.push_back(value->op); return {value}; } From e7f104991004e3201666e00afea8eb2f3e19d2d8 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 14 May 2019 16:03:23 -0700 Subject: [PATCH 014/126] invalid shape error --- topi/python/topi/util.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/topi/python/topi/util.py b/topi/python/topi/util.py index d4e23be47e58..edf8ee11e884 100644 --- a/topi/python/topi/util.py +++ b/topi/python/topi/util.py @@ -23,6 +23,10 @@ from tvm.api import layout, bijective_layout from . import tag +class InvalidShapeError(ValueError): + """Invalid shape for a topi function. i.e. call winograd template for non-3x3 kernel)""" + pass + def traverse_inline(s, final_op, callback): """Traverse computation graph and do auto inline From 52c19f415d4cc82039f7b2d290a77eacb3b7d376 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 14 May 2019 16:26:37 -0700 Subject: [PATCH 015/126] relay graph pack pass --- vta/python/vta/top/__init__.py | 4 +- vta/python/vta/top/graphpack.py | 277 ++++++++++++++++++++++++++++++++ 2 files changed, 280 insertions(+), 1 deletion(-) create mode 100644 vta/python/vta/top/graphpack.py diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py index f7b48c0bde2d..5111035decd3 100644 --- a/vta/python/vta/top/__init__.py +++ b/vta/python/vta/top/__init__.py @@ -1,7 +1,9 @@ """TVM TOPI connector, eventually most of these should go to TVM repo""" from . import bitpack -from . import op +from .graphpack import graph_pack from . import nnvm_bitpack +from .nnvm_graphpack import nnvm_graph_pack from . import nnvm_op +from . import op from . import vta_conv2d diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py new file mode 100644 index 000000000000..3ce50d06dbda --- /dev/null +++ b/vta/python/vta/top/graphpack.py @@ -0,0 +1,277 @@ +"""A Relay implementation of graph packing.""" + +from tvm import relay +from tvm.relay import op +from tvm.relay import ExprMutator + +def _to_shape(shape): + return tuple(int(sh) for sh in shape) + +def _pack_batch_channel(data, dshape, bfactor, cfactor): + """Pack the data channel dimension. + """ + assert int(dshape[0]) % bfactor == 0 + assert int(dshape[1]) % cfactor == 0 + data = op.reshape(data, + newshape=(int(dshape[0]) // bfactor, bfactor, + int(dshape[1]) // cfactor, cfactor, + int(dshape[2]), int(dshape[3]))) + data = op.transpose( + data, axes=(0, 2, 4, 5, 1, 3)) + return data + + +def _unpack_batch_channel(data, old_shape): + """Unpack the data channel dimension. + """ + data = op.transpose(data, axes=(0, 4, 1, 5, 2, 3)) + data = op.reshape(data, newshape=old_shape) + return data + + +def _pack_weight(data, dshape, cfactor): + """Pack the weight into packed format. + """ + assert len(dshape) == 4 + assert int(dshape[0]) % cfactor == 0 + assert int(dshape[1]) % cfactor == 0 + data = op.reshape(data, + newshape=(int(dshape[0]) // cfactor, cfactor, + int(dshape[1]) // cfactor, cfactor, + int(dshape[2]), int(dshape[3]))) + data = op.transpose( + data, axes=(0, 2, 4, 5, 1, 3)) + return data + + +def _pack_weight_conv2d_transpose(data, dshape, cfactor): + """Pack the weight into packed format. + """ + dshape = _to_shape(dshape) + assert len(dshape) == 4 + assert dshape[0] % cfactor == 0 + assert dshape[1] % cfactor == 0 + data = op.reshape(data, + newshape=(dshape[0] // cfactor, cfactor, + dshape[1] // cfactor, cfactor, + dshape[2], dshape[3])) + data = op.transpose( + data, axes=(2, 0, 4, 5, 3, 1)) + return data + + +def _pack_bias(data, dshape, dtype, bfactor, cfactor): + """Pack the bias parameter. + """ + dshape = _to_shape(dshape) + assert len(dshape) == 3 + assert dshape[0] % cfactor == 0 + data = op.reshape(data, + newshape=(dshape[0] // cfactor, + cfactor, dshape[1], + dshape[2], 1)) + data = op.transpose( + data, axes=(0, 2, 3, 4, 1)) + + # broadcast batch dimension to bfactor + data = op.broadcast_to( + data, + shape=(dshape[0] // cfactor, dshape[1], dshape[2], bfactor, cfactor)) + return data + + +def _get_shape(node): + """Get the shape of a node. + """ + return _to_shape(node.checked_type.shape) + +class ExprPack(ExprMutator): + def __init__(self, bfactor, cfactor, weight_bits): + self.bfactor = bfactor + self.cfactor = cfactor + self.weight_bits = weight_bits + self.start_pack = False + # Cache Operator the algorithm matches against. + self.bitpack_start = op.op.get('bitpack_start') + self.bitpack_end = op.op.get('bitpack_end') + self.conv2d = op.op.get("nn.conv2d") + self.conv2d_transpose = op.op.get("nn.conv2d_transpose") + self.add = op.op.get("add") + self.bias_add = op.op.get("nn.bias_add") + self.number_of_conv2d = 0 + super().__init__() + + def visit_call(self, call): + # First visit the children. + oshape = _get_shape(call) + odtype = call.checked_type.dtype + input_types = [arg.checked_type for arg in call.args] + args = [self.visit(arg) for arg in call.args] + + # Start and stop cases. + if call.op == self.bitpack_start: + assert not self.start_pack + self.start_pack = True + return _pack_batch_channel(args[0], oshape, self.bfactor, self.cfactor) + elif call.op == self.bitpack_end: + if self.start_pack: + self.start_pack = False + data = args[0] + data_shape = _get_shape(call.args[0]) + return _unpack_batch_channel(data, data_shape) + else: + pass + if self.start_pack: + # Operator cases + if call.op == self.conv2d and odtype == 'int32': + self.number_of_conv2d += 1 + assert 8 % self.weight_bits == 0 + w_lanes = 8 // self.weight_bits + data_layout = "NCHW%dn%dc" % (self.bfactor, self.cfactor) + kernel_layout = "OIHW%do%di" % (self.cfactor, self.cfactor) + data, weight = args + data_shape = _to_shape(input_types[0].shape) + kernel_shape = _to_shape(input_types[1].shape) + kernel = _pack_weight(weight, kernel_shape, self.cfactor) + # insert bit packing when necessary + if w_lanes != 1: + assert 8 % w_lanes == 0 + kernel = op.bitpack(kernel, lanes=w_lanes) + conv2d = op.nn.conv2d( + data, + kernel, + strides=call.attrs.strides, + padding=call.attrs.padding, + dilation=call.attrs.dilation, + groups=call.attrs.groups, + channels=call.attrs.channels, + kernel_size=call.attrs.kernel_size, + data_layout=data_layout, + kernel_layout=kernel_layout, + out_dtype=call.attrs.out_dtype) + return conv2d + elif call.op == self.conv2d_transpose and odtype == 'int32': + self.number_of_conv2d += 1 + assert 8 % self.weight_bits == 0 + w_lanes = 8 // self.weight_bits + if self.start_pack: + data_layout = "NCHW%dn%dc" % (self.bfactor, self.cfactor) + kernel_layout = "IOHW%di%do" % (self.cfactor, self.cfactor) + data, weight = args + data_shape = _to_shape(input_types[0].shape) + kernel_shape = _to_shape(input_types[1].shape) + kernel = _pack_weight_conv2d_transpose(weight, kernel_shape, self.cfactor) + conv2d = op.nn.conv2d_transpose( + data, + kernel, + strides=call.attrs.strides, + padding=call.attrs.padding, + dilation=call.attrs.dilation, + groups=call.attrs.groups, + channels=call.attrs.channels, + kernel_size=call.attrs.kernel_size, + data_layout=data_layout, + kernel_layout=kernel_layout, + output_padding=call.attrs.output_padding, + out_dtype=call.attrs.out_dtype) + return conv2d + elif call.op == self.add and tuple(input_types[0].shape) == tuple(input_types[1].shape): + pass + elif call.op == self.add and len(input_types[1].shape) == 3: + data, bias = args + bias = _pack_bias(bias, _to_shape(input_types[1].shape), input_types[1].dtype, self.bfactor, self.cfactor) + return relay.Call(self.add, [data, bias]) + elif self.start_pack and call.op == self.bias_add: + data, bias = args + bias = _pack_bias(bias, _to_shape(input_types[1].shape), input_types[1].dtype, self.bfactor, self.cfactor) + return relay.Call(self.add, [data, bias]) + elif self.start_pack and call.op == op.op.get('cast') and input_types[0].dtype == 'int32': + cast = relay.Call(op.op.get('cast'), [args[0]], call.attrs) + return relay.Call(op.op.get('copy'), [cast]) + + return relay.Call( + self.visit(call.op), + args, + call.attrs) + +class BT(Exception): + pass +def get_subgraph(expr, start_name, stop_name): + "we assume stop_name only appear once for simplicity." + "this constraint will be lifted in the future." + "bitpack_start and bitpack_end is both inclusive" + bitpack_start = op.op.get('bitpack_start') + bitpack_end = op.op.get('bitpack_end') + anf = relay.ir_pass.to_a_normal_form(expr) + def recursion(anf, start_found, stop_found): + if isinstance(anf, relay.expr.Function): + return relay.expr.Function(anf.params, recursion(anf.body, start_found, stop_found), anf.ret_type, anf.type_params, anf.attrs) + elif isinstance(anf, relay.expr.Let): + value = anf.value + if isinstance(value, relay.expr.Call): + if isinstance(value.op, relay.op.Op): + if value.op.name == start_name and not start_found: + value = relay.expr.Call(bitpack_start, [value]) + start_found = True + elif value.op.name == stop_name: + raise BT() + try: + return relay.expr.Let(anf.var, value, recursion(anf.body, start_found, stop_found)) + except BT: + assert start_found + assert not stop_found + stop_found = True + value = relay.expr.Call(bitpack_end, [value]) + return relay.expr.Let(anf.var, value, anf.body) # todo: check anf.body has no more stop_name beside that one + else: + assert start_found + assert stop_found + return anf + annotated = recursion(anf, False, False) + return relay.ir_pass.infer_type(relay.ir_pass.to_graph_normal_form(annotated)) + +def graph_pack(expr, + bfactor, + cfactor, + weight_bits, + start_name="nn.max_pool2d", + stop_name="nn.global_avg_pool2d"): + """Pack the graph into batch&channel packed format. + + Parameters + ---------- + expr : relay.Expr + The input program. + + bfactor : int + The packing factor in batch + + cfactor : int + The packing factor in channel + + weight_bits: int + The bit-width of the weights. + + start_name: str, optional + Start packing from certain known node. + + stop_name: str, optional + Stop packing from certain known node. + + Returns + ------- + expr : Expr + The transformed expression. + """ + assert isinstance(expr, relay.Function) + expr = get_subgraph(expr, start_name, stop_name) + print("Before", expr.astext(show_meta_data=False)) + expr = relay.ir_pass.infer_type(expr) + packer = ExprPack( + bfactor, cfactor, + weight_bits) + expr = packer.visit(expr) + print("After", expr.astext(show_meta_data=False)) + assert not packer.start_pack + return relay.ir_pass.infer_type(expr) + From ab01f07d197f413dde3342b863e5cafa9064a113 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 14 May 2019 16:49:17 -0700 Subject: [PATCH 016/126] test script for relay to vta compilation --- vta/scripts/relay_to_vta.py | 284 ++++++++++++++++++++++++++++++++++++ 1 file changed, 284 insertions(+) create mode 100644 vta/scripts/relay_to_vta.py diff --git a/vta/scripts/relay_to_vta.py b/vta/scripts/relay_to_vta.py new file mode 100644 index 000000000000..11cddef8b976 --- /dev/null +++ b/vta/scripts/relay_to_vta.py @@ -0,0 +1,284 @@ +"""Perform inference on VTA using Relay.""" + +import argparse, json, requests, time +from io import BytesIO +from mxnet.gluon.model_zoo import vision +import numpy as np +from os.path import join, isfile +from PIL import Image + +import tvm +from tvm import rpc, autotvm, relay +from tvm.contrib import graph_runtime, util, download +from tvm.contrib.debugger import debug_runtime +import vta +from vta.testing import simulator +from vta.top import graph_pack + +parser = argparse.ArgumentParser(description='Train a model for image classification.') +parser.add_argument('--model', type=str, required=True, + help='Input model name.') +parser.add_argument('--start-name', type=str, default='nn.max_pool2d', + help='The name of the node where packing starts') +parser.add_argument('--stop-name', type=str, default='nn.global_avg_pool2d', + help='The name of the node where packing stops') +parser.add_argument('--debug-profile', action='store_true', + help='Show layer-wise time cost profiling results') +parser.add_argument('--device', default="vta", + help='Select device target, either "vta" or "vtacpu"') +parser.add_argument('--measurements', type=int, default=1, + help='Number of measurements') + +opt = parser.parse_args() + +if 'mobilenet' in opt.model: + opt.start_name = 'nn.relu' +elif 'gan' in opt.model: + opt.start_name = 'reshape0' + opt.stop_name = 'copy2' +elif 'rnn' in opt.model: + opt.start_name = 'reshape0' + opt.stop_name = 'reshape1' + +# Helper function to read in image +# Takes in Image object, returns an ND array +def process_image(image): + # Convert to neural network input format + image = np.array(image) - np.array([123., 117., 104.]) + image /= np.array([58.395, 57.12, 57.375]) + image = image.transpose((2, 0, 1)) + image = image[np.newaxis, :] + + return tvm.nd.array(image.astype("float32")) + +def mark_nop(graph, + conv_layer=-1, + skip_conv_layer=(), + reverse=False, + conv2d_only=False): + """Helper function to mark certain op as nop + + Useful to debug performance issues. + """ + jgraph = json.loads(graph.json()) + counter = 0 + for _, node in enumerate(jgraph["nodes"]): + op_name = node["op"] + if op_name != "tvm_op": + continue + attrs = node["attrs"] + func_name = attrs["func_name"] + + if func_name.find("conv2d") != -1: + if conv_layer >= 0: + if counter != conv_layer: + attrs["func_name"] = "__nop" + if counter in skip_conv_layer: + attrs["func_name"] = "__nop" + counter += 1 + else: + if conv_layer >= 0: + attrs["func_name"] = "__nop" + attrs["func_name"] = "__nop" + + if reverse: + if attrs["func_name"] != "__nop": + attrs["func_name"] = "__nop" + else: + attrs["func_name"] = func_name + + if conv2d_only: + if attrs["func_name"].find("conv2d") == -1: + attrs["func_name"] = "__nop" + + graph = nnvm.graph.load_json(json.dumps(jgraph)) + return graph + + +def demo_cat_classification(env, m, ctx, remote, shape_dict, dtype_dict): + # Read in ImageNet Categories + url = "https://github.com/uwsaml/web-data/raw/master/vta/models/" + categ_fn = "synset.txt" + for fn in ["synset.txt"]: + if not isfile(fn): + download.download(join(url, fn), fn) + synset = eval(open(categ_fn).read()) + # Read in test image + image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg' + # Read in test image + response = requests.get(image_url) + image = Image.open(BytesIO(response.content)).resize((224, 224)) + # Set the input + image = process_image(image) + if "gan" in opt.model or "rnn" in opt.model: + # non-classification networks require custom input shapes and out shapes + m.set_input('data', tvm.nd.array( + 10 * np.random.uniform(size=shape_dict['data']).astype(dtype_dict['data']))) + timer = m.module.time_evaluator("run", ctx, number=1, repeat=opt.measurements) + tcost = timer() + std = np.std(tcost.results) * 1000 / env.BATCH + mean = tcost.mean * 1000 / env.BATCH + print("Performed inference in %.2fms/samlple (std = %.2f)" % (mean, std)) + else: + image = np.repeat(image.asnumpy(), env.BATCH, axis=0) + m.set_input('data', image) + # Perform inference + timer = m.module.time_evaluator("run", ctx, number=1, repeat=opt.measurements) + tcost = timer() + + if opt.debug_profile: + m.run() + + # Get classification results + tvm_output = m.get_output(0, + tvm.nd.empty((env.BATCH, 1000), "float32", remote.cpu(0))) + top_categories = np.argsort(tvm_output.asnumpy()[0]) + + # Report top-5 classification results + std = np.std(tcost.results) * 1000 / env.BATCH + mean = tcost.mean * 1000 / env.BATCH + print("%s Prediction" % opt.model) + print(" #1:", synset[top_categories[-1]]) + print(" #2:", synset[top_categories[-2]]) + print(" #3:", synset[top_categories[-3]]) + print(" #4:", synset[top_categories[-4]]) + print(" #5:", synset[top_categories[-5]]) + print("Performed inference in %.2fms/sample (std = %.2f)" % (mean, std)) + +###################################################################### +# Setup the Pynq Board's RPC Server +# --------------------------------- +# Build the RPC server's VTA runtime and program the Pynq FPGA. + +def run(device = "vta"): + env = vta.get_env() + # Measure build start time + reconfig_start = time.time() + + # We configure both the bitstream and the runtime system on the Pynq + # to match the VTA configuration specified by the vta_config.json file. + if env.TARGET != "sim": + + # Make sure that TVM was compiled with RPC=1 + assert tvm.module.enabled("rpc") + + # Get remote from fleet node + remote = autotvm.measure.request_remote(env.TARGET, '10.77.1.109', 9190, timeout=10000) + + # Reconfigure the JIT runtime + vta.reconfig_runtime(remote) + + # Program the FPGA with a pre-compiled VTA bitstream. + # You can program the FPGA with your own custom bitstream + # by passing the path to the bitstream file instead of None. + vta.program_fpga(remote, bitstream=None) + + # Report on reconfiguration time + reconfig_time = time.time() - reconfig_start + print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time)) + + # In simulation mode, host the RPC server locally. + elif env.TARGET == "sim": + remote = rpc.LocalSession() + + # TVM target and context + target = tvm.target.create("llvm -device={}".format(device)) + ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) + + with autotvm.tophub.context(target): + + # Measure build start time + build_start = time.time() + + # Derive the LLVM compiler flags + # When targetting the Pynq/Ultra-96, cross-compile to ARM ISA + target_host = env.target_host + + # Populate the shape and data type dictionary + dtype_dict = {"data": 'float32'} + if "gan" in opt.model: + shape_dict = {"data": (env.BATCH, 100)} + elif 'rnn' in opt.model: + batch_size, seq_len, hidden_dim = 4, 1, 640 + begin_state_shape = (batch_size, hidden_dim, 1, 1) + shape_dict = {"data": (seq_len, batch_size), + "cell_l0_begin_state_0": begin_state_shape, + "cell_l1_begin_state_0": begin_state_shape} + dtype_dict = {"data": "int32", + "cell_l0_begin_state_0": 'float32', + "cell_l1_begin_state_0": 'float32'} + else: + shape_dict = {"data": (env.BATCH, 3, 224, 224)} + + gluon_model = vision.get_model(opt.model, pretrained=True) + relay_graph, params = relay.frontend.from_mxnet(gluon_model, shape_dict) + + shape_dict.update({k: v.shape for k, v in params.items()}) + dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) + + with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1): + relay_graph = relay.quantize.quantize(relay_graph, params=params) + + if target.device_name == "vta": + assert env.BLOCK_IN == env.BLOCK_OUT + relay_graph = graph_pack( + relay_graph, + env.BATCH, + env.BLOCK_OUT, + env.WGT_WIDTH, + start_name=opt.start_name, + stop_name=opt.stop_name) + + relay_graph = relay.ir_pass.fold_constant(relay_graph) + + # Compile Relay program. + with relay.build_module.build_config(opt_level=3, disable_pass={"AlterOpLayout"}): + if target.device_name != "vta": + # import pdb; pdb.set_trace() + graph, lib, params = relay.build( + relay_graph, target=target, + params=params, target_host=target_host) + else: + # import pdb; pdb.set_trace() + with vta.build_config(): + graph, lib, params = relay.build( + relay_graph, target=target, + params=params, target_host=target_host) + + + # Save the compiled inference graph library + assert tvm.module.enabled("rpc") + temp = util.tempdir() + lib.save(temp.relpath("graphlib.o")) + + # Send the inference library over to the remote RPC server + remote.upload(temp.relpath("graphlib.o")) + lib = remote.load_module("graphlib.o") + + # Measure build time + build_time = time.time() - build_start + print(opt.model + " inference graph built in {0:.2f}s!".format(build_time)) + + cpu_skip_layer = (0,) if "gan" in opt.model else (3,) + # profile script, set this to False to run end to end + if opt.debug_fpga_only: + graph = mark_nop(graph, skip_conv_layer=cpu_skip_layer) + elif opt.debug_cpu_only: + graph = mark_nop(graph, skip_conv_layer=cpu_skip_layer, reverse=True) + elif opt.run_conv_layer: + conv_set = tuple(int(x) for x in opt.run_conv_layer.split(",")) + graph = mark_nop(graph, + skip_conv_layer=conv_set, + reverse=True, + conv2d_only=True) + + if opt.debug_profile: + m = debug_runtime.create(graph, lib, ctx) + else: + m = graph_runtime.create(graph, lib, ctx) + + # Set the parameters + m.set_input(**params) + demo_cat_classification(env, m, ctx, remote, shape_dict, dtype_dict) + +run(opt.device) From 32773ad730d07011828b255fe0e36aa558e02b22 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 15 May 2019 14:37:13 -0700 Subject: [PATCH 017/126] adding nnvm graphpack: --- vta/python/vta/top/nnvm_graphpack.py | 206 +++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 vta/python/vta/top/nnvm_graphpack.py diff --git a/vta/python/vta/top/nnvm_graphpack.py b/vta/python/vta/top/nnvm_graphpack.py new file mode 100644 index 000000000000..1f713acd3e27 --- /dev/null +++ b/vta/python/vta/top/nnvm_graphpack.py @@ -0,0 +1,206 @@ +"""An NNVM implementation of graph packing.""" + +import nnvm +from nnvm.compiler import graph_attr, graph_util + +def _pack_batch_channel(data, dshape, bfactor, cfactor): + """Pack the data channel dimension. + """ + assert dshape[0] % bfactor == 0 + assert dshape[1] % cfactor == 0 + data = nnvm.sym.reshape(data, + shape=(dshape[0] // bfactor, bfactor, + dshape[1] // cfactor, cfactor, + dshape[2], dshape[3])) + data = nnvm.sym.transpose( + data, axes=(0, 2, 4, 5, 1, 3)) + return data + + +def _unpack_batch_channel(data, old_shape): + """Unpack the data channel dimension. + """ + data = nnvm.sym.transpose(data, axes=(0, 4, 1, 5, 2, 3)) + data = nnvm.sym.reshape(data, shape=old_shape) + return data + + +def _pack_weight(data, dshape, cfactor): + """Pack the weight into packed format. + """ + assert len(dshape) == 4 + assert dshape[0] % cfactor == 0 + assert dshape[1] % cfactor == 0 + data = nnvm.sym.reshape(data, + shape=(dshape[0] // cfactor, cfactor, + dshape[1] // cfactor, cfactor, + dshape[2], dshape[3])) + data = nnvm.sym.transpose( + data, axes=(0, 2, 4, 5, 1, 3)) + return data + + +def _pack_weight_conv2d_transpose(data, dshape, cfactor): + """Pack the weight into packed format. + """ + assert len(dshape) == 4 + assert dshape[0] % cfactor == 0 + assert dshape[1] % cfactor == 0 + data = nnvm.sym.reshape(data, + shape=(dshape[0] // cfactor, cfactor, + dshape[1] // cfactor, cfactor, + dshape[2], dshape[3])) + data = nnvm.sym.transpose( + data, axes=(2, 0, 4, 5, 3, 1)) + return data + + +def _pack_bias(data, dshape, bfactor, cfactor): + """Pack the bias parameter. + """ + assert len(dshape) == 3 + assert dshape[0] % cfactor == 0 + data = nnvm.sym.reshape(data, + shape=(dshape[0] // cfactor, + cfactor, dshape[1], + dshape[2], 1)) + data = nnvm.sym.transpose( + data, axes=(0, 2, 3, 4, 1)) + # broadcast batch dimension to bfactor + data = nnvm.sym.broadcast_to( + data, + shape=(dshape[0] // cfactor, dshape[1], dshape[2], bfactor, cfactor)) + return data + + +def _get_shape(sym, shape_dict): + """Get the shape of a node. + """ + return graph_util.infer_shape( + nnvm.graph.create(sym), **shape_dict)[1][0] + + +def nnvm_graph_pack(graph, + shape_dict, + bfactor, + cfactor, + weight_bits, + start_name="max_pool2d0", + stop_name="global_avg_pool2d0"): + """Pack the graph into batch&channel packed format. + + Parameters + ---------- + graph : Graph + The input graph. + + shape_dict : dict of str to shape + The input shape. + + bfactor : int + The packing factor in batch + + cfactor : int + The packing factor in channel + + start_name: str, optional + Start packing from certain known node. + + start_name: str, optional + Stop packing from certain known node. + + Returns + ------- + graph : Graph + The transformed graph. + """ + graph = graph_attr.set_shape_inputs(graph, shape_dict) + graph = graph.apply("InferShape") + shape = graph.json_attr("shape") + gidx = graph.index + node_map = {} + dset = set() + start_pack = False + + for nid, node in enumerate(gidx.nodes): + children = [node_map[e[0]] for e in node["inputs"]] + ishape = [shape[gidx.entry_id(e)] for e in node["inputs"]] + oshape = shape[gidx.entry_id(nid, 0)] + attrs = node.get("attrs", {}) + node_name = node["name"] + op_name = node["op"] + get_clone = lambda c, o_n, n_n, a: getattr(nnvm.symbol, o_n)( + *c, name=n_n, **a) + if op_name == "null": + new_node = nnvm.symbol.Variable(node_name) + if start_name and node_name == start_name: + start_pack = True + new_node = _pack_batch_channel(new_node, oshape, bfactor, cfactor) + if start_pack and "_begin_state_" in node_name: # RNN -> CNN, pack + new_node = _pack_batch_channel(new_node, oshape, bfactor, cfactor) + elif node_name == start_name: + assert not start_pack + start_pack = True + new_node = get_clone(children, op_name, node_name, attrs) + new_node = _pack_batch_channel(new_node, oshape, bfactor, cfactor) + elif node_name == stop_name: + if start_pack: + start_pack = False + children[0] = _unpack_batch_channel(children[0], ishape[0]) + new_node = getattr(nnvm.symbol, op_name)( + *children, name=node_name, **attrs) + else: + new_node = get_clone(children, op_name, node_name, attrs) + elif op_name == "conv2d" and attrs.get("out_dtype", None) == "int32": + assert 8 % weight_bits == 0 + w_lanes = 8 // weight_bits + if start_pack: + attrs["layout"] = "NCHW%dn%dc" % (bfactor, cfactor) + attrs["kernel_layout"] = "OIHW%do%di%dp" % (cfactor, cfactor, w_lanes) + data, weight = children + weight = _pack_weight(weight, ishape[1], cfactor) + # insert bit packing when necessary + if w_lanes != 1: + assert 8 % w_lanes == 0 + weight = nnvm.sym.bitpack(weight, lanes=w_lanes) + new_node = nnvm.sym.conv2d( + data, weight, name=node_name, **attrs) + else: + new_node = get_clone(children, op_name, node_name, attrs) + elif op_name == "conv2d_transpose" and attrs.get("out_dtype", None) == "int32": + assert 8 % weight_bits == 0 + w_lanes = 8 // weight_bits + if start_pack: + attrs["layout"] = "NCHW%dn%dc" % (bfactor, cfactor) + attrs["kernel_layout"] = "IOHW%di%do%dp" % (cfactor, cfactor, w_lanes) + data, weight = children + weight = _pack_weight_conv2d_transpose(weight, ishape[1], cfactor) + new_node = nnvm.sym.conv2d_transpose( + data, weight, name=node_name, **attrs) + else: + new_node = get_clone(children, op_name, node_name, attrs) + elif op_name.startswith("broadcast_") and tuple(ishape[0]) == tuple(ishape[1]): + new_node = get_clone(children, op_name, node_name, attrs) + elif op_name.startswith("broadcast") and len(ishape[1]) == 3: + if start_pack: + children[1] = _pack_bias(children[1], ishape[1], bfactor, cfactor) + new_node = getattr(nnvm.symbol, op_name)( + *children, name=node_name, **attrs) + else: + new_node = get_clone(children, op_name, node_name, attrs) + elif op_name.startswith("elementwise_add"): + new_node = get_clone(children, op_name, node_name, attrs) + else: + new_node = get_clone(children, op_name, node_name, attrs) + dset.add(op_name) + node_map[nid] = new_node + + assert len(graph.index.output_entries) == 1 + ret = node_map[graph.index.output_entries[0][0]] + if start_pack: + oshape = shape[graph.index.output_entries[0][0]] + ret = _unpack_batch_channel(ret, oshape) + graph = nnvm.graph.create(ret) + graph = graph_attr.set_shape_inputs(graph, shape_dict) + graph = graph.apply("InferShape") + return graph From 73036e3ce25a0ec504ee3291f809cec1abfd9557 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 15 May 2019 17:45:12 -0700 Subject: [PATCH 018/126] clean up of script --- vta/scripts/relay_to_vta.py | 59 ------------------------------------- 1 file changed, 59 deletions(-) diff --git a/vta/scripts/relay_to_vta.py b/vta/scripts/relay_to_vta.py index 11cddef8b976..66af34f659e4 100644 --- a/vta/scripts/relay_to_vta.py +++ b/vta/scripts/relay_to_vta.py @@ -51,50 +51,6 @@ def process_image(image): return tvm.nd.array(image.astype("float32")) -def mark_nop(graph, - conv_layer=-1, - skip_conv_layer=(), - reverse=False, - conv2d_only=False): - """Helper function to mark certain op as nop - - Useful to debug performance issues. - """ - jgraph = json.loads(graph.json()) - counter = 0 - for _, node in enumerate(jgraph["nodes"]): - op_name = node["op"] - if op_name != "tvm_op": - continue - attrs = node["attrs"] - func_name = attrs["func_name"] - - if func_name.find("conv2d") != -1: - if conv_layer >= 0: - if counter != conv_layer: - attrs["func_name"] = "__nop" - if counter in skip_conv_layer: - attrs["func_name"] = "__nop" - counter += 1 - else: - if conv_layer >= 0: - attrs["func_name"] = "__nop" - attrs["func_name"] = "__nop" - - if reverse: - if attrs["func_name"] != "__nop": - attrs["func_name"] = "__nop" - else: - attrs["func_name"] = func_name - - if conv2d_only: - if attrs["func_name"].find("conv2d") == -1: - attrs["func_name"] = "__nop" - - graph = nnvm.graph.load_json(json.dumps(jgraph)) - return graph - - def demo_cat_classification(env, m, ctx, remote, shape_dict, dtype_dict): # Read in ImageNet Categories url = "https://github.com/uwsaml/web-data/raw/master/vta/models/" @@ -234,12 +190,10 @@ def run(device = "vta"): # Compile Relay program. with relay.build_module.build_config(opt_level=3, disable_pass={"AlterOpLayout"}): if target.device_name != "vta": - # import pdb; pdb.set_trace() graph, lib, params = relay.build( relay_graph, target=target, params=params, target_host=target_host) else: - # import pdb; pdb.set_trace() with vta.build_config(): graph, lib, params = relay.build( relay_graph, target=target, @@ -258,19 +212,6 @@ def run(device = "vta"): # Measure build time build_time = time.time() - build_start print(opt.model + " inference graph built in {0:.2f}s!".format(build_time)) - - cpu_skip_layer = (0,) if "gan" in opt.model else (3,) - # profile script, set this to False to run end to end - if opt.debug_fpga_only: - graph = mark_nop(graph, skip_conv_layer=cpu_skip_layer) - elif opt.debug_cpu_only: - graph = mark_nop(graph, skip_conv_layer=cpu_skip_layer, reverse=True) - elif opt.run_conv_layer: - conv_set = tuple(int(x) for x in opt.run_conv_layer.split(",")) - graph = mark_nop(graph, - skip_conv_layer=conv_set, - reverse=True, - conv2d_only=True) if opt.debug_profile: m = debug_runtime.create(graph, lib, ctx) From 44b3e5023d149354d9d08e60255376ff3e0163d2 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Fri, 17 May 2019 02:09:52 +0000 Subject: [PATCH 019/126] adding rpc server with fleet server registration --- apps/pynq_rpc/start_rpc_server_to_tracker.sh | 5 +++++ 1 file changed, 5 insertions(+) create mode 100755 apps/pynq_rpc/start_rpc_server_to_tracker.sh diff --git a/apps/pynq_rpc/start_rpc_server_to_tracker.sh b/apps/pynq_rpc/start_rpc_server_to_tracker.sh new file mode 100755 index 000000000000..0299ce55c89e --- /dev/null +++ b/apps/pynq_rpc/start_rpc_server_to_tracker.sh @@ -0,0 +1,5 @@ +#!/bin/bash +PROJROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd )" + +export PYTHONPATH=${PYTHONPATH}:${PROJROOT}/python:${PROJROOT}/vta/python +python3.6 -m vta.exec.rpc_server --tracker fleet:9190 --key pynq From 049118ceb2a2b13703dddbf3db8b5663b598a7bb Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Fri, 17 May 2019 02:14:16 +0000 Subject: [PATCH 020/126] adding license --- apps/pynq_rpc/start_rpc_server_to_tracker.sh | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/apps/pynq_rpc/start_rpc_server_to_tracker.sh b/apps/pynq_rpc/start_rpc_server_to_tracker.sh index 0299ce55c89e..f1b906327add 100755 --- a/apps/pynq_rpc/start_rpc_server_to_tracker.sh +++ b/apps/pynq_rpc/start_rpc_server_to_tracker.sh @@ -1,5 +1,23 @@ #!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. PROJROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd )" + export PYTHONPATH=${PYTHONPATH}:${PROJROOT}/python:${PROJROOT}/vta/python -python3.6 -m vta.exec.rpc_server --tracker fleet:9190 --key pynq +export PYTHONPATH=${PYTHONPATH}:/home/xilinx/pynq +python3 -m vta.exec.rpc_server --tracker fleet:9190 --key pynq From d304f641465a55a58af1e6ce7057378b56cff7f8 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 20 May 2019 16:42:44 -0700 Subject: [PATCH 021/126] increasing allocatable buffer size --- vta/include/vta/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vta/include/vta/driver.h b/vta/include/vta/driver.h index eca9e4da9799..2d8e9c2c3d84 100644 --- a/vta/include/vta/driver.h +++ b/vta/include/vta/driver.h @@ -42,7 +42,7 @@ extern "C" { /*! \brief Physically contiguous buffer size limit */ #ifndef VTA_MAX_XFER -#define VTA_MAX_XFER (1<<22) +#define VTA_MAX_XFER (1<<25) #endif /*! PAGE SIZE */ From b22b96ce9c7855429275b87b963a7ef70bf935c1 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 20 May 2019 17:55:05 -0700 Subject: [PATCH 022/126] adding bitstream programming in conv2d test; support for getting remote from tracker --- vta/python/vta/testing/util.py | 23 ++++++++++++------- .../integration/test_benchmark_topi_conv2d.py | 5 ++++ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/vta/python/vta/testing/util.py b/vta/python/vta/testing/util.py index f99541691082..b009b7f27fd3 100644 --- a/vta/python/vta/testing/util.py +++ b/vta/python/vta/testing/util.py @@ -18,7 +18,7 @@ from __future__ import absolute_import as _abs import os -from tvm import rpc +from tvm import rpc, autotvm from ..environment import get_env from . import simulator @@ -54,12 +54,19 @@ def run(run_func): elif env.TARGET == "pynq": - # Run on PYNQ if env variable exists - host = os.environ.get("VTA_PYNQ_RPC_HOST", None) - port = int(os.environ.get("VTA_PYNQ_RPC_PORT", None)) - if host and port: - remote = rpc.connect(host, port) + tracket_host = os.environ.get("TVM_TRACKER_HOST", None) + tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None)) + pynq_host = os.environ.get("VTA_PYNQ_RPC_HOST", None) + pynq_port = int(os.environ.get("VTA_PYNQ_RPC_PORT", None)) + # Run device from fleet node if env variables are defined + if tracket_host and tracket_port: + remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000) run_func(env, remote) else: - raise RuntimeError( - "Please set the VTA_PYNQ_RPC_HOST and VTA_PYNQ_RPC_PORT environment variables") + # Next, run on PYNQ if env variables are defined + if pynq_host and pynq_port: + remote = rpc.connect(pynq_host, pynq_port) + run_func(env, remote) + else: + raise RuntimeError( + "Please set the VTA_PYNQ_RPC_HOST and VTA_PYNQ_RPC_PORT environment variables") diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py index 28c8af4283ce..dc7b5d710c29 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py @@ -29,6 +29,7 @@ import topi import topi.testing import vta +from vta import program_fpga, reconfig_runtime import vta.testing from vta.testing import simulator @@ -213,6 +214,10 @@ def test_conv2d(device="vta"): def _run(env, remote): if device == "vta": target = env.target + if env.TARGET != "sim": + assert tvm.module.enabled("rpc") + program_fpga(remote, bitstream=None) + reconfig_runtime(remote) elif device == "arm_cpu": target = env.target_vta_cpu with autotvm.tophub.context(target): # load pre-tuned schedule parameters From 2aed8e62ee3a9c1f5d23b97b9e8a7bec8de63ff2 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 21 May 2019 10:35:52 -0700 Subject: [PATCH 023/126] removing printfs --- vta/src/runtime.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/vta/src/runtime.cc b/vta/src/runtime.cc index 06b34743955f..f44e3cab8a82 100644 --- a/vta/src/runtime.cc +++ b/vta/src/runtime.cc @@ -908,12 +908,10 @@ class CommandQueue { insn_queue_.InitSpace(); device_ = VTADeviceAlloc(); CHECK(device_ != nullptr); - printf("Initialize VTACommandHandle...\n"); } ~CommandQueue() { VTADeviceFree(device_); - printf("Close VTACommandhandle...\n"); } uint32_t GetElemBytes(uint32_t memory_id) { From 897c08ec0eb4f1d9aeeef1212520669eac530cc1 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 21 May 2019 13:55:42 -0700 Subject: [PATCH 024/126] adding option to skip execution in simulator --- vta/python/vta/testing/simulator.py | 13 +++++++++ vta/src/sim/sim_driver.cc | 42 +++++++++++++++++++++++------ 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/vta/python/vta/testing/simulator.py b/vta/python/vta/testing/simulator.py index dbeba84f6d4a..2d6cfe305756 100644 --- a/vta/python/vta/testing/simulator.py +++ b/vta/python/vta/testing/simulator.py @@ -84,4 +84,17 @@ def tsim_cycles(): """ return tvm.get_global_func("tvm.vta.tsim.cycles")() +# debug flag to skip execution. +DEBUG_SKIP_EXEC = 1 + +def debug_mode(flag): + """Set debug mode + Paramaters + ---------- + flag : int + The debug flag, 0 means clear all flags. + """ + tvm.get_global_func("vta.simulator.profiler_debug_mode")(flag) + + LIBS = _load_lib() diff --git a/vta/src/sim/sim_driver.cc b/vta/src/sim/sim_driver.cc index 5f9f6b637599..0691195f140e 100644 --- a/vta/src/sim/sim_driver.cc +++ b/vta/src/sim/sim_driver.cc @@ -35,6 +35,11 @@ namespace vta { namespace sim { +/*! \brief debug flag for skipping computation */ +enum DebugFlagMask { + kSkipExec = 1 +}; + /*! * \brief Helper class to pack and unpack bits * Applies truncation when pack to low level bits. @@ -253,8 +258,12 @@ class SRAM { return &(data_[index]); } // Execute the load instruction on this SRAM - void Load(const VTAMemInsn* op, DRAM* dram, uint64_t* load_counter) { + void Load(const VTAMemInsn* op, + DRAM* dram, + uint64_t* load_counter, + bool skip_exec) { load_counter[0] += (op->x_size * op->y_size) * kElemBytes; + if (skip_exec) return; DType* sram_ptr = data_ + op->sram_base; uint8_t* dram_ptr = static_cast(dram->GetAddr( op->dram_base * kElemBytes)); @@ -325,6 +334,8 @@ class Profiler { uint64_t gemm_counter{0}; /*! \brief instr counter for ALU ops */ uint64_t alu_counter{0}; + /*! \brief set debug mode */ + int64_t debug_flag{0}; /*! \brief clear the profiler */ void Clear() { inp_load_nbytes = 0; @@ -335,6 +346,10 @@ class Profiler { gemm_counter = 0; alu_counter = 0; } + /*! \return Whether we should skip execution. */ + bool SkipExec() const { + return (debug_flag & DebugFlagMask::kSkipExec) != 0; + } std::string AsJSON() { std::ostringstream os; @@ -398,13 +413,15 @@ class Device { void RunLoad(const VTAMemInsn* op) { if (op->x_size == 0) return; if (op->memory_type == VTA_MEM_ID_INP) { - inp_.Load(op, dram_, &(prof_->inp_load_nbytes)); + inp_.Load(op, dram_, &(prof_->inp_load_nbytes), prof_->SkipExec()); } else if (op->memory_type == VTA_MEM_ID_WGT) { - wgt_.Load(op, dram_, &(prof_->wgt_load_nbytes)); + wgt_.Load(op, dram_, &(prof_->wgt_load_nbytes), prof_->SkipExec()); } else if (op->memory_type == VTA_MEM_ID_ACC) { - acc_.Load(op, dram_, &(prof_->acc_load_nbytes)); + acc_.Load(op, dram_, &(prof_->acc_load_nbytes), prof_->SkipExec()); } else if (op->memory_type == VTA_MEM_ID_UOP) { - uop_.Load(op, dram_, &(prof_->uop_load_nbytes)); + // always load in uop, since uop is stateful + // subsequent non-debug mode exec can depend on it. + uop_.Load(op, dram_, &(prof_->uop_load_nbytes), false); } else { LOG(FATAL) << "Unknown memory_type=" << op->memory_type; } @@ -416,7 +433,9 @@ class Device { op->memory_type == VTA_MEM_ID_UOP) { prof_->out_store_nbytes += ( op->x_size * op->y_size * VTA_BATCH * VTA_BLOCK_OUT * VTA_OUT_WIDTH / 8); - acc_.TruncStore(op, dram_); + if (!prof_->SkipExec()) { + acc_.TruncStore(op, dram_); + } } else { LOG(FATAL) << "Store do not support memory_type=" << op->memory_type; @@ -425,7 +444,8 @@ class Device { void RunGEMM(const VTAGemInsn* op) { if (!op->reset_reg) { - prof_->gemm_counter += op->iter_out * op->iter_in; + prof_->gemm_counter += op->iter_out * op->iter_in * (op->uop_end - op->uop_bgn); + if (prof_->SkipExec()) return; for (uint32_t y = 0; y < op->iter_out; ++y) { for (uint32_t x = 0; x < op->iter_in; ++x) { for (uint32_t uindex = op->uop_bgn; uindex < op->uop_end; ++uindex) { @@ -459,6 +479,7 @@ class Device { } } } else { + if (prof_->SkipExec()) return; // reset for (uint32_t y = 0; y < op->iter_out; ++y) { for (uint32_t x = 0; x < op->iter_in; ++x) { @@ -477,7 +498,6 @@ class Device { } void RunALU(const VTAAluInsn* op) { - prof_->alu_counter += op->iter_out * op->iter_in; if (op->use_imm) { RunALU_(op); } else { @@ -520,6 +540,8 @@ class Device { template void RunALULoop(const VTAAluInsn* op, F func) { + prof_->alu_counter += op->iter_out * op->iter_in * (op->uop_end - op->uop_bgn); + if (prof_->SkipExec()) return; for (int y = 0; y < op->iter_out; ++y) { for (int x = 0; x < op->iter_in; ++x) { for (int k = op->uop_bgn; k < op->uop_end; ++k) { @@ -566,6 +588,10 @@ TVM_REGISTER_GLOBAL("vta.simulator.profiler_status") .set_body([](TVMArgs args, TVMRetValue* rv) { *rv = Profiler::ThreadLocal()->AsJSON(); }); +TVM_REGISTER_GLOBAL("vta.simulator.profiler_debug_mode") +.set_body([](TVMArgs args, TVMRetValue* rv) { + Profiler::ThreadLocal()->debug_flag = args[0]; + }); } // namespace sim } // namespace vta From f956f1580053872a848412e98f27cc3f863fa672 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 21 May 2019 13:59:17 -0700 Subject: [PATCH 025/126] InvalidShapeError reporting --- topi/python/topi/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py index a9984148d5d3..ac855d144aad 100644 --- a/topi/python/topi/__init__.py +++ b/topi/python/topi/__init__.py @@ -35,6 +35,8 @@ from . import image from . import sparse from . import hls +# error reporting +from .util import InvalidShapeError # not import testing by default # because testing can have extra deps that are not necessary # we can import them from test cases explicitly From 48ad24bf7ad0bc916f043ec6a40cb3f8e2db4d82 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 21 May 2019 14:01:27 -0700 Subject: [PATCH 026/126] reset the xlnk driver before every FPGA program --- vta/python/vta/exec/rpc_server.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vta/python/vta/exec/rpc_server.py b/vta/python/vta/exec/rpc_server.py index 8caa48a56104..0ac97a2ab07e 100644 --- a/vta/python/vta/exec/rpc_server.py +++ b/vta/python/vta/exec/rpc_server.py @@ -66,6 +66,9 @@ def ext_dev_callback(): @tvm.register_func("tvm.contrib.vta.init", override=True) def program_fpga(file_name): + from pynq import xlnk + # Reset xilinx driver + xlnk.Xlnk().xlnk_reset() path = tvm.get_global_func("tvm.rpc.server.workpath")(file_name) env = get_env() program_bitstream.bitstream_program(env.TARGET, path) From c6935227c35b1001a586d8e029a8724f5c83e4a9 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 21 May 2019 14:17:51 -0700 Subject: [PATCH 027/126] key flag used when building VTA target --- src/codegen/build_module.cc | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc index 0a488f38457b..04a2fd6d4db9 100644 --- a/src/codegen/build_module.cc +++ b/src/codegen/build_module.cc @@ -58,6 +58,7 @@ Target CreateTarget(const std::string& target_name, std::string libs_flag = "-libs="; std::string device_flag = "-device="; + std::string keys_flag = "-keys="; for (auto& item : options) { t->options_array.push_back(ir::StringImm::make(item)); @@ -69,12 +70,19 @@ Target CreateTarget(const std::string& target_name, } } else if (item.find(device_flag) == 0) { t->device_name = item.substr(device_flag.length()); + t->keys_array.push_back(ir::StringImm::make(t->device_name)); + } else if (item.find(keys_flag) == 0) { + std::stringstream ss(item.substr(keys_flag.length())); + std::string key_item; + while (std::getline(ss, key_item, ',')) { + t->keys_array.push_back(ir::StringImm::make(key_item)); + } } } - if (t->device_name.length() > 0) { - t->keys_array.push_back(ir::StringImm::make(t->device_name)); - } + // if (t->device_name.length() > 0) { + // t->keys_array.push_back(ir::StringImm::make(t->device_name)); + // } t->device_type = kDLCPU; t->thread_warp_size = 1; if (target_name == "c" || target_name == "llvm") { From 04d1788c5958768af40276c98a251e25734f8607 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 21 May 2019 14:22:07 -0700 Subject: [PATCH 028/126] initial conv2d autotuning support --- python/tvm/autotvm/measure/measure_methods.py | 22 +++++- vta/python/vta/__init__.py | 2 +- vta/python/vta/build_module.py | 59 +++++++++++++++ vta/scripts/tune_conv2d.py | 74 +++++++++++++++++++ 4 files changed, 154 insertions(+), 3 deletions(-) create mode 100644 vta/scripts/tune_conv2d.py diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 31d688483294..dcdd46728e3e 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -87,7 +87,9 @@ def __init__(self, timeout=10, n_parallel=None, build_func='default'): else: raise ValueError("Invalid build_func" + build_func) - self.build_func = _wrap_build_func(build_func) + # FIXME: right now we're circumventing the wrap_build_func + # self.build_func = _wrap_build_func(build_func) + self.build_func = build_func self.executor = LocalExecutor(timeout=timeout) self.tmp_dir = tempfile.mkdtemp() @@ -223,7 +225,18 @@ def set_task(self, task): for x in arg_bufs] func = build(s, arg_bufs, "llvm") tvm_buf = [nd.array(x) for x in self.ref_input] - func(*tvm_buf) + + def _run_func(): + """Run tvm function in a thread. + Because there is some issues with python multiprocessing and the thread pool in tvm + """ + func(*tvm_buf) + + thread = threading.Thread(target=_run_func) + thread.start() + thread.join() + del thread + self.ref_output = [x.asnumpy() for x in tvm_buf] def get_build_kwargs(self): @@ -452,6 +465,11 @@ def run_through_rpc(measure_input, build_result, try: # upload built module remote = request_remote(*remote_args) + # Program the FPGA every single time when targeting VTA + if measure_input.target.device_name == 'vta': + from vta import program_fpga, reconfig_runtime + program_fpga(remote, None) + reconfig_runtime(remote) remote.upload(build_result.filename) func = remote.load_module(os.path.split(build_result.filename)[1]) ctx = remote.context(str(measure_input.target), 0) diff --git a/vta/python/vta/__init__.py b/vta/python/vta/__init__.py index 926d73649b31..75ecdbad4bc7 100644 --- a/vta/python/vta/__init__.py +++ b/vta/python/vta/__init__.py @@ -18,5 +18,5 @@ # to maintain minimum dependency on the board if sys.argv[0] not in ("-c", "-m"): from . import top - from .build_module import build_config, lower, build + from .build_module import build_config, lower, build, vta_autotvm_build_func from . import graph diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py index 471dc90746de..f723c99dbe45 100644 --- a/vta/python/vta/build_module.py +++ b/vta/python/vta/build_module.py @@ -18,8 +18,10 @@ from __future__ import absolute_import as _abs import tvm +from tvm import rpc from . import ir_pass from .environment import get_env +from .testing import simulator def lift_coproc_scope(x): @@ -115,3 +117,60 @@ def build(*args, **kwargs): with build_config(): return tvm.build(*args, **kwargs) return tvm.build(*args, **kwargs) + + +def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs): + """Custom build func for VTA. Used for autotvm""" + + import time + import os + from random import getrandbits + from tvm.autotvm.util import get_const_tuple + from tvm.autotvm.measure.measure_methods import BuildResult, InstantiationError + + tic = time.time() + try: + filename = os.path.join(tmp_dir, "tmp_func_%0x.tar" % getrandbits(64)) + target, task, config = measure_input + + with target: + s, args = task.instantiate(config) + if not config.valid(): + raise InstantiationError(config.errors) + + func = build(s, args, target_host=task.target_host) + sim = build(s, args) + + arg_info = tuple((get_const_tuple(x.shape), x.dtype) for x in args) + func.export_library(filename) + + # When targeting VTA test the schedule on simulator first + # in order to catch runtime errors + if measure_input.target.device_name == 'vta': + from vta import reconfig_runtime + # Note: if you're not running the RPC locally, you cannot benefit + # from rumtime recompilation... + local_rpc_port = int(os.environ.get("VTA_LOCAL_SIM_RPC_PORT", "0")) + if local_rpc_port: + remote = rpc.connect("localhost", local_rpc_port) + reconfig_runtime(remote) + else: + remote = rpc.LocalSession() + sim_path = os.path.join(tmp_dir, "tmp_func_%0x.tar" % getrandbits(64)) + sim.export_library(sim_path) + remote.upload(sim_path) + f = remote.load_module(os.path.split(sim_path)[1]) + ctx = remote.context(str(measure_input.target), 0) + args = [tvm.nd.empty(x[0], dtype=x[1], ctx=ctx) for x in arg_info] + simulator.clear_stats() + simulator.debug_mode(simulator.DEBUG_SKIP_EXEC) + f(*args) + + # check by local simulator + ctx = tvm.context(str(target)) + args = [tvm.nd.empty(x[0], dtype=x[1], ctx=ctx) for x in arg_info] + sim(*args) + + except Exception as e: # pylint: disable=broad-except + return BuildResult(None, None, e, time.time() - tic) + return BuildResult(filename, arg_info, None, time.time() - tic) diff --git a/vta/scripts/tune_conv2d.py b/vta/scripts/tune_conv2d.py new file mode 100644 index 000000000000..432008661c58 --- /dev/null +++ b/vta/scripts/tune_conv2d.py @@ -0,0 +1,74 @@ +"""Tuning a single conv2d operator""" +import logging +import os + +import tvm +from tvm import autotvm +from tvm.contrib.util import get_lower_ir +import topi +import vta +import vta.testing + +env = vta.get_env() + +@tvm.tag_scope(tag=topi.tag.ELEMWISE) +def my_clip(x, a_min, a_max): + """Unlike topi's current clip, put min and max into two stages.""" + const_min = tvm.const(a_min, x.dtype) + const_max = tvm.const(a_max, x.dtype) + x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") + x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") + return x + +def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dtype): + data_shape = (N//env.BATCH, CI//env.BLOCK_IN, H, W, env.BATCH, env.BLOCK_IN) + kernel_shape = (CO//env.BLOCK_OUT, CI//env.BLOCK_IN, KH, KW, env.BLOCK_OUT, env.BLOCK_IN) + bias_shape = (N//env.BATCH, CO//env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) + + data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) + bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) + kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) + + with tvm.target.vta(): + res = topi.nn.conv2d(data, kernel, padding=padding, strides=strides, dilation=dilation, + layout='NCHW%dn%dc' % (env.BATCH, env.BLOCK_IN), out_dtype='int32') + res = topi.add(res, bias) + res = topi.right_shift(res, 8) + res = my_clip(res, 0, 127) + res = topi.cast(res, "int8") + + if tvm.target.current_target().device_name == 'vta': + s = topi.generic.schedule_conv2d_nchw([res]) + else: + s = tvm.create_schedule([res.op]) + + return s, [data, kernel, bias, res] + +if __name__ == '__main__': + N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dtype = \ + 1, 64, 56, 56, 64, 3, 3, (1, 1), (1, 1), (1, 1), 'int8', 'int32' + + task = autotvm.task.create(conv2d, args=(N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dtype), + target=tvm.target.vta(), target_host=env.target_host, template_key='direct') + print(task.config_space) + + # Logging config (for printing tuning log to the screen) + logging.basicConfig() + logging.getLogger('autotvm').setLevel(logging.DEBUG) + + # Get tracker info from env + tracket_host = os.environ.get("TVM_TRACKER_HOST", None) + tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None)) + + measure_option = autotvm.measure_option( + builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func), + runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port, number=4, repeat=3, timeout=10000, + check_correctness=True)) + + tuner = autotvm.tuner.RandomTuner(task) + n_trial = len(task.config_space) + tuner.tune(n_trial=n_trial, + measure_option=measure_option, + callbacks=[autotvm.callback.log_to_file('conv2d.log')]) + + print(tuner.best_config) From 29ebd8080cca85fbd1710bdfb4cdaa2cbed2d509 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 21 May 2019 14:26:33 -0700 Subject: [PATCH 029/126] edits to tune_conv2d.py --- vta/scripts/tune_conv2d.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vta/scripts/tune_conv2d.py b/vta/scripts/tune_conv2d.py index 432008661c58..2cd8c6ea5a2f 100644 --- a/vta/scripts/tune_conv2d.py +++ b/vta/scripts/tune_conv2d.py @@ -59,6 +59,9 @@ def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dt # Get tracker info from env tracket_host = os.environ.get("TVM_TRACKER_HOST", None) tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None)) + if not tracket_host or not tracket_port: + print("Set your AutoTVM tracker node host and port variables to run the autotuner") + exit() measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func), @@ -67,8 +70,9 @@ def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dt tuner = autotvm.tuner.RandomTuner(task) n_trial = len(task.config_space) - tuner.tune(n_trial=n_trial, + tuner.tune(n_trial=30, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file('conv2d.log')]) + print("\nBest tuner config:") print(tuner.best_config) From 7e633cd320b91ba1757a3c4dd1bae285b91607a1 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 21 May 2019 16:19:10 -0700 Subject: [PATCH 030/126] exhaustive search --- vta/scripts/tune_conv2d.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vta/scripts/tune_conv2d.py b/vta/scripts/tune_conv2d.py index 2cd8c6ea5a2f..e896e917b921 100644 --- a/vta/scripts/tune_conv2d.py +++ b/vta/scripts/tune_conv2d.py @@ -69,8 +69,7 @@ def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dt check_correctness=True)) tuner = autotvm.tuner.RandomTuner(task) - n_trial = len(task.config_space) - tuner.tune(n_trial=30, + tuner.tune(n_trial=len(task.config_space), measure_option=measure_option, callbacks=[autotvm.callback.log_to_file('conv2d.log')]) From 67b49c787fa5480fa11df7d7e204f03458abcc7c Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 21 May 2019 17:04:14 -0700 Subject: [PATCH 031/126] logging simulator stats in autoTVM --- python/tvm/autotvm/measure/measure.py | 4 +++- python/tvm/autotvm/measure/measure_methods.py | 20 ++++++++++--------- python/tvm/autotvm/record.py | 4 +++- vta/python/vta/build_module.py | 7 +++++-- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py index 0836fb741bd2..c4dec35d593f 100644 --- a/python/tvm/autotvm/measure/measure.py +++ b/python/tvm/autotvm/measure/measure.py @@ -34,7 +34,7 @@ class MeasureInput(namedtuple("MeasureInput", ["target", "task", "config"])): """ -class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost", "timestamp"])): +class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost", "timestamp", "sim_stats"])): """ Stores all the results of a measurement @@ -49,6 +49,8 @@ class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost" All cost of this measure, including rpc, compilation, test runs timestamp: float The absolute time stamp when we finish measurement. + sim_stats: Dictionary + Dictionary of VTA simulator statistics (only used when target is VTA) """ diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index dcdd46728e3e..1b32eaced711 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -46,7 +46,7 @@ logger = logging.getLogger('autotvm') -class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 'time_cost'))): +class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 'time_cost', 'sim_stats'))): """ Stores all the necessary inputs for a measurement. @@ -60,6 +60,8 @@ class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 't The error happens during compilation. time_cost : float The time cost of building + sim_stats : Dictionary + Dictionary of VTA simulator statistics (only used when target is VTA) """ class LocalBuilder(Builder): @@ -114,13 +116,13 @@ def build(self, measure_inputs): if isinstance(res, Exception): # timeout or fleet error, return MeasureResult directly results.append(MeasureResult((res,), MeasureErrorNo.BUILD_TIMEOUT, - self.timeout, time.time())) + self.timeout, time.time(), {})) elif res.error is not None: # instantiation error if isinstance(res.error, InstantiationError): results.append(MeasureResult((res.error,), MeasureErrorNo.INSTANTIATION_ERROR, - res.time_cost, time.time())) + res.time_cost, time.time(), {})) else: if "InstantiationError" in str(res.error): msg = str(res.error) @@ -130,11 +132,11 @@ def build(self, measure_inputs): pass results.append(MeasureResult((InstantiationError(msg),), MeasureErrorNo.INSTANTIATION_ERROR, - res.time_cost, time.time())) + res.time_cost, time.time(), {})) else: # tvm error results.append(MeasureResult((res.error,), MeasureErrorNo.COMPILE_HOST, - res.time_cost, time.time())) + res.time_cost, time.time(), {})) else: # return BuildResult results.append(res) @@ -282,7 +284,7 @@ def run(self, measure_inputs, build_results): res = future.get() if isinstance(res, Exception): # executor error or timeout results.append(MeasureResult((str(res),), MeasureErrorNo.RUN_TIMEOUT, - self.timeout, time.time())) + self.timeout, time.time(), {})) else: results.append(res) @@ -416,8 +418,8 @@ def _wrapped(measure_input, tmp_dir, **kwargs): func, arg_info = _build_func_common(measure_input, **kwargs) func.export_library(filename, build_func) except Exception as e: # pylint: disable=broad-except - return BuildResult(None, None, e, time.time() - tic) - return BuildResult(filename, arg_info, None, time.time() - tic) + return BuildResult(None, None, e, time.time() - tic, {}) + return BuildResult(filename, arg_info, None, time.time() - tic, {}) return _wrapped @@ -514,7 +516,7 @@ def run_through_rpc(measure_input, build_result, errno = MeasureErrorNo.RUNTIME_DEVICE tstamp = time.time() time.sleep(cooldown_interval) - return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp) + return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp, build_result.sim_stats) def request_remote(device_key, host=None, port=None, priority=1, timeout=60): diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py index 14efb7bd9239..9d3747352f79 100644 --- a/python/tvm/autotvm/record.py +++ b/python/tvm/autotvm/record.py @@ -98,7 +98,9 @@ def encode(inp, result, protocol='json'): result.all_cost, result.timestamp), - "v": AUTOTVM_LOG_VERSION + "v": AUTOTVM_LOG_VERSION, + + "s": result.sim_stats } return json.dumps(json_dict) if protocol == 'pickle': diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py index f723c99dbe45..7ab0834f0368 100644 --- a/vta/python/vta/build_module.py +++ b/vta/python/vta/build_module.py @@ -129,6 +129,8 @@ def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs): from tvm.autotvm.measure.measure_methods import BuildResult, InstantiationError tic = time.time() + # simulator stats + stats = {} try: filename = os.path.join(tmp_dir, "tmp_func_%0x.tar" % getrandbits(64)) target, task, config = measure_input @@ -165,6 +167,7 @@ def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs): simulator.clear_stats() simulator.debug_mode(simulator.DEBUG_SKIP_EXEC) f(*args) + stats = simulator.stats() # check by local simulator ctx = tvm.context(str(target)) @@ -172,5 +175,5 @@ def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs): sim(*args) except Exception as e: # pylint: disable=broad-except - return BuildResult(None, None, e, time.time() - tic) - return BuildResult(filename, arg_info, None, time.time() - tic) + return BuildResult(None, None, e, time.time() - tic, stats) + return BuildResult(filename, arg_info, None, time.time() - tic, stats) From 378b3a51404515565522510a422d3e98e6331481 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 21 May 2019 23:09:08 -0700 Subject: [PATCH 032/126] tuning over all resnet layers --- vta/scripts/tune_conv2d.py | 67 +++++++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 16 deletions(-) diff --git a/vta/scripts/tune_conv2d.py b/vta/scripts/tune_conv2d.py index e896e917b921..0113060a77da 100644 --- a/vta/scripts/tune_conv2d.py +++ b/vta/scripts/tune_conv2d.py @@ -1,4 +1,5 @@ """Tuning a single conv2d operator""" +from collections import namedtuple import logging import os @@ -11,6 +12,26 @@ env = vta.get_env() +Workload = namedtuple("Conv2DWorkload", + ['batch', 'height', 'width', 'in_filter', 'out_filter', + 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride']) + +resnet_wkls = [ + # Workloads of resnet18 on imagenet + # ('resnet-18.C1', Workload(1, 224, 224, 3, 64, 7, 7, 3, 3, 2, 2)), + ('resnet-18.C2', Workload(1, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1)), + # ('resnet-18.C3', Workload(1, 56, 56, 64, 64, 1, 1, 0, 0, 1, 1)), # this layer does not appear in ResNet + ('resnet-18.C4', Workload(1, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2)), + ('resnet-18.C5', Workload(1, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2)), + ('resnet-18.C6', Workload(1, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1)), + ('resnet-18.C7', Workload(1, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2)), + ('resnet-18.C8', Workload(1, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2)), + ('resnet-18.C9', Workload(1, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1)), + ('resnet-18.C10', Workload(1, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2)), + ('resnet-18.C11', Workload(1, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2)), + ('resnet-18.C12', Workload(1, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1)), +] + @tvm.tag_scope(tag=topi.tag.ELEMWISE) def my_clip(x, a_min, a_max): """Unlike topi's current clip, put min and max into two stages.""" @@ -45,12 +66,6 @@ def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dt return s, [data, kernel, bias, res] if __name__ == '__main__': - N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dtype = \ - 1, 64, 56, 56, 64, 3, 3, (1, 1), (1, 1), (1, 1), 'int8', 'int32' - - task = autotvm.task.create(conv2d, args=(N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dtype), - target=tvm.target.vta(), target_host=env.target_host, template_key='direct') - print(task.config_space) # Logging config (for printing tuning log to the screen) logging.basicConfig() @@ -63,15 +78,35 @@ def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dt print("Set your AutoTVM tracker node host and port variables to run the autotuner") exit() - measure_option = autotvm.measure_option( - builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func), - runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port, number=4, repeat=3, timeout=10000, - check_correctness=True)) + for wl_name, wl in resnet_wkls: + + # Workload parameters + N = wl.batch + CI = wl.in_filter + H = wl.height + W = wl.width + CO = wl.out_filter + KH = wl.hkernel + KW = wl.wkernel + strides = (wl.hstride, wl.wstride) + padding = (wl.hpad, wl.wpad) + dilation = (1, 1) + in_dtype = 'int8' + out_dtype = 'int32' + + task = autotvm.task.create(conv2d, args=(N, CI, H, W, CO, KH, KW, strides, padding, dilation, in_dtype, out_dtype), + target=tvm.target.vta(), target_host=env.target_host, template_key='direct') + print(task.config_space) + + measure_option = autotvm.measure_option( + builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func), + runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port, number=4, repeat=3, timeout=10000, + check_correctness=True)) - tuner = autotvm.tuner.RandomTuner(task) - tuner.tune(n_trial=len(task.config_space), - measure_option=measure_option, - callbacks=[autotvm.callback.log_to_file('conv2d.log')]) + tuner = autotvm.tuner.RandomTuner(task) + tuner.tune(n_trial=len(task.config_space), + measure_option=measure_option, + callbacks=[autotvm.callback.log_to_file('conv2d.log')]) - print("\nBest tuner config:") - print(tuner.best_config) + print("\nBest tuner config:") + print(tuner.best_config) From e3656ca7d4f3057d92a47d2f474a29c2d7db6727 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Thu, 23 May 2019 18:35:02 -0700 Subject: [PATCH 033/126] removing sim stats from log for now due to tophub issues --- python/tvm/autotvm/measure/measure.py | 4 +--- python/tvm/autotvm/measure/measure_methods.py | 20 +++++++++---------- python/tvm/autotvm/record.py | 4 +--- vta/python/vta/build_module.py | 4 ++-- 4 files changed, 13 insertions(+), 19 deletions(-) diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py index c4dec35d593f..0836fb741bd2 100644 --- a/python/tvm/autotvm/measure/measure.py +++ b/python/tvm/autotvm/measure/measure.py @@ -34,7 +34,7 @@ class MeasureInput(namedtuple("MeasureInput", ["target", "task", "config"])): """ -class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost", "timestamp", "sim_stats"])): +class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost", "timestamp"])): """ Stores all the results of a measurement @@ -49,8 +49,6 @@ class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost" All cost of this measure, including rpc, compilation, test runs timestamp: float The absolute time stamp when we finish measurement. - sim_stats: Dictionary - Dictionary of VTA simulator statistics (only used when target is VTA) """ diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 1b32eaced711..dcdd46728e3e 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -46,7 +46,7 @@ logger = logging.getLogger('autotvm') -class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 'time_cost', 'sim_stats'))): +class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 'time_cost'))): """ Stores all the necessary inputs for a measurement. @@ -60,8 +60,6 @@ class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 't The error happens during compilation. time_cost : float The time cost of building - sim_stats : Dictionary - Dictionary of VTA simulator statistics (only used when target is VTA) """ class LocalBuilder(Builder): @@ -116,13 +114,13 @@ def build(self, measure_inputs): if isinstance(res, Exception): # timeout or fleet error, return MeasureResult directly results.append(MeasureResult((res,), MeasureErrorNo.BUILD_TIMEOUT, - self.timeout, time.time(), {})) + self.timeout, time.time())) elif res.error is not None: # instantiation error if isinstance(res.error, InstantiationError): results.append(MeasureResult((res.error,), MeasureErrorNo.INSTANTIATION_ERROR, - res.time_cost, time.time(), {})) + res.time_cost, time.time())) else: if "InstantiationError" in str(res.error): msg = str(res.error) @@ -132,11 +130,11 @@ def build(self, measure_inputs): pass results.append(MeasureResult((InstantiationError(msg),), MeasureErrorNo.INSTANTIATION_ERROR, - res.time_cost, time.time(), {})) + res.time_cost, time.time())) else: # tvm error results.append(MeasureResult((res.error,), MeasureErrorNo.COMPILE_HOST, - res.time_cost, time.time(), {})) + res.time_cost, time.time())) else: # return BuildResult results.append(res) @@ -284,7 +282,7 @@ def run(self, measure_inputs, build_results): res = future.get() if isinstance(res, Exception): # executor error or timeout results.append(MeasureResult((str(res),), MeasureErrorNo.RUN_TIMEOUT, - self.timeout, time.time(), {})) + self.timeout, time.time())) else: results.append(res) @@ -418,8 +416,8 @@ def _wrapped(measure_input, tmp_dir, **kwargs): func, arg_info = _build_func_common(measure_input, **kwargs) func.export_library(filename, build_func) except Exception as e: # pylint: disable=broad-except - return BuildResult(None, None, e, time.time() - tic, {}) - return BuildResult(filename, arg_info, None, time.time() - tic, {}) + return BuildResult(None, None, e, time.time() - tic) + return BuildResult(filename, arg_info, None, time.time() - tic) return _wrapped @@ -516,7 +514,7 @@ def run_through_rpc(measure_input, build_result, errno = MeasureErrorNo.RUNTIME_DEVICE tstamp = time.time() time.sleep(cooldown_interval) - return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp, build_result.sim_stats) + return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp) def request_remote(device_key, host=None, port=None, priority=1, timeout=60): diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py index 9d3747352f79..14efb7bd9239 100644 --- a/python/tvm/autotvm/record.py +++ b/python/tvm/autotvm/record.py @@ -98,9 +98,7 @@ def encode(inp, result, protocol='json'): result.all_cost, result.timestamp), - "v": AUTOTVM_LOG_VERSION, - - "s": result.sim_stats + "v": AUTOTVM_LOG_VERSION } return json.dumps(json_dict) if protocol == 'pickle': diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py index 7ab0834f0368..91e3c4a7e0d8 100644 --- a/vta/python/vta/build_module.py +++ b/vta/python/vta/build_module.py @@ -175,5 +175,5 @@ def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs): sim(*args) except Exception as e: # pylint: disable=broad-except - return BuildResult(None, None, e, time.time() - tic, stats) - return BuildResult(filename, arg_info, None, time.time() - tic, stats) + return BuildResult(None, None, e, time.time() - tic) + return BuildResult(filename, arg_info, None, time.time() - tic) From 8186632733569e1990622f83031e317e931736b8 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Thu, 23 May 2019 18:54:02 -0700 Subject: [PATCH 034/126] autoTVM task extraction for VTA (nnvm for now) --- nnvm/python/nnvm/top/nn.py | 10 +- python/tvm/autotvm/task/nnvm_integration.py | 71 +++--- python/tvm/autotvm/task/topi_integration.py | 70 ++++-- vta/scripts/tune_resnet.py | 231 ++++++++++++++++++++ 4 files changed, 322 insertions(+), 60 deletions(-) create mode 100644 vta/scripts/tune_resnet.py diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py index 13964f4e25f6..128f985bd6d2 100644 --- a/nnvm/python/nnvm/top/nn.py +++ b/nnvm/python/nnvm/top/nn.py @@ -114,25 +114,25 @@ def compute_conv2d(attrs, inputs, _): if groups == 1 and layout == 'NCHW4c' and inputs[0].dtype == 'int8': # pylint: disable=assignment-from-no-return out = topi.nn.conv2d(inputs[0], inputs[1], strides, padding, - dilation, layout, out_dtype=out_dtype) + dilation, layout, out_dtype) # pylint: enable=assignment-from-no-return elif groups == 1: out = topi.nn.conv2d( - inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype=out_dtype) + inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype) elif layout == "NCHW" and \ groups == get_const_int(inputs[0].shape[1]) and \ groups == channels: out = topi.nn.depthwise_conv2d_nchw( - inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype) + inputs[0], inputs[1], strides, padding, dilation, out_dtype) elif layout in ["NCHW", "NCHW4c"]: out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, - out_dtype=out_dtype) + out_dtype) elif layout == "NHWC" and \ kernel_layout == "HWOI" and \ groups == get_const_int(inputs[0].shape[3]) and \ groups == channels: out = topi.nn.depthwise_conv2d_nhwc( - inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype) + inputs[0], inputs[1], strides, padding, dilation, out_dtype) else: raise ValueError("not support arbitrary group number for now") diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py index dbcee0e516e1..e4d2b3fb8023 100644 --- a/python/tvm/autotvm/task/nnvm_integration.py +++ b/python/tvm/autotvm/task/nnvm_integration.py @@ -27,15 +27,16 @@ from .task import create from .topi_integration import TaskExtractEnv +from .dispatcher import ApplyHistoryBest logger = logging.getLogger('autotvm') -def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None): +def extract_from_graph(graph, shape, dtype, target, symbols, params, target_host=None): """ Extract tuning tasks from a nnvm graph. This function collects tuning tasks by building the graph - with a "tracing" target and tracing all the calls to topi. + and trace all the calls to topi. Parameters ---------- @@ -49,6 +50,8 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None): The compilation target symbols : Array of nnvm.symbol Array of nnvm symbols want to be tuned + params : dict of str to NDArray + The parameter dictionary. target_host: tvm.target.Target The host compilation target @@ -78,32 +81,35 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None): topi_funcs.extend(SYMBOL2TOPI[sym_name]) else: warnings.warn("Symbol %s is not tunable, ignored" % sym_name) - - # run compiler to collect all TOPI calls during compilation env.reset(topi_funcs) - # disable logger temporarily - old_state = logger.disabled - logger.disabled = True + with env: + # disable logger temporarily + old_state = logger.disabled + logger.disabled = True - # use a "tracing" target to do a fake compile for collecting topi calls - tracing_target = _target.create("llvm -device=tracing") - nnvm.compiler.engine.clear_cache() - nnvm.compiler.build(graph, target=tracing_target, shape=shape, dtype=dtype) + # run compiler to collect all TOPI calls during compilation + nnvm.compiler.engine.clear_cache() + nnvm.compiler.build(graph, target=target, shape=shape, dtype=dtype, + target_host=target_host, params=params) - logger.disabled = old_state + logger.disabled = old_state # create tasks for target tasks = [] for task_name, args in env.get_tasks(): - tasks.append(create(task_name, args, - target=target, target_host=target_host, - template_key='direct')) + try: + tsk = create(task_name, args, + target=target, target_host=target_host, + template_key='direct') + tasks.append(tsk) + except topi.InvalidShapeError: + print("[Warning] Invalid Shape during AutoTVM Task Creation") return tasks -def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, target_host=None): +def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, params, target_host=None): """ Extract tuning tasks from multiple nnvm graphs. This function is the multiple graph version of extract_from_graph @@ -120,6 +126,8 @@ def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, target_ The compilation target symbols : Array of nnvm.symbol Array of nnvm symbols want to be tuned + params : dict of str to NDArray + The parameter dictionary. target_host: tvm.target.Target The host compilation target @@ -149,28 +157,29 @@ def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, target_ topi_funcs.extend(SYMBOL2TOPI[sym_name]) else: warnings.warn("Symbol %s is not tunable, ignored" % sym_name) - - # run compiler to collect all TOPI calls during compilation env.reset(topi_funcs) - # disable logger temporarily - old_state = logger.disabled - logger.disabled = True + with env: + # disable logger temporarily + old_state = logger.disabled + logger.disabled = True - # use a "tracing" target to do a fake compile for collecting topi calls - tracing_target = _target.create("llvm -device=tracing") + nnvm.compiler.engine.clear_cache() + for graph, shape, dtype in zip(graphs, shapes, dtypes): + nnvm.compiler.build(graph, target=target, shape=shape, dtype=dtype) - nnvm.compiler.engine.clear_cache() - for graph, shape, dtype in zip(graphs, shapes, dtypes): - nnvm.compiler.build(graph, target=tracing_target, shape=shape, dtype=dtype) - - logger.disabled = old_state + logger.disabled = old_state # create tasks for target tasks = [] for task_name, args in env.get_tasks(): - tasks.append(create(task_name, args, - target=target, target_host=target_host, - template_key='direct')) + try: + tsk = create(task_name, args, + target=target, target_host=target_host, + template_key='direct') + tasks.append(tsk) + except topi.InvalidShapeError: + print("[Warning] Invalid Shape during AutoTVM Task Creation") return tasks + diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py index c48d4f58edce..ed85504e4c0a 100644 --- a/python/tvm/autotvm/task/topi_integration.py +++ b/python/tvm/autotvm/task/topi_integration.py @@ -27,6 +27,9 @@ See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage. """ +import warnings +import sys + from ... import _api_internal, tensor, placeholder, create_schedule from .task import args_to_workload, dispatcher, register @@ -73,6 +76,7 @@ def deserialize_args(args): class TaskExtractEnv: """Global environment for extracting tuning tasks from nnvm graph""" current = None + registered = None def __init__(self, allow_duplicate=False): import topi @@ -106,47 +110,65 @@ def __init__(self, allow_duplicate=False): topi.nn.deformable_conv2d_nchw: [topi.generic.schedule_deformable_conv2d_nchw], } + # support reflection for tracing + self.func_to_reflection = { + topi.nn.conv2d: lambda x: setattr(topi.nn, 'conv2d', x), + topi.nn.conv2d_NCHWc: lambda x: setattr(topi.nn, 'conv2d_NCHWc', x), + topi.nn.depthwise_conv2d_nchw: lambda x: setattr(topi.nn, 'depthwise_conv2d_nchw', x), + topi.nn.group_conv2d_nchw: lambda x: setattr(topi.nn, 'group_conv2d_nchw', x), + topi.nn.conv2d_transpose_nchw: lambda x: setattr(topi.nn, 'conv2d_transpose_nchw', x), + topi.nn.dense: lambda x: setattr(topi.nn, 'dense', x), + topi.nn.bitserial_conv2d_nchw: lambda x: setattr(topi.nn, 'bitserial_conv2d_nchw', x), + topi.nn.bitserial_conv2d_nhwc: lambda x: setattr(topi.nn, 'bitserial_conv2d_nhwc', x), + topi.nn.bitserial_dense: lambda x: setattr(topi.nn, 'bitserial_dense', x), + topi.nn.deformable_conv2d_nchw: lambda x: setattr(topi.nn, 'deformable_conv2d_nchw', x), + } + self.allow_duplicate = allow_duplicate - self._register_tracing() self._register_topi_task() self.task_collection = [] self.wanted_topi_funcs = list(self.topi_to_task.keys()) + self.modified_funcs = [] + + def __enter__(self): + self.task_collection = [] + self.modified_funcs = [] - def _register_tracing(self): - """Register tracing function to track the topi function call""" - # register topi compute for "tracing" target - for topi_compute in self.topi_to_task: + for topi_compute in self.wanted_topi_funcs: def _local_scope(compute_func): """start a scope to hold the local function in for loop""" - @compute_func.register("tracing", ) - def _tracing_topi_compute(*args, **kwargs): - assert not kwargs, "Do not support extracting tuning tasks when" \ - "kwargs is used in TOPI function call." \ + def _tracing_wrapper(*args, **kwargs): + assert not kwargs, "Do not support extracting tuning tasks when " \ + "kwargs is used in TOPI function call. " \ "Please modify it to use only positional args." - if compute_func in self.wanted_topi_funcs: # record this call - key = (self.topi_to_task[compute_func], serialize_args(args)) - if self.allow_duplicate or key not in self.task_collection: - self.task_collection.append(key) - return compute_func.fdefault(*args) + key = (self.topi_to_task[compute_func], serialize_args(args)) + if self.allow_duplicate or key not in self.task_collection: + self.task_collection.append(key) + + return compute_func(*args, **kwargs) + + self.func_to_reflection[topi_compute](_tracing_wrapper) + self.modified_funcs.append(topi_compute) + _local_scope(topi_compute) - # register topi schedule for "tracing" target - for topi_compute in self.topi_to_task: - for topi_schedule in self.topi_to_schedule[topi_compute]: - def _local_scope_(schedule_func): - """start a scope to hold the local function in for loop""" + return self - @schedule_func.register("tracing", ) - def _tracing_topi_compute(outs): - outs = [outs] if isinstance(outs, tensor.Tensor) else outs - return create_schedule([x.op for x in outs]) - _local_scope_(topi_schedule) + def __exit__(self, exc_type, exc_val, exc_tb): + # revert modification + for func in self.modified_funcs: + self.func_to_reflection[func](func) def _register_topi_task(self): """register tuning wrapper for topi function""" import topi + # Avoid double registration for certain targets + if TaskExtractEnv.registered: + return + TaskExtractEnv.registered = True + # Tuning wrapper for topi functions @register("topi_nn_conv2d") def _topi_nn_conv2d(*args, **kwargs): diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py new file mode 100644 index 000000000000..b22a63e09df8 --- /dev/null +++ b/vta/scripts/tune_resnet.py @@ -0,0 +1,231 @@ +import argparse +import os +import time +import numpy as np + +import tvm +from tvm import rpc, autotvm +from tvm.autotvm.measure.measure_methods import request_remote +from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner +from tvm.contrib import graph_runtime, util +from tvm.contrib.download import download + +import topi +import nnvm.compiler +import vta +import vta.testing + +env = vta.get_env() + +def register_vta_tuning_tasks(): + from tvm.autotvm.task.topi_integration import TaskExtractEnv, deserialize_args + + @tvm.tag_scope(tag=topi.tag.ELEMWISE) + def my_clip(x, a_min, a_max): + """Unlike topi's current clip, put min and max into two stages.""" + const_min = tvm.const(a_min, x.dtype) + const_max = tvm.const(a_max, x.dtype) + x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") + x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") + return x + + # init autotvm env to register VTA operator + TaskExtractEnv() + + @autotvm.task.register("topi_nn_conv2d", override=True) + def _topi_nn_conv2d(*args, **kwargs): + assert not kwargs, "Do not support kwargs in template function call" + args = deserialize_args(args) + A, W = args[:2] + + with tvm.target.vta(): + res = topi.nn.conv2d(*args, **kwargs) + res = topi.right_shift(res, 8) + res = my_clip(res, 0, 127) + res = topi.cast(res, "int8") + + if tvm.target.current_target().device_name == 'vta': + s = topi.generic.schedule_conv2d_nchw([res]) + else: + s = tvm.create_schedule([res.op]) + return s, [A, W, res] + + + +def generate_graph(sym, params, target, target_host): + # Populate the shape and data type dictionary + shape_dict = {"data": (1, 3, 224, 224)} + dtype_dict = {"data": 'float32'} + shape_dict.update({k: v.shape for k, v in params.items()}) + dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) + + # Apply NNVM graph optimization passes + sym = vta.graph.clean_cast(sym) + sym = vta.graph.clean_conv_fuse(sym) + assert env.BLOCK_IN == env.BLOCK_OUT + sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT) + + # Compile NNVM graph + with nnvm.compiler.build_config(opt_level=3): + with vta.build_config(): + graph, lib, params = nnvm.compiler.build( + sym, target, shape_dict, dtype_dict, + params=params, target_host=target_host) + + return graph, lib, params + + +def extract_tasks(sym, params, target, target_host): + # Populate the shape and data type dictionary + shape_dict = {"data": (1, 3, 224, 224)} + dtype_dict = {"data": 'float32'} + shape_dict.update({k: v.shape for k, v in params.items()}) + dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) + + # Apply NNVM graph optimization passes + sym = vta.graph.clean_cast(sym) + sym = vta.graph.clean_conv_fuse(sym) + assert env.BLOCK_IN == env.BLOCK_OUT + sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT) + + with vta.build_config(): + tasks = autotvm.task.extract_from_graph(graph=sym, shape=shape_dict, dtype=dtype_dict, target=target, + params=params, symbols=(nnvm.sym.conv2d,), target_host=target_host) + return tasks + + +def download_model(): + url = "https://github.com/uwsaml/web-data/raw/master/vta/models/" + categ_fn = 'synset.txt' + graph_fn = 'resnet18_qt8.json' + params_fn = 'resnet18_qt8.params' + data_dir = '_data' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + + for file in [categ_fn, graph_fn, params_fn]: + if not os.path.isfile(file): + download(os.path.join(url, file), os.path.join(data_dir, file)) + + sym = nnvm.graph.load_json(open(os.path.join(data_dir, graph_fn)).read()) + params = nnvm.compiler.load_param_dict(open(os.path.join(data_dir, params_fn), 'rb').read()) + + return sym, params + + +def tune_tasks(tasks, + measure_option, + tuner='xgb', + n_trial=1000, + early_stopping=None, + log_filename='tuning.log', + use_transfer_learning=True, + try_winograd=True): + # create tmp log file + tmp_log_file = log_filename + ".tmp" + if os.path.exists(tmp_log_file): + os.remove(tmp_log_file) + + for i, tsk in enumerate(reversed(tasks)): + prefix = "[Task %2d/%2d] " % (i+1, len(tasks)) + + # create tuner + if tuner == 'xgb' or tuner == 'xgb-rank': + tuner_obj = XGBTuner(tsk, loss_type='rank') + elif tuner == 'ga': + tuner_obj = GATuner(tsk, pop_size=50) + elif tuner == 'random': + tuner_obj = RandomTuner(tsk) + elif tuner == 'gridsearch': + tuner_obj = GridSearchTuner(tsk) + else: + raise ValueError("Invalid tuner: " + tuner) + + if use_transfer_learning: + if os.path.isfile(tmp_log_file): + tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file)) + + # do tuning + n_trial_ = min(n_trial, len(tsk.config_space)) + tuner_obj.tune(n_trial_, + early_stopping=early_stopping, + measure_option=measure_option, + callbacks=[ + autotvm.callback.progress_bar(n_trial_, prefix=prefix), + autotvm.callback.log_to_file(tmp_log_file)]) + + # pick best records to a cache file + autotvm.record.pick_best(tmp_log_file, log_filename) + os.remove(tmp_log_file) + +if __name__ == '__main__': + + # Get tracker info from env + tracket_host = os.environ.get("TVM_TRACKER_HOST", None) + tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None)) + if not tracket_host or not tracket_port: + print("Set your AutoTVM tracker node host and port variables to run the autotuner") + exit() + + tuning_opt = { + 'log_filename': 'resnet-18.log', + + 'tuner': 'random', + 'n_trial': 1e9, + 'early_stopping': None, + + 'measure_option': autotvm.measure_option( + builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func), + runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port, + number=4, repeat=3, timeout=60, + check_correctness=True)) + } + + # download model + sym, params = download_model() + + # register VTA tuning tasks + register_vta_tuning_tasks() + + # extract tasks + print("Extract tasks...") + target = tvm.target.vta() + target_host = env.target_host + tasks = extract_tasks(sym, params, target, target_host) + + print("Tuning...") + tune_tasks(tasks, **tuning_opt) + + # compile kernels with history best records + with autotvm.tophub.context(target, extra_files=[tuning_opt['log_filename']]): + print("Compile...") + graph, lib, params = generate_graph(sym, params, target, target_host) + input_shape = (1, 3, 224, 224) + dtype = 'float32' + + # export library + tmp = util.tempdir() + filename = "net.tar" + lib.export_library(tmp.relpath(filename)) + + # upload module to device + print("Upload...") + remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000) + remote.upload(tmp.relpath(filename)) + rlib = remote.load_module(filename) + + # upload parameters to device + ctx = remote.context(str(target), 0) + rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} + data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) + module = graph_runtime.create(graph, rlib, ctx) + module.set_input('data', data_tvm) + module.set_input(**rparams) + + # evaluate + print("Evaluate inference time cost...") + ftimer = module.module.time_evaluator("run", ctx, number=3, repeat=3) + prof_res = np.array(ftimer().results) * 1000 # convert to millisecond + print("Mean inference time (std dev): %.2f ms (%.2f ms)" % + (np.mean(prof_res), np.std(prof_res))) + From 51773ecd45853ca7cad807e4fbb68f3b7f6ca33c Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 29 May 2019 18:55:36 -0500 Subject: [PATCH 035/126] merge fix --- vta/python/vta/top/vta_conv2d.py | 57 -------------------------------- 1 file changed, 57 deletions(-) diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index 1672af47ca0c..eef047965a56 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -193,60 +193,3 @@ def _traverse(op): s[conv2d_stage].tensorize(x_bi, env.gemm) s[output].pragma(x_co1, env.dma_copy) return s -<<<<<<< HEAD - -class Conv2DSchedule(object): - """ 2D convolution schedule object. - """ - def __init__(self, - b_factor=1, - oc_factor=1, - ic_factor=1, - h_factor=1, - w_factor=0, - oc_nthread=0, - h_nthread=0, - debug_sync=False): - self.b_factor = b_factor - self.oc_factor = oc_factor - self.ic_factor = ic_factor - self.h_factor = h_factor - self.w_factor = w_factor - self.oc_nthread = oc_nthread - self.h_nthread = h_nthread - self.debug_sync = debug_sync - - def __str__(self): - return "{}.{}.{}.{}.{}.{}.{}".format( - self.b_factor, self.oc_factor, self.ic_factor, - self.h_factor, self.w_factor, - self.oc_nthread, self.h_nthread) - -Schedule = Conv2DSchedule - -# Layer description of the ResNet18 -RESNET = { - 0: Workload(1, 224, 224, 16, 64, 7, 7, 3, 3, 2, 2), - 1: Workload(1, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1), - 2: Workload(1, 56, 56, 64, 64, 1, 1, 0, 0, 1, 1), - 3: Workload(1, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2), - 4: Workload(1, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2), - 5: Workload(1, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1), - 6: Workload(1, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2), - 7: Workload(1, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2), - 8: Workload(1, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1), - 9: Workload(1, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2), - 10: Workload(1, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2), - 11: Workload(1, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1), -} - -for idx in RESNET: - f_schedules = find_schedules(RESNET[idx], vt_only=True, best_only=True) - if f_schedules: - scheds = f_schedules[0] - _WL2PLAN[RESNET[idx]] = scheds - else: - logging.warning("No valid schedule was found for the workload on current vta configuration") - break -======= ->>>>>>> autotvm support for conv2d operator From 3ad29a4bdf3e73e74edf97fa38e4c40f3566a4b6 Mon Sep 17 00:00:00 2001 From: ZihengJiang Date: Wed, 29 May 2019 14:39:49 -0700 Subject: [PATCH 036/126] Insert stop_fusion for vta. --- src/relay/pass/quantize.cc | 62 +++++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 5 deletions(-) diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc index 07233a83ca23..f55b26881f8b 100644 --- a/src/relay/pass/quantize.cc +++ b/src/relay/pass/quantize.cc @@ -504,10 +504,10 @@ RELAY_REGISTER_OP("nn.relu") RELAY_REGISTER_OP("strided_slice") .set_attr("FQRealizeRewrite", IdentityRealize); - -Expr MaxPoolRealize(const Call& ref_call, - const Array& new_args, - const NodeRef& ctx) { +/* \brief for unary operators which requantize its input to dtype_nbit */ +Expr CastDtypeInputRealize(const Call& ref_call, + const Array& new_args, + const NodeRef& ctx) { const QConfig& cfg = QConfig::Current(); CHECK_EQ(new_args.size(), 1); if (const auto* n = new_args[0].as()) { @@ -520,7 +520,10 @@ Expr MaxPoolRealize(const Call& ref_call, } RELAY_REGISTER_OP("nn.max_pool2d") -.set_attr("FQRealizeRewrite", MaxPoolRealize); +.set_attr("FQRealizeRewrite", CastDtypeInputRealize); + +RELAY_REGISTER_OP("stop_fusion") +.set_attr("FQRealizeRewrite", CastDtypeInputRealize); Expr AvgPoolRealize(const Call& ref_call, @@ -646,6 +649,55 @@ Pass QuantizeRealizePass() { TVM_REGISTER_API("relay._quantize.QuantizeRealize") .set_body_typed(QuantizeRealizePass); +class VtaStoreInjector : public ExprMutator { + private: + const CallNode* GetPreviousNode(const CallNode* n) { + if (n == nullptr || n->args.size() == 0) { + return nullptr; + } + return n->args[0].as(); + } + + public: + Expr VisitExpr_(const CallNode* n) final { + static const Op& conv2d = Op::Get("nn.conv2d"); + static const Op& add = Op::Get("add"); + static const Op& relu = Op::Get("nn.relu"); + auto new_e = ExprMutator::VisitExpr_(n); + const CallNode* n0 = new_e.as(); + // conv->add->relu->[here] + if (n0 && n0->op.same_as(relu)) { + const CallNode* n1 = n0->args[0].as(); + if (n1 && n1->op.same_as(add)) { + const CallNode* n2 = n1->args[0].as(); + if (n2 && n2->op.same_as(conv2d)) { + return StopFusion(new_e); + } + } + } + // conv->add->[here]->add + if (n0 && n0->op.same_as(add)) { + const CallNode* n1 = n0->args[1].as(); + if (n1 && n1->op.same_as(add)) { + const CallNode* n2 = n1->args[0].as(); + if (n2 && n2->op.same_as(conv2d)) { + Expr child = StopFusion(n0->args[1]); + return CallNode::make(add, {n0->args[0], child}, Attrs{}, {}); + } + } + } + return new_e; + } +}; + +Expr VtaStoreHint(const Expr& e) { + return VtaStoreInjector().Mutate(e); +} + +TVM_REGISTER_API("relay._quantize.vta_store_hint") +.set_body_typed(VtaStoreHint); + + } // namespace quantize } // namespace relay } // namespace tvm From df679584822f5db56a365723a034e688ab7c2e9e Mon Sep 17 00:00:00 2001 From: ZihengJiang Date: Fri, 31 May 2019 15:44:35 -0700 Subject: [PATCH 037/126] Update. --- python/tvm/relay/quantize/_annotate.py | 120 +++++++++++++++++-- python/tvm/relay/quantize/quantize.py | 42 +++++++ src/relay/op/annotation/annotation.cc | 26 ++++- src/relay/pass/pattern_util.h | 6 +- src/relay/pass/quantize.cc | 154 ++++++++++++++++--------- src/relay/pass/quantize.h | 24 ++++ 6 files changed, 306 insertions(+), 66 deletions(-) diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py index 61e895ac7efb..799b553a702c 100644 --- a/python/tvm/relay/quantize/_annotate.py +++ b/python/tvm/relay/quantize/_annotate.py @@ -171,14 +171,16 @@ def conv2d_rewrite(ref_call, new_args, ctx): lhs_expr, lhs_kind = _get_expr_kind(new_args[0]) rhs_expr, rhs_kind = _get_expr_kind(new_args[1]) - if lhs_kind is None or lhs_kind != QAnnotateKind.INPUT: + # print('conv2d lhs kind: {0}'.format(lhs_kind)) + # print('conv2d lhs: \n{0}'.format(lhs_expr)) + # print('\n\n\n') + if lhs_kind is None or lhs_kind == QAnnotateKind.ACTIVATION: lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT) assert rhs_kind is None rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT) expr = _forward_op(ref_call, [lhs_expr, rhs_expr]) - return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION) @@ -241,25 +243,43 @@ def add_rewrite(ref_call, new_args, ctx): lhs_expr, lhs_kind = _get_expr_kind(new_args[0]) rhs_expr, rhs_kind = _get_expr_kind(new_args[1]) + # print('add lhs kind: {0}'.format(lhs_kind)) + # print('add rhs kind: {0}'.format(rhs_kind)) if lhs_kind is None and rhs_kind is None: return None + if lhs_kind is None and rhs_kind is not None: # quantize lhs to INPUT field if it is normal expression + assert rhs_kind == QAnnotateKind.INPUT lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT) + expr = _forward_op(ref_call, [lhs_expr, rhs_expr]) + # print('execute add with INPUT') + return QAnnotateExpr(expr, QAnnotateKind.INPUT) + if lhs_kind is not None and rhs_kind is None: if isinstance(rhs_expr, _expr.Constant): # quantize rhs to WEIGHT field if it is Constant rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT) + assert lhs_kind == QAnnotateKind.ACTIVATION + expr = _forward_op(ref_call, [lhs_expr, rhs_expr]) + return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION) else: # quantize rhs to INPUT field if it is not Constant rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT) - if lhs_kind == QAnnotateKind.ACTIVATION and rhs_kind == QAnnotateKind.ACTIVATION: - # quantize rhs to INPUT field if both lhs and rhs are ACTIVATION - rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT) + raise ValueError + + if lhs_kind is not None and rhs_kind is not None: + if lhs_kind == QAnnotateKind.INPUT and rhs_kind == QAnnotateKind.INPUT: + expr = _forward_op(ref_call, [lhs_expr, rhs_expr]) + # print('execute add with INPUT') + return QAnnotateExpr(expr, QAnnotateKind.INPUT) + if lhs_kind == QAnnotateKind.ACTIVATION and rhs_kind == QAnnotateKind.ACTIVATION: + # quantize rhs to INPUT field if both lhs and rhs are ACTIVATION + rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT) - expr = _forward_op(ref_call, [lhs_expr, rhs_expr]) - return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION) + expr = _forward_op(ref_call, [lhs_expr, rhs_expr]) + return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION) @register_annotate_function("stop_fusion") @@ -294,6 +314,7 @@ def identity_rewrite(ref_call, new_args, ctx): register_annotate_function("nn.relu", identity_rewrite) register_annotate_function("strided_slice", identity_rewrite) register_annotate_function("nn.avg_pool2d", identity_rewrite) +register_annotate_function("stop_fusion", identity_rewrite) def pool2d_rewrite(ref_call, new_args, ctx): @@ -313,6 +334,20 @@ def pool2d_rewrite(ref_call, new_args, ctx): register_annotate_function("nn.max_pool2d", pool2d_rewrite) +@register_annotate_function("force_cast") +def force_cast_rewrite(ref_call, new_args, ctx): + if _conv_counter() <= current_qconfig().skip_k_conv: + return None + expr, x_kind = _get_expr_kind(new_args[0]) + + if x_kind is None: + return new_args[0] + if x_kind == QAnnotateKind.ACTIVATION: + expr = attach_simulated_quantize(expr, QAnnotateKind.INPUT) + + expr = _forward_op(ref_call, [expr]) + return QAnnotateExpr(expr, QAnnotateKind.INPUT) + @register_annotate_function("concatenate") def concatenate_rewrite(ref_call, new_args, ctx): @@ -333,3 +368,74 @@ def concatenate_rewrite(ref_call, new_args, ctx): expr_list[i] = attach_simulated_quantize(expr_list[i], QAnnotateKind.ACTIVATION) expr = _forward_op(ref_call, [_expr.Tuple(expr_list)]) return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION) + + +# register for vta stop fusion +def register_vta_rewrite(op_name, frewrite=None, level=10): + def _register(func): + return _op.op._Register(op_name, "FQVtaRewrite", func, level) + return _register(frewrite) if frewrite is not None else _register + +@register_relay_node +class QVtaExpr(_expr.TempExpr): + def __init__(self, expr): + self.__init_handle_by_constructor__( + _quantize.make_vta_expr, expr) + + def realize(self): + return _quantize.temp_expr_realize(self) + + +def vta_expr_check(expr): + if isinstance(expr, QVtaExpr): + return True, expr.expr + return False, expr + +# def _stop_fusion(expr): +# return _quantize.make_stop_fusion(expr) + +@register_vta_rewrite("nn.conv2d") +def conv2d_vta_rewrite(ref_call, new_args, ctx): + cnt = _conv_counter() + if cnt < current_qconfig().skip_k_conv: + _set_conv_counter(cnt + 1) + return None + _set_conv_counter(cnt + 1) + + + data_cond, data = vta_expr_check(new_args[0]) + kernel_cond, kernel = vta_expr_check(new_args[1]) + + assert not kernel_cond + if data_cond: + data = new_args[0].realize() + ret = _forward_op(ref_call, [data, kernel]) + return QVtaExpr(ret) + +def identity_vta_rewrite(ref_call, new_args, ctx): + cond, expr = vta_expr_check(new_args[0]) + if cond: + return QVtaExpr(_forward_op(ref_call, [expr])) + else: + return None + +register_vta_rewrite("nn.relu", identity_vta_rewrite) +register_vta_rewrite("nn.max_pool2d", identity_vta_rewrite) + + +# @register_vta_rewrite("nn.max_pool2d") +# def pool_vta_rewrite(ref_call, new_args, ctx): +# pass + +@register_vta_rewrite("add") +def add_vta_rewrite(ref_call, new_args, ctx): + lhs_cond, lhs = vta_expr_check(new_args[0]) + rhs_cond, rhs = vta_expr_check(new_args[1]) + if lhs_cond and rhs_cond: + lhs = new_args[0].realize() + rhs = new_args[1].realize() + return _forward_op(ref_call, [lhs, rhs]) + elif lhs_cond and not rhs_cond: + return QVtaExpr(_forward_op(ref_call, [lhs, rhs])) + else: + return None diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py index a7749d4892fb..da881db26fb2 100644 --- a/python/tvm/relay/quantize/quantize.py +++ b/python/tvm/relay/quantize/quantize.py @@ -349,3 +349,45 @@ def quantize(graph, params=None, dataset=None): mod = optimize(mod) mod = quantize_seq(mod) return mod[mod.entry_func.name_hint] + +def quantize_vta(graph, params=None, dataset=None): + + """ The quantization procedure for VTA specifically. + + Parameters + --------- + graph: Function + The original graph. + + params : dict of str to NDArray + Input parameters to the graph that do not change + during inference time. Used for constant folding. + + dataset: list of dict of Var -> NDArray + The calibration dataset. + + Returns + ------- + ret: Function + The graph after quantization + """ + + # TODO(zhiics) Move this to the pass manager. + graph = optimize(graph, params) + + print('original graph') + print(graph) + graph = _quantize.rewrite_for_vta(graph) + print('after rewrite for vta') + print(graph) + + graph = annotate(graph) + graph = calibrate(graph, dataset) + print('after calibrate') + print(graph) + graph = realize(graph) + graph = _ir_pass.fold_constant(graph) + + print('after realize') + print(graph) + return graph diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc index f09a3a22e3ab..789c85e39074 100644 --- a/src/relay/op/annotation/annotation.cc +++ b/src/relay/op/annotation/annotation.cc @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -83,6 +83,28 @@ TVM_ADD_FILELINE) return {topi::identity(inputs[0])}; }); +Expr ForceCast(Expr data) { + static const Op& op = Op::Get("force_cast"); + return CallNode::make(op, {data}, Attrs{}, {}); +} + +RELAY_REGISTER_OP("force_cast") +.describe(R"code(Annotate an expression to prevent it being fused with previous expressions.)code" +TVM_ADD_FILELINE) +.set_num_inputs(1) +.add_argument("data", "Tensor", "The input data.") +.add_type_rel("Identity", IdentityRel) +.set_support_level(10) +.set_attr("TOpPattern", kOpaque) +.set_attr("TOpIsStateful", false) +.set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) +.set_attr("FTVMCompute", + [](const Attrs& attrs, const Array& inputs, + const Type& out_dtype, const Target& target) -> Array { + return {topi::identity(inputs[0])}; + }); + + RELAY_REGISTER_OP("bitpack_start") .describe(R"code( Mark the start of bitpacking. diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h index b709f2846b34..5c303905968e 100644 --- a/src/relay/pass/pattern_util.h +++ b/src/relay/pass/pattern_util.h @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -379,6 +379,8 @@ Expr MakeStridedSlice(Expr data, Array begin, Array end, Array Expr StopFusion(Expr data); +Expr ForceCast(Expr data); + } // namespace relay } // namespace tvm #endif // TVM_RELAY_PASS_PATTERN_UTIL_H_ diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc index f55b26881f8b..8fbe290ad60b 100644 --- a/src/relay/pass/quantize.cc +++ b/src/relay/pass/quantize.cc @@ -90,7 +90,7 @@ RELAY_REGISTER_OP("relay.op.annotation.simulated_quantize") .add_argument("clip_min", "Tensor", "lower bound. It should be a scalar") .add_argument("clip_max", "Tensor", "upper bound. It should be a scalar") .set_attrs_type_key("relay.attrs.SimulatedQuantizeAttrs") -.set_support_level(10) +.set_support_level(11) .add_type_rel("SimulatedQuantize", SimulatedQuantizeRel); TVM_REGISTER_API("relay._quantize.simulated_quantize") @@ -111,13 +111,14 @@ TVM_REGISTER_API("relay._quantize.simulated_quantize") Expr QAnnotateExprNode::Realize() const { const auto& cfg = QConfig::Current(); - if (cfg->store_lowbit_output) { - // store low bit output back for VTA - const PackedFunc* f = runtime::Registry::Get("relay.quantize.attach_simulated_quantize"); - return (*f)(this->expr, static_cast(kQInput)); - } else { - return expr; - } + return expr; + // if (cfg->store_lowbit_output) { + // // store low bit output back for VTA + // const PackedFunc* f = runtime::Registry::Get("relay.quantize.attach_simulated_quantize"); + // return (*f)(this->expr, static_cast(kQInput)); + // } else { + // return expr; + // } } QAnnotateExpr QAnnotateExprNode::make(Expr expr, QAnnotateKind kind) { @@ -133,6 +134,23 @@ TVM_REGISTER_API("relay._quantize.make_annotate_expr") static_cast(args[1].operator int())); }); + +TVM_REGISTER_API("relay._quantize.annotate") +.set_body_typed([] (const Expr& expr) { + std::function fmulti_ref = [](const Expr& e) { + if (e->derived_from()) { + const auto* n = e.as(); + CHECK(n); + const PackedFunc* f = runtime::Registry::Get("relay.quantize.attach_simulated_quantize"); + Expr ret = (*f)(n->expr, static_cast(kQInput)); + return static_cast(QAnnotateExprNode::make(ret, kQInput)); + } + return e; + }; + return ForwardRewrite(expr, "FQAnnotateRewrite", nullptr, nullptr); +}); + + // ============= // realize pass @@ -385,7 +403,17 @@ Array UnifyDTypeScale(const Array& ref_args, // unify the data type CHECK_EQ(ref_args.size(), args.size()); - DataType dtype = cfg->dtype_activation; + DataType dtype; + if (nptrs[0]->dtype == cfg->dtype_activation) { + DataType dtype = cfg->dtype_activation; + ret.Set(1, Cast(ret[1], dtype)); + } else if (nptrs[1]->dtype == cfg->dtype_input) { + DataType dtype = cfg->dtype_input; + ret.Set(0, Cast(ret[0], dtype)); + } else { + LOG(FATAL) << "should not touch here."; + } + for (size_t i = 0; i < ret.size(); ++i) { auto ref_arg = ref_args[i].as(); if (nptrs[i]->dtype != dtype) { @@ -504,6 +532,9 @@ RELAY_REGISTER_OP("nn.relu") RELAY_REGISTER_OP("strided_slice") .set_attr("FQRealizeRewrite", IdentityRealize); +RELAY_REGISTER_OP("stop_fusion") +.set_attr("FQRealizeRewrite", IdentityRealize); + /* \brief for unary operators which requantize its input to dtype_nbit */ Expr CastDtypeInputRealize(const Call& ref_call, const Array& new_args, @@ -522,9 +553,6 @@ Expr CastDtypeInputRealize(const Call& ref_call, RELAY_REGISTER_OP("nn.max_pool2d") .set_attr("FQRealizeRewrite", CastDtypeInputRealize); -RELAY_REGISTER_OP("stop_fusion") -.set_attr("FQRealizeRewrite", CastDtypeInputRealize); - Expr AvgPoolRealize(const Call& ref_call, const Array& new_args, @@ -546,6 +574,29 @@ Expr AvgPoolRealize(const Call& ref_call, RELAY_REGISTER_OP("nn.avg_pool2d") .set_attr("FQRealizeRewrite", AvgPoolRealize); +Expr ForceCastRealize(const Call& ref_call, + const Array& new_args, + const NodeRef& ctx) { + const QConfig& cfg = QConfig::Current(); + CHECK_EQ(new_args.size(), 1); + if (const auto* n = new_args[0].as()) { + Expr ret = Cast(n->data, cfg->dtype_input); + return QRealizeIntExprNode::make(ret, n->dom_scale, cfg->dtype_input); + } + CHECK(!new_args[0]->derived_from()); + return Expr(nullptr); +} + +RELAY_REGISTER_OP("force_cast") +.set_attr("FQRealizeRewrite", ForceCastRealize); + +TVM_REGISTER_API("relay._quantize.realize") +.set_body_typed([](const Expr& e) { + Expr ret = ForwardRewrite(e, "FQRealizeRewrite", nullptr, nullptr); + return ret; +}); + + // ============= // qconfig @@ -649,53 +700,46 @@ Pass QuantizeRealizePass() { TVM_REGISTER_API("relay._quantize.QuantizeRealize") .set_body_typed(QuantizeRealizePass); -class VtaStoreInjector : public ExprMutator { - private: - const CallNode* GetPreviousNode(const CallNode* n) { - if (n == nullptr || n->args.size() == 0) { - return nullptr; - } - return n->args[0].as(); - } +// ============= +// Insert stop_fusion for vta. - public: - Expr VisitExpr_(const CallNode* n) final { - static const Op& conv2d = Op::Get("nn.conv2d"); - static const Op& add = Op::Get("add"); - static const Op& relu = Op::Get("nn.relu"); - auto new_e = ExprMutator::VisitExpr_(n); - const CallNode* n0 = new_e.as(); - // conv->add->relu->[here] - if (n0 && n0->op.same_as(relu)) { - const CallNode* n1 = n0->args[0].as(); - if (n1 && n1->op.same_as(add)) { - const CallNode* n2 = n1->args[0].as(); - if (n2 && n2->op.same_as(conv2d)) { - return StopFusion(new_e); - } - } - } - // conv->add->[here]->add - if (n0 && n0->op.same_as(add)) { - const CallNode* n1 = n0->args[1].as(); - if (n1 && n1->op.same_as(add)) { - const CallNode* n2 = n1->args[0].as(); - if (n2 && n2->op.same_as(conv2d)) { - Expr child = StopFusion(n0->args[1]); - return CallNode::make(add, {n0->args[0], child}, Attrs{}, {}); - } - } - } - return new_e; - } -}; -Expr VtaStoreHint(const Expr& e) { - return VtaStoreInjector().Mutate(e); +Expr QVtaExprNode::Realize() const { + Expr ret = ForceCast(this->expr); + return StopFusion(ret); +} + +QVtaExpr QVtaExprNode::make(Expr expr) { + auto rnode = make_node(); + rnode->expr = expr; + return QVtaExpr(rnode); } -TVM_REGISTER_API("relay._quantize.vta_store_hint") -.set_body_typed(VtaStoreHint); +TVM_REGISTER_API("relay._quantize.rewrite_for_vta") +.set_body_typed([] (const Expr& expr) { + return ForwardRewrite(expr, "FQVtaRewrite", nullptr, nullptr); +}); + + +TVM_REGISTER_API("relay._quantize.make_vta_expr") +.set_body([](TVMArgs args, TVMRetValue *ret) { + *ret = QVtaExprNode::make(args[0]); + }); + + +TVM_REGISTER_API("relay._quantize.make_stop_fusion") +.set_body_typed([] (const Expr& expr) { + return StopFusion(expr); +}); + +TVM_REGISTER_API("relay._quantize.temp_expr_realize") +.set_body_typed([] (const Expr& expr) { + const QVtaExprNode* n = expr.as(); + CHECK(n); + return n->Realize(); +}); + + } // namespace quantize diff --git a/src/relay/pass/quantize.h b/src/relay/pass/quantize.h index da95a6c2134a..fce98e54459c 100644 --- a/src/relay/pass/quantize.h +++ b/src/relay/pass/quantize.h @@ -72,6 +72,30 @@ class QAnnotateExprNode : public TempExprNode { RELAY_DEFINE_NODE_REF(QAnnotateExpr, QAnnotateExprNode, TempExpr); +class QVtaExpr; +/*! + * \brief TempExprNode used during annotate forward rewrite. + */ +class QVtaExprNode : public TempExprNode { + public: + /*! \brief The original expression */ + Expr expr; + + void VisitAttrs(tvm::AttrVisitor* v) final { + v->Visit("expr", &expr); + } + + TVM_DLL static QVtaExpr make(Expr expr); + + Expr Realize() const final; + + static constexpr const char* _type_key = "relay.QVtaExpr"; + TVM_DECLARE_NODE_TYPE_INFO(QVtaExprNode, TempExprNode); +}; + +RELAY_DEFINE_NODE_REF(QVtaExpr, QVtaExprNode, TempExpr); + + /*! \brief TempExpr used during realize forward rewrite. */ class QRealizeExpr; /*! \brief TempExpr representing integer. */ From 7b2d3067f11b75d22c056292686e6a2081b310d3 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 3 Jun 2019 17:55:56 -0700 Subject: [PATCH 038/126] fix bug from relay build config change --- vta/scripts/relay_to_vta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vta/scripts/relay_to_vta.py b/vta/scripts/relay_to_vta.py index 66af34f659e4..22c76bdda259 100644 --- a/vta/scripts/relay_to_vta.py +++ b/vta/scripts/relay_to_vta.py @@ -188,7 +188,7 @@ def run(device = "vta"): relay_graph = relay.ir_pass.fold_constant(relay_graph) # Compile Relay program. - with relay.build_module.build_config(opt_level=3, disable_pass={"AlterOpLayout"}): + with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta": graph, lib, params = relay.build( relay_graph, target=target, From d539d15745599237d938435681c90e842cb473fd Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 4 Jun 2019 15:28:44 -0700 Subject: [PATCH 039/126] typo fix --- src/pass/make_api.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pass/make_api.cc b/src/pass/make_api.cc index dfb764f535c9..13f46ecb6f7a 100644 --- a/src/pass/make_api.cc +++ b/src/pass/make_api.cc @@ -184,7 +184,7 @@ LoweredFunc MakeAPI(Stmt body, for (Var v : undefined) { os << " \'" << v->name_hint << "\' "; } - os << " does not appeared in api_args"; + os << " does not appear in api_args"; LOG(FATAL) << "Not all Vars are passed in api_args: " << os.str(); } return f; From b1077181931cc82efafcc7f246add95a36d5c8ee Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 4 Jun 2019 15:47:36 -0700 Subject: [PATCH 040/126] typo fix --- src/schedule/schedule_lang.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/schedule/schedule_lang.cc b/src/schedule/schedule_lang.cc index e1cb4c5f9bdc..7532f4bcd31c 100644 --- a/src/schedule/schedule_lang.cc +++ b/src/schedule/schedule_lang.cc @@ -47,7 +47,7 @@ size_t FindLeafVar(ArrayNode* all_vars, ArrayNode* leaf_vars, const IterVar& v) if (FindNodeRef(all_vars, v) < all_vars->data.size()) { LOG(FATAL) << "Operate on iter var " << v - << "that has already been splitted"; + << "that has already been split"; } else { LOG(FATAL) << "Operate on iter var " << v << "that is not part of the schedule"; From c5936ba65ef4e5498f2025a77624fe5dde73b96b Mon Sep 17 00:00:00 2001 From: ZihengJiang Date: Tue, 4 Jun 2019 16:47:25 -0700 Subject: [PATCH 041/126] Fix for tvm::buuild --- src/codegen/build_module.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc index 04a2fd6d4db9..488baa9bce46 100644 --- a/src/codegen/build_module.cc +++ b/src/codegen/build_module.cc @@ -580,6 +580,9 @@ runtime::Module build(const Map>& inputs, Map> updated_input; for (const auto& it : inputs) { auto target = Target::Create(it.first); + if (target->device_name == "vta") { + target = Target::Create("ext_dev"); + } updated_input.Set(target, it.second); } return build(updated_input, target_host, config); From 96b7529d4d61b946eeae084f5ad29f72839d421a Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 4 Jun 2019 18:06:37 -0700 Subject: [PATCH 042/126] relay task extraction for VTA (wip) --- python/tvm/autotvm/task/nnvm_integration.py | 19 +++-- python/tvm/autotvm/task/relay_integration.py | 87 +++++++++++--------- 2 files changed, 57 insertions(+), 49 deletions(-) diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py index e4d2b3fb8023..251a310cf7aa 100644 --- a/python/tvm/autotvm/task/nnvm_integration.py +++ b/python/tvm/autotvm/task/nnvm_integration.py @@ -66,8 +66,8 @@ def extract_from_graph(graph, shape, dtype, target, symbols, params, target_host env = TaskExtractEnv.get() - #NOTE: To add more symbols, you only need to change the following lists - #nnvm symbol -> topi compute + # NOTE: To add more symbols, you only need to change the following lists + # nnvm symbol -> topi compute SYMBOL2TOPI = { nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw, topi.nn.group_conv2d_nchw], @@ -81,14 +81,14 @@ def extract_from_graph(graph, shape, dtype, target, symbols, params, target_host topi_funcs.extend(SYMBOL2TOPI[sym_name]) else: warnings.warn("Symbol %s is not tunable, ignored" % sym_name) - env.reset(topi_funcs) + # run compiler to collect all TOPI calls during compilation + env.reset(topi_funcs) with env: # disable logger temporarily old_state = logger.disabled logger.disabled = True - # run compiler to collect all TOPI calls during compilation nnvm.compiler.engine.clear_cache() nnvm.compiler.build(graph, target=target, shape=shape, dtype=dtype, target_host=target_host, params=params) @@ -99,12 +99,14 @@ def extract_from_graph(graph, shape, dtype, target, symbols, params, target_host tasks = [] for task_name, args in env.get_tasks(): try: + print(task_name) + print(args) tsk = create(task_name, args, target=target, target_host=target_host, template_key='direct') tasks.append(tsk) except topi.InvalidShapeError: - print("[Warning] Invalid Shape during AutoTVM Task Creation") + print("[Warning] Invalid shape during AutoTVM task creation") return tasks @@ -157,15 +159,16 @@ def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, params, topi_funcs.extend(SYMBOL2TOPI[sym_name]) else: warnings.warn("Symbol %s is not tunable, ignored" % sym_name) - env.reset(topi_funcs) + # run compiler to collect all TOPI calls during compilation + env.reset(topi_funcs) with env: # disable logger temporarily old_state = logger.disabled logger.disabled = True - nnvm.compiler.engine.clear_cache() for graph, shape, dtype in zip(graphs, shapes, dtypes): + nnvm.compiler.engine.clear_cache() nnvm.compiler.build(graph, target=target, shape=shape, dtype=dtype) logger.disabled = old_state @@ -179,7 +182,7 @@ def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, params, template_key='direct') tasks.append(tsk) except topi.InvalidShapeError: - print("[Warning] Invalid Shape during AutoTVM Task Creation") + print("[Warning] Invalid shape during AutoTVM task creation") return tasks diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py index 14caa70c0b84..c22496369637 100644 --- a/python/tvm/autotvm/task/relay_integration.py +++ b/python/tvm/autotvm/task/relay_integration.py @@ -57,11 +57,12 @@ def extract_from_program(func, params, ops, target, target_host=None): task: Array of autotvm.task.Task collected tasks """ - env = TaskExtractEnv.get() import tvm.relay.op from tvm import relay import topi + env = TaskExtractEnv.get() + # NOTE: To add more ops, you only need to change the following lists # relay op -> topi compute OP2TOPI = { @@ -81,30 +82,32 @@ def extract_from_program(func, params, ops, target, target_host=None): # run compiler to collect all TOPI calls during compilation env.reset(topi_funcs) + with env: + # disable logger temporarily + old_state = logger.disabled + logger.disabled = True - # disable logger temporarily - old_state = logger.disabled - logger.disabled = True - - # use a "tracing" target to do a fake compile for collecting topi calls - tracing_target = _target.create("llvm -device=tracing") - relay.backend.compile_engine.get().clear() - # wrap build call in thread to avoid multiprocessing problems - build_thread = threading.Thread(target=relay.build, args=(func, - tracing_target, - target_host, - params)) - build_thread.start() - build_thread.join() - logger.disabled = old_state + relay.backend.compile_engine.get().clear() + # wrap build call in thread to avoid multiprocessing problems + build_thread = threading.Thread(target=relay.build, args=(func, + target, + target_host, + params)) + build_thread.start() + build_thread.join() + + logger.disabled = old_state # create tasks for target tasks = [] for task_name, args in env.get_tasks(): - tasks.append(create(task_name, args, - target=target, target_host=target_host, - template_key='direct')) - + try: + tsk = create(task_name, args, + target=target, target_host=target_host, + template_key='direct') + tasks.append(tsk) + except topi.InvalidShapeError: + print("[Warning] Invalid shape during AutoTVM task creation") return tasks @@ -155,30 +158,32 @@ def extract_from_multiple_program(funcs, params, ops, target, target_host=None): # run compiler to collect all TOPI calls during compilation env.reset(topi_funcs) - - # disable logger temporarily - old_state = logger.disabled - logger.disabled = True - - # use a "tracing" target to do a fake compile for collecting topi calls - tracing_target = _target.create("llvm -device=tracing") - - for func, param in zip(funcs, params): - # wrap build call in thread to avoid multiprocessing problems - build_thread = threading.Thread(target=relay.build, args=(func, - tracing_target, - target_host, - params)) - build_thread.start() - build_thread.join() - - logger.disabled = old_state + with env: + # disable logger temporarily + old_state = logger.disabled + logger.disabled = True + + for func, param in zip(funcs, params): + relay.backend.compile_engine.get().clear() + # wrap build call in thread to avoid multiprocessing problems + build_thread = threading.Thread(target=relay.build, args=(func, + target, + target_host, + params)) + build_thread.start() + build_thread.join() + + logger.disabled = old_state # create tasks for target tasks = [] for task_name, args in env.get_tasks(): - tasks.append(create(task_name, args, - target=target, target_host=target_host, - template_key='direct')) + try: + tsk = create(task_name, args, + target=target, target_host=target_host, + template_key='direct') + tasks.append(tsk) + except topi.InvalidShapeError: + print("[Warning] Invalid shape during AutoTVM task creation") return tasks From fadd29dd14816679b5bdb2e3a95f2e6fb3e75657 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 5 Jun 2019 11:34:27 -0700 Subject: [PATCH 043/126] refactor relay to vta compilation script --- vta/scripts/relay_to_vta.py | 162 +++++++++++++++--------------------- 1 file changed, 68 insertions(+), 94 deletions(-) diff --git a/vta/scripts/relay_to_vta.py b/vta/scripts/relay_to_vta.py index 22c76bdda259..59bc8095608f 100644 --- a/vta/scripts/relay_to_vta.py +++ b/vta/scripts/relay_to_vta.py @@ -16,7 +16,7 @@ from vta.top import graph_pack parser = argparse.ArgumentParser(description='Train a model for image classification.') -parser.add_argument('--model', type=str, required=True, +parser.add_argument('--model', type=str, required=False, default='resnet18_v1', help='Input model name.') parser.add_argument('--start-name', type=str, default='nn.max_pool2d', help='The name of the node where packing starts') @@ -31,15 +31,6 @@ opt = parser.parse_args() -if 'mobilenet' in opt.model: - opt.start_name = 'nn.relu' -elif 'gan' in opt.model: - opt.start_name = 'reshape0' - opt.stop_name = 'copy2' -elif 'rnn' in opt.model: - opt.start_name = 'reshape0' - opt.stop_name = 'reshape1' - # Helper function to read in image # Takes in Image object, returns an ND array def process_image(image): @@ -51,63 +42,11 @@ def process_image(image): return tvm.nd.array(image.astype("float32")) -def demo_cat_classification(env, m, ctx, remote, shape_dict, dtype_dict): - # Read in ImageNet Categories - url = "https://github.com/uwsaml/web-data/raw/master/vta/models/" - categ_fn = "synset.txt" - for fn in ["synset.txt"]: - if not isfile(fn): - download.download(join(url, fn), fn) - synset = eval(open(categ_fn).read()) - # Read in test image - image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg' - # Read in test image - response = requests.get(image_url) - image = Image.open(BytesIO(response.content)).resize((224, 224)) - # Set the input - image = process_image(image) - if "gan" in opt.model or "rnn" in opt.model: - # non-classification networks require custom input shapes and out shapes - m.set_input('data', tvm.nd.array( - 10 * np.random.uniform(size=shape_dict['data']).astype(dtype_dict['data']))) - timer = m.module.time_evaluator("run", ctx, number=1, repeat=opt.measurements) - tcost = timer() - std = np.std(tcost.results) * 1000 / env.BATCH - mean = tcost.mean * 1000 / env.BATCH - print("Performed inference in %.2fms/samlple (std = %.2f)" % (mean, std)) - else: - image = np.repeat(image.asnumpy(), env.BATCH, axis=0) - m.set_input('data', image) - # Perform inference - timer = m.module.time_evaluator("run", ctx, number=1, repeat=opt.measurements) - tcost = timer() - - if opt.debug_profile: - m.run() - - # Get classification results - tvm_output = m.get_output(0, - tvm.nd.empty((env.BATCH, 1000), "float32", remote.cpu(0))) - top_categories = np.argsort(tvm_output.asnumpy()[0]) - - # Report top-5 classification results - std = np.std(tcost.results) * 1000 / env.BATCH - mean = tcost.mean * 1000 / env.BATCH - print("%s Prediction" % opt.model) - print(" #1:", synset[top_categories[-1]]) - print(" #2:", synset[top_categories[-2]]) - print(" #3:", synset[top_categories[-3]]) - print(" #4:", synset[top_categories[-4]]) - print(" #5:", synset[top_categories[-5]]) - print("Performed inference in %.2fms/sample (std = %.2f)" % (mean, std)) - -###################################################################### -# Setup the Pynq Board's RPC Server -# --------------------------------- -# Build the RPC server's VTA runtime and program the Pynq FPGA. +if __name__ == '__main__': -def run(device = "vta"): + # Read in VTA environment env = vta.get_env() + # Measure build start time reconfig_start = time.time() @@ -119,7 +58,12 @@ def run(device = "vta"): assert tvm.module.enabled("rpc") # Get remote from fleet node - remote = autotvm.measure.request_remote(env.TARGET, '10.77.1.109', 9190, timeout=10000) + tracket_host = os.environ.get("TVM_TRACKER_HOST", None) + tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None)) + if not tracket_host or not tracket_port: + print("Set your AutoTVM tracker node host and port variables to run the autotuner") + exit() + remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000) # Reconfigure the JIT runtime vta.reconfig_runtime(remote) @@ -138,9 +82,10 @@ def run(device = "vta"): remote = rpc.LocalSession() # TVM target and context - target = tvm.target.create("llvm -device={}".format(device)) - ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) + target = tvm.target.create("llvm -device={}".format(opt.device)) + ctx = remote.ext_dev(0) if opt.device == "vta" else remote.cpu(0) + # Get tophub schedules with autotvm.tophub.context(target): # Measure build start time @@ -152,54 +97,44 @@ def run(device = "vta"): # Populate the shape and data type dictionary dtype_dict = {"data": 'float32'} - if "gan" in opt.model: - shape_dict = {"data": (env.BATCH, 100)} - elif 'rnn' in opt.model: - batch_size, seq_len, hidden_dim = 4, 1, 640 - begin_state_shape = (batch_size, hidden_dim, 1, 1) - shape_dict = {"data": (seq_len, batch_size), - "cell_l0_begin_state_0": begin_state_shape, - "cell_l1_begin_state_0": begin_state_shape} - dtype_dict = {"data": "int32", - "cell_l0_begin_state_0": 'float32', - "cell_l1_begin_state_0": 'float32'} - else: - shape_dict = {"data": (env.BATCH, 3, 224, 224)} + shape_dict = {"data": (env.BATCH, 3, 224, 224)} + # Get off the shelf gluon model, and convert to relay gluon_model = vision.get_model(opt.model, pretrained=True) - relay_graph, params = relay.frontend.from_mxnet(gluon_model, shape_dict) + relay_prog, params = relay.frontend.from_mxnet(gluon_model, shape_dict) + # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) + # Perform quantization in Relay with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1): - relay_graph = relay.quantize.quantize(relay_graph, params=params) + relay_prog = relay.quantize.quantize(relay_prog, params=params) + # Perform graph packing and constant folding for VTA target if target.device_name == "vta": assert env.BLOCK_IN == env.BLOCK_OUT - relay_graph = graph_pack( - relay_graph, + relay_prog = graph_pack( + relay_prog, env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=opt.start_name, stop_name=opt.stop_name) + relay_prog = relay.ir_pass.fold_constant(relay_prog) - relay_graph = relay.ir_pass.fold_constant(relay_graph) - - # Compile Relay program. + # Compile Relay program with AlterOpLayout disabled with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta": graph, lib, params = relay.build( - relay_graph, target=target, + relay_prog, target=target, params=params, target_host=target_host) else: with vta.build_config(): graph, lib, params = relay.build( - relay_graph, target=target, + relay_prog, target=target, params=params, target_host=target_host) - # Save the compiled inference graph library assert tvm.module.enabled("rpc") temp = util.tempdir() @@ -213,13 +148,52 @@ def run(device = "vta"): build_time = time.time() - build_start print(opt.model + " inference graph built in {0:.2f}s!".format(build_time)) + # If detailed runtime info is needed build with debug runtime if opt.debug_profile: m = debug_runtime.create(graph, lib, ctx) else: m = graph_runtime.create(graph, lib, ctx) - # Set the parameters + # Set the network parameters m.set_input(**params) - demo_cat_classification(env, m, ctx, remote, shape_dict, dtype_dict) -run(opt.device) + # Read in ImageNet Categories + url = "https://github.com/uwsaml/web-data/raw/master/vta/models/" + categ_fn = "synset.txt" + for fn in ["synset.txt"]: + if not isfile(fn): + download.download(join(url, fn), fn) + synset = eval(open(categ_fn).read()) + + # Read in test image + image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg' + response = requests.get(image_url) + image = Image.open(BytesIO(response.content)).resize((224, 224)) + + # Set the input + image = process_image(image) + image = np.repeat(image.asnumpy(), env.BATCH, axis=0) + m.set_input('data', image) + + # Perform inference + timer = m.module.time_evaluator("run", ctx, number=1, repeat=opt.measurements) + tcost = timer() + + # Display profile information + if opt.debug_profile: + m.run() + + # Get classification results + tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 1000), "float32", remote.cpu(0))) + top_categories = np.argsort(tvm_output.asnumpy()[0]) + + # Report top-5 classification results + std = np.std(tcost.results) * 1000 / env.BATCH + mean = tcost.mean * 1000 / env.BATCH + print("%s Prediction" % opt.model) + print(" #1:", synset[top_categories[-1]]) + print(" #2:", synset[top_categories[-2]]) + print(" #3:", synset[top_categories[-3]]) + print(" #4:", synset[top_categories[-4]]) + print(" #5:", synset[top_categories[-5]]) + print("Performed inference in %.2fms/sample (std = %.2f)" % (mean, std)) From 6ba0fff45387fda0bac42066dcd2cc17dec15127 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 5 Jun 2019 12:06:35 -0700 Subject: [PATCH 044/126] further refactor, cleanup --- vta/scripts/relay_to_vta.py | 123 ++++++++++++++++++------------------ 1 file changed, 61 insertions(+), 62 deletions(-) diff --git a/vta/scripts/relay_to_vta.py b/vta/scripts/relay_to_vta.py index 59bc8095608f..d75deea432ae 100644 --- a/vta/scripts/relay_to_vta.py +++ b/vta/scripts/relay_to_vta.py @@ -15,47 +15,47 @@ from vta.testing import simulator from vta.top import graph_pack -parser = argparse.ArgumentParser(description='Train a model for image classification.') -parser.add_argument('--model', type=str, required=False, default='resnet18_v1', - help='Input model name.') -parser.add_argument('--start-name', type=str, default='nn.max_pool2d', - help='The name of the node where packing starts') -parser.add_argument('--stop-name', type=str, default='nn.global_avg_pool2d', - help='The name of the node where packing stops') -parser.add_argument('--debug-profile', action='store_true', - help='Show layer-wise time cost profiling results') -parser.add_argument('--device', default="vta", - help='Select device target, either "vta" or "vtacpu"') -parser.add_argument('--measurements', type=int, default=1, - help='Number of measurements') - -opt = parser.parse_args() - -# Helper function to read in image -# Takes in Image object, returns an ND array -def process_image(image): - # Convert to neural network input format - image = np.array(image) - np.array([123., 117., 104.]) - image /= np.array([58.395, 57.12, 57.375]) - image = image.transpose((2, 0, 1)) - image = image[np.newaxis, :] - return tvm.nd.array(image.astype("float32")) +def classification_demo(opt): + """Image classification demo. -if __name__ == '__main__': + Parameters + ---------- + opt: a dictionary obtained from argparse + """ + + # Make sure that TVM was compiled with RPC=1 + assert tvm.module.enabled("rpc") # Read in VTA environment env = vta.get_env() - # Measure build start time - reconfig_start = time.time() + # Download ImageNet Categories + url = "https://github.com/uwsaml/web-data/raw/master/vta/models/" + categ_fn = "synset.txt" + for fn in ["synset.txt"]: + if not isfile(fn): + download.download(join(url, fn), fn) + synset = eval(open(categ_fn).read()) + + # Download test image + image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg' + response = requests.get(image_url) + + # Prepare test image for inference + image = Image.open(BytesIO(response.content)).resize((224, 224)) + image = np.array(image) - np.array([123., 117., 104.]) + image /= np.array([58.395, 57.12, 57.375]) + image = image.transpose((2, 0, 1)) + image = image[np.newaxis, :] + image = np.repeat(image, env.BATCH, axis=0) # We configure both the bitstream and the runtime system on the Pynq # to match the VTA configuration specified by the vta_config.json file. if env.TARGET != "sim": - # Make sure that TVM was compiled with RPC=1 - assert tvm.module.enabled("rpc") + # Measure build start time + reconfig_start = time.time() # Get remote from fleet node tracket_host = os.environ.get("TVM_TRACKER_HOST", None) @@ -65,12 +65,10 @@ def process_image(image): exit() remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000) - # Reconfigure the JIT runtime - vta.reconfig_runtime(remote) - - # Program the FPGA with a pre-compiled VTA bitstream. + # Reconfigure the JIT runtime and FPGA. # You can program the FPGA with your own custom bitstream # by passing the path to the bitstream file instead of None. + vta.reconfig_runtime(remote) vta.program_fpga(remote, bitstream=None) # Report on reconfiguration time @@ -78,10 +76,10 @@ def process_image(image): print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time)) # In simulation mode, host the RPC server locally. - elif env.TARGET == "sim": + else: remote = rpc.LocalSession() - # TVM target and context + # Create a TVM target and execution context target = tvm.target.create("llvm -device={}".format(opt.device)) ctx = remote.ext_dev(0) if opt.device == "vta" else remote.cpu(0) @@ -134,45 +132,25 @@ def process_image(image): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=target_host) + + # Measure Relay build time + build_time = time.time() - build_start + print(opt.model + " inference graph built in {0:.2f}s!".format(build_time)) - # Save the compiled inference graph library - assert tvm.module.enabled("rpc") + # Send the inference library over to the remote RPC server temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) - - # Send the inference library over to the remote RPC server remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") - # Measure build time - build_time = time.time() - build_start - print(opt.model + " inference graph built in {0:.2f}s!".format(build_time)) - # If detailed runtime info is needed build with debug runtime if opt.debug_profile: m = debug_runtime.create(graph, lib, ctx) else: m = graph_runtime.create(graph, lib, ctx) - # Set the network parameters + # Set the network parameters and inputs m.set_input(**params) - - # Read in ImageNet Categories - url = "https://github.com/uwsaml/web-data/raw/master/vta/models/" - categ_fn = "synset.txt" - for fn in ["synset.txt"]: - if not isfile(fn): - download.download(join(url, fn), fn) - synset = eval(open(categ_fn).read()) - - # Read in test image - image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg' - response = requests.get(image_url) - image = Image.open(BytesIO(response.content)).resize((224, 224)) - - # Set the input - image = process_image(image) - image = np.repeat(image.asnumpy(), env.BATCH, axis=0) m.set_input('data', image) # Perform inference @@ -197,3 +175,24 @@ def process_image(image): print(" #4:", synset[top_categories[-4]]) print(" #5:", synset[top_categories[-5]]) print("Performed inference in %.2fms/sample (std = %.2f)" % (mean, std)) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='Train a model for image classification.') + parser.add_argument('--model', type=str, required=False, default='resnet18_v1', + help='Input model name.') + parser.add_argument('--start-name', type=str, default='nn.max_pool2d', + help='The name of the node where packing starts') + parser.add_argument('--stop-name', type=str, default='nn.global_avg_pool2d', + help='The name of the node where packing stops') + parser.add_argument('--debug-profile', action='store_true', + help='Show layer-wise time cost profiling results') + parser.add_argument('--device', default="vta", + help='Select device target, either "vta" or "vtacpu"') + parser.add_argument('--measurements', type=int, default=1, + help='Number of measurements') + + opt = parser.parse_args() + + classification_demo(opt) From 4a34d4b1dcfebaa2b93d44d2250c7fe8e8123886 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 5 Jun 2019 18:53:18 -0700 Subject: [PATCH 045/126] relay based task extraction working --- python/tvm/autotvm/task/relay_integration.py | 36 ++- python/tvm/relay/op/nn/_nn.py | 2 +- vta/scripts/tune_resnet.py | 232 +++++++++---------- vta/scripts/tune_resnet_nnvm.py | 231 ++++++++++++++++++ 4 files changed, 370 insertions(+), 131 deletions(-) create mode 100644 vta/scripts/tune_resnet_nnvm.py diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py index c22496369637..cb18653d8f37 100644 --- a/python/tvm/autotvm/task/relay_integration.py +++ b/python/tvm/autotvm/task/relay_integration.py @@ -33,6 +33,24 @@ logger = logging.getLogger('autotvm') +def my_build(func, + target, + target_host, + params): + """ VTA compatible relay build. + """ + + from tvm import relay + + if "vta" in target.device_name: + with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): + if target.device_name == "vta": + import vta + with vta.build_config(): + return relay.build(func, target, target_host, params) + else: + return relay.build(func, target, target_host, params) + def extract_from_program(func, params, ops, target, target_host=None): """ Extract tuning tasks from a relay program. @@ -89,10 +107,11 @@ def extract_from_program(func, params, ops, target, target_host=None): relay.backend.compile_engine.get().clear() # wrap build call in thread to avoid multiprocessing problems - build_thread = threading.Thread(target=relay.build, args=(func, - target, - target_host, - params)) + build_thread = threading.Thread(target=my_build, + args=(func, + target, + target_host, + params)) build_thread.start() build_thread.join() @@ -166,10 +185,11 @@ def extract_from_multiple_program(funcs, params, ops, target, target_host=None): for func, param in zip(funcs, params): relay.backend.compile_engine.get().clear() # wrap build call in thread to avoid multiprocessing problems - build_thread = threading.Thread(target=relay.build, args=(func, - target, - target_host, - params)) + build_thread = threading.Thread(target=my_build, + args=(func, + target, + target_host, + params)) build_thread.start() build_thread.join() diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index 6c8f8f88795c..e796995d5b42 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -119,7 +119,7 @@ def compute_conv2d(attrs, inputs, out_type, target): if groups == 1: out = topi.nn.conv2d( inputs[0], inputs[1], strides, padding, - dilation, layout, out_dtype=out_dtype) + dilation, layout, out_dtype) elif layout == "NCHW" and \ get_const_int(inputs[1].shape[0]) == groups and \ get_const_int(inputs[1].shape[1]) == 1: diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py index b22a63e09df8..9a4cf3ce6845 100644 --- a/vta/scripts/tune_resnet.py +++ b/vta/scripts/tune_resnet.py @@ -1,21 +1,21 @@ -import argparse -import os -import time +"""Perform inference on VTA using Relay.""" + +import argparse, os +from mxnet.gluon.model_zoo import vision import numpy as np +from PIL import Image +import topi import tvm -from tvm import rpc, autotvm +from tvm import rpc, autotvm, relay from tvm.autotvm.measure.measure_methods import request_remote from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner -from tvm.contrib import graph_runtime, util -from tvm.contrib.download import download - -import topi -import nnvm.compiler +from tvm.contrib import graph_runtime, util, download +from tvm.contrib.debugger import debug_runtime import vta -import vta.testing - -env = vta.get_env() +from vta.testing import simulator +from vta.top import graph_pack +from tvm.autotvm.task import extract_from_program def register_vta_tuning_tasks(): from tvm.autotvm.task.topi_integration import TaskExtractEnv, deserialize_args @@ -50,69 +50,6 @@ def _topi_nn_conv2d(*args, **kwargs): s = tvm.create_schedule([res.op]) return s, [A, W, res] - - -def generate_graph(sym, params, target, target_host): - # Populate the shape and data type dictionary - shape_dict = {"data": (1, 3, 224, 224)} - dtype_dict = {"data": 'float32'} - shape_dict.update({k: v.shape for k, v in params.items()}) - dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) - - # Apply NNVM graph optimization passes - sym = vta.graph.clean_cast(sym) - sym = vta.graph.clean_conv_fuse(sym) - assert env.BLOCK_IN == env.BLOCK_OUT - sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT) - - # Compile NNVM graph - with nnvm.compiler.build_config(opt_level=3): - with vta.build_config(): - graph, lib, params = nnvm.compiler.build( - sym, target, shape_dict, dtype_dict, - params=params, target_host=target_host) - - return graph, lib, params - - -def extract_tasks(sym, params, target, target_host): - # Populate the shape and data type dictionary - shape_dict = {"data": (1, 3, 224, 224)} - dtype_dict = {"data": 'float32'} - shape_dict.update({k: v.shape for k, v in params.items()}) - dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) - - # Apply NNVM graph optimization passes - sym = vta.graph.clean_cast(sym) - sym = vta.graph.clean_conv_fuse(sym) - assert env.BLOCK_IN == env.BLOCK_OUT - sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT) - - with vta.build_config(): - tasks = autotvm.task.extract_from_graph(graph=sym, shape=shape_dict, dtype=dtype_dict, target=target, - params=params, symbols=(nnvm.sym.conv2d,), target_host=target_host) - return tasks - - -def download_model(): - url = "https://github.com/uwsaml/web-data/raw/master/vta/models/" - categ_fn = 'synset.txt' - graph_fn = 'resnet18_qt8.json' - params_fn = 'resnet18_qt8.params' - data_dir = '_data' - if not os.path.exists(data_dir): - os.makedirs(data_dir) - - for file in [categ_fn, graph_fn, params_fn]: - if not os.path.isfile(file): - download(os.path.join(url, file), os.path.join(data_dir, file)) - - sym = nnvm.graph.load_json(open(os.path.join(data_dir, graph_fn)).read()) - params = nnvm.compiler.load_param_dict(open(os.path.join(data_dir, params_fn), 'rb').read()) - - return sym, params - - def tune_tasks(tasks, measure_option, tuner='xgb', @@ -158,8 +95,103 @@ def tune_tasks(tasks, autotvm.record.pick_best(tmp_log_file, log_filename) os.remove(tmp_log_file) + +def extract_tasks(opt, env, target): + """Compile network and extract tasks. + + Parameters + ---------- + opt: a dictionary of parameters obtained from argparse + env: the VTA environment + target: the TVM target + + + Returns + ------- + task: Array of autotvm.task.Task collected tasks + """ + + # Make sure that TVM was compiled with RPC=1 + assert tvm.module.enabled("rpc") + + # Get tracker info from env + tracket_host = os.environ.get("TVM_TRACKER_HOST", None) + tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None)) + if not tracket_host or not tracket_port: + print("Set your AutoTVM tracker node host and port variables to run the autotuner") + exit() + + # Register VTA tuning tasks + register_vta_tuning_tasks() + + # Create a TVM target and execution context + target_host = env.target_host + + # Get tophub schedules + with autotvm.tophub.context(target): + + # Populate the shape and data type dictionary + dtype_dict = {"data": 'float32'} + shape_dict = {"data": (env.BATCH, 3, 224, 224)} + + # Get off the shelf gluon model, and convert to relay + gluon_model = vision.get_model(opt.model, pretrained=True) + relay_prog, params = relay.frontend.from_mxnet(gluon_model, shape_dict) + + # Update shape and type dictionary + shape_dict.update({k: v.shape for k, v in params.items()}) + dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) + + # Perform quantization in Relay + with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1): + relay_prog = relay.quantize.quantize(relay_prog, params=params) + + # Perform graph packing and constant folding for VTA target + if target.device_name == "vta": + assert env.BLOCK_IN == env.BLOCK_OUT + relay_prog = graph_pack( + relay_prog, + env.BATCH, + env.BLOCK_OUT, + env.WGT_WIDTH, + start_name=opt.start_name, + stop_name=opt.stop_name) + relay_prog = relay.ir_pass.fold_constant(relay_prog) + + # Perform task extraction on Relay program + tasks = extract_from_program(func=relay_prog, + params=params, + ops=(tvm.relay.op.nn.conv2d,), + target=target, + target_host=target_host) + + return tasks + + if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Train a model for image classification.') + parser.add_argument('--model', type=str, required=False, default='resnet18_v1', + help='Input model name.') + parser.add_argument('--start-name', type=str, default='nn.max_pool2d', + help='The name of the node where packing starts') + parser.add_argument('--stop-name', type=str, default='nn.global_avg_pool2d', + help='The name of the node where packing stops') + parser.add_argument('--debug-profile', action='store_true', + help='Show layer-wise time cost profiling results') + parser.add_argument('--device', default="vta", + help='Select device target, either "vta" or "vtacpu"') + parser.add_argument('--measurements', type=int, default=1, + help='Number of measurements') + + opt = parser.parse_args() + + # Read in VTA environment + env = vta.get_env() + + # Target + target = tvm.target.vta() + # Get tracker info from env tracket_host = os.environ.get("TVM_TRACKER_HOST", None) tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None)) @@ -167,6 +199,7 @@ def tune_tasks(tasks, print("Set your AutoTVM tracker node host and port variables to run the autotuner") exit() + # Set tuner options tuning_opt = { 'log_filename': 'resnet-18.log', @@ -177,55 +210,10 @@ def tune_tasks(tasks, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func), runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port, - number=4, repeat=3, timeout=60, + number=4, min_repeat_ms=150, repeat=3, timeout=60, check_correctness=True)) } - # download model - sym, params = download_model() + tasks = extract_tasks(opt, env, target) - # register VTA tuning tasks - register_vta_tuning_tasks() - - # extract tasks - print("Extract tasks...") - target = tvm.target.vta() - target_host = env.target_host - tasks = extract_tasks(sym, params, target, target_host) - - print("Tuning...") tune_tasks(tasks, **tuning_opt) - - # compile kernels with history best records - with autotvm.tophub.context(target, extra_files=[tuning_opt['log_filename']]): - print("Compile...") - graph, lib, params = generate_graph(sym, params, target, target_host) - input_shape = (1, 3, 224, 224) - dtype = 'float32' - - # export library - tmp = util.tempdir() - filename = "net.tar" - lib.export_library(tmp.relpath(filename)) - - # upload module to device - print("Upload...") - remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000) - remote.upload(tmp.relpath(filename)) - rlib = remote.load_module(filename) - - # upload parameters to device - ctx = remote.context(str(target), 0) - rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} - data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) - module = graph_runtime.create(graph, rlib, ctx) - module.set_input('data', data_tvm) - module.set_input(**rparams) - - # evaluate - print("Evaluate inference time cost...") - ftimer = module.module.time_evaluator("run", ctx, number=3, repeat=3) - prof_res = np.array(ftimer().results) * 1000 # convert to millisecond - print("Mean inference time (std dev): %.2f ms (%.2f ms)" % - (np.mean(prof_res), np.std(prof_res))) - diff --git a/vta/scripts/tune_resnet_nnvm.py b/vta/scripts/tune_resnet_nnvm.py new file mode 100644 index 000000000000..b22a63e09df8 --- /dev/null +++ b/vta/scripts/tune_resnet_nnvm.py @@ -0,0 +1,231 @@ +import argparse +import os +import time +import numpy as np + +import tvm +from tvm import rpc, autotvm +from tvm.autotvm.measure.measure_methods import request_remote +from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner +from tvm.contrib import graph_runtime, util +from tvm.contrib.download import download + +import topi +import nnvm.compiler +import vta +import vta.testing + +env = vta.get_env() + +def register_vta_tuning_tasks(): + from tvm.autotvm.task.topi_integration import TaskExtractEnv, deserialize_args + + @tvm.tag_scope(tag=topi.tag.ELEMWISE) + def my_clip(x, a_min, a_max): + """Unlike topi's current clip, put min and max into two stages.""" + const_min = tvm.const(a_min, x.dtype) + const_max = tvm.const(a_max, x.dtype) + x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") + x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") + return x + + # init autotvm env to register VTA operator + TaskExtractEnv() + + @autotvm.task.register("topi_nn_conv2d", override=True) + def _topi_nn_conv2d(*args, **kwargs): + assert not kwargs, "Do not support kwargs in template function call" + args = deserialize_args(args) + A, W = args[:2] + + with tvm.target.vta(): + res = topi.nn.conv2d(*args, **kwargs) + res = topi.right_shift(res, 8) + res = my_clip(res, 0, 127) + res = topi.cast(res, "int8") + + if tvm.target.current_target().device_name == 'vta': + s = topi.generic.schedule_conv2d_nchw([res]) + else: + s = tvm.create_schedule([res.op]) + return s, [A, W, res] + + + +def generate_graph(sym, params, target, target_host): + # Populate the shape and data type dictionary + shape_dict = {"data": (1, 3, 224, 224)} + dtype_dict = {"data": 'float32'} + shape_dict.update({k: v.shape for k, v in params.items()}) + dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) + + # Apply NNVM graph optimization passes + sym = vta.graph.clean_cast(sym) + sym = vta.graph.clean_conv_fuse(sym) + assert env.BLOCK_IN == env.BLOCK_OUT + sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT) + + # Compile NNVM graph + with nnvm.compiler.build_config(opt_level=3): + with vta.build_config(): + graph, lib, params = nnvm.compiler.build( + sym, target, shape_dict, dtype_dict, + params=params, target_host=target_host) + + return graph, lib, params + + +def extract_tasks(sym, params, target, target_host): + # Populate the shape and data type dictionary + shape_dict = {"data": (1, 3, 224, 224)} + dtype_dict = {"data": 'float32'} + shape_dict.update({k: v.shape for k, v in params.items()}) + dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) + + # Apply NNVM graph optimization passes + sym = vta.graph.clean_cast(sym) + sym = vta.graph.clean_conv_fuse(sym) + assert env.BLOCK_IN == env.BLOCK_OUT + sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT) + + with vta.build_config(): + tasks = autotvm.task.extract_from_graph(graph=sym, shape=shape_dict, dtype=dtype_dict, target=target, + params=params, symbols=(nnvm.sym.conv2d,), target_host=target_host) + return tasks + + +def download_model(): + url = "https://github.com/uwsaml/web-data/raw/master/vta/models/" + categ_fn = 'synset.txt' + graph_fn = 'resnet18_qt8.json' + params_fn = 'resnet18_qt8.params' + data_dir = '_data' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + + for file in [categ_fn, graph_fn, params_fn]: + if not os.path.isfile(file): + download(os.path.join(url, file), os.path.join(data_dir, file)) + + sym = nnvm.graph.load_json(open(os.path.join(data_dir, graph_fn)).read()) + params = nnvm.compiler.load_param_dict(open(os.path.join(data_dir, params_fn), 'rb').read()) + + return sym, params + + +def tune_tasks(tasks, + measure_option, + tuner='xgb', + n_trial=1000, + early_stopping=None, + log_filename='tuning.log', + use_transfer_learning=True, + try_winograd=True): + # create tmp log file + tmp_log_file = log_filename + ".tmp" + if os.path.exists(tmp_log_file): + os.remove(tmp_log_file) + + for i, tsk in enumerate(reversed(tasks)): + prefix = "[Task %2d/%2d] " % (i+1, len(tasks)) + + # create tuner + if tuner == 'xgb' or tuner == 'xgb-rank': + tuner_obj = XGBTuner(tsk, loss_type='rank') + elif tuner == 'ga': + tuner_obj = GATuner(tsk, pop_size=50) + elif tuner == 'random': + tuner_obj = RandomTuner(tsk) + elif tuner == 'gridsearch': + tuner_obj = GridSearchTuner(tsk) + else: + raise ValueError("Invalid tuner: " + tuner) + + if use_transfer_learning: + if os.path.isfile(tmp_log_file): + tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file)) + + # do tuning + n_trial_ = min(n_trial, len(tsk.config_space)) + tuner_obj.tune(n_trial_, + early_stopping=early_stopping, + measure_option=measure_option, + callbacks=[ + autotvm.callback.progress_bar(n_trial_, prefix=prefix), + autotvm.callback.log_to_file(tmp_log_file)]) + + # pick best records to a cache file + autotvm.record.pick_best(tmp_log_file, log_filename) + os.remove(tmp_log_file) + +if __name__ == '__main__': + + # Get tracker info from env + tracket_host = os.environ.get("TVM_TRACKER_HOST", None) + tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None)) + if not tracket_host or not tracket_port: + print("Set your AutoTVM tracker node host and port variables to run the autotuner") + exit() + + tuning_opt = { + 'log_filename': 'resnet-18.log', + + 'tuner': 'random', + 'n_trial': 1e9, + 'early_stopping': None, + + 'measure_option': autotvm.measure_option( + builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func), + runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port, + number=4, repeat=3, timeout=60, + check_correctness=True)) + } + + # download model + sym, params = download_model() + + # register VTA tuning tasks + register_vta_tuning_tasks() + + # extract tasks + print("Extract tasks...") + target = tvm.target.vta() + target_host = env.target_host + tasks = extract_tasks(sym, params, target, target_host) + + print("Tuning...") + tune_tasks(tasks, **tuning_opt) + + # compile kernels with history best records + with autotvm.tophub.context(target, extra_files=[tuning_opt['log_filename']]): + print("Compile...") + graph, lib, params = generate_graph(sym, params, target, target_host) + input_shape = (1, 3, 224, 224) + dtype = 'float32' + + # export library + tmp = util.tempdir() + filename = "net.tar" + lib.export_library(tmp.relpath(filename)) + + # upload module to device + print("Upload...") + remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000) + remote.upload(tmp.relpath(filename)) + rlib = remote.load_module(filename) + + # upload parameters to device + ctx = remote.context(str(target), 0) + rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} + data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) + module = graph_runtime.create(graph, rlib, ctx) + module.set_input('data', data_tvm) + module.set_input(**rparams) + + # evaluate + print("Evaluate inference time cost...") + ftimer = module.module.time_evaluator("run", ctx, number=3, repeat=3) + prof_res = np.array(ftimer().results) * 1000 # convert to millisecond + print("Mean inference time (std dev): %.2f ms (%.2f ms)" % + (np.mean(prof_res), np.std(prof_res))) + From 59f1c026be8c3f2fe791d07e7a11a9c3c2d7ccd7 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Thu, 6 Jun 2019 11:37:22 -0700 Subject: [PATCH 046/126] autotuning script refactor --- vta/scripts/tune_resnet.py | 226 ++++++++++++++++++-------------- vta/scripts/tune_resnet_nnvm.py | 44 ++++--- 2 files changed, 154 insertions(+), 116 deletions(-) diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py index 9a4cf3ce6845..72e1395f0af2 100644 --- a/vta/scripts/tune_resnet.py +++ b/vta/scripts/tune_resnet.py @@ -17,6 +17,29 @@ from vta.top import graph_pack from tvm.autotvm.task import extract_from_program +def parse_arguments(): + + parser = argparse.ArgumentParser(description='Train a model for image classification.') + parser.add_argument('--model', type=str, required=False, default='resnet18_v1', + help='Input model name.') + parser.add_argument('--start-name', type=str, default='nn.max_pool2d', + help='The name of the node where packing starts') + parser.add_argument('--stop-name', type=str, default='nn.global_avg_pool2d', + help='The name of the node where packing stops') + parser.add_argument('--debug-profile', action='store_true', + help='Show layer-wise time cost profiling results') + parser.add_argument('--device', default="vta", + help='Select device target, either "vta" or "vtacpu"') + parser.add_argument('--measurements', type=int, default=1, + help='Number of measurements during AutoTVM search') + parser.add_argument('--tuner', type=str, default="random", + help='AutoTVM search strategy') + parser.add_argument('--log-filename', type=str, default="resnet-18.log", + help='AutoTVM log file name') + + return parser.parse_args() + + def register_vta_tuning_tasks(): from tvm.autotvm.task.topi_integration import TaskExtractEnv, deserialize_args @@ -50,6 +73,40 @@ def _topi_nn_conv2d(*args, **kwargs): s = tvm.create_schedule([res.op]) return s, [A, W, res] + +def compile_network(opt, env, target): + + # Populate the shape and data type dictionary + dtype_dict = {"data": 'float32'} + shape_dict = {"data": (env.BATCH, 3, 224, 224)} + + # Get off the shelf gluon model, and convert to relay + gluon_model = vision.get_model(opt.model, pretrained=True) + relay_prog, params = relay.frontend.from_mxnet(gluon_model, shape_dict) + + # Update shape and type dictionary + shape_dict.update({k: v.shape for k, v in params.items()}) + dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) + + # Perform quantization in Relay + with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1): + relay_prog = relay.quantize.quantize(relay_prog, params=params) + + # Perform graph packing and constant folding for VTA target + if target.device_name == "vta": + assert env.BLOCK_IN == env.BLOCK_OUT + relay_prog = graph_pack( + relay_prog, + env.BATCH, + env.BLOCK_OUT, + env.WGT_WIDTH, + start_name=opt.start_name, + stop_name=opt.stop_name) + relay_prog = relay.ir_pass.fold_constant(relay_prog) + + return relay_prog, params + + def tune_tasks(tasks, measure_option, tuner='xgb', @@ -58,6 +115,7 @@ def tune_tasks(tasks, log_filename='tuning.log', use_transfer_learning=True, try_winograd=True): + # create tmp log file tmp_log_file = log_filename + ".tmp" if os.path.exists(tmp_log_file): @@ -95,101 +153,17 @@ def tune_tasks(tasks, autotvm.record.pick_best(tmp_log_file, log_filename) os.remove(tmp_log_file) +if __name__ == '__main__': -def extract_tasks(opt, env, target): - """Compile network and extract tasks. - - Parameters - ---------- - opt: a dictionary of parameters obtained from argparse - env: the VTA environment - target: the TVM target - + opt = parse_arguments() - Returns - ------- - task: Array of autotvm.task.Task collected tasks - """ - # Make sure that TVM was compiled with RPC=1 assert tvm.module.enabled("rpc") - # Get tracker info from env - tracket_host = os.environ.get("TVM_TRACKER_HOST", None) - tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None)) - if not tracket_host or not tracket_port: - print("Set your AutoTVM tracker node host and port variables to run the autotuner") - exit() - - # Register VTA tuning tasks - register_vta_tuning_tasks() - - # Create a TVM target and execution context - target_host = env.target_host - - # Get tophub schedules - with autotvm.tophub.context(target): - - # Populate the shape and data type dictionary - dtype_dict = {"data": 'float32'} - shape_dict = {"data": (env.BATCH, 3, 224, 224)} - - # Get off the shelf gluon model, and convert to relay - gluon_model = vision.get_model(opt.model, pretrained=True) - relay_prog, params = relay.frontend.from_mxnet(gluon_model, shape_dict) - - # Update shape and type dictionary - shape_dict.update({k: v.shape for k, v in params.items()}) - dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) - - # Perform quantization in Relay - with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1): - relay_prog = relay.quantize.quantize(relay_prog, params=params) - - # Perform graph packing and constant folding for VTA target - if target.device_name == "vta": - assert env.BLOCK_IN == env.BLOCK_OUT - relay_prog = graph_pack( - relay_prog, - env.BATCH, - env.BLOCK_OUT, - env.WGT_WIDTH, - start_name=opt.start_name, - stop_name=opt.stop_name) - relay_prog = relay.ir_pass.fold_constant(relay_prog) - - # Perform task extraction on Relay program - tasks = extract_from_program(func=relay_prog, - params=params, - ops=(tvm.relay.op.nn.conv2d,), - target=target, - target_host=target_host) - - return tasks - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser(description='Train a model for image classification.') - parser.add_argument('--model', type=str, required=False, default='resnet18_v1', - help='Input model name.') - parser.add_argument('--start-name', type=str, default='nn.max_pool2d', - help='The name of the node where packing starts') - parser.add_argument('--stop-name', type=str, default='nn.global_avg_pool2d', - help='The name of the node where packing stops') - parser.add_argument('--debug-profile', action='store_true', - help='Show layer-wise time cost profiling results') - parser.add_argument('--device', default="vta", - help='Select device target, either "vta" or "vtacpu"') - parser.add_argument('--measurements', type=int, default=1, - help='Number of measurements') - - opt = parser.parse_args() - # Read in VTA environment env = vta.get_env() - # Target + # VTA target target = tvm.target.vta() # Get tracker info from env @@ -198,22 +172,80 @@ def extract_tasks(opt, env, target): if not tracket_host or not tracket_port: print("Set your AutoTVM tracker node host and port variables to run the autotuner") exit() + + # Compile Relay program + print("Initial compile...") + relay_prog, params = compile_network(opt, env, target) - # Set tuner options - tuning_opt = { - 'log_filename': 'resnet-18.log', + # Register VTA tuning tasks + register_vta_tuning_tasks() - 'tuner': 'random', + # Perform task extraction on Relay program + print("Extracting tasks...") + tasks = extract_from_program(func=relay_prog, + params=params, + ops=(tvm.relay.op.nn.conv2d,), + target=target, + target_host=env.target_host) + + # Perform Autotuning + print("Tuning...") + tuning_opt = { + 'log_filename': opt.log_filename, + 'tuner': opt.tuner, 'n_trial': 1e9, 'early_stopping': None, - - 'measure_option': autotvm.measure_option( + 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func), runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port, - number=4, min_repeat_ms=150, repeat=3, timeout=60, + number=4, min_repeat_ms=150, repeat=opt.measurements, timeout=60, check_correctness=True)) } - - tasks = extract_tasks(opt, env, target) - tune_tasks(tasks, **tuning_opt) + + # Compile kernels with history best records + with autotvm.tophub.context(target, extra_files=[opt.log_filename]): + + # ResNet parameters + input_shape = (1, 3, 224, 224) + dtype = 'float32' + + # Compile network + print("Compiling network with best tuning parameters...") + relay_prog, params = compile_network(opt, env, target) + with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): + if target.device_name != "vta": + graph, lib, params = relay.build( + relay_prog, target=target, + params=params, target_host=env.target_host) + else: + with vta.build_config(): + graph, lib, params = relay.build( + relay_prog, target=target, + params=params, target_host=env.target_host) + + # Export library + tmp = util.tempdir() + filename = "net.tar" + lib.export_library(tmp.relpath(filename)) + + # Upload module to device + print("Upload...") + remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000) + remote.upload(tmp.relpath(filename)) + rlib = remote.load_module(filename) + + # Upload parameters to device + ctx = remote.context(str(target), 0) + rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} + data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) + module = graph_runtime.create(graph, rlib, ctx) + module.set_input('data', data_tvm) + module.set_input(**rparams) + + # Evaluate + print("Evaluate inference time cost...") + ftimer = module.module.time_evaluator("run", ctx, number=4, repeat=opt.measurements) + prof_res = np.array(ftimer().results) * 1000 # convert to millisecond + print("Mean inference time (std dev): %.2f ms (%.2f ms)" % + (np.mean(prof_res), np.std(prof_res))) diff --git a/vta/scripts/tune_resnet_nnvm.py b/vta/scripts/tune_resnet_nnvm.py index b22a63e09df8..433951570372 100644 --- a/vta/scripts/tune_resnet_nnvm.py +++ b/vta/scripts/tune_resnet_nnvm.py @@ -167,6 +167,20 @@ def tune_tasks(tasks, print("Set your AutoTVM tracker node host and port variables to run the autotuner") exit() + # Download model + sym, params = download_model() + + # Register VTA tuning tasks + register_vta_tuning_tasks() + + # Extract tasks + print("Extracting tasks...") + target = tvm.target.vta() + target_host = env.target_host + tasks = extract_tasks(sym, params, target, target_host) + + # Perform Autotuning + print("Tuning...") tuning_opt = { 'log_filename': 'resnet-18.log', @@ -180,41 +194,33 @@ def tune_tasks(tasks, number=4, repeat=3, timeout=60, check_correctness=True)) } - - # download model - sym, params = download_model() - - # register VTA tuning tasks - register_vta_tuning_tasks() - - # extract tasks - print("Extract tasks...") - target = tvm.target.vta() - target_host = env.target_host - tasks = extract_tasks(sym, params, target, target_host) - - print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.tophub.context(target, extra_files=[tuning_opt['log_filename']]): - print("Compile...") + + # ResNet parameters + input_shape = (1, 3, 224, 224) + dtype = 'float32'\ + + # Compile network + print("Compiling network with best tuning parameters...") graph, lib, params = generate_graph(sym, params, target, target_host) input_shape = (1, 3, 224, 224) dtype = 'float32' - # export library + # Export library tmp = util.tempdir() filename = "net.tar" lib.export_library(tmp.relpath(filename)) - # upload module to device + # Upload module to device print("Upload...") remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) - # upload parameters to device + # Upload parameters to device ctx = remote.context(str(target), 0) rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) @@ -222,7 +228,7 @@ def tune_tasks(tasks, module.set_input('data', data_tvm) module.set_input(**rparams) - # evaluate + # Evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=3, repeat=3) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond From 51f1ee09b037e095bf250da74ed50cb7a8172f96 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Thu, 6 Jun 2019 17:18:21 -0700 Subject: [PATCH 047/126] refactoring, debug runtime --- vta/scripts/relay_to_vta.py | 18 +++--- vta/scripts/tune_resnet.py | 97 ++++++++++++++++++++------------- vta/scripts/tune_resnet_nnvm.py | 10 ++-- 3 files changed, 75 insertions(+), 50 deletions(-) diff --git a/vta/scripts/relay_to_vta.py b/vta/scripts/relay_to_vta.py index d75deea432ae..c47ae09ef2a2 100644 --- a/vta/scripts/relay_to_vta.py +++ b/vta/scripts/relay_to_vta.py @@ -1,6 +1,6 @@ """Perform inference on VTA using Relay.""" -import argparse, json, requests, time +import argparse, json, os, requests, time from io import BytesIO from mxnet.gluon.model_zoo import vision import numpy as np @@ -50,6 +50,13 @@ def classification_demo(opt): image = image[np.newaxis, :] image = np.repeat(image, env.BATCH, axis=0) + # For tuning, make sure tracker variables are set + tracker_host = os.environ.get("TVM_TRACKER_HOST", None) + tracker_port = int(os.environ.get("TVM_TRACKER_PORT", None)) + if not tracker_host or not tracker_port: + print("Set your AutoTVM tracker node host and port variables to run the autotuner") + exit() + # We configure both the bitstream and the runtime system on the Pynq # to match the VTA configuration specified by the vta_config.json file. if env.TARGET != "sim": @@ -58,12 +65,7 @@ def classification_demo(opt): reconfig_start = time.time() # Get remote from fleet node - tracket_host = os.environ.get("TVM_TRACKER_HOST", None) - tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None)) - if not tracket_host or not tracket_port: - print("Set your AutoTVM tracker node host and port variables to run the autotuner") - exit() - remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000) + remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000) # Reconfigure the JIT runtime and FPGA. # You can program the FPGA with your own custom bitstream @@ -84,7 +86,7 @@ def classification_demo(opt): ctx = remote.ext_dev(0) if opt.device == "vta" else remote.cpu(0) # Get tophub schedules - with autotvm.tophub.context(target): + with autotvm.tophub.context(target, extra_files=["resnet-18.log"]): # Measure build start time build_start = time.time() diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py index 72e1395f0af2..18dac6df01e7 100644 --- a/vta/scripts/tune_resnet.py +++ b/vta/scripts/tune_resnet.py @@ -1,6 +1,6 @@ """Perform inference on VTA using Relay.""" -import argparse, os +import argparse, os, time from mxnet.gluon.model_zoo import vision import numpy as np from PIL import Image @@ -163,15 +163,39 @@ def tune_tasks(tasks, # Read in VTA environment env = vta.get_env() - # VTA target - target = tvm.target.vta() - - # Get tracker info from env - tracket_host = os.environ.get("TVM_TRACKER_HOST", None) - tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None)) - if not tracket_host or not tracket_port: + # Get remote from fleet node + tracker_host = os.environ.get("TVM_TRACKER_HOST", None) + tracker_port = int(os.environ.get("TVM_TRACKER_PORT", None)) + if not tracker_host or not tracker_port: print("Set your AutoTVM tracker node host and port variables to run the autotuner") exit() + + # Get remote + if env.TARGET != "sim": + + # Measure build start time + reconfig_start = time.time() + + # Get remote from fleet node + remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000) + + # Reconfigure the JIT runtime and FPGA. + # You can program the FPGA with your own custom bitstream + # by passing the path to the bitstream file instead of None. + vta.reconfig_runtime(remote) + vta.program_fpga(remote, bitstream=None) + + # Report on reconfiguration time + reconfig_time = time.time() - reconfig_start + print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time)) + + # In simulation mode, host the RPC server locally. + else: + remote = rpc.LocalSession() + + # VTA target and execution context + target = tvm.target.vta() + ctx = remote.ext_dev(0) if opt.device == "vta" else remote.cpu(0) # Compile Relay program print("Initial compile...") @@ -197,22 +221,18 @@ def tune_tasks(tasks, 'early_stopping': None, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func), - runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port, + runner=autotvm.RPCRunner(env.TARGET, tracker_host, tracker_port, number=4, min_repeat_ms=150, repeat=opt.measurements, timeout=60, check_correctness=True)) } tune_tasks(tasks, **tuning_opt) # Compile kernels with history best records - with autotvm.tophub.context(target, extra_files=[opt.log_filename]): - - # ResNet parameters - input_shape = (1, 3, 224, 224) - dtype = 'float32' + with autotvm.tophub.context(target, extra_files=[opt.log_filename]): # Compile network print("Compiling network with best tuning parameters...") - relay_prog, params = compile_network(opt, env, target) + # relay_prog, params = compile_network(opt, env, target) with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta": graph, lib, params = relay.build( @@ -225,27 +245,30 @@ def tune_tasks(tasks, params=params, target_host=env.target_host) # Export library - tmp = util.tempdir() - filename = "net.tar" - lib.export_library(tmp.relpath(filename)) - - # Upload module to device - print("Upload...") - remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000) - remote.upload(tmp.relpath(filename)) - rlib = remote.load_module(filename) - - # Upload parameters to device - ctx = remote.context(str(target), 0) - rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} - data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) - module = graph_runtime.create(graph, rlib, ctx) - module.set_input('data', data_tvm) - module.set_input(**rparams) - - # Evaluate - print("Evaluate inference time cost...") - ftimer = module.module.time_evaluator("run", ctx, number=4, repeat=opt.measurements) - prof_res = np.array(ftimer().results) * 1000 # convert to millisecond + temp = util.tempdir() + lib.save(temp.relpath("graphlib.o")) + remote.upload(temp.relpath("graphlib.o")) + lib = remote.load_module("graphlib.o") + + # If detailed runtime info is needed build with debug runtime + if opt.debug_profile: + m = debug_runtime.create(graph, lib, ctx) + else: + m = graph_runtime.create(graph, lib, ctx) + + # Set the network parameters and synthetic input + image = tvm.nd.array( + (np.random.uniform(size=(1, 3, 224, 224))).astype('float32')) + m.set_input(**params) + m.set_input('data', image) + + # Perform inference + timer = m.module.time_evaluator("run", ctx, number=4, repeat=opt.measurements) + tcost = timer() + prof_res = np.array(tcost.results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))) + + # Display profile information + if opt.debug_profile: + m.run() diff --git a/vta/scripts/tune_resnet_nnvm.py b/vta/scripts/tune_resnet_nnvm.py index 433951570372..3a6149df267c 100644 --- a/vta/scripts/tune_resnet_nnvm.py +++ b/vta/scripts/tune_resnet_nnvm.py @@ -161,9 +161,9 @@ def tune_tasks(tasks, if __name__ == '__main__': # Get tracker info from env - tracket_host = os.environ.get("TVM_TRACKER_HOST", None) - tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None)) - if not tracket_host or not tracket_port: + tracker_host = os.environ.get("TVM_TRACKER_HOST", None) + tracker_port = int(os.environ.get("TVM_TRACKER_PORT", None)) + if not tracker_host or not tracker_port: print("Set your AutoTVM tracker node host and port variables to run the autotuner") exit() @@ -190,7 +190,7 @@ def tune_tasks(tasks, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func), - runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port, + runner=autotvm.RPCRunner(env.TARGET, tracker_host, tracker_port, number=4, repeat=3, timeout=60, check_correctness=True)) } @@ -216,7 +216,7 @@ def tune_tasks(tasks, # Upload module to device print("Upload...") - remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000) + remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) From e92c0c2838476910612d78c6e33f687dea4db199 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Thu, 6 Jun 2019 17:34:36 -0700 Subject: [PATCH 048/126] removing debug messages --- python/tvm/relay/quantize/quantize.py | 10 ---------- vta/python/vta/top/graphpack.py | 2 -- 2 files changed, 12 deletions(-) diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py index da881db26fb2..6fc3f9ed57fc 100644 --- a/python/tvm/relay/quantize/quantize.py +++ b/python/tvm/relay/quantize/quantize.py @@ -374,20 +374,10 @@ def quantize_vta(graph, params=None, dataset=None): # TODO(zhiics) Move this to the pass manager. graph = optimize(graph, params) - - print('original graph') - print(graph) graph = _quantize.rewrite_for_vta(graph) - print('after rewrite for vta') - print(graph) - graph = annotate(graph) graph = calibrate(graph, dataset) - print('after calibrate') - print(graph) graph = realize(graph) graph = _ir_pass.fold_constant(graph) - print('after realize') - print(graph) return graph diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py index 3ce50d06dbda..770dd380403d 100644 --- a/vta/python/vta/top/graphpack.py +++ b/vta/python/vta/top/graphpack.py @@ -265,13 +265,11 @@ def graph_pack(expr, """ assert isinstance(expr, relay.Function) expr = get_subgraph(expr, start_name, stop_name) - print("Before", expr.astext(show_meta_data=False)) expr = relay.ir_pass.infer_type(expr) packer = ExprPack( bfactor, cfactor, weight_bits) expr = packer.visit(expr) - print("After", expr.astext(show_meta_data=False)) assert not packer.start_pack return relay.ir_pass.infer_type(expr) From affd1581403a3f264b190d988149130d8860fa53 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 10 Jun 2019 10:02:26 -0700 Subject: [PATCH 049/126] proper argparsing, and target setting --- vta/scripts/relay_to_vta.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vta/scripts/relay_to_vta.py b/vta/scripts/relay_to_vta.py index c47ae09ef2a2..c71e6f61f37a 100644 --- a/vta/scripts/relay_to_vta.py +++ b/vta/scripts/relay_to_vta.py @@ -82,11 +82,11 @@ def classification_demo(opt): remote = rpc.LocalSession() # Create a TVM target and execution context - target = tvm.target.create("llvm -device={}".format(opt.device)) + target = env.target if opt.device == "vta" else env.target_vta_cpu ctx = remote.ext_dev(0) if opt.device == "vta" else remote.cpu(0) # Get tophub schedules - with autotvm.tophub.context(target, extra_files=["resnet-18.log"]): + with autotvm.tophub.context(target): # Measure build start time build_start = time.time() @@ -182,7 +182,7 @@ def classification_demo(opt): if __name__ == '__main__': parser = argparse.ArgumentParser(description='Train a model for image classification.') - parser.add_argument('--model', type=str, required=False, default='resnet18_v1', + parser.add_argument('--model', type=str, default='resnet18_v1', choices=['resnet18_v1'], help='Input model name.') parser.add_argument('--start-name', type=str, default='nn.max_pool2d', help='The name of the node where packing starts') @@ -190,8 +190,8 @@ def classification_demo(opt): help='The name of the node where packing stops') parser.add_argument('--debug-profile', action='store_true', help='Show layer-wise time cost profiling results') - parser.add_argument('--device', default="vta", - help='Select device target, either "vta" or "vtacpu"') + parser.add_argument('--device', default='vta', choices=['vta', 'arm_cpu'], + help='Select device target') parser.add_argument('--measurements', type=int, default=1, help='Number of measurements') From 8f277bf4125c769b1117e2abd8e79e0555f54ddb Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 10 Jun 2019 10:04:08 -0700 Subject: [PATCH 050/126] adding dense tuning --- vta/scripts/tune_resnet.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py index 18dac6df01e7..463187f3caa3 100644 --- a/vta/scripts/tune_resnet.py +++ b/vta/scripts/tune_resnet.py @@ -20,7 +20,7 @@ def parse_arguments(): parser = argparse.ArgumentParser(description='Train a model for image classification.') - parser.add_argument('--model', type=str, required=False, default='resnet18_v1', + parser.add_argument('--model', type=str, default='resnet18_v1', choices=['resnet18_v1'], help='Input model name.') parser.add_argument('--start-name', type=str, default='nn.max_pool2d', help='The name of the node where packing starts') @@ -28,8 +28,8 @@ def parse_arguments(): help='The name of the node where packing stops') parser.add_argument('--debug-profile', action='store_true', help='Show layer-wise time cost profiling results') - parser.add_argument('--device', default="vta", - help='Select device target, either "vta" or "vtacpu"') + parser.add_argument('--device', default='vta', choices=['vta', 'arm_cpu'], + help='Select device target') parser.add_argument('--measurements', type=int, default=1, help='Number of measurements during AutoTVM search') parser.add_argument('--tuner', type=str, default="random", @@ -74,6 +74,23 @@ def _topi_nn_conv2d(*args, **kwargs): return s, [A, W, res] + @autotvm.task.register("topi_nn_dense", override=True) + def _topi_nn_dense(*args, **kwargs): + assert not kwargs, "Do not support kwargs in template function call" + args = deserialize_args(args) + A, W = args[:2] + + with tvm.target.vta(): + res = topi.nn.dense(*args, **kwargs) + + if tvm.target.current_target().device_name == 'vta': + s = topi.generic.schedule_conv2d_nchw([res]) + else: + s = tvm.create_schedule([res.op]) + + return s, [A, W, res] + + def compile_network(opt, env, target): # Populate the shape and data type dictionary @@ -194,7 +211,7 @@ def tune_tasks(tasks, remote = rpc.LocalSession() # VTA target and execution context - target = tvm.target.vta() + target = env.target if opt.device == "vta" else env.target_vta_cpu ctx = remote.ext_dev(0) if opt.device == "vta" else remote.cpu(0) # Compile Relay program @@ -208,7 +225,8 @@ def tune_tasks(tasks, print("Extracting tasks...") tasks = extract_from_program(func=relay_prog, params=params, - ops=(tvm.relay.op.nn.conv2d,), + ops=(tvm.relay.op.nn.conv2d, + tvm.relay.op.nn.dense), target=target, target_host=env.target_host) From be4d3a1f5ab1a8ac0aa648c132e3b653adfea02f Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 10 Jun 2019 14:34:12 -0700 Subject: [PATCH 051/126] updated tutorial to use Relay --- vta/tutorials/resnet.py | 324 ++++++++++++++-------------------------- 1 file changed, 116 insertions(+), 208 deletions(-) diff --git a/vta/tutorials/resnet.py b/vta/tutorials/resnet.py index 13161586480e..d3ed0cebe79d 100644 --- a/vta/tutorials/resnet.py +++ b/vta/tutorials/resnet.py @@ -24,292 +24,200 @@ """ + ###################################################################### # Import Libraries # ---------------- -# We start by importing the tvm, vta, nnvm libraries to run this example. +# We start by importing libraries to run this example. from __future__ import absolute_import, print_function -import os -import time +import argparse, json, os, requests, time from io import BytesIO +from os.path import join, isfile +from PIL import Image +from mxnet.gluon.model_zoo import vision import numpy as np -import requests from matplotlib import pyplot as plt -from PIL import Image import tvm -from tvm import rpc, autotvm -from tvm.contrib import graph_runtime, util -from tvm.contrib.download import download -import nnvm.compiler -import vta -import vta.testing +from tvm import rpc, autotvm, relay +from tvm.contrib import graph_runtime, util, download +from tvm.contrib.debugger import debug_runtime -# Load VTA parameters from the vta/config/vta_config.json file -env = vta.get_env() +import vta +from vta.testing import simulator +from vta.top import graph_pack -# Helper to crop an image to a square (224, 224) -# Takes in an Image object, returns an Image object -def thumbnailify(image, pad=15): - w, h = image.size - crop = ((w-h)//2+pad, pad, h+(w-h)//2-pad, h-pad) - image = image.crop(crop) - image = image.resize((224, 224)) - return image - -# Helper function to read in image -# Takes in Image object, returns an ND array -def process_image(image): - # Convert to neural network input format - image = np.array(image) - np.array([123., 117., 104.]) - image /= np.array([58.395, 57.12, 57.375]) - image = image.transpose((2, 0, 1)) - image = image[np.newaxis, :] - - return tvm.nd.array(image.astype("float32")) - -# Classification helper function -# Takes in the graph runtime, and an image, and returns top result and time -def classify(m, image): - m.set_input('data', image) - timer = m.module.time_evaluator("run", ctx, number=1) - tcost = timer() - tvm_output = m.get_output(0) - top = np.argmax(tvm_output.asnumpy()[0]) - tcost = "t={0:.2f}s".format(tcost.mean) - return tcost + " {}".format(synset[top]) +# Make sure that TVM was compiled with RPC=1 +assert tvm.module.enabled("rpc") ###################################################################### -# Download ResNet Model -# -------------------------------------------- -# Download the necessary files to run ResNet-18. -# - -# Obtain ResNet model and download them into _data dir -url = "https://github.com/uwsaml/web-data/raw/master/vta/models/" -categ_fn = 'synset.txt' -graph_fn = 'resnet18_qt8.json' -params_fn = 'resnet18_qt8.params' +# Define the platform and model targets +# ---------------- +# Execute on CPU vs. VTA, and define the model. -# Create data dir -data_dir = "_data/" -if not os.path.exists(data_dir): - os.makedirs(data_dir) +# Load VTA parameters from the vta/config/vta_config.json file +env = vta.get_env() -# Download files -for file in [categ_fn, graph_fn, params_fn]: - download(os.path.join(url, file), os.path.join(data_dir, file)) +# Set ``device=arm_cpu`` to run inference on the CPU +# or ``device=vta`` to run inference on the FPGA. +device = "vta" +target = env.target if device == "vta" else env.target_vta_cpu -# Read in ImageNet Categories -synset = eval(open(os.path.join(data_dir, categ_fn)).read()) +# Name of Gluon model to compile +model = "resnet18_v1" +start_pack="nn.max_pool2d" +stop_pack="nn.global_avg_pool2d" ###################################################################### -# Setup the Pynq Board's RPC Server +# Obtain an execution remote # --------------------------------- -# Build the RPC server's VTA runtime and program the Pynq FPGA. - -# Measure build start time -reconfig_start = time.time() - -# We read the Pynq RPC host IP address and port number from the OS environment -host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99") -port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091")) - -# We configure both the bitstream and the runtime system on the Pynq -# to match the VTA configuration specified by the vta_config.json file. -if env.TARGET == "pynq": - # Make sure that TVM was compiled with RPC=1 - assert tvm.module.enabled("rpc") - remote = rpc.connect(host, port) - - # Reconfigure the JIT runtime - vta.reconfig_runtime(remote) - - # Program the FPGA with a pre-compiled VTA bitstream. +# When target is 'pynq', reconfigure FPGA and runtime. +# Otherwise, if target is 'sim', execute locally. + +if env.TARGET != "sim": + + # Get remote from fleet node if environment variable is set + tracker_host = os.environ.get("TVM_TRACKER_HOST", None) + tracker_port = int(os.environ.get("TVM_TRACKER_PORT", None)) + device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99") + device_port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091")) + if not tracker_host or not tracker_port: + remote = rpc.connect(device_host, device_port) + else: + remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000) + + # Reconfigure the JIT runtime and FPGA. # You can program the FPGA with your own custom bitstream # by passing the path to the bitstream file instead of None. + reconfig_start = time.time() + vta.reconfig_runtime(remote) vta.program_fpga(remote, bitstream=None) - - # Report on reconfiguration time reconfig_time = time.time() - reconfig_start print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time)) # In simulation mode, host the RPC server locally. -elif env.TARGET == "sim": +else: remote = rpc.LocalSession() +# Get execution context from remote +ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) ###################################################################### -# Build the ResNet Runtime +# Build the inference runtime # ------------------------ -# Build the ResNet graph runtime, and configure the parameters. - -# Set ``device=vtacpu`` to run inference on the CPU -# or ``device=vta`` to run inference on the FPGA. -device = "vta" - -# TVM target and context -target = tvm.target.create("llvm -device={}".format(device)) -ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) - -# TVM module -m = None +# Build ResNet from Gluon with Relay. +# Load pre-configured AutoTVM schedules with autotvm.tophub.context(target): - graph_fn = os.path.join(data_dir, graph_fn) - params_fn= os.path.join(data_dir, params_fn) + # Populate the shape and data type dictionary for ResNet input + dtype_dict = {"data": 'float32'} + shape_dict = {"data": (env.BATCH, 3, 224, 224)} + + # Get off the shelf gluon model, and convert to relay + gluon_model = vision.get_model(model, pretrained=True) # Measure build start time build_start = time.time() - # Load the ResNet-18 graph and parameters - sym = nnvm.graph.load_json(open(graph_fn).read()) - params = nnvm.compiler.load_param_dict(open(params_fn, 'rb').read()) + # Start front end compilation + relay_prog, params = relay.frontend.from_mxnet(gluon_model, shape_dict) + print(relay_prog) + # exit() - # Populate the shape and data type dictionary - shape_dict = {"data": (1, 3, 224, 224)} - dtype_dict = {"data": 'float32'} + # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) - # Apply NNVM graph optimization passes - sym = vta.graph.clean_cast(sym) - sym = vta.graph.clean_conv_fuse(sym) + # Perform quantization in Relay + with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1): + relay_prog = relay.quantize.quantize(relay_prog, params=params) + + # Perform graph packing and constant folding for VTA target if target.device_name == "vta": assert env.BLOCK_IN == env.BLOCK_OUT - sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT) - - # Compile NNVM graph - with nnvm.compiler.build_config(opt_level=3): + relay_prog = graph_pack( + relay_prog, + env.BATCH, + env.BLOCK_OUT, + env.WGT_WIDTH, + start_name=start_pack, + stop_name=stop_pack) + relay_prog = relay.ir_pass.fold_constant(relay_prog) + + # Compile Relay program with AlterOpLayout disabled + with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta": - graph, lib, params = nnvm.compiler.build( - sym, target, shape_dict, dtype_dict, + graph, lib, params = relay.build( + relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): - graph, lib, params = nnvm.compiler.build( - sym, target, shape_dict, dtype_dict, + graph, lib, params = relay.build( + relay_prog, target=target, params=params, target_host=env.target_host) - # Save the compiled inference graph library - assert tvm.module.enabled("rpc") - temp = util.tempdir() - lib.save(temp.relpath("graphlib.o")) + # Measure Relay build time + build_time = time.time() - build_start + print(model + " inference graph built in {0:.2f}s!".format(build_time)) # Send the inference library over to the remote RPC server + temp = util.tempdir() + lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") - # Measure build time - build_time = time.time() - build_start - print("ResNet-18 inference graph built in {0:.2f}s!".format(build_time)) - + # Graph runtime m = graph_runtime.create(graph, lib, ctx) - # Set the parameters - m.set_input(**params) - ###################################################################### -# Run ResNet-18 inference on a sample image -# ----------------------------------------- -# Perform image classification on test image. -# You can change the test image URL to any image of your choosing. +# Perform ResNet-18 inference +# ------------------------ +# We run classification on an image sample from ImageNet + +# Download ImageNet categories +categ_url = "https://github.com/uwsaml/web-data/raw/master/vta/models/synset.txt" +categ_fn = "synset.txt" +download.download(join(categ_url, categ_fn), categ_fn) +synset = eval(open(categ_fn).read()) -# Read in test image +# Download test image image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg' -# Read in test image response = requests.get(image_url) + +# Prepare test image for inference image = Image.open(BytesIO(response.content)).resize((224, 224)) -# Show Image plt.imshow(image) plt.show() -# Set the input -image = process_image(image) +image = np.array(image) - np.array([123., 117., 104.]) +image /= np.array([58.395, 57.12, 57.375]) +image = image.transpose((2, 0, 1)) +image = image[np.newaxis, :] +image = np.repeat(image, env.BATCH, axis=0) + +# Set the network parameters and inputs +m.set_input(**params) m.set_input('data', image) # Perform inference -timer = m.module.time_evaluator("run", ctx, number=1) +timer = m.module.time_evaluator("run", ctx, number=4, repeat=3) tcost = timer() # Get classification results -tvm_output = m.get_output(0) +tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 1000), "float32", remote.cpu(0))) top_categories = np.argsort(tvm_output.asnumpy()[0]) # Report top-5 classification results -print("ResNet-18 Prediction #1:", synset[top_categories[-1]]) +std = np.std(tcost.results) * 1000 / env.BATCH +mean = tcost.mean * 1000 / env.BATCH +print("%s prediction" % model) +print(" #1:", synset[top_categories[-1]]) print(" #2:", synset[top_categories[-2]]) print(" #3:", synset[top_categories[-3]]) print(" #4:", synset[top_categories[-4]]) print(" #5:", synset[top_categories[-5]]) -print("Performed inference in {0:.2f}s".format(tcost.mean)) - - -###################################################################### -# Run a Youtube Video Image Classifier -# ------------------------------------ -# Perform image classification on test stream on 1 frame every 48 frames. -# Comment the `if False:` out to run the demo - -# Early exit - remove for Demo -if False: - - import cv2 - import pafy - from IPython.display import clear_output - - # Helper to crop an image to a square (224, 224) - # Takes in an Image object, returns an Image object - def thumbnailify(image, pad=15): - w, h = image.size - crop = ((w-h)//2+pad, pad, h+(w-h)//2-pad, h-pad) - image = image.crop(crop) - image = image.resize((224, 224)) - return image - - # 16:16 inches - plt.rcParams['figure.figsize'] = [16, 16] - - # Stream the video in - url = "https://www.youtube.com/watch?v=PJlmYh27MHg&t=2s" - video = pafy.new(url) - best = video.getbest(preftype="mp4") - cap = cv2.VideoCapture(best.url) - - # Process one frame out of every 48 for variety - count = 0 - guess = "" - while(count<2400): - - # Capture frame-by-frame - ret, frame = cap.read() - - # Process one every 48 frames - if count % 48 == 1: - frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - frame = Image.fromarray(frame) - # Crop and resize - thumb = np.array(thumbnailify(frame)) - image = process_image(thumb) - guess = classify(m, image) - - # Insert guess in frame - frame = cv2.rectangle(thumb,(0,0),(200,0),(0,0,0),50) - cv2.putText(frame, guess, (5,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (256,256,256), 1, cv2.LINE_AA) - - plt.imshow(thumb) - plt.axis('off') - plt.show() - if cv2.waitKey(1) & 0xFF == ord('q'): - break - clear_output(wait=True) - - count += 1 - - # When everything done, release the capture - cap.release() - cv2.destroyAllWindows() +print("Performed inference in %.2fms/sample (std = %.2f)" % (mean, std)) From 05b08fc8f33f97c8362382f6e4e9cf7f02f49949 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 11 Jun 2019 15:18:06 -0700 Subject: [PATCH 052/126] setup for colab --- package.sh | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100755 package.sh diff --git a/package.sh b/package.sh new file mode 100755 index 000000000000..da227738637d --- /dev/null +++ b/package.sh @@ -0,0 +1,6 @@ +echo "Installing Dependencies ..." +echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list +sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823 +sudo apt-get update +sudo apt-get install -y -q llvm-6.0 libtinfo-dev libffi-dev zlib1g-dev clinfo tree +sudo apt-get install verilator sbt From 1ef1e2509c2a03af3cc865623a80094361d1d6a5 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 11 Jun 2019 23:23:09 -0700 Subject: [PATCH 053/126] fix url --- vta/tutorials/resnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vta/tutorials/resnet.py b/vta/tutorials/resnet.py index d3ed0cebe79d..9caa6cdafbd6 100644 --- a/vta/tutorials/resnet.py +++ b/vta/tutorials/resnet.py @@ -180,7 +180,7 @@ # We run classification on an image sample from ImageNet # Download ImageNet categories -categ_url = "https://github.com/uwsaml/web-data/raw/master/vta/models/synset.txt" +categ_url = "https://github.com/uwsaml/web-data/raw/master/vta/models/" categ_fn = "synset.txt" download.download(join(categ_url, categ_fn), categ_fn) synset = eval(open(categ_fn).read()) From 1619a11a8ab2270ab6556a773bbffce9a1035537 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Thu, 13 Jun 2019 00:00:05 -0700 Subject: [PATCH 054/126] dense operator placeholder --- vta/python/vta/top/vta_dense.py | 171 ++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 vta/python/vta/top/vta_dense.py diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py new file mode 100644 index 000000000000..f2fdbc7e93a4 --- /dev/null +++ b/vta/python/vta/top/vta_dense.py @@ -0,0 +1,171 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Dense operator declaration and schedule registration for VTA.""" + +import numpy as np +import tvm +from tvm import autotvm +import topi + +from ..environment import get_env + +def is_packed_layout(layout): + """Check if layout is packed layout""" + if layout == "NCHW": + return False + if "n" in layout and "c" in layout: + return True + return False + +@autotvm.register_topi_compute(topi.nn.dense, 'vta', 'direct') +def _declaration_dense(cfg, + data, + weight, + bias=None, + out_dtype=None): + """Dense function declaration.""" + + # Make sure that the dense operator is packed + assert len(data.shape) == 4 + assert len(weight.shape) == 4 + # Derive output shape + oshape = (data.shape[0], weight.shape[0], data.shape[2], weight.shape[2]) + + # Reduction axes (input channel) + assert(data.shape[1] == weight.shape[1]) + assert(data.shape[3] == weight.shape[3]) + k_o = tvm.reduce_axis((0, data.shape[1]), name='k_o') + k_i = tvm.reduce_axis((0, data.shape[3]), name='k_i') + + res = tvm.compute( + oshape, + lambda b_o, c_o, b_i, c_i: tvm.sum( + data[b_o, k_o, b_i, k_i].astype(out_dtype) * + weight[c_o, k_o, c_i, k_i].astype(out_dtype), + axis=[k_o, k_i]), + name="res", tag="packed_dense") + + cfg.add_flop(2 * np.prod(topi.util.get_const_tuple(oshape)) * + data.shape[1] * data.shape[3]) + return res + +@autotvm.register_topi_schedule(topi.generic.schedule_dense, 'vta', 'direct') +def _schedule_dense(cfg, outs): + """Packed dense schedule.""" + + assert len(outs) == 1 + output = outs[0] + const_ops = [] + ewise_inputs = [] + ewise_ops = [] + dense_res = [] + assert "int" in output.op.input_tensors[0].dtype + + def _traverse(op): + if topi.tag.is_broadcast(op.tag): + if not op.same_as(output.op): + if len(op.axis) == 0: + const_ops.append(op) + else: + ewise_ops.append(op) + for tensor in op.input_tensors: + if isinstance(tensor.op, tvm.tensor.PlaceholderOp): + ewise_inputs.append((op, tensor)) + else: + _traverse(tensor.op) + else: + assert op.tag == "dense" + dense_res.append(op) + + _traverse(output.op) + assert len(dense_res) == 1 + dense_stage = dense_res[0].output(0) + s = tvm.create_schedule(output.op) + + ##### space definition begin ##### + b, co, _, _ = s[dense_stage].op.axis + ci, _ = s[dense_stage].op.reduce_axis + cfg.define_split('tile_b', b, num_outputs=2) + cfg.define_split('tile_co', co, num_outputs=2) + cfg.define_split('tile_ci', ci, num_outputs=2) + cfg.define_knob('oc_nthread', [1, 2]) + cfg.define_knob('h_nthread', [1, 2]) + ###### space definition end ###### + + data, kernel = dense_stage.op.input_tensors + + env = get_env() + + cdata = s.cache_read(data, env.inp_scope, [dense_stage]) + ckernel = s.cache_read(kernel, env.wgt_scope, [dense_stage]) + s[dense_stage].set_scope(env.acc_scope) + + # cache read input + cache_read_ewise = [] + for consumer, tensor in ewise_inputs: + cache_read_ewise.append( + s.cache_read(tensor, env.acc_scope, [consumer])) + + # set ewise scope + for op in ewise_ops: + s[op].set_scope(env.acc_scope) + s[op].pragma(s[op].op.axis[0], env.alu) + + for op in const_ops: + s[op].compute_inline() + + # tile + x_bo, x_co, x_bi, x_ci = s[output].op.axis + x_bo0, x_bo1 = cfg['tile_b'].apply(s, output, x_bo) + x_co0, x_co1 = cfg['tile_co'].apply(s, output, x_co) + s[output].reorder(x_bo0, x_co0, x_bo1, x_co1, x_bi, x_ci) + store_pt = x_co0 + + # set all compute scopes + s[dense_stage].compute_at(s[output], store_pt) + for op in ewise_ops: + s[op].compute_at(s[output], store_pt) + + for tensor in cache_read_ewise: + s[tensor].compute_at(s[output], store_pt) + s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy) + + # virtual threading along output channel axes + if cfg['oc_nthread'].val > 1: + _, v_t = s[output].split(x_co0, factor=cfg['oc_nthread'].val) + s[output].reorder(v_t, x_bo) + s[output].bind(v_t, tvm.thread_axis("cthread")) + + # virtual threading along spatial rows + if cfg['h_nthread'].val > 1: + _, v_t = s[output].split(x_i0, factor=cfg['h_nthread'].val) + s[output].reorder(v_t, x_bo) + s[output].bind(v_t, tvm.thread_axis("cthread")) + + x_bo, x_co, x_bi, x_ci = s[dense_stage].op.axis + k_o, k_i = s[dense_stage].op.reduce_axis + s[dense_stage].reorder(x_bo, k_o, x_co, x_bi, x_ci, k_i) + + k_o, _ = cfg['tile_ci'].apply(s, dense_stage, k_o) + s[cdata].compute_at(s[dense_stage], k_o) + s[ckernel].compute_at(s[dense_stage], k_o) + + # Use VTA instructions + s[cdata].pragma(s[cdata].op.axis[0], env.dma_copy) + s[ckernel].pragma(s[ckernel].op.axis[0], env.dma_copy) + s[dense_stage].tensorize(x_bi, env.gemm) + s[output].pragma(x_co1, env.dma_copy) From e0e1bc7f8410094860778a53811716ef146aba2b Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Thu, 13 Jun 2019 12:52:50 -0700 Subject: [PATCH 055/126] fix support for pass manager --- python/tvm/relay/quantize/quantize.py | 43 +++++++-------------------- src/relay/pass/quantize.cc | 21 +++++++------ 2 files changed, 23 insertions(+), 41 deletions(-) diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py index 6fc3f9ed57fc..df6f8b9e139c 100644 --- a/python/tvm/relay/quantize/quantize.py +++ b/python/tvm/relay/quantize/quantize.py @@ -279,6 +279,17 @@ def realize(): return _quantize.QuantizeRealize() +def rewrite_for_vta(): + """Performs rewriting for VTA target. + + Returns + ------- + ret: tvm.relay.Pass + The registered pass for VTA rewrite. + """ + return _quantize.QuantizeRewriteForVTA() + + def _bind_params(func, params): """Bind the params to the expression. """ @@ -349,35 +360,3 @@ def quantize(graph, params=None, dataset=None): mod = optimize(mod) mod = quantize_seq(mod) return mod[mod.entry_func.name_hint] - -def quantize_vta(graph, params=None, dataset=None): - - """ The quantization procedure for VTA specifically. - - Parameters - --------- - graph: Function - The original graph. - - params : dict of str to NDArray - Input parameters to the graph that do not change - during inference time. Used for constant folding. - - dataset: list of dict of Var -> NDArray - The calibration dataset. - - Returns - ------- - ret: Function - The graph after quantization - """ - - # TODO(zhiics) Move this to the pass manager. - graph = optimize(graph, params) - graph = _quantize.rewrite_for_vta(graph) - graph = annotate(graph) - graph = calibrate(graph, dataset) - graph = realize(graph) - graph = _ir_pass.fold_constant(graph) - - return graph diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc index 8fbe290ad60b..cb64902d74f9 100644 --- a/src/relay/pass/quantize.cc +++ b/src/relay/pass/quantize.cc @@ -700,6 +700,18 @@ Pass QuantizeRealizePass() { TVM_REGISTER_API("relay._quantize.QuantizeRealize") .set_body_typed(QuantizeRealizePass); +Pass QuantizeRewriteForVTAPass() { + runtime::TypedPackedFunc pass_func = + [=](Function f, Module m, PassContext pc) { + return Downcast( + ForwardRewrite(f, "FQVtaRewrite", nullptr, nullptr)); + }; + return CreateFunctionPass(pass_func, 1, "QuantizeRewriteForVTA", {}); +} + +TVM_REGISTER_API("relay._quantize.QuantizeRewriteForVTA") +.set_body_typed(QuantizeRewriteForVTAPass); + // ============= // Insert stop_fusion for vta. @@ -715,18 +727,11 @@ QVtaExpr QVtaExprNode::make(Expr expr) { return QVtaExpr(rnode); } -TVM_REGISTER_API("relay._quantize.rewrite_for_vta") -.set_body_typed([] (const Expr& expr) { - return ForwardRewrite(expr, "FQVtaRewrite", nullptr, nullptr); -}); - - TVM_REGISTER_API("relay._quantize.make_vta_expr") .set_body([](TVMArgs args, TVMRetValue *ret) { *ret = QVtaExprNode::make(args[0]); }); - TVM_REGISTER_API("relay._quantize.make_stop_fusion") .set_body_typed([] (const Expr& expr) { return StopFusion(expr); @@ -740,8 +745,6 @@ TVM_REGISTER_API("relay._quantize.temp_expr_realize") }); - - } // namespace quantize } // namespace relay } // namespace tvm From 0b4addb416e163c4096947eb486fe5bf1d33c881 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Thu, 13 Jun 2019 17:02:10 -0700 Subject: [PATCH 056/126] dense op benchmark --- vta/python/vta/top/__init__.py | 1 + vta/python/vta/top/op.py | 21 +- vta/python/vta/top/vta_conv2d.py | 29 +-- vta/python/vta/top/vta_dense.py | 20 +- .../integration/test_benchmark_topi_conv2d.py | 3 +- .../integration/test_benchmark_topi_dense.py | 185 ++++++++++++++++++ 6 files changed, 229 insertions(+), 30 deletions(-) create mode 100644 vta/tests/python/integration/test_benchmark_topi_dense.py diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py index 5111035decd3..ee2b5ec21ef8 100644 --- a/vta/python/vta/top/__init__.py +++ b/vta/python/vta/top/__init__.py @@ -7,3 +7,4 @@ from . import nnvm_op from . import op from . import vta_conv2d +from . import vta_dense diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index 7f3c58a46116..5d6cfadc34f4 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -31,6 +31,7 @@ def compute_clip(attrs, inputs, output_type, target): x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") return [x] + @reg.register_compute("nn.conv2d", level=15) def compute_conv2d(attrs, inputs, output_type, target): """ Compute definition of conv2d """ @@ -41,13 +42,13 @@ def compute_conv2d(attrs, inputs, output_type, target): layout = attrs.data_layout out_dtype = attrs.out_dtype - assert dilation == (1, 1), "not support dilate now" + assert dilation == (1, 1), "support for dilation limited to (1, 1)" if is_packed_layout(layout): if groups == 1: assert groups == 1 env = get_env() assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now" - assert env.LOG_OUT_WIDTH == 3, "only support 8bit inp for now" + assert env.LOG_WGT_WIDTH == 3, "only support 8bit wgt for now" inputs = list(inputs) assert inputs[1].dtype == "int8" return [topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype)] @@ -57,6 +58,7 @@ def compute_conv2d(attrs, inputs, output_type, target): with tvm.target.arm_cpu(tvm.target.current_target().model): return _nn.compute_conv2d(attrs, inputs, output_type, target) + @reg.register_schedule("nn.conv2d", level=15) def schedule_conv2d(attrs, outs, target): """ Schedule definition of conv2d """ @@ -77,3 +79,18 @@ def schedule_conv2d(attrs, outs, target): with tvm.target.arm_cpu(tvm.target.current_target().model): return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target()) + + +@reg.register_compute("nn.dense", level=15) +def compute_dense(attrs, inputs, out_type, target): + """Compute definition of dense""" + out_dtype = attrs.out_dtype + out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype + return [topi.nn.dense(inputs[0], inputs[1], None, out_dtype)] + + +@reg.register_schedule("nn.dense", level=15) +def schedule_dense(attrs, outputs, target): + """Schedule definition of dense""" + with target: + return topi.generic.schedule_dense(outputs) diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index eef047965a56..15d45029af82 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Namespace for supporting packed_conv2d + ewise variant of nnvm.""" +"""Conv2D operator declaration and schedule registration for VTA.""" import numpy as np import tvm @@ -32,14 +32,14 @@ def is_packed_layout(layout): return False @autotvm.register_topi_compute(topi.nn.conv2d, 'vta', 'direct') -def packed_conv2d(cfg, - data, - kernel, - strides, - padding, - dilation, - layout, - out_dtype): +def _declaration_conv2d(cfg, + data, + kernel, + strides, + padding, + dilation, + layout, + out_dtype): """ Packed conv2d function.""" if not is_packed_layout(layout): raise topi.InvalidShapeError() @@ -68,14 +68,14 @@ def packed_conv2d(cfg, pad_data[b_o, k_o, i*hstride+d_i, j*wstride+d_j, b_i, k_i].astype(out_dtype) * kernel[c_o, k_o, d_i, d_j, c_i, k_i].astype(out_dtype), axis=[k_o, d_i, d_j, k_i]), - name="res", tag="packed_conv2d") + name="res", tag="conv2d") cfg.add_flop(2 * np.prod(topi.util.get_const_tuple(oshape)) * kshape[2] * kshape[3] * ishape[1] * ishape[-1]) return res @autotvm.register_topi_schedule(topi.generic.schedule_conv2d_nchw, 'vta', 'direct') -def schedule_packed_conv2d(cfg, outs): +def _schedule_conv2d(cfg, outs): assert len(outs) == 1 output = outs[0] const_ops = [] @@ -97,7 +97,7 @@ def _traverse(op): else: _traverse(tensor.op) else: - assert op.tag == "packed_conv2d" + assert op.tag == "conv2d" conv2d_res.append(op) _traverse(output.op) @@ -106,8 +106,8 @@ def _traverse(op): s = tvm.create_schedule(output.op) ##### space definition begin ##### - b, co, h, w, bi, ci = s[conv2d_stage].op.axis - ci, kh, kw, bci = s[conv2d_stage].op.reduce_axis + b, co, h, w, _, _ = s[conv2d_stage].op.axis + ci, _, _, _ = s[conv2d_stage].op.reduce_axis cfg.define_split('tile_b', b, num_outputs=2) cfg.define_split('tile_h', h, num_outputs=2) cfg.define_split('tile_w', w, num_outputs=2) @@ -192,4 +192,5 @@ def _traverse(op): s[ckernel].pragma(s[ckernel].op.axis[0], env.dma_copy) s[conv2d_stage].tensorize(x_bi, env.gemm) s[output].pragma(x_co1, env.dma_copy) + return s diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py index f2fdbc7e93a4..69f96f1aba29 100644 --- a/vta/python/vta/top/vta_dense.py +++ b/vta/python/vta/top/vta_dense.py @@ -44,13 +44,12 @@ def _declaration_dense(cfg, assert len(weight.shape) == 4 # Derive output shape oshape = (data.shape[0], weight.shape[0], data.shape[2], weight.shape[2]) - + # Reduction axes (input channel) - assert(data.shape[1] == weight.shape[1]) - assert(data.shape[3] == weight.shape[3]) + assert(int(data.shape[1]) == int(weight.shape[1])) + assert(int(data.shape[3]) == int(weight.shape[3])) k_o = tvm.reduce_axis((0, data.shape[1]), name='k_o') k_i = tvm.reduce_axis((0, data.shape[3]), name='k_i') - res = tvm.compute( oshape, lambda b_o, c_o, b_i, c_i: tvm.sum( @@ -58,7 +57,7 @@ def _declaration_dense(cfg, weight[c_o, k_o, c_i, k_i].astype(out_dtype), axis=[k_o, k_i]), name="res", tag="packed_dense") - + cfg.add_flop(2 * np.prod(topi.util.get_const_tuple(oshape)) * data.shape[1] * data.shape[3]) return res @@ -88,7 +87,7 @@ def _traverse(op): else: _traverse(tensor.op) else: - assert op.tag == "dense" + assert op.tag == "packed_dense" dense_res.append(op) _traverse(output.op) @@ -103,7 +102,6 @@ def _traverse(op): cfg.define_split('tile_co', co, num_outputs=2) cfg.define_split('tile_ci', ci, num_outputs=2) cfg.define_knob('oc_nthread', [1, 2]) - cfg.define_knob('h_nthread', [1, 2]) ###### space definition end ###### data, kernel = dense_stage.op.input_tensors @@ -150,12 +148,6 @@ def _traverse(op): s[output].reorder(v_t, x_bo) s[output].bind(v_t, tvm.thread_axis("cthread")) - # virtual threading along spatial rows - if cfg['h_nthread'].val > 1: - _, v_t = s[output].split(x_i0, factor=cfg['h_nthread'].val) - s[output].reorder(v_t, x_bo) - s[output].bind(v_t, tvm.thread_axis("cthread")) - x_bo, x_co, x_bi, x_ci = s[dense_stage].op.axis k_o, k_i = s[dense_stage].op.reduce_axis s[dense_stage].reorder(x_bo, k_o, x_co, x_bi, x_ci, k_i) @@ -169,3 +161,5 @@ def _traverse(op): s[ckernel].pragma(s[ckernel].op.axis[0], env.dma_copy) s[dense_stage].tensorize(x_bi, env.gemm) s[output].pragma(x_co1, env.dma_copy) + + return s \ No newline at end of file diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py index dc7b5d710c29..2aec47118e44 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py @@ -14,7 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Testing if we can generate code in topi style""" + +"""Testing topi conv2d operator for VTA""" import os import json diff --git a/vta/tests/python/integration/test_benchmark_topi_dense.py b/vta/tests/python/integration/test_benchmark_topi_dense.py new file mode 100644 index 000000000000..6759cc19b292 --- /dev/null +++ b/vta/tests/python/integration/test_benchmark_topi_dense.py @@ -0,0 +1,185 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Testing topi gemm operator for VTA""" + +import os +import json +from collections import namedtuple + +import numpy as np + +import tvm +from tvm import autotvm +from tvm.contrib import util +from tvm.contrib.pickle_memoize import memoize +import topi +import topi.testing +import vta +from vta import program_fpga, reconfig_runtime +import vta.testing +from vta.testing import simulator + +# FIXME: we need a custom clip operator to circumvent a pattern detection limitation +@tvm.tag_scope(tag=topi.tag.ELEMWISE) +def my_clip(x, a_min, a_max): + """Unlike topi's current clip, put min and max into two stages.""" + const_min = tvm.const(a_min, x.dtype) + const_max = tvm.const(a_max, x.dtype) + x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") + x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") + return x + +def run_gemm(env, remote, target, + batch_size, in_feat, out_feat, + check_correctness=True, print_ir=True, + samples=4): + + # Perform packing only if we are targeting the accelerator + if "arm_cpu" in target.keys: + data_pack = False + elif "vta" in target.keys: + data_pack = True + + # Derive shapes depending upon packing + a_shape = (batch_size, in_feat) + w_shape = (out_feat, in_feat) + if data_pack: + data_shape = (batch_size//env.BATCH, in_feat//env.BLOCK_IN, + env.BATCH, env.BLOCK_IN) + kernel_shape = (out_feat//env.BLOCK_OUT, in_feat//env.BLOCK_IN, + env.BLOCK_OUT, env.BLOCK_IN) + else: + data_shape = a_shape + kernel_shape = w_shape + data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) + kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) + + # Define base computation schedule + with target: + res = topi.nn.dense( + data, kernel, None, env.acc_dtype) + res = topi.right_shift(res, 8) + res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) + res = topi.cast(res, env.out_dtype) + # Derive base schedule + s = topi.generic.schedule_dense([res]) + if print_ir: + print(vta.lower(s, [data, kernel, res], simple_mode=True)) + + # Derive number of ops + num_ops = 2 * batch_size * in_feat * out_feat + + # @memoize("vta.tests.test_benchmark_topi.dense.verify") + def get_ref_data(): + # derive min max for act, wgt types (max non inclusive) + a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) + w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1)) + a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype) + w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype) + + r_np = np.dot(a_np.astype(env.acc_dtype), w_np.T.astype(env.acc_dtype)).astype(env.acc_dtype) + return a_np, w_np, r_np + + # Data in original format + data_np, kernel_np, res_ref = get_ref_data() + if data_pack: + data_np = data_np.reshape( + batch_size//env.BATCH, env.BATCH, + in_feat//env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3)) + kernel_np = kernel_np.reshape( + out_feat//env.BLOCK_OUT, env.BLOCK_OUT, + in_feat//env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3)) + + # Build + if "vta" in target.keys: + mod = vta.build(s, [data, kernel, res], + target=target, + target_host=env.target_host, + name="dense") + else: + mod = tvm.build(s, [data, kernel, res], + target=target, + target_host=env.target_host, + name="dense") + temp = util.tempdir() + mod.save(temp.relpath("dense.o")) + remote.upload(temp.relpath("dense.o")) + f = remote.load_module("dense.o") + ctx = remote.context(str(target)) + + res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype) + data_arr = tvm.nd.array(data_np, ctx) + kernel_arr = tvm.nd.array(kernel_np, ctx) + res_arr = tvm.nd.array(res_np, ctx) + time_f = f.time_evaluator("conv2d", ctx, number=samples) + + # In vta sim mode, collect simulator runtime statistics + stats = {} + cost = None + if env.TARGET == "sim": + # Check if we're in local RPC mode (allows us to rebuild the + # runtime on the fly when varying the VTA designs) + local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) + if local_rpc: + remote.get_function("vta.simulator.profiler_clear")() + cost = time_f(data_arr, kernel_arr, res_arr) + stats = json.loads(remote.get_function("vta.simulator.profiler_status")()) + else: + simulator.clear_stats() + cost = time_f(data_arr, kernel_arr, res_arr) + stats = simulator.stats() + else: + cost = time_f(data_arr, kernel_arr, res_arr) + + # Check correctness + correct = False + if check_correctness: + res_orig = res_arr.asnumpy() + if data_pack: + res_orig = res_orig.reshape(batch_size, out_feat) + res_ref = res_ref >> 8 + res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1) + res_ref = res_ref.astype(env.out_dtype) + correct = np.allclose(res_orig, res_ref) + + gops = (num_ops / cost.mean) / float(10 ** 9) + status = "PASSED" if correct else "FAILED" + if "arm_cpu" in target.keys: + device = "CPU" + elif "vta" in target.keys: + device = "VTA" + print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops)) + + return correct, cost, stats + +def test_gemm(device="vta", batch=128, in_feat=128, out_feat=128): + def _run(env, remote): + if device == "vta": + target = env.target + if env.TARGET != "sim": + assert tvm.module.enabled("rpc") + program_fpga(remote, bitstream=None) + reconfig_runtime(remote) + elif device == "arm_cpu": + target = env.target_vta_cpu + with autotvm.tophub.context(target): # load pre-tuned schedule parameters + run_gemm(env, remote, target, batch, in_feat, out_feat) + vta.testing.run(_run) + +if __name__ == "__main__": + test_gemm("vta", 1, 16, 16) From d16f91cbf8750f31c380ea957dc6790675464d79 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Fri, 14 Jun 2019 12:24:55 -0700 Subject: [PATCH 057/126] getting rid of kwargs usage --- python/tvm/relay/op/nn/_nn.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index e796995d5b42..3778a56aa9ca 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -56,7 +56,7 @@ def compute_dense(attrs, inputs, out_type, target): """Compute definition of dense""" out_dtype = attrs.out_dtype out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype - return [topi.nn.dense(inputs[0], inputs[1], out_dtype=out_dtype)] + return [topi.nn.dense(inputs[0], inputs[1], None, out_dtype)] @reg.register_schedule("nn.dense") @@ -124,16 +124,16 @@ def compute_conv2d(attrs, inputs, out_type, target): get_const_int(inputs[1].shape[0]) == groups and \ get_const_int(inputs[1].shape[1]) == 1: out = topi.nn.depthwise_conv2d_nchw( - inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype) + inputs[0], inputs[1], strides, padding, dilation, out_dtype) elif layout == "NHWC" and \ kernel_layout == "HWOI" and\ get_const_int(inputs[1].shape[2]) == groups and \ get_const_int(inputs[1].shape[3]) == 1: out = topi.nn.depthwise_conv2d_nhwc( - inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype) + inputs[0], inputs[1], strides, padding, dilation, out_dtype) elif layout in ['NCHW', 'NCHW4c']: out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, - out_dtype=out_dtype) + out_dtype) else: raise ValueError("not support arbitrary group number for now") return [out] From 5e8173289d521d73686a9c1ea1f6d777aea5d9de Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Fri, 14 Jun 2019 12:27:07 -0700 Subject: [PATCH 058/126] registration of dense definition and schedule for vta --- vta/python/vta/top/op.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index 5d6cfadc34f4..abb529dbe7f1 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -75,7 +75,7 @@ def schedule_conv2d(attrs, outs, target): elif str(target).startswith("llvm"): return tvm.create_schedule([x.op for x in outs]) else: - raise RuntimeError("not support target %s" % target) + raise RuntimeError("Target %s is not supported" % target) with tvm.target.arm_cpu(tvm.target.current_target().model): return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target()) @@ -86,11 +86,26 @@ def compute_dense(attrs, inputs, out_type, target): """Compute definition of dense""" out_dtype = attrs.out_dtype out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype - return [topi.nn.dense(inputs[0], inputs[1], None, out_dtype)] + + if inputs[0].shape == 4: # this implies the layout is packed + return [topi.nn.dense(inputs[0], inputs[1], None, out_dtype)] + + with tvm.target.arm_cpu(tvm.target.current_target().model): + return _nn.compute_dense(attrs, inputs, out_type, target) @reg.register_schedule("nn.dense", level=15) -def schedule_dense(attrs, outputs, target): +def schedule_dense(attrs, outs, target): """Schedule definition of dense""" - with target: - return topi.generic.schedule_dense(outputs) + + if outs[0].shape == 4: # this implies the layout is packed + target = tvm.target.create(target) + if target.device_name == "vta": + return topi.generic.schedule_dense(outs) + elif str(target).startswith("llvm"): + return tvm.create_schedule([x.op for x in outs]) + else: + raise RuntimeError("Target %s is not supported" % target) + + with tvm.target.arm_cpu(tvm.target.current_target().model): + return _nn.schedule_dense(attrs, outs, tvm.target.current_target()) From 52880c932735d8483c51234efe8bd5a970b5b26f Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Fri, 14 Jun 2019 12:27:35 -0700 Subject: [PATCH 059/126] error reporting --- vta/python/vta/top/vta_dense.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py index 69f96f1aba29..63fed639ca8c 100644 --- a/vta/python/vta/top/vta_dense.py +++ b/vta/python/vta/top/vta_dense.py @@ -40,8 +40,9 @@ def _declaration_dense(cfg, """Dense function declaration.""" # Make sure that the dense operator is packed - assert len(data.shape) == 4 - assert len(weight.shape) == 4 + if len(data.shape) != 4 or len(weight.shape) != 4: + raise topi.InvalidShapeError() + # Derive output shape oshape = (data.shape[0], weight.shape[0], data.shape[2], weight.shape[2]) From d0b2ade79a6ff987304a73168c29e386efb41ad1 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Fri, 14 Jun 2019 12:28:22 -0700 Subject: [PATCH 060/126] dense support --- vta/scripts/tune_resnet.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py index 463187f3caa3..46cbffc86a35 100644 --- a/vta/scripts/tune_resnet.py +++ b/vta/scripts/tune_resnet.py @@ -82,9 +82,12 @@ def _topi_nn_dense(*args, **kwargs): with tvm.target.vta(): res = topi.nn.dense(*args, **kwargs) + res = topi.right_shift(res, 8) + res = my_clip(res, 0, 127) + res = topi.cast(res, "int8") if tvm.target.current_target().device_name == 'vta': - s = topi.generic.schedule_conv2d_nchw([res]) + s = topi.generic.schedule_dense([res]) else: s = tvm.create_schedule([res.op]) From 5e100b5754c06c49cb983769f967ffda12db4ecf Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Fri, 14 Jun 2019 12:29:57 -0700 Subject: [PATCH 061/126] remove use of kwargs --- vta/tests/python/integration/test_benchmark_topi_dense.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vta/tests/python/integration/test_benchmark_topi_dense.py b/vta/tests/python/integration/test_benchmark_topi_dense.py index 6759cc19b292..656a939302f9 100644 --- a/vta/tests/python/integration/test_benchmark_topi_dense.py +++ b/vta/tests/python/integration/test_benchmark_topi_dense.py @@ -72,7 +72,7 @@ def run_gemm(env, remote, target, # Define base computation schedule with target: res = topi.nn.dense( - data, kernel, None, env.acc_dtype) + data, kernel, out_dtype=env.acc_dtype) res = topi.right_shift(res, 8) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) From 28b976fd69cf2ceb53292fabd11c5afa2a892a9f Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Fri, 14 Jun 2019 16:50:44 -0700 Subject: [PATCH 062/126] update dense schedule --- vta/python/vta/top/vta_dense.py | 36 +++++++++---------- .../integration/test_benchmark_topi_dense.py | 6 ++-- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py index 63fed639ca8c..3c4b0e009e03 100644 --- a/vta/python/vta/top/vta_dense.py +++ b/vta/python/vta/top/vta_dense.py @@ -57,7 +57,7 @@ def _declaration_dense(cfg, data[b_o, k_o, b_i, k_i].astype(out_dtype) * weight[c_o, k_o, c_i, k_i].astype(out_dtype), axis=[k_o, k_i]), - name="res", tag="packed_dense") + name="res", tag="dense") cfg.add_flop(2 * np.prod(topi.util.get_const_tuple(oshape)) * data.shape[1] * data.shape[3]) @@ -88,7 +88,7 @@ def _traverse(op): else: _traverse(tensor.op) else: - assert op.tag == "packed_dense" + assert op.tag == "dense" dense_res.append(op) _traverse(output.op) @@ -100,17 +100,17 @@ def _traverse(op): b, co, _, _ = s[dense_stage].op.axis ci, _ = s[dense_stage].op.reduce_axis cfg.define_split('tile_b', b, num_outputs=2) - cfg.define_split('tile_co', co, num_outputs=2) cfg.define_split('tile_ci', ci, num_outputs=2) + cfg.define_split('tile_co', co, num_outputs=2) cfg.define_knob('oc_nthread', [1, 2]) ###### space definition end ###### - data, kernel = dense_stage.op.input_tensors + data, weight = dense_stage.op.input_tensors env = get_env() cdata = s.cache_read(data, env.inp_scope, [dense_stage]) - ckernel = s.cache_read(kernel, env.wgt_scope, [dense_stage]) + cweight = s.cache_read(weight, env.wgt_scope, [dense_stage]) s[dense_stage].set_scope(env.acc_scope) # cache read input @@ -127,12 +127,12 @@ def _traverse(op): for op in const_ops: s[op].compute_inline() - # tile - x_bo, x_co, x_bi, x_ci = s[output].op.axis - x_bo0, x_bo1 = cfg['tile_b'].apply(s, output, x_bo) - x_co0, x_co1 = cfg['tile_co'].apply(s, output, x_co) - s[output].reorder(x_bo0, x_co0, x_bo1, x_co1, x_bi, x_ci) - store_pt = x_co0 + # apply tiling for SRAM reuse + x_b, x_c, _, _ = s[output].op.axis + x_bo, x_bi = cfg['tile_b'].apply(s, output, x_b) + x_co, x_ci = cfg['tile_co'].apply(s, output, x_c) + s[output].reorder(x_bo, x_co, x_bi, x_ci) + store_pt = x_co # set all compute scopes s[dense_stage].compute_at(s[output], store_pt) @@ -145,22 +145,22 @@ def _traverse(op): # virtual threading along output channel axes if cfg['oc_nthread'].val > 1: - _, v_t = s[output].split(x_co0, factor=cfg['oc_nthread'].val) + _, v_t = s[output].split(x_co, factor=cfg['oc_nthread'].val) s[output].reorder(v_t, x_bo) s[output].bind(v_t, tvm.thread_axis("cthread")) - x_bo, x_co, x_bi, x_ci = s[dense_stage].op.axis - k_o, k_i = s[dense_stage].op.reduce_axis - s[dense_stage].reorder(x_bo, k_o, x_co, x_bi, x_ci, k_i) + x_bo, x_co, x_bi, _ = s[dense_stage].op.axis + k_o, _ = s[dense_stage].op.reduce_axis + s[dense_stage].reorder(x_bo, k_o, x_co) k_o, _ = cfg['tile_ci'].apply(s, dense_stage, k_o) s[cdata].compute_at(s[dense_stage], k_o) - s[ckernel].compute_at(s[dense_stage], k_o) + s[cweight].compute_at(s[dense_stage], k_o) # Use VTA instructions s[cdata].pragma(s[cdata].op.axis[0], env.dma_copy) - s[ckernel].pragma(s[ckernel].op.axis[0], env.dma_copy) + s[cweight].pragma(s[cweight].op.axis[0], env.dma_copy) s[dense_stage].tensorize(x_bi, env.gemm) - s[output].pragma(x_co1, env.dma_copy) + s[output].pragma(x_ci, env.dma_copy) return s \ No newline at end of file diff --git a/vta/tests/python/integration/test_benchmark_topi_dense.py b/vta/tests/python/integration/test_benchmark_topi_dense.py index 656a939302f9..12fbc45c1c4b 100644 --- a/vta/tests/python/integration/test_benchmark_topi_dense.py +++ b/vta/tests/python/integration/test_benchmark_topi_dense.py @@ -126,7 +126,7 @@ def get_ref_data(): data_arr = tvm.nd.array(data_np, ctx) kernel_arr = tvm.nd.array(kernel_np, ctx) res_arr = tvm.nd.array(res_np, ctx) - time_f = f.time_evaluator("conv2d", ctx, number=samples) + time_f = f.time_evaluator("dense", ctx, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} @@ -163,7 +163,7 @@ def get_ref_data(): device = "CPU" elif "vta" in target.keys: device = "VTA" - print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops)) + print("%s DENSE TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops)) return correct, cost, stats @@ -182,4 +182,4 @@ def _run(env, remote): vta.testing.run(_run) if __name__ == "__main__": - test_gemm("vta", 1, 16, 16) + test_gemm("vta", 16, 512, 1008) From eafd93e8ac77c2b042a5081a77be80405b1b2d4b Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 17 Jun 2019 19:29:08 -0700 Subject: [PATCH 063/126] fix API change from PR3353 --- vta/scripts/relay_to_vta.py | 4 ++-- vta/scripts/tune_resnet.py | 4 ++-- vta/tutorials/resnet.py | 6 ++---- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/vta/scripts/relay_to_vta.py b/vta/scripts/relay_to_vta.py index c71e6f61f37a..a0b8a5fa6998 100644 --- a/vta/scripts/relay_to_vta.py +++ b/vta/scripts/relay_to_vta.py @@ -101,7 +101,7 @@ def classification_demo(opt): # Get off the shelf gluon model, and convert to relay gluon_model = vision.get_model(opt.model, pretrained=True) - relay_prog, params = relay.frontend.from_mxnet(gluon_model, shape_dict) + mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) @@ -109,7 +109,7 @@ def classification_demo(opt): # Perform quantization in Relay with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1): - relay_prog = relay.quantize.quantize(relay_prog, params=params) + relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params) # Perform graph packing and constant folding for VTA target if target.device_name == "vta": diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py index 46cbffc86a35..6f3a688074f8 100644 --- a/vta/scripts/tune_resnet.py +++ b/vta/scripts/tune_resnet.py @@ -102,7 +102,7 @@ def compile_network(opt, env, target): # Get off the shelf gluon model, and convert to relay gluon_model = vision.get_model(opt.model, pretrained=True) - relay_prog, params = relay.frontend.from_mxnet(gluon_model, shape_dict) + mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) @@ -110,7 +110,7 @@ def compile_network(opt, env, target): # Perform quantization in Relay with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1): - relay_prog = relay.quantize.quantize(relay_prog, params=params) + relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params) # Perform graph packing and constant folding for VTA target if target.device_name == "vta": diff --git a/vta/tutorials/resnet.py b/vta/tutorials/resnet.py index 9caa6cdafbd6..af86e8a32c04 100644 --- a/vta/tutorials/resnet.py +++ b/vta/tutorials/resnet.py @@ -125,9 +125,7 @@ build_start = time.time() # Start front end compilation - relay_prog, params = relay.frontend.from_mxnet(gluon_model, shape_dict) - print(relay_prog) - # exit() + mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) @@ -135,7 +133,7 @@ # Perform quantization in Relay with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1): - relay_prog = relay.quantize.quantize(relay_prog, params=params) + relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params) # Perform graph packing and constant folding for VTA target if target.device_name == "vta": From a333a0747cba044035f691a209acaa425bd01220 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 18 Jun 2019 11:41:13 -0700 Subject: [PATCH 064/126] fixing flop derivation bug --- vta/python/vta/top/vta_conv2d.py | 5 +++-- vta/python/vta/top/vta_dense.py | 19 +++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index 15d45029af82..e588a2ff0404 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -68,10 +68,11 @@ def _declaration_conv2d(cfg, pad_data[b_o, k_o, i*hstride+d_i, j*wstride+d_j, b_i, k_i].astype(out_dtype) * kernel[c_o, k_o, d_i, d_j, c_i, k_i].astype(out_dtype), axis=[k_o, d_i, d_j, k_i]), - name="res", tag="conv2d") + name="res", tag="conv2d_dense") cfg.add_flop(2 * np.prod(topi.util.get_const_tuple(oshape)) * kshape[2] * kshape[3] * ishape[1] * ishape[-1]) + return res @autotvm.register_topi_schedule(topi.generic.schedule_conv2d_nchw, 'vta', 'direct') @@ -97,7 +98,7 @@ def _traverse(op): else: _traverse(tensor.op) else: - assert op.tag == "conv2d" + assert op.tag == "conv2d_dense" conv2d_res.append(op) _traverse(output.op) diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py index 3c4b0e009e03..0b4d907853e4 100644 --- a/vta/python/vta/top/vta_dense.py +++ b/vta/python/vta/top/vta_dense.py @@ -43,24 +43,27 @@ def _declaration_dense(cfg, if len(data.shape) != 4 or len(weight.shape) != 4: raise topi.InvalidShapeError() - # Derive output shape + # Derive shapes + ishape = topi.util.get_const_tuple(data.shape) + wshape = topi.util.get_const_tuple(weight.shape) oshape = (data.shape[0], weight.shape[0], data.shape[2], weight.shape[2]) # Reduction axes (input channel) - assert(int(data.shape[1]) == int(weight.shape[1])) - assert(int(data.shape[3]) == int(weight.shape[3])) - k_o = tvm.reduce_axis((0, data.shape[1]), name='k_o') - k_i = tvm.reduce_axis((0, data.shape[3]), name='k_i') + assert(ishape[1] == wshape[1]) + assert(ishape[3] == wshape[3]) + k_o = tvm.reduce_axis((0, ishape[1]), name='k_o') + k_i = tvm.reduce_axis((0, ishape[3]), name='k_i') res = tvm.compute( oshape, lambda b_o, c_o, b_i, c_i: tvm.sum( data[b_o, k_o, b_i, k_i].astype(out_dtype) * weight[c_o, k_o, c_i, k_i].astype(out_dtype), axis=[k_o, k_i]), - name="res", tag="dense") + name="res", tag="dense_pack") cfg.add_flop(2 * np.prod(topi.util.get_const_tuple(oshape)) * - data.shape[1] * data.shape[3]) + ishape[1] * ishape[3]) + return res @autotvm.register_topi_schedule(topi.generic.schedule_dense, 'vta', 'direct') @@ -88,7 +91,7 @@ def _traverse(op): else: _traverse(tensor.op) else: - assert op.tag == "dense" + assert op.tag == "dense_pack" dense_res.append(op) _traverse(output.op) From 1c4e950132d8ec0a58b0223223c86f347bcc5c79 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 18 Jun 2019 11:41:24 -0700 Subject: [PATCH 065/126] dense operator tuning --- vta/scripts/tune_dense.py | 87 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 vta/scripts/tune_dense.py diff --git a/vta/scripts/tune_dense.py b/vta/scripts/tune_dense.py new file mode 100644 index 000000000000..5f38e1bb70f8 --- /dev/null +++ b/vta/scripts/tune_dense.py @@ -0,0 +1,87 @@ +"""Tuning a single conv2d operator""" +from collections import namedtuple +import logging +import os + +import tvm +from tvm import autotvm +from tvm.contrib.util import get_lower_ir +import topi +import vta +import vta.testing + +env = vta.get_env() + +Workload = namedtuple("DenseWorkload", + ['batch', 'in_filter', 'out_filter']) + +resnet_wkls = [ + # Workloads of resnet18 on imagenet + ('resnet-18.dense', Workload(16, 512, 1024)), +] + +@tvm.tag_scope(tag=topi.tag.ELEMWISE) +def my_clip(x, a_min, a_max): + """Unlike topi's current clip, put min and max into two stages.""" + const_min = tvm.const(a_min, x.dtype) + const_max = tvm.const(a_max, x.dtype) + x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") + x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") + return x + +def dense(N, CI, CO): + data_shape = (N//env.BATCH, CI//env.BLOCK_IN, env.BATCH, env.BLOCK_IN) + kernel_shape = (CO//env.BLOCK_OUT, CI//env.BLOCK_IN, env.BLOCK_OUT, env.BLOCK_IN) + + data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) + kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) + + with tvm.target.vta(): + res = topi.nn.dense(data, kernel, None, 'int32') + res = topi.right_shift(res, 8) + res = my_clip(res, 0, 127) + res = topi.cast(res, "int8") + + if tvm.target.current_target().device_name == 'vta': + s = topi.generic.schedule_dense([res]) + else: + s = tvm.create_schedule([res.op]) + + return s, [data, kernel, res] + +if __name__ == '__main__': + + # Logging config (for printing tuning log to the screen) + logging.basicConfig() + logging.getLogger('autotvm').setLevel(logging.DEBUG) + + # Get tracker info from env + tracket_host = os.environ.get("TVM_TRACKER_HOST", None) + tracket_port = int(os.environ.get("TVM_TRACKER_PORT", None)) + if not tracket_host or not tracket_port: + print("Set your AutoTVM tracker node host and port variables to run the autotuner") + exit() + + for wl_name, wl in resnet_wkls: + + # Workload parameters + N = wl.batch + CI = wl.in_filter + CO = wl.out_filter + + task = autotvm.task.create(dense, args=(N, CI, CO), + target=tvm.target.vta(), target_host=env.target_host, template_key='direct') + print(task.config_space) + + measure_option = autotvm.measure_option( + builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func), + runner=autotvm.RPCRunner(env.TARGET, tracket_host, tracket_port, number=4, repeat=3, timeout=10000, + check_correctness=True)) + + tuner = autotvm.tuner.RandomTuner(task) + tuner.tune(n_trial=len(task.config_space), + measure_option=measure_option, + callbacks=[autotvm.callback.log_to_file('conv2d.log')]) + + print("\nBest tuner config:") + print(tuner.best_config) From af5cfd441ee709c8aa8a9eaac97b8299e2616b04 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 18 Jun 2019 11:50:56 -0700 Subject: [PATCH 066/126] tuning conv2d only --- vta/scripts/tune_resnet.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py index 6f3a688074f8..c715a3883add 100644 --- a/vta/scripts/tune_resnet.py +++ b/vta/scripts/tune_resnet.py @@ -73,7 +73,6 @@ def _topi_nn_conv2d(*args, **kwargs): s = tvm.create_schedule([res.op]) return s, [A, W, res] - @autotvm.task.register("topi_nn_dense", override=True) def _topi_nn_dense(*args, **kwargs): assert not kwargs, "Do not support kwargs in template function call" @@ -228,8 +227,7 @@ def tune_tasks(tasks, print("Extracting tasks...") tasks = extract_from_program(func=relay_prog, params=params, - ops=(tvm.relay.op.nn.conv2d, - tvm.relay.op.nn.dense), + ops=(tvm.relay.op.nn.conv2d,), target=target, target_host=env.target_host) From a04a3cb2c373a7e0ce3eef5d265f8a0f7a63145c Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 18 Jun 2019 17:52:18 -0700 Subject: [PATCH 067/126] skip dense layer in quant, cleanup --- python/tvm/relay/quantize/_annotate.py | 20 +++++--------------- python/tvm/relay/quantize/quantize.py | 18 ++++++++++++++++++ src/relay/pass/quantize.cc | 2 ++ src/relay/pass/quantize.h | 4 ++++ vta/tutorials/resnet.py | 2 +- 5 files changed, 30 insertions(+), 16 deletions(-) diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py index 799b553a702c..8edc690daa29 100644 --- a/python/tvm/relay/quantize/_annotate.py +++ b/python/tvm/relay/quantize/_annotate.py @@ -171,9 +171,6 @@ def conv2d_rewrite(ref_call, new_args, ctx): lhs_expr, lhs_kind = _get_expr_kind(new_args[0]) rhs_expr, rhs_kind = _get_expr_kind(new_args[1]) - # print('conv2d lhs kind: {0}'.format(lhs_kind)) - # print('conv2d lhs: \n{0}'.format(lhs_expr)) - # print('\n\n\n') if lhs_kind is None or lhs_kind == QAnnotateKind.ACTIVATION: lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT) @@ -181,6 +178,7 @@ def conv2d_rewrite(ref_call, new_args, ctx): rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT) expr = _forward_op(ref_call, [lhs_expr, rhs_expr]) + return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION) @@ -200,6 +198,8 @@ def dense_rewrite(ref_call, new_args, ctx): if check_to_skip(): return None + _set_dense_counter(cnt + 1) + lhs_expr, lhs_kind = _get_expr_kind(new_args[0]) rhs_expr, rhs_kind = _get_expr_kind(new_args[1]) @@ -243,8 +243,6 @@ def add_rewrite(ref_call, new_args, ctx): lhs_expr, lhs_kind = _get_expr_kind(new_args[0]) rhs_expr, rhs_kind = _get_expr_kind(new_args[1]) - # print('add lhs kind: {0}'.format(lhs_kind)) - # print('add rhs kind: {0}'.format(rhs_kind)) if lhs_kind is None and rhs_kind is None: return None @@ -254,7 +252,6 @@ def add_rewrite(ref_call, new_args, ctx): assert rhs_kind == QAnnotateKind.INPUT lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT) expr = _forward_op(ref_call, [lhs_expr, rhs_expr]) - # print('execute add with INPUT') return QAnnotateExpr(expr, QAnnotateKind.INPUT) if lhs_kind is not None and rhs_kind is None: @@ -272,12 +269,10 @@ def add_rewrite(ref_call, new_args, ctx): if lhs_kind is not None and rhs_kind is not None: if lhs_kind == QAnnotateKind.INPUT and rhs_kind == QAnnotateKind.INPUT: expr = _forward_op(ref_call, [lhs_expr, rhs_expr]) - # print('execute add with INPUT') return QAnnotateExpr(expr, QAnnotateKind.INPUT) if lhs_kind == QAnnotateKind.ACTIVATION and rhs_kind == QAnnotateKind.ACTIVATION: # quantize rhs to INPUT field if both lhs and rhs are ACTIVATION rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT) - expr = _forward_op(ref_call, [lhs_expr, rhs_expr]) return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION) @@ -376,6 +371,7 @@ def _register(func): return _op.op._Register(op_name, "FQVtaRewrite", func, level) return _register(frewrite) if frewrite is not None else _register + @register_relay_node class QVtaExpr(_expr.TempExpr): def __init__(self, expr): @@ -391,8 +387,6 @@ def vta_expr_check(expr): return True, expr.expr return False, expr -# def _stop_fusion(expr): -# return _quantize.make_stop_fusion(expr) @register_vta_rewrite("nn.conv2d") def conv2d_vta_rewrite(ref_call, new_args, ctx): @@ -402,7 +396,6 @@ def conv2d_vta_rewrite(ref_call, new_args, ctx): return None _set_conv_counter(cnt + 1) - data_cond, data = vta_expr_check(new_args[0]) kernel_cond, kernel = vta_expr_check(new_args[1]) @@ -412,6 +405,7 @@ def conv2d_vta_rewrite(ref_call, new_args, ctx): ret = _forward_op(ref_call, [data, kernel]) return QVtaExpr(ret) + def identity_vta_rewrite(ref_call, new_args, ctx): cond, expr = vta_expr_check(new_args[0]) if cond: @@ -423,10 +417,6 @@ def identity_vta_rewrite(ref_call, new_args, ctx): register_vta_rewrite("nn.max_pool2d", identity_vta_rewrite) -# @register_vta_rewrite("nn.max_pool2d") -# def pool_vta_rewrite(ref_call, new_args, ctx): -# pass - @register_vta_rewrite("add") def add_vta_rewrite(ref_call, new_args, ctx): lhs_cond, lhs = vta_expr_check(new_args[0]) diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py index df6f8b9e139c..487bb1c3d47e 100644 --- a/python/tvm/relay/quantize/quantize.py +++ b/python/tvm/relay/quantize/quantize.py @@ -140,6 +140,10 @@ def qconfig(**kwargs): Specifying which layers to be skipped. Provide a list of indices that indicate which conv2d layers to leave untouched. + skip_dense_layers: list + Specifies which dense layers to avoid. Provide a list of indices + that indicate which conv2d layers to leave untouched. + round_for_shift: boolean Whether to add bias for rounding during shift. @@ -193,6 +197,20 @@ def annotate_context(): return AnnotateContext.Current +DENSE_COUNTER = 0 + + +def _dense_counter(): + """Get the global counter for dense.""" + return DENSE_COUNTER + + +def _set_dense_counter(n): + """Set the value of the global dense counter.""" + global DENSE_COUNTER + DENSE_COUNTER = n + + def calibrate(graph, mod=None, ctx=None): """The calibrate procedure will try to calculate the content of dom_scale, nbit, clip_min, clip_max for every `simulated_quantize` diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc index cb64902d74f9..72e9da681f86 100644 --- a/src/relay/pass/quantize.cc +++ b/src/relay/pass/quantize.cc @@ -649,6 +649,8 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable) p->stream << "nbit_activation=" << op->nbit_activation << ", "; p->stream << "global_scale=" << op->global_scale << ", "; p->stream << "skip_conv_layers==" << op->skip_conv_layers << ", "; + p->stream << "skip_k_dense==" << op->skip_k_dense << ", "; + p->stream << "skip_dense_layers==" << op->skip_dense_layers << ", "; p->stream << "round_for_shift==" << op->round_for_shift << ", "; p->stream << "store_lowbit_output==" << op->store_lowbit_output << ", "; p->stream << "debug_enabled_ops==" << op->debug_enabled_ops; diff --git a/src/relay/pass/quantize.h b/src/relay/pass/quantize.h index fce98e54459c..318ebe57e2af 100644 --- a/src/relay/pass/quantize.h +++ b/src/relay/pass/quantize.h @@ -150,6 +150,8 @@ class QConfigNode : public Node { DataType dtype_activation = Int(32); double global_scale = 8.0; Array skip_conv_layers = Array(NodePtr(nullptr)); + int skip_k_dense = 0; + Array skip_dense_layers = Array(NodePtr(nullptr)); bool round_for_shift = true; bool store_lowbit_output = true; Array debug_enabled_ops = Array(NodePtr(nullptr)); @@ -163,6 +165,8 @@ class QConfigNode : public Node { v->Visit("dtype_activation", &dtype_activation); v->Visit("global_scale", &global_scale); v->Visit("skip_conv_layers", &skip_conv_layers); + v->Visit("skip_k_dense", &skip_k_dense); + v->Visit("skip_dense_layers", &skip_dense_layers); v->Visit("round_for_shift", &round_for_shift); v->Visit("store_lowbit_output", &store_lowbit_output); v->Visit("debug_enabled_ops", &debug_enabled_ops); diff --git a/vta/tutorials/resnet.py b/vta/tutorials/resnet.py index af86e8a32c04..c58f5412d974 100644 --- a/vta/tutorials/resnet.py +++ b/vta/tutorials/resnet.py @@ -132,7 +132,7 @@ dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Perform quantization in Relay - with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1): + with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1, skip_k_dense=1): relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params) # Perform graph packing and constant folding for VTA target From db7462def68339f3bb82461b55d71faec5fa274f Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 18 Jun 2019 18:37:32 -0700 Subject: [PATCH 068/126] support for callable build func --- python/tvm/autotvm/measure/measure_methods.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index dcdd46728e3e..1ed990f394ba 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -86,10 +86,9 @@ def __init__(self, timeout=10, n_parallel=None, build_func='default'): build_func = ndk.create_shared else: raise ValueError("Invalid build_func" + build_func) - - # FIXME: right now we're circumventing the wrap_build_func - # self.build_func = _wrap_build_func(build_func) - self.build_func = build_func + self.build_func = _wrap_build_func(build_func) + else: + self.build_func = build_func self.executor = LocalExecutor(timeout=timeout) self.tmp_dir = tempfile.mkdtemp() From ae413e5fa54cf6838a7f91681aef7f13406bf439 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 18 Jun 2019 19:27:48 -0700 Subject: [PATCH 069/126] multiprocessing bug fix --- python/tvm/autotvm/measure/measure_methods.py | 13 +--------- python/tvm/autotvm/task/nnvm_integration.py | 24 ++++++++++++++++--- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 1ed990f394ba..7ddc6cd9ea5f 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -224,18 +224,7 @@ def set_task(self, task): for x in arg_bufs] func = build(s, arg_bufs, "llvm") tvm_buf = [nd.array(x) for x in self.ref_input] - - def _run_func(): - """Run tvm function in a thread. - Because there is some issues with python multiprocessing and the thread pool in tvm - """ - func(*tvm_buf) - - thread = threading.Thread(target=_run_func) - thread.start() - thread.join() - del thread - + func(*tvm_buf) self.ref_output = [x.asnumpy() for x in tvm_buf] def get_build_kwargs(self): diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py index 251a310cf7aa..9c64761662d3 100644 --- a/python/tvm/autotvm/task/nnvm_integration.py +++ b/python/tvm/autotvm/task/nnvm_integration.py @@ -19,6 +19,7 @@ Decorator and utilities for the integration with TOPI and NNVM """ +import threading import warnings import logging @@ -90,8 +91,16 @@ def extract_from_graph(graph, shape, dtype, target, symbols, params, target_host logger.disabled = True nnvm.compiler.engine.clear_cache() - nnvm.compiler.build(graph, target=target, shape=shape, dtype=dtype, - target_host=target_host, params=params) + # wrap build call in thread to avoid multiprocessing problems + build_thread = threading.Thread(target=nnvm.compiler.build, + args=(graph, + target, + shape, + dtype, + params, + target_host)) + build_thread.start() + build_thread.join() logger.disabled = old_state @@ -169,7 +178,16 @@ def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, params, for graph, shape, dtype in zip(graphs, shapes, dtypes): nnvm.compiler.engine.clear_cache() - nnvm.compiler.build(graph, target=target, shape=shape, dtype=dtype) + # wrap build call in thread to avoid multiprocessing problems + build_thread = threading.Thread(target=nnvm.compiler.build, + args=(graph, + target, + shape, + dtype, + params, + target_host)) + build_thread.start() + build_thread.join() logger.disabled = old_state From 6e3e5b83a3abfeaf6f18e04ce4d538a136264ddb Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 18 Jun 2019 19:37:31 -0700 Subject: [PATCH 070/126] doc --- python/tvm/relay/quantize/quantize.py | 4 ++-- src/codegen/build_module.cc | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py index 487bb1c3d47e..c127484f9b54 100644 --- a/python/tvm/relay/quantize/quantize.py +++ b/python/tvm/relay/quantize/quantize.py @@ -141,8 +141,8 @@ def qconfig(**kwargs): that indicate which conv2d layers to leave untouched. skip_dense_layers: list - Specifies which dense layers to avoid. Provide a list of indices - that indicate which conv2d layers to leave untouched. + Different way of specifying which dense layers to avoid. + Provide a list of indices that indicate which conv2d layers to leave untouched. round_for_shift: boolean Whether to add bias for rounding during shift. diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc index 488baa9bce46..6917200ff920 100644 --- a/src/codegen/build_module.cc +++ b/src/codegen/build_module.cc @@ -80,9 +80,9 @@ Target CreateTarget(const std::string& target_name, } } - // if (t->device_name.length() > 0) { - // t->keys_array.push_back(ir::StringImm::make(t->device_name)); - // } + if (t->device_name.length() > 0) { + t->keys_array.push_back(ir::StringImm::make(t->device_name)); + } t->device_type = kDLCPU; t->thread_warp_size = 1; if (target_name == "c" || target_name == "llvm") { From 432a2cc240fcfcc592f5fab0902c0d2788d9e1b1 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 18 Jun 2019 19:52:42 -0700 Subject: [PATCH 071/126] skip dense layer --- vta/scripts/relay_to_vta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vta/scripts/relay_to_vta.py b/vta/scripts/relay_to_vta.py index a0b8a5fa6998..6d2855a83a76 100644 --- a/vta/scripts/relay_to_vta.py +++ b/vta/scripts/relay_to_vta.py @@ -108,7 +108,7 @@ def classification_demo(opt): dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Perform quantization in Relay - with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1): + with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1, skip_k_dense=1): relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params) # Perform graph packing and constant folding for VTA target From 794ce529dc68c30384216e12f916aa605be65d55 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 18 Jun 2019 19:58:25 -0700 Subject: [PATCH 072/126] cleanup --- package.sh | 6 ------ 1 file changed, 6 deletions(-) delete mode 100755 package.sh diff --git a/package.sh b/package.sh deleted file mode 100755 index da227738637d..000000000000 --- a/package.sh +++ /dev/null @@ -1,6 +0,0 @@ -echo "Installing Dependencies ..." -echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list -sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823 -sudo apt-get update -sudo apt-get install -y -q llvm-6.0 libtinfo-dev libffi-dev zlib1g-dev clinfo tree -sudo apt-get install verilator sbt From 80c4f6b209d17dce6393ce8ce2388b5df8e59e08 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 18 Jun 2019 20:06:16 -0700 Subject: [PATCH 073/126] clean up --- src/relay/pass/quantize.cc | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc index 72e9da681f86..c41ee6ac0935 100644 --- a/src/relay/pass/quantize.cc +++ b/src/relay/pass/quantize.cc @@ -111,14 +111,13 @@ TVM_REGISTER_API("relay._quantize.simulated_quantize") Expr QAnnotateExprNode::Realize() const { const auto& cfg = QConfig::Current(); - return expr; - // if (cfg->store_lowbit_output) { - // // store low bit output back for VTA - // const PackedFunc* f = runtime::Registry::Get("relay.quantize.attach_simulated_quantize"); - // return (*f)(this->expr, static_cast(kQInput)); - // } else { - // return expr; - // } + if (cfg->store_lowbit_output) { + // store low bit output back for VTA + const PackedFunc* f = runtime::Registry::Get("relay.quantize.attach_simulated_quantize"); + return (*f)(this->expr, static_cast(kQInput)); + } else { + return expr; + } } QAnnotateExpr QAnnotateExprNode::make(Expr expr, QAnnotateKind kind) { From ab1f6cd956c9be385852b9617843f16e27f744fc Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 18 Jun 2019 21:18:52 -0700 Subject: [PATCH 074/126] this ensures that relay to vta compilation works for renset-18 --- .../test_benchmark_resnet18_relay.py} | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) rename vta/{scripts/relay_to_vta.py => tests/python/integration/test_benchmark_resnet18_relay.py} (92%) diff --git a/vta/scripts/relay_to_vta.py b/vta/tests/python/integration/test_benchmark_resnet18_relay.py similarity index 92% rename from vta/scripts/relay_to_vta.py rename to vta/tests/python/integration/test_benchmark_resnet18_relay.py index 6d2855a83a76..f9cfb5a34f2b 100644 --- a/vta/scripts/relay_to_vta.py +++ b/vta/tests/python/integration/test_benchmark_resnet18_relay.py @@ -16,8 +16,8 @@ from vta.top import graph_pack -def classification_demo(opt): - """Image classification demo. +def classification_test(opt): + """ResNet-18 classification test. Parameters ---------- @@ -167,7 +167,18 @@ def classification_demo(opt): tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 1000), "float32", remote.cpu(0))) top_categories = np.argsort(tvm_output.asnumpy()[0]) - # Report top-5 classification results + # This just checks that one of the 5 top categories + # is one variety of cat; this is by no means an accurate + # assessment of how quantization affects classification + # accuracy but is meant to catch changes to the quantization + # pass that would break basic correctness + cat_detected = False + for k in top_categories[-5:]: + if "cat" in synset[k]: + cat_detected = True + assert(cat_detected) + + # Report latency and top-5 classification results std = np.std(tcost.results) * 1000 / env.BATCH mean = tcost.mean * 1000 / env.BATCH print("%s Prediction" % opt.model) @@ -197,4 +208,4 @@ def classification_demo(opt): opt = parser.parse_args() - classification_demo(opt) + classification_test(opt) From cce05daa65191a3f55c4f25bfd481f77e473c595 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 18 Jun 2019 21:29:23 -0700 Subject: [PATCH 075/126] autotvm task extraction test for VTA --- .../test_autotvm_task_extraction.py | 188 ++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 vta/tests/python/integration/test_autotvm_task_extraction.py diff --git a/vta/tests/python/integration/test_autotvm_task_extraction.py b/vta/tests/python/integration/test_autotvm_task_extraction.py new file mode 100644 index 000000000000..995ea411bfbd --- /dev/null +++ b/vta/tests/python/integration/test_autotvm_task_extraction.py @@ -0,0 +1,188 @@ +"""Perform inference on VTA using Relay.""" + +import argparse, os, time +from mxnet.gluon.model_zoo import vision +import numpy as np +from PIL import Image + +import topi +import tvm +from tvm import rpc, autotvm, relay +from tvm.autotvm.measure.measure_methods import request_remote +from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner +from tvm.contrib import graph_runtime, util, download +from tvm.contrib.debugger import debug_runtime +import vta +from vta.testing import simulator +from vta.top import graph_pack +from tvm.autotvm.task import extract_from_program + +def parse_arguments(): + + parser = argparse.ArgumentParser(description='Train a model for image classification.') + parser.add_argument('--model', type=str, default='resnet18_v1', choices=['resnet18_v1'], + help='Input model name.') + parser.add_argument('--start-name', type=str, default='nn.max_pool2d', + help='The name of the node where packing starts') + parser.add_argument('--stop-name', type=str, default='nn.global_avg_pool2d', + help='The name of the node where packing stops') + parser.add_argument('--debug-profile', action='store_true', + help='Show layer-wise time cost profiling results') + parser.add_argument('--device', default='vta', choices=['vta', 'arm_cpu'], + help='Select device target') + parser.add_argument('--measurements', type=int, default=1, + help='Number of measurements during AutoTVM search') + parser.add_argument('--tuner', type=str, default="random", + help='AutoTVM search strategy') + parser.add_argument('--log-filename', type=str, default="resnet-18.log", + help='AutoTVM log file name') + + return parser.parse_args() + + +def register_vta_tuning_tasks(): + from tvm.autotvm.task.topi_integration import TaskExtractEnv, deserialize_args + + @tvm.tag_scope(tag=topi.tag.ELEMWISE) + def my_clip(x, a_min, a_max): + """Unlike topi's current clip, put min and max into two stages.""" + const_min = tvm.const(a_min, x.dtype) + const_max = tvm.const(a_max, x.dtype) + x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") + x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") + return x + + # init autotvm env to register VTA operator + TaskExtractEnv() + + @autotvm.task.register("topi_nn_conv2d", override=True) + def _topi_nn_conv2d(*args, **kwargs): + assert not kwargs, "Do not support kwargs in template function call" + args = deserialize_args(args) + A, W = args[:2] + + with tvm.target.vta(): + res = topi.nn.conv2d(*args, **kwargs) + res = topi.right_shift(res, 8) + res = my_clip(res, 0, 127) + res = topi.cast(res, "int8") + + if tvm.target.current_target().device_name == 'vta': + s = topi.generic.schedule_conv2d_nchw([res]) + else: + s = tvm.create_schedule([res.op]) + return s, [A, W, res] + + @autotvm.task.register("topi_nn_dense", override=True) + def _topi_nn_dense(*args, **kwargs): + assert not kwargs, "Do not support kwargs in template function call" + args = deserialize_args(args) + A, W = args[:2] + + with tvm.target.vta(): + res = topi.nn.dense(*args, **kwargs) + res = topi.right_shift(res, 8) + res = my_clip(res, 0, 127) + res = topi.cast(res, "int8") + + if tvm.target.current_target().device_name == 'vta': + s = topi.generic.schedule_dense([res]) + else: + s = tvm.create_schedule([res.op]) + + return s, [A, W, res] + + +def compile_network(opt, env, target): + + # Populate the shape and data type dictionary + dtype_dict = {"data": 'float32'} + shape_dict = {"data": (env.BATCH, 3, 224, 224)} + + # Get off the shelf gluon model, and convert to relay + gluon_model = vision.get_model(opt.model, pretrained=True) + mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) + + # Update shape and type dictionary + shape_dict.update({k: v.shape for k, v in params.items()}) + dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) + + # Perform quantization in Relay + with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1): + relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params) + + # Perform graph packing and constant folding for VTA target + if target.device_name == "vta": + assert env.BLOCK_IN == env.BLOCK_OUT + relay_prog = graph_pack( + relay_prog, + env.BATCH, + env.BLOCK_OUT, + env.WGT_WIDTH, + start_name=opt.start_name, + stop_name=opt.stop_name) + relay_prog = relay.ir_pass.fold_constant(relay_prog) + + return relay_prog, params + +if __name__ == '__main__': + + opt = parse_arguments() + + # Make sure that TVM was compiled with RPC=1 + assert tvm.module.enabled("rpc") + + # Read in VTA environment + env = vta.get_env() + + # Get remote from fleet node + tracker_host = os.environ.get("TVM_TRACKER_HOST", None) + tracker_port = int(os.environ.get("TVM_TRACKER_PORT", None)) + if not tracker_host or not tracker_port: + print("Set your AutoTVM tracker node host and port variables to run the autotuner") + exit() + + # Get remote + if env.TARGET != "sim": + + # Measure build start time + reconfig_start = time.time() + + # Get remote from fleet node + remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000) + + # Reconfigure the JIT runtime and FPGA. + # You can program the FPGA with your own custom bitstream + # by passing the path to the bitstream file instead of None. + vta.reconfig_runtime(remote) + vta.program_fpga(remote, bitstream=None) + + # Report on reconfiguration time + reconfig_time = time.time() - reconfig_start + print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time)) + + # In simulation mode, host the RPC server locally. + else: + remote = rpc.LocalSession() + + # VTA target and execution context + target = env.target if opt.device == "vta" else env.target_vta_cpu + ctx = remote.ext_dev(0) if opt.device == "vta" else remote.cpu(0) + + # Register VTA tuning tasks + register_vta_tuning_tasks() + + # Compile Relay program + relay_prog, params = compile_network(opt, env, target) + + # Perform task extraction on Relay program + tasks = extract_from_program(func=relay_prog, + params=params, + ops=(tvm.relay.op.nn.conv2d,), + target=target, + target_host=env.target_host) + + # Check that we have extracted the right number of tasks + assert opt.model == "resnet18_v1" and len(tasks) == 10 + + print("Task extraction passed!") From 4be3cbcfcbff4162d62408fd222933a0f8ae0bfe Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 18 Jun 2019 21:37:15 -0700 Subject: [PATCH 076/126] adding headers --- vta/scripts/tune_conv2d.py | 18 +++++++++++++++ vta/scripts/tune_dense.py | 22 +++++++++++++++++-- vta/scripts/tune_resnet.py | 19 +++++++++++++++- vta/scripts/tune_resnet_nnvm.py | 19 ++++++++++++++++ .../test_autotvm_task_extraction.py | 17 ++++++++++++++ .../test_benchmark_resnet18_relay.py | 17 ++++++++++++++ 6 files changed, 109 insertions(+), 3 deletions(-) diff --git a/vta/scripts/tune_conv2d.py b/vta/scripts/tune_conv2d.py index 0113060a77da..f55c7e985716 100644 --- a/vta/scripts/tune_conv2d.py +++ b/vta/scripts/tune_conv2d.py @@ -1,4 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """Tuning a single conv2d operator""" + from collections import namedtuple import logging import os diff --git a/vta/scripts/tune_dense.py b/vta/scripts/tune_dense.py index 5f38e1bb70f8..237ca2754512 100644 --- a/vta/scripts/tune_dense.py +++ b/vta/scripts/tune_dense.py @@ -1,4 +1,22 @@ -"""Tuning a single conv2d operator""" +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Tuning a single dense operator""" + from collections import namedtuple import logging import os @@ -81,7 +99,7 @@ def dense(N, CI, CO): tuner = autotvm.tuner.RandomTuner(task) tuner.tune(n_trial=len(task.config_space), measure_option=measure_option, - callbacks=[autotvm.callback.log_to_file('conv2d.log')]) + callbacks=[autotvm.callback.log_to_file('dense.log')]) print("\nBest tuner config:") print(tuner.best_config) diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py index c715a3883add..e89de92af531 100644 --- a/vta/scripts/tune_resnet.py +++ b/vta/scripts/tune_resnet.py @@ -1,4 +1,21 @@ -"""Perform inference on VTA using Relay.""" +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Perform ResNet autoTVM tuning on VTA using Relay.""" import argparse, os, time from mxnet.gluon.model_zoo import vision diff --git a/vta/scripts/tune_resnet_nnvm.py b/vta/scripts/tune_resnet_nnvm.py index 3a6149df267c..22a4dd5dfc78 100644 --- a/vta/scripts/tune_resnet_nnvm.py +++ b/vta/scripts/tune_resnet_nnvm.py @@ -1,3 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Perform ResNet autoTVM tuning on VTA using NNVM.""" + import argparse import os import time diff --git a/vta/tests/python/integration/test_autotvm_task_extraction.py b/vta/tests/python/integration/test_autotvm_task_extraction.py index 995ea411bfbd..e276b5c0672f 100644 --- a/vta/tests/python/integration/test_autotvm_task_extraction.py +++ b/vta/tests/python/integration/test_autotvm_task_extraction.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """Perform inference on VTA using Relay.""" import argparse, os, time diff --git a/vta/tests/python/integration/test_benchmark_resnet18_relay.py b/vta/tests/python/integration/test_benchmark_resnet18_relay.py index f9cfb5a34f2b..ced6e9db3fc7 100644 --- a/vta/tests/python/integration/test_benchmark_resnet18_relay.py +++ b/vta/tests/python/integration/test_benchmark_resnet18_relay.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """Perform inference on VTA using Relay.""" import argparse, json, os, requests, time From ab3069ed1daae5ff2ce60e051c323cc27ca4e1ff Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 18 Jun 2019 21:54:54 -0700 Subject: [PATCH 077/126] missing headers --- vta/python/vta/top/bitpack.py | 17 +++++++++++++++++ vta/python/vta/top/graphpack.py | 17 +++++++++++++++++ vta/python/vta/top/nnvm_bitpack.py | 17 +++++++++++++++++ vta/python/vta/top/nnvm_graphpack.py | 17 +++++++++++++++++ vta/python/vta/top/nnvm_op.py | 17 +++++++++++++++++ 5 files changed, 85 insertions(+) diff --git a/vta/python/vta/top/bitpack.py b/vta/python/vta/top/bitpack.py index 2265af4518b4..b39a96fa263a 100644 --- a/vta/python/vta/top/bitpack.py +++ b/vta/python/vta/top/bitpack.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """Bit packing operators""" from __future__ import absolute_import as _abs diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py index 770dd380403d..650465b066d0 100644 --- a/vta/python/vta/top/graphpack.py +++ b/vta/python/vta/top/graphpack.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """A Relay implementation of graph packing.""" from tvm import relay diff --git a/vta/python/vta/top/nnvm_bitpack.py b/vta/python/vta/top/nnvm_bitpack.py index 7b09ffbf43c0..52b3fa7d9899 100644 --- a/vta/python/vta/top/nnvm_bitpack.py +++ b/vta/python/vta/top/nnvm_bitpack.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """Bit packing operators""" from __future__ import absolute_import as _abs diff --git a/vta/python/vta/top/nnvm_graphpack.py b/vta/python/vta/top/nnvm_graphpack.py index 1f713acd3e27..427001ffa5ed 100644 --- a/vta/python/vta/top/nnvm_graphpack.py +++ b/vta/python/vta/top/nnvm_graphpack.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """An NNVM implementation of graph packing.""" import nnvm diff --git a/vta/python/vta/top/nnvm_op.py b/vta/python/vta/top/nnvm_op.py index ce69b2b438d1..d9c2efb550f2 100644 --- a/vta/python/vta/top/nnvm_op.py +++ b/vta/python/vta/top/nnvm_op.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """Namespace for supporting packed_conv2d + ewise variant of nnvm.""" from __future__ import absolute_import as _abs From 67ae8d13b0c63c5c4ef82c4927a196020e6cffd0 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 18 Jun 2019 23:49:51 -0700 Subject: [PATCH 078/126] header --- vta/python/vta/top/op.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index abb529dbe7f1..dc4dd08c4c50 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """Namespace for supporting packed_conv2d + ewise variant of nnvm.""" from __future__ import absolute_import as _abs From 5c86609a1ae99f7b1c49cd61e342796750640ac9 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 00:08:54 -0700 Subject: [PATCH 079/126] rename test file --- .../{test_benchmark_resnet18_relay.py => test_resnet18.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename vta/tests/python/integration/{test_benchmark_resnet18_relay.py => test_resnet18.py} (100%) diff --git a/vta/tests/python/integration/test_benchmark_resnet18_relay.py b/vta/tests/python/integration/test_resnet18.py similarity index 100% rename from vta/tests/python/integration/test_benchmark_resnet18_relay.py rename to vta/tests/python/integration/test_resnet18.py From 19f51fc6c7be89d58ad71f011f16b71f5102f8ce Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 00:25:38 -0700 Subject: [PATCH 080/126] lint fix --- python/tvm/autotvm/task/nnvm_integration.py | 4 ---- python/tvm/autotvm/task/relay_integration.py | 24 +++++++++----------- python/tvm/autotvm/task/topi_integration.py | 9 +++----- python/tvm/relay/quantize/_annotate.py | 12 ++++++---- src/relay/op/annotation/annotation.cc | 2 +- 5 files changed, 22 insertions(+), 29 deletions(-) diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py index 9c64761662d3..d945abb054e2 100644 --- a/python/tvm/autotvm/task/nnvm_integration.py +++ b/python/tvm/autotvm/task/nnvm_integration.py @@ -24,11 +24,8 @@ import logging -from ... import target as _target - from .task import create from .topi_integration import TaskExtractEnv -from .dispatcher import ApplyHistoryBest logger = logging.getLogger('autotvm') @@ -203,4 +200,3 @@ def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, params, print("[Warning] Invalid shape during AutoTVM task creation") return tasks - diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py index cb18653d8f37..ff55055b3c10 100644 --- a/python/tvm/autotvm/task/relay_integration.py +++ b/python/tvm/autotvm/task/relay_integration.py @@ -25,19 +25,17 @@ import logging -from ... import target as _target - from .task import create from .topi_integration import TaskExtractEnv logger = logging.getLogger('autotvm') -def my_build(func, - target, - target_host, - params): - """ VTA compatible relay build. +def _build(func, + target, + target_host, + params): + """ Helper to build VTA properly. """ from tvm import relay @@ -48,8 +46,8 @@ def my_build(func, import vta with vta.build_config(): return relay.build(func, target, target_host, params) - else: - return relay.build(func, target, target_host, params) + # default case + return relay.build(func, target, target_host, params) def extract_from_program(func, params, ops, target, target_host=None): """ Extract tuning tasks from a relay program. @@ -107,7 +105,7 @@ def extract_from_program(func, params, ops, target, target_host=None): relay.backend.compile_engine.get().clear() # wrap build call in thread to avoid multiprocessing problems - build_thread = threading.Thread(target=my_build, + build_thread = threading.Thread(target=_build, args=(func, target, target_host, @@ -187,9 +185,9 @@ def extract_from_multiple_program(funcs, params, ops, target, target_host=None): # wrap build call in thread to avoid multiprocessing problems build_thread = threading.Thread(target=my_build, args=(func, - target, - target_host, - params)) + target, + target_host, + params)) build_thread.start() build_thread.join() diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py index ed85504e4c0a..f41d7ee934c5 100644 --- a/python/tvm/autotvm/task/topi_integration.py +++ b/python/tvm/autotvm/task/topi_integration.py @@ -27,10 +27,7 @@ See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage. """ -import warnings -import sys - -from ... import _api_internal, tensor, placeholder, create_schedule +from ... import _api_internal, tensor, placeholder from .task import args_to_workload, dispatcher, register from ..util import get_const_tuple @@ -148,8 +145,8 @@ def _tracing_wrapper(*args, **kwargs): return compute_func(*args, **kwargs) - self.func_to_reflection[topi_compute](_tracing_wrapper) - self.modified_funcs.append(topi_compute) + self.func_to_reflection[compute_func](_tracing_wrapper) + self.modified_funcs.append(compute_func) _local_scope(topi_compute) diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py index 8edc690daa29..e98f45ef96b0 100644 --- a/python/tvm/relay/quantize/_annotate.py +++ b/python/tvm/relay/quantize/_annotate.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -#pylint: disable=unused-argument +#pylint: disable=unused-argument,inconsistent-return-statements """Internal module for registering attribute for annotation.""" from __future__ import absolute_import import warnings @@ -329,8 +329,10 @@ def pool2d_rewrite(ref_call, new_args, ctx): register_annotate_function("nn.max_pool2d", pool2d_rewrite) + @register_annotate_function("force_cast") def force_cast_rewrite(ref_call, new_args, ctx): + """Rewrite function to force cast""" if _conv_counter() <= current_qconfig().skip_k_conv: return None expr, x_kind = _get_expr_kind(new_args[0]) @@ -390,6 +392,7 @@ def vta_expr_check(expr): @register_vta_rewrite("nn.conv2d") def conv2d_vta_rewrite(ref_call, new_args, ctx): + """Rewrite function for conv2d for VTA target""" cnt = _conv_counter() if cnt < current_qconfig().skip_k_conv: _set_conv_counter(cnt + 1) @@ -410,8 +413,7 @@ def identity_vta_rewrite(ref_call, new_args, ctx): cond, expr = vta_expr_check(new_args[0]) if cond: return QVtaExpr(_forward_op(ref_call, [expr])) - else: - return None + return None register_vta_rewrite("nn.relu", identity_vta_rewrite) register_vta_rewrite("nn.max_pool2d", identity_vta_rewrite) @@ -419,6 +421,7 @@ def identity_vta_rewrite(ref_call, new_args, ctx): @register_vta_rewrite("add") def add_vta_rewrite(ref_call, new_args, ctx): + """Rewrite function for ewise add for VTA target""" lhs_cond, lhs = vta_expr_check(new_args[0]) rhs_cond, rhs = vta_expr_check(new_args[1]) if lhs_cond and rhs_cond: @@ -427,5 +430,4 @@ def add_vta_rewrite(ref_call, new_args, ctx): return _forward_op(ref_call, [lhs, rhs]) elif lhs_cond and not rhs_cond: return QVtaExpr(_forward_op(ref_call, [lhs, rhs])) - else: - return None + return None diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc index 789c85e39074..e6d41073e473 100644 --- a/src/relay/op/annotation/annotation.cc +++ b/src/relay/op/annotation/annotation.cc @@ -89,7 +89,7 @@ Expr ForceCast(Expr data) { } RELAY_REGISTER_OP("force_cast") -.describe(R"code(Annotate an expression to prevent it being fused with previous expressions.)code" +.describe(R"code(Annotate an expression to force a cast.)code" TVM_ADD_FILELINE) .set_num_inputs(1) .add_argument("data", "Tensor", "The input data.") From 49689bcc5f90865c46254c83a373123e9db7094d Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 01:09:19 -0700 Subject: [PATCH 081/126] another set of lint fixes --- vta/python/vta/build_module.py | 12 ++++----- vta/python/vta/testing/util.py | 5 +++- vta/python/vta/top/bitpack.py | 3 ++- vta/python/vta/top/graphpack.py | 42 ++++++++++++++++++++---------- vta/python/vta/top/nnvm_bitpack.py | 5 ++-- vta/python/vta/top/op.py | 27 ++++++++++++------- vta/python/vta/top/vta_conv2d.py | 14 +++++----- vta/python/vta/top/vta_dense.py | 19 +++++++------- 8 files changed, 75 insertions(+), 52 deletions(-) diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py index 91e3c4a7e0d8..71fc0d3283c6 100644 --- a/vta/python/vta/build_module.py +++ b/vta/python/vta/build_module.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# pylint: disable=unused-argument """VTA specific buildin for runtime.""" from __future__ import absolute_import as _abs @@ -129,8 +130,6 @@ def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs): from tvm.autotvm.measure.measure_methods import BuildResult, InstantiationError tic = time.time() - # simulator stats - stats = {} try: filename = os.path.join(tmp_dir, "tmp_func_%0x.tar" % getrandbits(64)) target, task, config = measure_input @@ -143,7 +142,7 @@ def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs): func = build(s, args, target_host=task.target_host) sim = build(s, args) - arg_info = tuple((get_const_tuple(x.shape), x.dtype) for x in args) + arg_info = tuple((get_const_tuple(x.shape), x.dtype) for x in args) func.export_library(filename) # When targeting VTA test the schedule on simulator first @@ -164,16 +163,15 @@ def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs): f = remote.load_module(os.path.split(sim_path)[1]) ctx = remote.context(str(measure_input.target), 0) args = [tvm.nd.empty(x[0], dtype=x[1], ctx=ctx) for x in arg_info] - simulator.clear_stats() + # Skip execution just to verify correctness simulator.debug_mode(simulator.DEBUG_SKIP_EXEC) f(*args) - stats = simulator.stats() # check by local simulator ctx = tvm.context(str(target)) args = [tvm.nd.empty(x[0], dtype=x[1], ctx=ctx) for x in arg_info] sim(*args) - except Exception as e: # pylint: disable=broad-except - return BuildResult(None, None, e, time.time() - tic) + except Exception as ex: # pylint: disable=broad-except + return BuildResult(None, None, ex, time.time() - tic) return BuildResult(filename, arg_info, None, time.time() - tic) diff --git a/vta/python/vta/testing/util.py b/vta/python/vta/testing/util.py index b009b7f27fd3..30760409733c 100644 --- a/vta/python/vta/testing/util.py +++ b/vta/python/vta/testing/util.py @@ -60,7 +60,10 @@ def run(run_func): pynq_port = int(os.environ.get("VTA_PYNQ_RPC_PORT", None)) # Run device from fleet node if env variables are defined if tracket_host and tracket_port: - remote = autotvm.measure.request_remote(env.TARGET, tracket_host, tracket_port, timeout=10000) + remote = autotvm.measure.request_remote(env.TARGET, + tracket_host, + tracket_port, + timeout=10000) run_func(env, remote) else: # Next, run on PYNQ if env variables are defined diff --git a/vta/python/vta/top/bitpack.py b/vta/python/vta/top/bitpack.py index b39a96fa263a..d4748faad6a7 100644 --- a/vta/python/vta/top/bitpack.py +++ b/vta/python/vta/top/bitpack.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# pylint: disable=ungrouped-imports """Bit packing operators""" from __future__ import absolute_import as _abs @@ -76,7 +77,7 @@ def _bitpack(*indices): @register_compute("bitpack", level=15) -def compute_bitpack(attrs, inputs, output_type, target): +def compute_bitpack(attrs, inputs): lanes = attrs.lanes dtype = inputs[0].dtype assert dtype == "int8" diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py index 650465b066d0..c8f39c87a9c6 100644 --- a/vta/python/vta/top/graphpack.py +++ b/vta/python/vta/top/graphpack.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +# pylint: disable=unused-argument """A Relay implementation of graph packing.""" from tvm import relay @@ -54,8 +54,8 @@ def _pack_weight(data, dshape, cfactor): assert int(dshape[1]) % cfactor == 0 data = op.reshape(data, newshape=(int(dshape[0]) // cfactor, cfactor, - int(dshape[1]) // cfactor, cfactor, - int(dshape[2]), int(dshape[3]))) + int(dshape[1]) // cfactor, cfactor, + int(dshape[2]), int(dshape[3]))) data = op.transpose( data, axes=(0, 2, 4, 5, 1, 3)) return data @@ -92,8 +92,8 @@ def _pack_bias(data, dshape, dtype, bfactor, cfactor): # broadcast batch dimension to bfactor data = op.broadcast_to( - data, - shape=(dshape[0] // cfactor, dshape[1], dshape[2], bfactor, cfactor)) + data, + shape=(dshape[0] // cfactor, dshape[1], dshape[2], bfactor, cfactor)) return data @@ -103,6 +103,8 @@ def _get_shape(node): return _to_shape(node.checked_type.shape) class ExprPack(ExprMutator): + """Visitor to perform graph packing on an AST. + """ def __init__(self, bfactor, cfactor, weight_bits): self.bfactor = bfactor self.cfactor = cfactor @@ -196,13 +198,22 @@ def visit_call(self, call): pass elif call.op == self.add and len(input_types[1].shape) == 3: data, bias = args - bias = _pack_bias(bias, _to_shape(input_types[1].shape), input_types[1].dtype, self.bfactor, self.cfactor) + bias = _pack_bias(bias, + _to_shape(input_types[1].shape), + input_types[1].dtype, + self.bfactor, + self.cfactor) return relay.Call(self.add, [data, bias]) elif self.start_pack and call.op == self.bias_add: data, bias = args - bias = _pack_bias(bias, _to_shape(input_types[1].shape), input_types[1].dtype, self.bfactor, self.cfactor) + bias = _pack_bias(bias, + _to_shape(input_types[1].shape), + input_types[1].dtype, + self.bfactor, + self.cfactor) return relay.Call(self.add, [data, bias]) - elif self.start_pack and call.op == op.op.get('cast') and input_types[0].dtype == 'int32': + elif self.start_pack and call.op == op.op.get('cast') and \ + input_types[0].dtype == 'int32': cast = relay.Call(op.op.get('cast'), [args[0]], call.attrs) return relay.Call(op.op.get('copy'), [cast]) @@ -214,15 +225,18 @@ def visit_call(self, call): class BT(Exception): pass def get_subgraph(expr, start_name, stop_name): - "we assume stop_name only appear once for simplicity." - "this constraint will be lifted in the future." - "bitpack_start and bitpack_end is both inclusive" + """ We assume stop_name only appears once for simplicity. + This constraint will be lifted in the future. + bitpack_start and bitpack_end are both inclusive + """ bitpack_start = op.op.get('bitpack_start') bitpack_end = op.op.get('bitpack_end') anf = relay.ir_pass.to_a_normal_form(expr) def recursion(anf, start_found, stop_found): if isinstance(anf, relay.expr.Function): - return relay.expr.Function(anf.params, recursion(anf.body, start_found, stop_found), anf.ret_type, anf.type_params, anf.attrs) + return relay.expr.Function(anf.params, + recursion(anf.body, start_found, stop_found), + anf.ret_type, anf.type_params, anf.attrs) elif isinstance(anf, relay.expr.Let): value = anf.value if isinstance(value, relay.expr.Call): @@ -239,7 +253,8 @@ def recursion(anf, start_found, stop_found): assert not stop_found stop_found = True value = relay.expr.Call(bitpack_end, [value]) - return relay.expr.Let(anf.var, value, anf.body) # todo: check anf.body has no more stop_name beside that one + # todo: check anf.body has no more stop_name beside that one + return relay.expr.Let(anf.var, value, anf.body) else: assert start_found assert stop_found @@ -289,4 +304,3 @@ def graph_pack(expr, expr = packer.visit(expr) assert not packer.start_pack return relay.ir_pass.infer_type(expr) - diff --git a/vta/python/vta/top/nnvm_bitpack.py b/vta/python/vta/top/nnvm_bitpack.py index 52b3fa7d9899..0dc241330339 100644 --- a/vta/python/vta/top/nnvm_bitpack.py +++ b/vta/python/vta/top/nnvm_bitpack.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +# pylint: disable=unused-argument """Bit packing operators""" from __future__ import absolute_import as _abs @@ -22,7 +22,6 @@ from topi import util from nnvm.top import registry as reg, OpPattern -from nnvm.top import nn as _nn from nnvm.top.tensor import _fschedule_broadcast def bitpack(data, bits, pack_type="int8", name="bitpack"): @@ -84,4 +83,4 @@ def compute_bitpack(attrs, inputs, out): return bitpack(inputs[0], bits, dtype) reg.register_schedule("bitpack", _fschedule_broadcast) -reg.register_pattern("bitpack", OpPattern.INJECTIVE) \ No newline at end of file +reg.register_pattern("bitpack", OpPattern.INJECTIVE) diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index dc4dd08c4c50..da3d7eb900ef 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -14,12 +14,10 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +# pylint: disable=unused-argument, ungrouped-imports """Namespace for supporting packed_conv2d + ewise variant of nnvm.""" from __future__ import absolute_import as _abs -import logging - import tvm import topi @@ -68,9 +66,20 @@ def compute_conv2d(attrs, inputs, output_type, target): assert env.LOG_WGT_WIDTH == 3, "only support 8bit wgt for now" inputs = list(inputs) assert inputs[1].dtype == "int8" - return [topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype)] - else: - return [topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, out_dtype)] + return [topi.nn.conv2d(inputs[0], + inputs[1], + strides, + padding, + dilation, + layout, + out_dtype)] + return [topi.nn.group_conv2d_nchw(inputs[0], + inputs[1], + strides, + padding, + dilation, + groups, + out_dtype)] with tvm.target.arm_cpu(tvm.target.current_target().model): return _nn.compute_conv2d(attrs, inputs, output_type, target) @@ -87,12 +96,10 @@ def schedule_conv2d(attrs, outs, target): if target.device_name == "vta": if groups == 1: return topi.generic.schedule_conv2d_nchw(outs) - else: - return topi.generic.schedule_group_conv2d_nchw(outs) + return topi.generic.schedule_group_conv2d_nchw(outs) elif str(target).startswith("llvm"): return tvm.create_schedule([x.op for x in outs]) - else: - raise RuntimeError("Target %s is not supported" % target) + raise RuntimeError("Target %s is not supported" % target) with tvm.target.arm_cpu(tvm.target.current_target().model): return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target()) diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index e588a2ff0404..c455f535d93c 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -88,7 +88,7 @@ def _schedule_conv2d(cfg, outs): def _traverse(op): if topi.tag.is_broadcast(op.tag): if not op.same_as(output.op): - if len(op.axis) == 0: + if not op.axis: const_ops.append(op) else: ewise_ops.append(op) @@ -107,13 +107,13 @@ def _traverse(op): s = tvm.create_schedule(output.op) ##### space definition begin ##### - b, co, h, w, _, _ = s[conv2d_stage].op.axis - ci, _, _, _ = s[conv2d_stage].op.reduce_axis + b, c_o, x_i, x_j, _, _ = s[conv2d_stage].op.axis + c_i, _, _, _ = s[conv2d_stage].op.reduce_axis cfg.define_split('tile_b', b, num_outputs=2) - cfg.define_split('tile_h', h, num_outputs=2) - cfg.define_split('tile_w', w, num_outputs=2) - cfg.define_split('tile_ci', ci, num_outputs=2) - cfg.define_split('tile_co', co, num_outputs=2) + cfg.define_split('tile_h', x_i, num_outputs=2) + cfg.define_split('tile_w', x_j, num_outputs=2) + cfg.define_split('tile_ci', c_i, num_outputs=2) + cfg.define_split('tile_co', c_o, num_outputs=2) cfg.define_knob('oc_nthread', [1, 2]) cfg.define_knob('h_nthread', [1, 2]) ###### space definition end ###### diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py index 0b4d907853e4..9d6c19c5af20 100644 --- a/vta/python/vta/top/vta_dense.py +++ b/vta/python/vta/top/vta_dense.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# pylint: disable=unused-argument """Dense operator declaration and schedule registration for VTA.""" import numpy as np @@ -49,8 +50,8 @@ def _declaration_dense(cfg, oshape = (data.shape[0], weight.shape[0], data.shape[2], weight.shape[2]) # Reduction axes (input channel) - assert(ishape[1] == wshape[1]) - assert(ishape[3] == wshape[3]) + assert ishape[1] == wshape[1] + assert ishape[3] == wshape[3] k_o = tvm.reduce_axis((0, ishape[1]), name='k_o') k_i = tvm.reduce_axis((0, ishape[3]), name='k_i') res = tvm.compute( @@ -69,7 +70,7 @@ def _declaration_dense(cfg, @autotvm.register_topi_schedule(topi.generic.schedule_dense, 'vta', 'direct') def _schedule_dense(cfg, outs): """Packed dense schedule.""" - + assert len(outs) == 1 output = outs[0] const_ops = [] @@ -81,7 +82,7 @@ def _schedule_dense(cfg, outs): def _traverse(op): if topi.tag.is_broadcast(op.tag): if not op.same_as(output.op): - if len(op.axis) == 0: + if not op.axis: const_ops.append(op) else: ewise_ops.append(op) @@ -100,11 +101,11 @@ def _traverse(op): s = tvm.create_schedule(output.op) ##### space definition begin ##### - b, co, _, _ = s[dense_stage].op.axis - ci, _ = s[dense_stage].op.reduce_axis + b, c_o, _, _ = s[dense_stage].op.axis + c_i, _ = s[dense_stage].op.reduce_axis cfg.define_split('tile_b', b, num_outputs=2) - cfg.define_split('tile_ci', ci, num_outputs=2) - cfg.define_split('tile_co', co, num_outputs=2) + cfg.define_split('tile_ci', c_i, num_outputs=2) + cfg.define_split('tile_co', c_o, num_outputs=2) cfg.define_knob('oc_nthread', [1, 2]) ###### space definition end ###### @@ -166,4 +167,4 @@ def _traverse(op): s[dense_stage].tensorize(x_bi, env.gemm) s[output].pragma(x_ci, env.dma_copy) - return s \ No newline at end of file + return s From e6f2187e35e336d58df0d629b5fd8dd1b9edc3cc Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 01:14:24 -0700 Subject: [PATCH 082/126] lint fix --- vta/python/vta/build_module.py | 4 ++-- vta/python/vta/top/graphpack.py | 16 +++++++++------- vta/python/vta/top/nnvm_op.py | 10 +++++----- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py index 71fc0d3283c6..854dd4daf14a 100644 --- a/vta/python/vta/build_module.py +++ b/vta/python/vta/build_module.py @@ -172,6 +172,6 @@ def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs): args = [tvm.nd.empty(x[0], dtype=x[1], ctx=ctx) for x in arg_info] sim(*args) - except Exception as ex: # pylint: disable=broad-except - return BuildResult(None, None, ex, time.time() - tic) + except Exception as exc: # pylint: disable=broad-except + return BuildResult(None, None, exc, time.time() - tic) return BuildResult(filename, arg_info, None, time.time() - tic) diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py index c8f39c87a9c6..c6cc49748bac 100644 --- a/vta/python/vta/top/graphpack.py +++ b/vta/python/vta/top/graphpack.py @@ -70,8 +70,8 @@ def _pack_weight_conv2d_transpose(data, dshape, cfactor): assert dshape[1] % cfactor == 0 data = op.reshape(data, newshape=(dshape[0] // cfactor, cfactor, - dshape[1] // cfactor, cfactor, - dshape[2], dshape[3])) + dshape[1] // cfactor, cfactor, + dshape[2], dshape[3])) data = op.transpose( data, axes=(2, 0, 4, 5, 3, 1)) return data @@ -227,15 +227,17 @@ class BT(Exception): def get_subgraph(expr, start_name, stop_name): """ We assume stop_name only appears once for simplicity. This constraint will be lifted in the future. - bitpack_start and bitpack_end are both inclusive + bitpack_start and bitpack_end are both inclusive. """ bitpack_start = op.op.get('bitpack_start') bitpack_end = op.op.get('bitpack_end') anf = relay.ir_pass.to_a_normal_form(expr) - def recursion(anf, start_found, stop_found): + def _recursion(anf, start_found, stop_found): + """ Helper to obtain the subgraph. + """ if isinstance(anf, relay.expr.Function): return relay.expr.Function(anf.params, - recursion(anf.body, start_found, stop_found), + _recursion(anf.body, start_found, stop_found), anf.ret_type, anf.type_params, anf.attrs) elif isinstance(anf, relay.expr.Let): value = anf.value @@ -247,7 +249,7 @@ def recursion(anf, start_found, stop_found): elif value.op.name == stop_name: raise BT() try: - return relay.expr.Let(anf.var, value, recursion(anf.body, start_found, stop_found)) + return relay.expr.Let(anf.var, value, _recursion(anf.body, start_found, stop_found)) except BT: assert start_found assert not stop_found @@ -259,7 +261,7 @@ def recursion(anf, start_found, stop_found): assert start_found assert stop_found return anf - annotated = recursion(anf, False, False) + annotated = _recursion(anf, False, False) return relay.ir_pass.infer_type(relay.ir_pass.to_graph_normal_form(annotated)) def graph_pack(expr, diff --git a/vta/python/vta/top/nnvm_op.py b/vta/python/vta/top/nnvm_op.py index d9c2efb550f2..a38b2172671b 100644 --- a/vta/python/vta/top/nnvm_op.py +++ b/vta/python/vta/top/nnvm_op.py @@ -92,9 +92,10 @@ def compute_conv2d(attrs, inputs, out): assert env.LOG_OUT_WIDTH == 3, "only support 8bit inp for now" inputs = list(inputs) assert inputs[1].dtype == "int8" - return topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype) - else: - return topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, out_dtype) + return topi.nn.conv2d(inputs[0], inputs[1], strides, + padding, dilation, layout, out_dtype) + return topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, + padding, dilation, groups, out_dtype) with tvm.target.arm_cpu(tvm.target.current_target().model): return _nn.compute_conv2d(attrs, inputs, out) @@ -110,8 +111,7 @@ def schedule_conv2d(attrs, outs, target): if target.device_name == "vta": if groups == 1: return topi.generic.schedule_conv2d_nchw(outs) - else: - return topi.generic.schedule_group_conv2d_nchw(outs) + return topi.generic.schedule_group_conv2d_nchw(outs) elif str(target).startswith("llvm"): return tvm.create_schedule([x.op for x in outs]) else: From 2a1b76ea081f4e781985e2c4295b8ac2e74b84ee Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 01:24:13 -0700 Subject: [PATCH 083/126] compiler warnings --- src/relay/pass/quantize.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc index c41ee6ac0935..3d5802307af3 100644 --- a/src/relay/pass/quantize.cc +++ b/src/relay/pass/quantize.cc @@ -388,7 +388,6 @@ Array UnifyDTypeScale(const Array& ref_args, const Array& args, DataType* dtype_ptr, Expr* scale_ptr) { - static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize"); const QConfig& cfg = QConfig::Current(); std::vector nptrs; From b0fdab0127680a5d4021b10d0b127fd09c5a7594 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 11:33:42 -0700 Subject: [PATCH 084/126] removing ci tests for now that require changes to the packages on the test machine --- .../test_autotvm_task_extraction.py | 205 ---------------- vta/tests/python/integration/test_resnet18.py | 228 ------------------ 2 files changed, 433 deletions(-) delete mode 100644 vta/tests/python/integration/test_autotvm_task_extraction.py delete mode 100644 vta/tests/python/integration/test_resnet18.py diff --git a/vta/tests/python/integration/test_autotvm_task_extraction.py b/vta/tests/python/integration/test_autotvm_task_extraction.py deleted file mode 100644 index e276b5c0672f..000000000000 --- a/vta/tests/python/integration/test_autotvm_task_extraction.py +++ /dev/null @@ -1,205 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Perform inference on VTA using Relay.""" - -import argparse, os, time -from mxnet.gluon.model_zoo import vision -import numpy as np -from PIL import Image - -import topi -import tvm -from tvm import rpc, autotvm, relay -from tvm.autotvm.measure.measure_methods import request_remote -from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner -from tvm.contrib import graph_runtime, util, download -from tvm.contrib.debugger import debug_runtime -import vta -from vta.testing import simulator -from vta.top import graph_pack -from tvm.autotvm.task import extract_from_program - -def parse_arguments(): - - parser = argparse.ArgumentParser(description='Train a model for image classification.') - parser.add_argument('--model', type=str, default='resnet18_v1', choices=['resnet18_v1'], - help='Input model name.') - parser.add_argument('--start-name', type=str, default='nn.max_pool2d', - help='The name of the node where packing starts') - parser.add_argument('--stop-name', type=str, default='nn.global_avg_pool2d', - help='The name of the node where packing stops') - parser.add_argument('--debug-profile', action='store_true', - help='Show layer-wise time cost profiling results') - parser.add_argument('--device', default='vta', choices=['vta', 'arm_cpu'], - help='Select device target') - parser.add_argument('--measurements', type=int, default=1, - help='Number of measurements during AutoTVM search') - parser.add_argument('--tuner', type=str, default="random", - help='AutoTVM search strategy') - parser.add_argument('--log-filename', type=str, default="resnet-18.log", - help='AutoTVM log file name') - - return parser.parse_args() - - -def register_vta_tuning_tasks(): - from tvm.autotvm.task.topi_integration import TaskExtractEnv, deserialize_args - - @tvm.tag_scope(tag=topi.tag.ELEMWISE) - def my_clip(x, a_min, a_max): - """Unlike topi's current clip, put min and max into two stages.""" - const_min = tvm.const(a_min, x.dtype) - const_max = tvm.const(a_max, x.dtype) - x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") - x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") - return x - - # init autotvm env to register VTA operator - TaskExtractEnv() - - @autotvm.task.register("topi_nn_conv2d", override=True) - def _topi_nn_conv2d(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - args = deserialize_args(args) - A, W = args[:2] - - with tvm.target.vta(): - res = topi.nn.conv2d(*args, **kwargs) - res = topi.right_shift(res, 8) - res = my_clip(res, 0, 127) - res = topi.cast(res, "int8") - - if tvm.target.current_target().device_name == 'vta': - s = topi.generic.schedule_conv2d_nchw([res]) - else: - s = tvm.create_schedule([res.op]) - return s, [A, W, res] - - @autotvm.task.register("topi_nn_dense", override=True) - def _topi_nn_dense(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - args = deserialize_args(args) - A, W = args[:2] - - with tvm.target.vta(): - res = topi.nn.dense(*args, **kwargs) - res = topi.right_shift(res, 8) - res = my_clip(res, 0, 127) - res = topi.cast(res, "int8") - - if tvm.target.current_target().device_name == 'vta': - s = topi.generic.schedule_dense([res]) - else: - s = tvm.create_schedule([res.op]) - - return s, [A, W, res] - - -def compile_network(opt, env, target): - - # Populate the shape and data type dictionary - dtype_dict = {"data": 'float32'} - shape_dict = {"data": (env.BATCH, 3, 224, 224)} - - # Get off the shelf gluon model, and convert to relay - gluon_model = vision.get_model(opt.model, pretrained=True) - mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) - - # Update shape and type dictionary - shape_dict.update({k: v.shape for k, v in params.items()}) - dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) - - # Perform quantization in Relay - with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1): - relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params) - - # Perform graph packing and constant folding for VTA target - if target.device_name == "vta": - assert env.BLOCK_IN == env.BLOCK_OUT - relay_prog = graph_pack( - relay_prog, - env.BATCH, - env.BLOCK_OUT, - env.WGT_WIDTH, - start_name=opt.start_name, - stop_name=opt.stop_name) - relay_prog = relay.ir_pass.fold_constant(relay_prog) - - return relay_prog, params - -if __name__ == '__main__': - - opt = parse_arguments() - - # Make sure that TVM was compiled with RPC=1 - assert tvm.module.enabled("rpc") - - # Read in VTA environment - env = vta.get_env() - - # Get remote from fleet node - tracker_host = os.environ.get("TVM_TRACKER_HOST", None) - tracker_port = int(os.environ.get("TVM_TRACKER_PORT", None)) - if not tracker_host or not tracker_port: - print("Set your AutoTVM tracker node host and port variables to run the autotuner") - exit() - - # Get remote - if env.TARGET != "sim": - - # Measure build start time - reconfig_start = time.time() - - # Get remote from fleet node - remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000) - - # Reconfigure the JIT runtime and FPGA. - # You can program the FPGA with your own custom bitstream - # by passing the path to the bitstream file instead of None. - vta.reconfig_runtime(remote) - vta.program_fpga(remote, bitstream=None) - - # Report on reconfiguration time - reconfig_time = time.time() - reconfig_start - print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time)) - - # In simulation mode, host the RPC server locally. - else: - remote = rpc.LocalSession() - - # VTA target and execution context - target = env.target if opt.device == "vta" else env.target_vta_cpu - ctx = remote.ext_dev(0) if opt.device == "vta" else remote.cpu(0) - - # Register VTA tuning tasks - register_vta_tuning_tasks() - - # Compile Relay program - relay_prog, params = compile_network(opt, env, target) - - # Perform task extraction on Relay program - tasks = extract_from_program(func=relay_prog, - params=params, - ops=(tvm.relay.op.nn.conv2d,), - target=target, - target_host=env.target_host) - - # Check that we have extracted the right number of tasks - assert opt.model == "resnet18_v1" and len(tasks) == 10 - - print("Task extraction passed!") diff --git a/vta/tests/python/integration/test_resnet18.py b/vta/tests/python/integration/test_resnet18.py deleted file mode 100644 index ced6e9db3fc7..000000000000 --- a/vta/tests/python/integration/test_resnet18.py +++ /dev/null @@ -1,228 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Perform inference on VTA using Relay.""" - -import argparse, json, os, requests, time -from io import BytesIO -from mxnet.gluon.model_zoo import vision -import numpy as np -from os.path import join, isfile -from PIL import Image - -import tvm -from tvm import rpc, autotvm, relay -from tvm.contrib import graph_runtime, util, download -from tvm.contrib.debugger import debug_runtime -import vta -from vta.testing import simulator -from vta.top import graph_pack - - -def classification_test(opt): - """ResNet-18 classification test. - - Parameters - ---------- - opt: a dictionary obtained from argparse - """ - - # Make sure that TVM was compiled with RPC=1 - assert tvm.module.enabled("rpc") - - # Read in VTA environment - env = vta.get_env() - - # Download ImageNet Categories - url = "https://github.com/uwsaml/web-data/raw/master/vta/models/" - categ_fn = "synset.txt" - for fn in ["synset.txt"]: - if not isfile(fn): - download.download(join(url, fn), fn) - synset = eval(open(categ_fn).read()) - - # Download test image - image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg' - response = requests.get(image_url) - - # Prepare test image for inference - image = Image.open(BytesIO(response.content)).resize((224, 224)) - image = np.array(image) - np.array([123., 117., 104.]) - image /= np.array([58.395, 57.12, 57.375]) - image = image.transpose((2, 0, 1)) - image = image[np.newaxis, :] - image = np.repeat(image, env.BATCH, axis=0) - - # For tuning, make sure tracker variables are set - tracker_host = os.environ.get("TVM_TRACKER_HOST", None) - tracker_port = int(os.environ.get("TVM_TRACKER_PORT", None)) - if not tracker_host or not tracker_port: - print("Set your AutoTVM tracker node host and port variables to run the autotuner") - exit() - - # We configure both the bitstream and the runtime system on the Pynq - # to match the VTA configuration specified by the vta_config.json file. - if env.TARGET != "sim": - - # Measure build start time - reconfig_start = time.time() - - # Get remote from fleet node - remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000) - - # Reconfigure the JIT runtime and FPGA. - # You can program the FPGA with your own custom bitstream - # by passing the path to the bitstream file instead of None. - vta.reconfig_runtime(remote) - vta.program_fpga(remote, bitstream=None) - - # Report on reconfiguration time - reconfig_time = time.time() - reconfig_start - print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time)) - - # In simulation mode, host the RPC server locally. - else: - remote = rpc.LocalSession() - - # Create a TVM target and execution context - target = env.target if opt.device == "vta" else env.target_vta_cpu - ctx = remote.ext_dev(0) if opt.device == "vta" else remote.cpu(0) - - # Get tophub schedules - with autotvm.tophub.context(target): - - # Measure build start time - build_start = time.time() - - # Derive the LLVM compiler flags - # When targetting the Pynq/Ultra-96, cross-compile to ARM ISA - target_host = env.target_host - - # Populate the shape and data type dictionary - dtype_dict = {"data": 'float32'} - shape_dict = {"data": (env.BATCH, 3, 224, 224)} - - # Get off the shelf gluon model, and convert to relay - gluon_model = vision.get_model(opt.model, pretrained=True) - mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) - - # Update shape and type dictionary - shape_dict.update({k: v.shape for k, v in params.items()}) - dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) - - # Perform quantization in Relay - with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1, skip_k_dense=1): - relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params) - - # Perform graph packing and constant folding for VTA target - if target.device_name == "vta": - assert env.BLOCK_IN == env.BLOCK_OUT - relay_prog = graph_pack( - relay_prog, - env.BATCH, - env.BLOCK_OUT, - env.WGT_WIDTH, - start_name=opt.start_name, - stop_name=opt.stop_name) - relay_prog = relay.ir_pass.fold_constant(relay_prog) - - # Compile Relay program with AlterOpLayout disabled - with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): - if target.device_name != "vta": - graph, lib, params = relay.build( - relay_prog, target=target, - params=params, target_host=target_host) - else: - with vta.build_config(): - graph, lib, params = relay.build( - relay_prog, target=target, - params=params, target_host=target_host) - - # Measure Relay build time - build_time = time.time() - build_start - print(opt.model + " inference graph built in {0:.2f}s!".format(build_time)) - - # Send the inference library over to the remote RPC server - temp = util.tempdir() - lib.save(temp.relpath("graphlib.o")) - remote.upload(temp.relpath("graphlib.o")) - lib = remote.load_module("graphlib.o") - - # If detailed runtime info is needed build with debug runtime - if opt.debug_profile: - m = debug_runtime.create(graph, lib, ctx) - else: - m = graph_runtime.create(graph, lib, ctx) - - # Set the network parameters and inputs - m.set_input(**params) - m.set_input('data', image) - - # Perform inference - timer = m.module.time_evaluator("run", ctx, number=1, repeat=opt.measurements) - tcost = timer() - - # Display profile information - if opt.debug_profile: - m.run() - - # Get classification results - tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 1000), "float32", remote.cpu(0))) - top_categories = np.argsort(tvm_output.asnumpy()[0]) - - # This just checks that one of the 5 top categories - # is one variety of cat; this is by no means an accurate - # assessment of how quantization affects classification - # accuracy but is meant to catch changes to the quantization - # pass that would break basic correctness - cat_detected = False - for k in top_categories[-5:]: - if "cat" in synset[k]: - cat_detected = True - assert(cat_detected) - - # Report latency and top-5 classification results - std = np.std(tcost.results) * 1000 / env.BATCH - mean = tcost.mean * 1000 / env.BATCH - print("%s Prediction" % opt.model) - print(" #1:", synset[top_categories[-1]]) - print(" #2:", synset[top_categories[-2]]) - print(" #3:", synset[top_categories[-3]]) - print(" #4:", synset[top_categories[-4]]) - print(" #5:", synset[top_categories[-5]]) - print("Performed inference in %.2fms/sample (std = %.2f)" % (mean, std)) - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser(description='Train a model for image classification.') - parser.add_argument('--model', type=str, default='resnet18_v1', choices=['resnet18_v1'], - help='Input model name.') - parser.add_argument('--start-name', type=str, default='nn.max_pool2d', - help='The name of the node where packing starts') - parser.add_argument('--stop-name', type=str, default='nn.global_avg_pool2d', - help='The name of the node where packing stops') - parser.add_argument('--debug-profile', action='store_true', - help='Show layer-wise time cost profiling results') - parser.add_argument('--device', default='vta', choices=['vta', 'arm_cpu'], - help='Select device target') - parser.add_argument('--measurements', type=int, default=1, - help='Number of measurements') - - opt = parser.parse_args() - - classification_test(opt) From a6ffab3a2ef188a3e235cf4c9ef05c941ba928f7 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 12:21:40 -0700 Subject: [PATCH 085/126] ci fix due to TaskExtractEnv API change --- .../graph_tuner/utils/traverse_graph.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py index 08f1017e7fb8..dfdbfe31e5e3 100644 --- a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py +++ b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py @@ -65,18 +65,19 @@ def expr2graph(expr, target_ops, node_dict, node_list): % op_name) topi_funcs += OP2COMPUTE[op_name] env.reset(topi_funcs) - _expr2graph_impl(expr, target_ops, node_dict, node_list) - task_pos = 0 - for node_entry in node_list: - if node_entry["op"] in target_ops: - task_name, args = env.task_collection[task_pos] - task = autotvm.task.create(task_name, args, - target="llvm", - target_host=None, - template_key='direct') - node_entry["workloads"] = [task.workload] - node_entry["topi_op"] = [task_name] - task_pos += 1 + with env: + _expr2graph_impl(expr, target_ops, node_dict, node_list) + task_pos = 0 + for node_entry in node_list: + if node_entry["op"] in target_ops: + task_name, args = env.task_collection[task_pos] + task = autotvm.task.create(task_name, args, + target="llvm", + target_host=None, + template_key='direct') + node_entry["workloads"] = [task.workload] + node_entry["topi_op"] = [task_name] + task_pos += 1 def _expr2graph_impl(expr, target_ops, node_dict, node_list): From 30e8ad0db4b766e8d6f3a52258f8e66eeeb43892 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 12:28:33 -0700 Subject: [PATCH 086/126] lint fix --- python/tvm/autotvm/graph_tuner/utils/traverse_graph.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py index dfdbfe31e5e3..c0debaedede0 100644 --- a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py +++ b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py @@ -72,9 +72,9 @@ def expr2graph(expr, target_ops, node_dict, node_list): if node_entry["op"] in target_ops: task_name, args = env.task_collection[task_pos] task = autotvm.task.create(task_name, args, - target="llvm", - target_host=None, - template_key='direct') + target="llvm", + target_host=None, + template_key='direct') node_entry["workloads"] = [task.workload] node_entry["topi_op"] = [task_name] task_pos += 1 From 07eb36e239b8c4508ed7c3ef1298b0665997c5d0 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 13:33:22 -0700 Subject: [PATCH 087/126] reorganize vta tutorial page; added more comments to e2e resnet --- docs/conf.py | 4 +- vta/tutorials/README.txt | 1 + vta/tutorials/frontend/README.txt | 4 ++ .../deploy_resnet_on_vta.py} | 68 +++++++++++++++---- .../{ => optimize}/convolution_opt.py | 0 .../{ => optimize}/matrix_multiply_opt.py | 0 6 files changed, 62 insertions(+), 15 deletions(-) create mode 100644 vta/tutorials/frontend/README.txt rename vta/tutorials/{resnet.py => frontend/deploy_resnet_on_vta.py} (75%) rename vta/tutorials/{ => optimize}/convolution_opt.py (100%) rename vta/tutorials/{ => optimize}/matrix_multiply_opt.py (100%) diff --git a/docs/conf.py b/docs/conf.py index a1b66325a527..d9eea045a97b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -215,7 +215,9 @@ def run_doxygen(folder): '../tutorials/autotvm', '../tutorials/dev', '../tutorials/topi', - '../tutorials/deployment']) + '../tutorials/deployment', + '../vta/tutorials/frontend', + '../vta/tutorials/optimize']) def generate_doxygen_xml(app): """Run the doxygen make commands if we're on the ReadTheDocs server""" diff --git a/vta/tutorials/README.txt b/vta/tutorials/README.txt index 1ba48b0b1fad..3d3858b111ba 100644 --- a/vta/tutorials/README.txt +++ b/vta/tutorials/README.txt @@ -1,2 +1,3 @@ VTA Tutorials ============= +This page contains tutorials about VTA and how to use TVM/Relay to target VTA. diff --git a/vta/tutorials/frontend/README.txt b/vta/tutorials/frontend/README.txt new file mode 100644 index 000000000000..319506d21f8f --- /dev/null +++ b/vta/tutorials/frontend/README.txt @@ -0,0 +1,4 @@ +.. _tutorial-frontend: + +Compile Deep Learning Models +---------------------------- diff --git a/vta/tutorials/resnet.py b/vta/tutorials/frontend/deploy_resnet_on_vta.py similarity index 75% rename from vta/tutorials/resnet.py rename to vta/tutorials/frontend/deploy_resnet_on_vta.py index c58f5412d974..e2b536b798ad 100644 --- a/vta/tutorials/resnet.py +++ b/vta/tutorials/frontend/deploy_resnet_on_vta.py @@ -15,20 +15,28 @@ # specific language governing permissions and limitations # under the License. """ -ResNet Inference Example -======================== +Deploy Pretrained ResNet Model from MxNet on VTA +================================================ **Author**: `Thierry Moreau `_ This tutorial provides an end-to-end demo, on how to run ResNet-18 inference onto the VTA accelerator design to perform ImageNet classification tasks. - +It showcases Relay as a front end compiler that can perform quantization (VTA +only supports int8/32 inference) as well as graph packing (in order to enable +tensorization in the core) to massage the compute graph for the hardware target. """ - ###################################################################### -# Import Libraries -# ---------------- -# We start by importing libraries to run this example. +# Install dependencies +# -------------------- +# To use the autotvm package in tvm, we need to install some extra dependencies. +# (change "3" to "2" if you use python2): +# +# .. code-block:: bash +# +# pip3 install --user mxnet requests pillow +# +# Now return to the python code. Import packages. from __future__ import absolute_import, print_function @@ -56,7 +64,7 @@ ###################################################################### # Define the platform and model targets -# ---------------- +# ------------------------------------- # Execute on CPU vs. VTA, and define the model. # Load VTA parameters from the vta/config/vta_config.json file @@ -68,6 +76,9 @@ target = env.target if device == "vta" else env.target_vta_cpu # Name of Gluon model to compile +# The ``start_pack`` and ``stop_pack`` labels indicate where +# to start and end the graph packing relay pass: in other words +# where to start and finish offloading to VTA. model = "resnet18_v1" start_pack="nn.max_pool2d" stop_pack="nn.global_avg_pool2d" @@ -80,9 +91,14 @@ if env.TARGET != "sim": - # Get remote from fleet node if environment variable is set + # Get remote from tracker node if environment variable is set. + # To set up the tracker, you'll need to follow the "Auto-tuning + # a convolutional network for VTA" tutorial. tracker_host = os.environ.get("TVM_TRACKER_HOST", None) tracker_port = int(os.environ.get("TVM_TRACKER_PORT", None)) + # Otherwise if you have a device you want to program directly from + # the host, make sure you've set the variables below to the IP of + # your board. device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99") device_port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091")) if not tracker_host or not tracker_port: @@ -107,9 +123,19 @@ ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) ###################################################################### -# Build the inference runtime -# ------------------------ -# Build ResNet from Gluon with Relay. +# Build the inference graph runtime +# --------------------------------- +# Grab ResNet-18 model from Gluon model zoo and compile with Relay. +# The compilation steps are: +# 1) Front end translation from MxNet into Relay module. +# 2) Apply 8-bit quantization: here we skip the first conv layer, +# and dense layer which will both be executed in fp32 on the CPU. +# 3) Perform graph packing to alter the data layout for tensorization. +# 4) Perform constant folding to reduce number of operators (e.g. eliminate +# batch norm multiply). +# 5) Perform relay build to object file. +# 6) Load the object file onto remote (FPGA device). +# 7) Generate graph runtime, `m`. # Load pre-configured AutoTVM schedules with autotvm.tophub.context(target): @@ -174,8 +200,10 @@ ###################################################################### # Perform ResNet-18 inference -# ------------------------ +# --------------------------- # We run classification on an image sample from ImageNet +# We just need to download the categories files, `synset.txt` +# and an input test image. # Download ImageNet categories categ_url = "https://github.com/uwsaml/web-data/raw/master/vta/models/" @@ -201,7 +229,8 @@ m.set_input(**params) m.set_input('data', image) -# Perform inference +# Perform inference: we run the module 4 times, +# and repeat 3 times to get error bounds timer = m.module.time_evaluator("run", ctx, number=4, repeat=3) tcost = timer() @@ -219,3 +248,14 @@ print(" #4:", synset[top_categories[-4]]) print(" #5:", synset[top_categories[-5]]) print("Performed inference in %.2fms/sample (std = %.2f)" % (mean, std)) + +# This just checks that one of the 5 top categories +# is one variety of cat; this is by no means an accurate +# assessment of how quantization affects classification +# accuracy but is meant to catch changes to the +# quantization pass that would accuracy in the CI. +cat_detected = False +for k in top_categories[-5:]: + if "cat" in synset[k]: + cat_detected = True +assert(cat_detected) \ No newline at end of file diff --git a/vta/tutorials/convolution_opt.py b/vta/tutorials/optimize/convolution_opt.py similarity index 100% rename from vta/tutorials/convolution_opt.py rename to vta/tutorials/optimize/convolution_opt.py diff --git a/vta/tutorials/matrix_multiply_opt.py b/vta/tutorials/optimize/matrix_multiply_opt.py similarity index 100% rename from vta/tutorials/matrix_multiply_opt.py rename to vta/tutorials/optimize/matrix_multiply_opt.py From f18de91d5737d6405423ab9e44880c9a6cd94b2b Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 13:34:16 -0700 Subject: [PATCH 088/126] missing readme file for sphynx gallery --- vta/tutorials/optimize/README.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 vta/tutorials/optimize/README.txt diff --git a/vta/tutorials/optimize/README.txt b/vta/tutorials/optimize/README.txt new file mode 100644 index 000000000000..b051548c5351 --- /dev/null +++ b/vta/tutorials/optimize/README.txt @@ -0,0 +1,2 @@ +Optimize Tensor Operators +------------------------- From 0985a2135bb6c5af68e3b2f25291b9bc9b0f5e34 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 13:45:10 -0700 Subject: [PATCH 089/126] ci fix --- python/tvm/autotvm/task/relay_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py index ff55055b3c10..e71c076e26d0 100644 --- a/python/tvm/autotvm/task/relay_integration.py +++ b/python/tvm/autotvm/task/relay_integration.py @@ -40,7 +40,7 @@ def _build(func, from tvm import relay - if "vta" in target.device_name: + if "vta" in str(target): with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name == "vta": import vta From 0d454d819e33cbf435c48436bf211467cb56df8d Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 14:38:41 -0700 Subject: [PATCH 090/126] quantization ci fix --- python/tvm/relay/quantize/quantize.py | 32 ++++++++++++------- vta/scripts/tune_resnet.py | 5 ++- .../frontend/deploy_resnet_on_vta.py | 5 ++- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py index c127484f9b54..4f3ff60a8c06 100644 --- a/python/tvm/relay/quantize/quantize.py +++ b/python/tvm/relay/quantize/quantize.py @@ -156,6 +156,9 @@ def qconfig(**kwargs): is None, which means will try to call all operartors' annotate rewrite function. + target_vta: boolean + Whether we are performing quantization for VTA. + Returns ------- config: QConfig @@ -355,6 +358,8 @@ def quantize(graph, params=None, dataset=None): if params: graph = _bind_params(graph, params) + cfg = current_qconfig() + mod = _module.Module.from_expr(graph) # Perform "SimplifyInference", "FoldScaleAxis", "FoldConstant", and # "CanonicalizeOps" optimization before quantization. @@ -366,15 +371,20 @@ def quantize(graph, params=None, dataset=None): calibrate_pass = _transform.function_pass(calibrate, opt_level=1, name="QuantizeCalibrate") - quantize_seq = _transform.Sequential([annotate(), - calibrate_pass, - realize(), - _transform.FoldConstant()]) - with annotate_context(): - with _transform.PassContext(opt_level=3, - required_pass=["QuantizeAnnotate", - "QuantizeCalibrate", - "QuantizeRealize"]): - mod = optimize(mod) - mod = quantize_seq(mod) + # Quantize pass list + quant_passes = [annotate(), + calibrate_pass, + realize(), + _transform.FoldConstant()] + # Add rewrite_for_vta() pass if target is VTA + if cfg.target_vta: + quant_passes = [rewrite_for_vta()] + quant_passes + quantize_seq = _transform.Sequential(quant_passes) + with _transform.PassContext(opt_level=3, + required_pass=["QuantizeAnnotate", + "QuantizeCalibrate", + "QuantizeRealize"]): + mod = optimize(mod) + mod = quantize_seq(mod) + return mod[mod.entry_func.name_hint] diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py index e89de92af531..5469b6b40b6e 100644 --- a/vta/scripts/tune_resnet.py +++ b/vta/scripts/tune_resnet.py @@ -125,7 +125,10 @@ def compile_network(opt, env, target): dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Perform quantization in Relay - with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1): + with relay.quantize.qconfig(global_scale=8.0, + skip_k_conv=1, + skip_k_dense=1, + target_vta=True): relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params) # Perform graph packing and constant folding for VTA target diff --git a/vta/tutorials/frontend/deploy_resnet_on_vta.py b/vta/tutorials/frontend/deploy_resnet_on_vta.py index e2b536b798ad..7b5e6b2e730e 100644 --- a/vta/tutorials/frontend/deploy_resnet_on_vta.py +++ b/vta/tutorials/frontend/deploy_resnet_on_vta.py @@ -158,7 +158,10 @@ dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Perform quantization in Relay - with relay.quantize.qconfig(global_scale=8.0, skip_k_conv=1, skip_k_dense=1): + with relay.quantize.qconfig(global_scale=8.0, + skip_k_conv=1, + skip_k_dense=1, + target_vta=True): relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params) # Perform graph packing and constant folding for VTA target From 655c0a55b4a68d132cd1b9bc2afac31fa819f9bc Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 15:31:28 -0700 Subject: [PATCH 091/126] ci fix for nnvm task extraction --- nnvm/python/nnvm/top/nn.py | 2 +- python/tvm/autotvm/task/nnvm_integration.py | 2 +- python/tvm/autotvm/task/topi_integration.py | 6 +++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py index 128f985bd6d2..521b7f4b1da0 100644 --- a/nnvm/python/nnvm/top/nn.py +++ b/nnvm/python/nnvm/top/nn.py @@ -78,7 +78,7 @@ def schedule_log_softmax(_, outs, target): def compute_dense(attrs, inputs, _): """Compute definition of dense""" if attrs.get_bool("use_bias"): - return topi.nn.dense(inputs[0], inputs[1], bias=inputs[2]) + return topi.nn.dense(inputs[0], inputs[1], inputs[2]) return topi.nn.dense(inputs[0], inputs[1]) @reg.register_schedule("dense") diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py index d945abb054e2..e785394a7da3 100644 --- a/python/tvm/autotvm/task/nnvm_integration.py +++ b/python/tvm/autotvm/task/nnvm_integration.py @@ -30,7 +30,7 @@ logger = logging.getLogger('autotvm') -def extract_from_graph(graph, shape, dtype, target, symbols, params, target_host=None): +def extract_from_graph(graph, shape, dtype, target, symbols, params=None, target_host=None): """ Extract tuning tasks from a nnvm graph. This function collects tuning tasks by building the graph diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py index f41d7ee934c5..c816e67f6deb 100644 --- a/python/tvm/autotvm/task/topi_integration.py +++ b/python/tvm/autotvm/task/topi_integration.py @@ -209,7 +209,11 @@ def _topi_nn_conv2d_transpose_nchw(*args, **kwargs): def _topi_nn_dense(*args, **kwargs): assert not kwargs, "Do not support kwargs in template function call" args = deserialize_args(args) - data, weight, bias, _ = args + if len(args) > 2: + data, weight, bias = args[:2] + else: + data, weight = args + bias = None C = topi.nn.dense(*args, **kwargs) s = topi.generic.schedule_dense([C]) if bias is not None: From 32bb0d4e0684a325403dd09d1d7caea001862c11 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 15:48:06 -0700 Subject: [PATCH 092/126] bug fix --- python/tvm/autotvm/task/topi_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py index c816e67f6deb..7ff8ec73e16e 100644 --- a/python/tvm/autotvm/task/topi_integration.py +++ b/python/tvm/autotvm/task/topi_integration.py @@ -210,7 +210,7 @@ def _topi_nn_dense(*args, **kwargs): assert not kwargs, "Do not support kwargs in template function call" args = deserialize_args(args) if len(args) > 2: - data, weight, bias = args[:2] + data, weight, bias = args[:3] else: data, weight = args bias = None From a444f03b998340d91d490f10f71222589e25a51a Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 18:09:19 -0700 Subject: [PATCH 093/126] default case in operator override to prevent sphynx gallery issues --- vta/python/vta/top/op.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index da3d7eb900ef..063a8acdb303 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -80,9 +80,11 @@ def compute_conv2d(attrs, inputs, output_type, target): dilation, groups, out_dtype)] + elif target.device_name == "vta": + with tvm.target.arm_cpu(tvm.target.current_target().model): + return _nn.compute_conv2d(attrs, inputs, output_type, target) - with tvm.target.arm_cpu(tvm.target.current_target().model): - return _nn.compute_conv2d(attrs, inputs, output_type, target) + return _nn.compute_conv2d(attrs, inputs, output_type, target) @reg.register_schedule("nn.conv2d", level=15) @@ -97,12 +99,14 @@ def schedule_conv2d(attrs, outs, target): if groups == 1: return topi.generic.schedule_conv2d_nchw(outs) return topi.generic.schedule_group_conv2d_nchw(outs) - elif str(target).startswith("llvm"): - return tvm.create_schedule([x.op for x in outs]) + # elif str(target).startswith("llvm"): + # return tvm.create_schedule([x.op for x in outs]) raise RuntimeError("Target %s is not supported" % target) + elif target.device_name == "vta": + with tvm.target.arm_cpu(tvm.target.current_target().model): + return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target()) - with tvm.target.arm_cpu(tvm.target.current_target().model): - return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target()) + return _nn.schedule_conv2d(attrs, outs, target) @reg.register_compute("nn.dense", level=15) @@ -112,10 +116,13 @@ def compute_dense(attrs, inputs, out_type, target): out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype if inputs[0].shape == 4: # this implies the layout is packed + target = tvm.target.create(target) return [topi.nn.dense(inputs[0], inputs[1], None, out_dtype)] + elif target.device_name == "vta": + with tvm.target.arm_cpu(tvm.target.current_target().model): + return _nn.compute_dense(attrs, inputs, out_type, target) - with tvm.target.arm_cpu(tvm.target.current_target().model): - return _nn.compute_dense(attrs, inputs, out_type, target) + return _nn.compute_dense(attrs, inputs, out_type, target) @reg.register_schedule("nn.dense", level=15) @@ -126,10 +133,11 @@ def schedule_dense(attrs, outs, target): target = tvm.target.create(target) if target.device_name == "vta": return topi.generic.schedule_dense(outs) - elif str(target).startswith("llvm"): - return tvm.create_schedule([x.op for x in outs]) - else: - raise RuntimeError("Target %s is not supported" % target) + # elif str(target).startswith("llvm"): + # return tvm.create_schedule([x.op for x in outs]) + raise RuntimeError("Target %s is not supported" % target) + elif target.device_name == "vta": + with tvm.target.arm_cpu(tvm.target.current_target().model): + return _nn.schedule_dense(attrs, outs, tvm.target.current_target()) - with tvm.target.arm_cpu(tvm.target.current_target().model): - return _nn.schedule_dense(attrs, outs, tvm.target.current_target()) + return _nn.schedule_dense(attrs, outs, target) From 31beec6239f42fdbfa8922af8195ac2451167885 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 18:17:45 -0700 Subject: [PATCH 094/126] deprecating nnvm for VTA --- vta/python/vta/top/__init__.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py index ee2b5ec21ef8..3b5132ebf0ef 100644 --- a/vta/python/vta/top/__init__.py +++ b/vta/python/vta/top/__init__.py @@ -2,9 +2,11 @@ from . import bitpack from .graphpack import graph_pack -from . import nnvm_bitpack -from .nnvm_graphpack import nnvm_graph_pack -from . import nnvm_op from . import op from . import vta_conv2d from . import vta_dense + +# NNVM is deprecated for VTA +# from . import nnvm_bitpack +# from .nnvm_graphpack import nnvm_graph_pack +# from . import nnvm_op From fa73537352dd7fb5dc278532b0caef3da59896a1 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 18:18:30 -0700 Subject: [PATCH 095/126] refactoring --- vta/python/vta/top/op.py | 87 ++++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 43 deletions(-) diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index 063a8acdb303..96eaa8fb9905 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -57,33 +57,35 @@ def compute_conv2d(attrs, inputs, output_type, target): layout = attrs.data_layout out_dtype = attrs.out_dtype - assert dilation == (1, 1), "support for dilation limited to (1, 1)" - if is_packed_layout(layout): - if groups == 1: - assert groups == 1 - env = get_env() - assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now" - assert env.LOG_WGT_WIDTH == 3, "only support 8bit wgt for now" - inputs = list(inputs) - assert inputs[1].dtype == "int8" - return [topi.nn.conv2d(inputs[0], - inputs[1], - strides, - padding, - dilation, - layout, - out_dtype)] - return [topi.nn.group_conv2d_nchw(inputs[0], - inputs[1], - strides, - padding, - dilation, - groups, - out_dtype)] - elif target.device_name == "vta": + if target.device_name == "vta": + assert dilation == (1, 1), "support for dilation limited to (1, 1)" + if is_packed_layout(layout): + if groups == 1: + assert groups == 1 + env = get_env() + assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now" + assert env.LOG_WGT_WIDTH == 3, "only support 8bit wgt for now" + inputs = list(inputs) + assert inputs[1].dtype == "int8" + return [topi.nn.conv2d(inputs[0], + inputs[1], + strides, + padding, + dilation, + layout, + out_dtype)] + return [topi.nn.group_conv2d_nchw(inputs[0], + inputs[1], + strides, + padding, + dilation, + groups, + out_dtype)] + # If it's not packed, run on ARM CPU with tvm.target.arm_cpu(tvm.target.current_target().model): return _nn.compute_conv2d(attrs, inputs, output_type, target) + # If VTA is not the target, default to _nn def return _nn.compute_conv2d(attrs, inputs, output_type, target) @@ -93,19 +95,18 @@ def schedule_conv2d(attrs, outs, target): groups = attrs.groups layout = attrs.data_layout - if is_packed_layout(layout): - target = tvm.target.create(target) - if target.device_name == "vta": + if target.device_name == "vta": + if is_packed_layout(layout): + target = tvm.target.create(target) + assert target.device_name == "vta" if groups == 1: return topi.generic.schedule_conv2d_nchw(outs) return topi.generic.schedule_group_conv2d_nchw(outs) - # elif str(target).startswith("llvm"): - # return tvm.create_schedule([x.op for x in outs]) - raise RuntimeError("Target %s is not supported" % target) - elif target.device_name == "vta": + # If it's not packed, run on ARM CPU with tvm.target.arm_cpu(tvm.target.current_target().model): return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target()) + # If VTA is not the target, default to _nn def return _nn.schedule_conv2d(attrs, outs, target) @@ -115,29 +116,29 @@ def compute_dense(attrs, inputs, out_type, target): out_dtype = attrs.out_dtype out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype - if inputs[0].shape == 4: # this implies the layout is packed - target = tvm.target.create(target) - return [topi.nn.dense(inputs[0], inputs[1], None, out_dtype)] - elif target.device_name == "vta": + if target.device_name == "vta": + if inputs[0].shape == 4: # this implies the layout is packed + target = tvm.target.create(target) + return [topi.nn.dense(inputs[0], inputs[1], None, out_dtype)] + # If it's not packed, run on ARM CPU with tvm.target.arm_cpu(tvm.target.current_target().model): return _nn.compute_dense(attrs, inputs, out_type, target) + # If VTA is not the target, default to _nn def return _nn.compute_dense(attrs, inputs, out_type, target) @reg.register_schedule("nn.dense", level=15) def schedule_dense(attrs, outs, target): """Schedule definition of dense""" - - if outs[0].shape == 4: # this implies the layout is packed - target = tvm.target.create(target) - if target.device_name == "vta": + if target.device_name == "vta": + if outs[0].shape == 4: # this implies the layout is packed + target = tvm.target.create(target) + assert target.device_name == "vta" return topi.generic.schedule_dense(outs) - # elif str(target).startswith("llvm"): - # return tvm.create_schedule([x.op for x in outs]) - raise RuntimeError("Target %s is not supported" % target) - elif target.device_name == "vta": + # If it's not packed, run on ARM CPU with tvm.target.arm_cpu(tvm.target.current_target().model): return _nn.schedule_dense(attrs, outs, tvm.target.current_target()) + # If VTA is not the target, default to _nn def return _nn.schedule_dense(attrs, outs, target) From 401baa75db97b65743a0964a307b174a8cc6f60f Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 20:03:13 -0700 Subject: [PATCH 096/126] fix naming --- python/tvm/relay/quantize/_annotate.py | 12 ++++++------ src/relay/pass/quantize.cc | 14 +++++++------- src/relay/pass/quantize.h | 12 ++++++------ 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py index e98f45ef96b0..c625dc26bcb7 100644 --- a/python/tvm/relay/quantize/_annotate.py +++ b/python/tvm/relay/quantize/_annotate.py @@ -370,12 +370,12 @@ def concatenate_rewrite(ref_call, new_args, ctx): # register for vta stop fusion def register_vta_rewrite(op_name, frewrite=None, level=10): def _register(func): - return _op.op._Register(op_name, "FQVtaRewrite", func, level) + return _op.op._Register(op_name, "FQVTARewrite", func, level) return _register(frewrite) if frewrite is not None else _register @register_relay_node -class QVtaExpr(_expr.TempExpr): +class QVTAExpr(_expr.TempExpr): def __init__(self, expr): self.__init_handle_by_constructor__( _quantize.make_vta_expr, expr) @@ -385,7 +385,7 @@ def realize(self): def vta_expr_check(expr): - if isinstance(expr, QVtaExpr): + if isinstance(expr, QVTAExpr): return True, expr.expr return False, expr @@ -406,13 +406,13 @@ def conv2d_vta_rewrite(ref_call, new_args, ctx): if data_cond: data = new_args[0].realize() ret = _forward_op(ref_call, [data, kernel]) - return QVtaExpr(ret) + return QVTAExpr(ret) def identity_vta_rewrite(ref_call, new_args, ctx): cond, expr = vta_expr_check(new_args[0]) if cond: - return QVtaExpr(_forward_op(ref_call, [expr])) + return QVTAExpr(_forward_op(ref_call, [expr])) return None register_vta_rewrite("nn.relu", identity_vta_rewrite) @@ -429,5 +429,5 @@ def add_vta_rewrite(ref_call, new_args, ctx): rhs = new_args[1].realize() return _forward_op(ref_call, [lhs, rhs]) elif lhs_cond and not rhs_cond: - return QVtaExpr(_forward_op(ref_call, [lhs, rhs])) + return QVTAExpr(_forward_op(ref_call, [lhs, rhs])) return None diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc index 3d5802307af3..ebc127639287 100644 --- a/src/relay/pass/quantize.cc +++ b/src/relay/pass/quantize.cc @@ -704,7 +704,7 @@ Pass QuantizeRewriteForVTAPass() { runtime::TypedPackedFunc pass_func = [=](Function f, Module m, PassContext pc) { return Downcast( - ForwardRewrite(f, "FQVtaRewrite", nullptr, nullptr)); + ForwardRewrite(f, "FQVTARewrite", nullptr, nullptr)); }; return CreateFunctionPass(pass_func, 1, "QuantizeRewriteForVTA", {}); } @@ -716,20 +716,20 @@ TVM_REGISTER_API("relay._quantize.QuantizeRewriteForVTA") // Insert stop_fusion for vta. -Expr QVtaExprNode::Realize() const { +Expr QVTAExprNode::Realize() const { Expr ret = ForceCast(this->expr); return StopFusion(ret); } -QVtaExpr QVtaExprNode::make(Expr expr) { - auto rnode = make_node(); +QVTAExpr QVTAExprNode::make(Expr expr) { + auto rnode = make_node(); rnode->expr = expr; - return QVtaExpr(rnode); + return QVTAExpr(rnode); } TVM_REGISTER_API("relay._quantize.make_vta_expr") .set_body([](TVMArgs args, TVMRetValue *ret) { - *ret = QVtaExprNode::make(args[0]); + *ret = QVTAExprNode::make(args[0]); }); TVM_REGISTER_API("relay._quantize.make_stop_fusion") @@ -739,7 +739,7 @@ TVM_REGISTER_API("relay._quantize.make_stop_fusion") TVM_REGISTER_API("relay._quantize.temp_expr_realize") .set_body_typed([] (const Expr& expr) { - const QVtaExprNode* n = expr.as(); + const QVTAExprNode* n = expr.as(); CHECK(n); return n->Realize(); }); diff --git a/src/relay/pass/quantize.h b/src/relay/pass/quantize.h index 318ebe57e2af..2699ccd09e57 100644 --- a/src/relay/pass/quantize.h +++ b/src/relay/pass/quantize.h @@ -72,11 +72,11 @@ class QAnnotateExprNode : public TempExprNode { RELAY_DEFINE_NODE_REF(QAnnotateExpr, QAnnotateExprNode, TempExpr); -class QVtaExpr; +class QVTAExpr; /*! * \brief TempExprNode used during annotate forward rewrite. */ -class QVtaExprNode : public TempExprNode { +class QVTAExprNode : public TempExprNode { public: /*! \brief The original expression */ Expr expr; @@ -85,15 +85,15 @@ class QVtaExprNode : public TempExprNode { v->Visit("expr", &expr); } - TVM_DLL static QVtaExpr make(Expr expr); + TVM_DLL static QVTAExpr make(Expr expr); Expr Realize() const final; - static constexpr const char* _type_key = "relay.QVtaExpr"; - TVM_DECLARE_NODE_TYPE_INFO(QVtaExprNode, TempExprNode); + static constexpr const char* _type_key = "relay.QVTAExpr"; + TVM_DECLARE_NODE_TYPE_INFO(QVTAExprNode, TempExprNode); }; -RELAY_DEFINE_NODE_REF(QVtaExpr, QVtaExprNode, TempExpr); +RELAY_DEFINE_NODE_REF(QVTAExpr, QVTAExprNode, TempExpr); /*! \brief TempExpr used during realize forward rewrite. */ From f1b810eaf54a54a419adf50095342b60f82aba30 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 20:19:03 -0700 Subject: [PATCH 097/126] annotation ops --- python/tvm/relay/quantize/_annotate.py | 4 ++-- src/relay/op/annotation/annotation.cc | 14 +++++++------- src/relay/pass/fuse_ops.cc | 2 +- src/relay/pass/quantize.cc | 4 ++-- vta/python/vta/top/graphpack.py | 8 ++++---- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py index c625dc26bcb7..0a549f568b85 100644 --- a/python/tvm/relay/quantize/_annotate.py +++ b/python/tvm/relay/quantize/_annotate.py @@ -309,7 +309,7 @@ def identity_rewrite(ref_call, new_args, ctx): register_annotate_function("nn.relu", identity_rewrite) register_annotate_function("strided_slice", identity_rewrite) register_annotate_function("nn.avg_pool2d", identity_rewrite) -register_annotate_function("stop_fusion", identity_rewrite) +register_annotate_function("annotation.stop_fusion", identity_rewrite) def pool2d_rewrite(ref_call, new_args, ctx): @@ -330,7 +330,7 @@ def pool2d_rewrite(ref_call, new_args, ctx): register_annotate_function("nn.max_pool2d", pool2d_rewrite) -@register_annotate_function("force_cast") +@register_annotate_function("annotation.force_cast") def force_cast_rewrite(ref_call, new_args, ctx): """Rewrite function to force cast""" if _conv_counter() <= current_qconfig().skip_k_conv: diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc index e6d41073e473..4e37aab11357 100644 --- a/src/relay/op/annotation/annotation.cc +++ b/src/relay/op/annotation/annotation.cc @@ -58,16 +58,16 @@ RELAY_REGISTER_OP("on_device") ElemwiseArbitraryLayout); Expr StopFusion(Expr data) { - static const Op& op = Op::Get("stop_fusion"); + static const Op& op = Op::Get("annotation.stop_fusion"); return CallNode::make(op, {data}, Attrs{}, {}); } -TVM_REGISTER_API("relay.op.annotation._make.stop_fusion") +TVM_REGISTER_API("relay.op.annotation._make.annotation.") .set_body_typed([](Expr data) { return StopFusion(data); }); -RELAY_REGISTER_OP("stop_fusion") +RELAY_REGISTER_OP("annotation.stop_fusion") .describe(R"code(Annotate an expression to prevent it being fused with previous expressions.)code" TVM_ADD_FILELINE) .set_num_inputs(1) @@ -84,11 +84,11 @@ TVM_ADD_FILELINE) }); Expr ForceCast(Expr data) { - static const Op& op = Op::Get("force_cast"); + static const Op& op = Op::Get("annotation.force_cast"); return CallNode::make(op, {data}, Attrs{}, {}); } -RELAY_REGISTER_OP("force_cast") +RELAY_REGISTER_OP("annotation.force_cast") .describe(R"code(Annotate an expression to force a cast.)code" TVM_ADD_FILELINE) .set_num_inputs(1) @@ -105,7 +105,7 @@ TVM_ADD_FILELINE) }); -RELAY_REGISTER_OP("bitpack_start") +RELAY_REGISTER_OP("annotation.bitpack_start") .describe(R"code( Mark the start of bitpacking. )code" TVM_ADD_FILELINE) @@ -122,7 +122,7 @@ Mark the start of bitpacking. return {topi::identity(inputs[0])}; }); -RELAY_REGISTER_OP("bitpack_end") +RELAY_REGISTER_OP("annotation.bitpack_end") .describe(R"code( Mark the end of bitpacking. )code" TVM_ADD_FILELINE) diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc index 9cd73171bfea..9f940e54953b 100644 --- a/src/relay/pass/fuse_ops.cc +++ b/src/relay/pass/fuse_ops.cc @@ -821,7 +821,7 @@ class FuseMutator : private ExprMutator { // Transform calls. Expr VisitExpr_(const CallNode* call) { - static const Op& stop_fusion = Op::Get("stop_fusion"); + static const Op& stop_fusion = Op::Get("annotation.stop_fusion"); if (call->op.as()) { // If it is a primitive op call // then we must have a group assignment for it already. diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc index ebc127639287..2f23c7659b02 100644 --- a/src/relay/pass/quantize.cc +++ b/src/relay/pass/quantize.cc @@ -530,7 +530,7 @@ RELAY_REGISTER_OP("nn.relu") RELAY_REGISTER_OP("strided_slice") .set_attr("FQRealizeRewrite", IdentityRealize); -RELAY_REGISTER_OP("stop_fusion") +RELAY_REGISTER_OP("annotation.stop_fusion") .set_attr("FQRealizeRewrite", IdentityRealize); /* \brief for unary operators which requantize its input to dtype_nbit */ @@ -585,7 +585,7 @@ Expr ForceCastRealize(const Call& ref_call, return Expr(nullptr); } -RELAY_REGISTER_OP("force_cast") +RELAY_REGISTER_OP("annotation.force_cast") .set_attr("FQRealizeRewrite", ForceCastRealize); TVM_REGISTER_API("relay._quantize.realize") diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py index c6cc49748bac..6f901833ea15 100644 --- a/vta/python/vta/top/graphpack.py +++ b/vta/python/vta/top/graphpack.py @@ -111,8 +111,8 @@ def __init__(self, bfactor, cfactor, weight_bits): self.weight_bits = weight_bits self.start_pack = False # Cache Operator the algorithm matches against. - self.bitpack_start = op.op.get('bitpack_start') - self.bitpack_end = op.op.get('bitpack_end') + self.bitpack_start = op.op.get('annotation.bitpack_start') + self.bitpack_end = op.op.get('annotation.bitpack_end') self.conv2d = op.op.get("nn.conv2d") self.conv2d_transpose = op.op.get("nn.conv2d_transpose") self.add = op.op.get("add") @@ -229,8 +229,8 @@ def get_subgraph(expr, start_name, stop_name): This constraint will be lifted in the future. bitpack_start and bitpack_end are both inclusive. """ - bitpack_start = op.op.get('bitpack_start') - bitpack_end = op.op.get('bitpack_end') + bitpack_start = op.op.get('annotation.bitpack_start') + bitpack_end = op.op.get('annotation.bitpack_end') anf = relay.ir_pass.to_a_normal_form(expr) def _recursion(anf, start_found, stop_found): """ Helper to obtain the subgraph. From 51acba8d608ab84d282adacdf52a461d737e9245 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 19 Jun 2019 20:20:48 -0700 Subject: [PATCH 098/126] typo fix --- src/relay/op/annotation/annotation.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc index 4e37aab11357..a5ade5bde304 100644 --- a/src/relay/op/annotation/annotation.cc +++ b/src/relay/op/annotation/annotation.cc @@ -62,7 +62,7 @@ Expr StopFusion(Expr data) { return CallNode::make(op, {data}, Attrs{}, {}); } -TVM_REGISTER_API("relay.op.annotation._make.annotation.") +TVM_REGISTER_API("relay.op.annotation._make.stop_fusion") .set_body_typed([](Expr data) { return StopFusion(data); }); From 819e2d9b9599081cda5d0f1dc1a12c37ae7de551 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Thu, 20 Jun 2019 00:02:19 -0700 Subject: [PATCH 099/126] autoTVM tutorial for VTA --- docs/conf.py | 3 +- vta/scripts/tune_resnet.py | 1 - vta/tutorials/autotvm/README.txt | 3 + vta/tutorials/autotvm/tune_relay_vta.py | 458 ++++++++++++++++++++++++ 4 files changed, 463 insertions(+), 2 deletions(-) create mode 100644 vta/tutorials/autotvm/README.txt create mode 100644 vta/tutorials/autotvm/tune_relay_vta.py diff --git a/docs/conf.py b/docs/conf.py index d9eea045a97b..c4410e5864f9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -217,7 +217,8 @@ def run_doxygen(folder): '../tutorials/topi', '../tutorials/deployment', '../vta/tutorials/frontend', - '../vta/tutorials/optimize']) + '../vta/tutorials/optimize', + '../vta/tutorials/autotvm']) def generate_doxygen_xml(app): """Run the doxygen make commands if we're on the ReadTheDocs server""" diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py index 5469b6b40b6e..1a7c74bee3f7 100644 --- a/vta/scripts/tune_resnet.py +++ b/vta/scripts/tune_resnet.py @@ -271,7 +271,6 @@ def tune_tasks(tasks, # Compile network print("Compiling network with best tuning parameters...") - # relay_prog, params = compile_network(opt, env, target) with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta": graph, lib, params = relay.build( diff --git a/vta/tutorials/autotvm/README.txt b/vta/tutorials/autotvm/README.txt new file mode 100644 index 000000000000..c511381dd57d --- /dev/null +++ b/vta/tutorials/autotvm/README.txt @@ -0,0 +1,3 @@ +Auto tuning +------------- + diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py new file mode 100644 index 000000000000..891c23d6d105 --- /dev/null +++ b/vta/tutorials/autotvm/tune_relay_vta.py @@ -0,0 +1,458 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Auto-tuning a convolutional network on VTA +========================================== +**Author**: `Lianmin Zheng `_, `Thierry Moreau `_ + +Auto-tuning for a specific accelerator design is critical for getting the best +performance for any given operator. This is a tutorial showcases how to tune a +whole convolutional network on VTA. + +The operator implementation for VTA in TVM is written in template form. +The template has many tunable knobs (tile factor, virtual threads, etc). +We will tune all convolution operators in the neural network. After tuning, +we produce a log file which stores the best schedule parameters for all tuned +operators. When the TVM compiler compiles these operators, it will query this +log file to get the best knob parameters. + +""" + +###################################################################### +# Install dependencies +# -------------------- +# To use the autotvm package in tvm, we need to install some extra dependencies. +# (change "3" to "2" if you use python2): +# +# .. code-block:: bash +# +# pip3 install --user psutil xgboost tornado mxnet requests pillow +# +# To make TVM run faster during tuning, it is recommended to use cython +# as FFI of TVM. In the root directory of TVM, execute +# (change "3" to "2" if you use python2): +# +# .. code-block:: bash +# +# pip3 install --user cython +# sudo make cython3 +# +# Now return to python code. Import packages. + +import os +from mxnet.gluon.model_zoo import vision +import numpy as np +from PIL import Image + +import topi +import tvm +from tvm import rpc, autotvm, relay +from tvm.contrib import graph_runtime, util, download +from tvm.autotvm.measure.measure_methods import request_remote +from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner + +import vta +from vta.testing import simulator +from vta.top import graph_pack + +################################################################# +# Compile network +# --------------- +# Perform vta-specific compilation with Relay from a Gluon model + +def compile_network(env, target, model, start_pack, stop_pack): + + # Populate the shape and data type dictionary + dtype_dict = {"data": 'float32'} + shape_dict = {"data": (env.BATCH, 3, 224, 224)} + + # Get off the shelf gluon model, and convert to relay + gluon_model = vision.get_model(model, pretrained=True) + mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) + + # Update shape and type dictionary + shape_dict.update({k: v.shape for k, v in params.items()}) + dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) + + # Perform quantization in Relay + with relay.quantize.qconfig(global_scale=8.0, + skip_k_conv=1, + skip_k_dense=1, + target_vta=True): + relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params) + + # Perform graph packing and constant folding for VTA target + if target.device_name == "vta": + assert env.BLOCK_IN == env.BLOCK_OUT + relay_prog = graph_pack( + relay_prog, + env.BATCH, + env.BLOCK_OUT, + env.WGT_WIDTH, + start_name=start_pack, + stop_name=stop_pack) + relay_prog = relay.ir_pass.fold_constant(relay_prog) + + return relay_prog, params + + +################################################################# +# Start RPC Tracker +# ----------------- +# TVM uses an RPC session to communicate with Pynq boards. +# During tuning, the tuner will send the generated code to the board and +# measure the speed of code on the board. +# +# To scale up tuning, TVM uses an RPC Tracker to manage multiple devices. +# The RPC Tracker is a centralized master node. We can register all devices to +# the tracker. For example, if we have 10 Pynq boards, we can register all of them +# to the tracker, and run 10 measurements in parallel, accelerating the tuning process. +# +# To start an RPC tracker, run this command on the host machine. The tracker is +# required during the whole tuning process, so we need to open a new terminal for +# this command: +# +# .. code-block:: bash +# +# python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190 +# +# The expected output is: +# +# .. code-block:: bash +# +# INFO:RPCTracker:bind to 0.0.0.0:9190 + +################################################################# +# Register devices to RPC Tracker +# ----------------------------------- +# Now we can register our devices to the tracker. The first step is to +# build the TVM runtime for the Pynq devices. +# +# Follow `this section `_ +# to build the TVM runtime on the device. Then register the device to the tracker with: +# +# .. code-block:: bash +# +# python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=pynq +# +# (replace :code:`[HOST_IP]` with the IP address of your host machine) +# +# After registering devices, we can confirm it by querying the rpc_tracker: +# +# .. code-block:: bash +# +# python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190 +# +# For example, if we have 6 Pynq boards and 11 Raspberry Pi 3B, +# the output can be +# +# .. code-block:: bash +# +# Queue Status +# ---------------------------------- +# key total free pending +# ---------------------------------- +# pynq 6 6 0 +# rpi3b 11 11 0 +# ---------------------------------- +# +# You can register multiple devices to the tracker to accelerate tuning. + +########################################### +# Set Tuning Options +# ------------------ +# Before tuning, we should apply some configurations. +# Here we use an Pynq-Z1 board as an example. + +# Tracker host and port can be set by your environment +tracker_host = os.environ.get("TVM_TRACKER_HOST", '0.0.0.0') +tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190)) + +# Load VTA parameters from the vta/config/vta_config.json file +env = vta.get_env() + +# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device. +# Set ``device=arm_cpu`` to run inference on the CPU +# or ``device=vta`` to run inference on the FPGA. +device = "vta" +target = env.target if device == "vta" else env.target_vta_cpu + +# Name of Gluon model to compile +# The ``start_pack`` and ``stop_pack`` labels indicate where +# to start and end the graph packing relay pass: in other words +# where to start and finish offloading to VTA. +network = "resnet18_v1" +start_pack="nn.max_pool2d" +stop_pack="nn.global_avg_pool2d" + +# Tuning option +log_file = "%s.%s.log" % (device, network) +tuning_option = { + 'log_filename': log_file, + + 'tuner': 'random', + 'n_trial': 1000, + 'early_stopping': None, + + 'measure_option': autotvm.measure_option( + builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func), + runner=autotvm.RPCRunner( + env.TARGET, host=tracker_host, port=tracker_port, + number=5, + timeout=60, + check_correctness=True + ), + ), +} + +#################################################################### +# +# .. note:: How to set tuning options +# +# In general, the default values provided here work well. +# If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` +# to larger values, makes the tuning run for longer. +# If your device is under-powered or your conv2d operators are large, consider +# setting a longer timeout. +# + +################################################################### +# Begin Tuning +# ------------ +# Now we can extract tuning tasks from the network and begin tuning. +# Here, we provide a simple utility function to tune a list of tasks. +# This function is just an initial implementation which tunes them in sequential order. +# We will introduce a more sophisticated tuning scheduler in the future. +# +# Given that the tuning will be done on Pynq FPGA boards, make sure that +# the ```TARGET`` entry in the ``vta_config.json`` file is set to ``pynq``. + +# You can skip the implementation of this function for this tutorial. +def tune_tasks(tasks, + measure_option, + tuner='xgb', + n_trial=1000, + early_stopping=None, + log_filename='tuning.log', + use_transfer_learning=True): + + # create tmp log file + tmp_log_file = log_filename + ".tmp" + if os.path.exists(tmp_log_file): + os.remove(tmp_log_file) + + for i, tsk in enumerate(reversed(tasks)): + prefix = "[Task %2d/%2d] " % (i+1, len(tasks)) + + # create tuner + if tuner == 'xgb' or tuner == 'xgb-rank': + tuner_obj = XGBTuner(tsk, loss_type='rank') + elif tuner == 'xgb_knob': + tuner_obj = XGBTuner(tsk, loss_type='rank', feature_type='knob') + elif tuner == 'ga': + tuner_obj = GATuner(tsk, pop_size=50) + elif tuner == 'random': + tuner_obj = RandomTuner(tsk) + elif tuner == 'gridsearch': + tuner_obj = GridSearchTuner(tsk) + else: + raise ValueError("Invalid tuner: " + tuner) + + if use_transfer_learning: + if os.path.isfile(tmp_log_file): + tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file)) + + # do tuning + tuner_obj.tune(n_trial=min(n_trial, len(tsk.config_space)), + early_stopping=early_stopping, + measure_option=measure_option, + callbacks=[ + autotvm.callback.progress_bar(n_trial, prefix=prefix), + autotvm.callback.log_to_file(tmp_log_file)]) + + # pick best records to a cache file + autotvm.record.pick_best(tmp_log_file, log_filename) + os.remove(tmp_log_file) + + + +######################################################################## +# Register VTA-specific tuning tasks + +def register_vta_tuning_tasks(): + from tvm.autotvm.task.topi_integration import TaskExtractEnv, deserialize_args + + @tvm.tag_scope(tag=topi.tag.ELEMWISE) + def my_clip(x, a_min, a_max): + """Unlike topi's current clip, put min and max into two stages.""" + const_min = tvm.const(a_min, x.dtype) + const_max = tvm.const(a_max, x.dtype) + x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") + x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") + return x + + # init autotvm env to register VTA operator + TaskExtractEnv() + + @autotvm.task.register("topi_nn_conv2d", override=True) + def _topi_nn_conv2d(*args, **kwargs): + assert not kwargs, "Do not support kwargs in template function call" + args = deserialize_args(args) + A, W = args[:2] + + with tvm.target.vta(): + res = topi.nn.conv2d(*args, **kwargs) + res = topi.right_shift(res, 8) + res = my_clip(res, 0, 127) + res = topi.cast(res, "int8") + + if tvm.target.current_target().device_name == 'vta': + s = topi.generic.schedule_conv2d_nchw([res]) + else: + s = tvm.create_schedule([res.op]) + return s, [A, W, res] + + +######################################################################## +# Finally, we launch tuning jobs and evaluate the end-to-end performance. + +def tune_and_evaluate(tuning_opt): + + if env.TARGET != "sim": + # Get remote from fleet node + remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000) + # Reconfigure the JIT runtime and FPGA. + vta.reconfig_runtime(remote) + vta.program_fpga(remote, bitstream=None) + else: + # In simulation mode, host the RPC server locally. + remote = rpc.LocalSession() + + # Register VTA tuning tasks + register_vta_tuning_tasks() + + # Perform task extraction on Relay program + print("Extract tasks...") + relay_prog, params = compile_network(env, target, network, start_pack, stop_pack) + tasks = autotvm.task.extract_from_program(func=relay_prog, + params=params, + ops=(tvm.relay.op.nn.conv2d,), + target=target, + target_host=env.target_host) + + # We should have extracted 10 convolution tasks + assert len(tasks) == 10 + print("Extracted {} conv2d tasks:".format(len(tasks))) + for tsk in tasks: + print("\t{}".format(tsk)) + + # We do not run the tuning in our webpage server since it takes too long. + # Comment the following line to run it by yourself. + return + + # run tuning tasks + print("Tuning...") + tune_tasks(tasks, **tuning_opt) + + # compile kernels with history best records + with autotvm.tophub.context(target, extra_files=[log_file]): + # Compile network + print("Compile...") + with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): + if target.device_name != "vta": + graph, lib, params = relay.build( + relay_prog, target=target, + params=params, target_host=env.target_host) + else: + with vta.build_config(): + graph, lib, params = relay.build( + relay_prog, target=target, + params=params, target_host=env.target_host) + + # Export library + temp = util.tempdir() + lib.save(temp.relpath("graphlib.o")) + remote.upload(temp.relpath("graphlib.o")) + lib = remote.load_module("graphlib.o") + + # Generate the graph runtime + m = graph_runtime.create(graph, lib, ctx) + + # upload parameters to device + ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) + image = tvm.nd.array( + (np.random.uniform(size=(1, 3, 224, 224))).astype('float32')) + m.set_input(**params) + m.set_input('data', image) + + # evaluate + timer = m.module.time_evaluator("run", ctx, number=1, repeat=10) + tcost = timer() + prof_res = np.array(tcost.results) * 1000 # convert to millisecond + print("Mean inference time (std dev): %.2f ms (%.2f ms)" % + (np.mean(prof_res), np.std(prof_res))) + +# Run the tuning and evaluate the results +tune_and_evaluate(tuning_option) + +###################################################################### +# Sample Output +# ------------- +# The tuning needs to compile many programs and extract feature from them. +# So a high performance CPU is recommended. +# One sample output is listed below. +# It takes about 2 hours on a 16T CPU, and 6 Pynq boards. +# +# .. code-block:: bash +# +# Extract tasks... +# Tuning... +# [Task 1/12] Current/Best: 22.37/ 52.19 GFLOPS | Progress: (544/1000) | 406.59 s Done. +# [Task 2/12] Current/Best: 6.51/ 18.77 GFLOPS | Progress: (608/1000) | 325.05 s Done. +# [Task 3/12] Current/Best: 4.67/ 24.87 GFLOPS | Progress: (480/1000) | 372.31 s Done. +# [Task 4/12] Current/Best: 11.35/ 46.83 GFLOPS | Progress: (736/1000) | 602.39 s Done. +# [Task 5/12] Current/Best: 1.01/ 19.80 GFLOPS | Progress: (448/1000) | 262.16 s Done. +# [Task 6/12] Current/Best: 2.47/ 23.76 GFLOPS | Progress: (672/1000) | 563.85 s Done. +# [Task 7/12] Current/Best: 14.57/ 33.97 GFLOPS | Progress: (544/1000) | 465.15 s Done. +# [Task 8/12] Current/Best: 1.13/ 17.65 GFLOPS | Progress: (576/1000) | 365.08 s Done. +# [Task 9/12] Current/Best: 14.45/ 22.66 GFLOPS | Progress: (928/1000) | 724.25 s Done. +# [Task 10/12] Current/Best: 3.22/ 15.36 GFLOPS | Progress: (864/1000) | 564.27 s Done. +# [Task 11/12] Current/Best: 11.03/ 32.23 GFLOPS | Progress: (736/1000) | 635.15 s Done. +# [Task 12/12] Current/Best: 8.00/ 21.65 GFLOPS | Progress: (1000/1000) | 1111.81 s Done. +# Compile... +# Upload... +# Evaluate inference time cost... +# Mean inference time (std dev): 162.59 ms (0.06 ms) + +###################################################################### +# +# .. note:: **Experiencing Difficulties?** +# +# The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS", +# then there must be something wrong. +# +# First, make sure you set the correct configuration of your device. +# Then, you can print debug information by adding these lines in the beginning +# of the script. It will print every measurement result, where you can find useful +# error messages. +# +# .. code-block:: python +# +# import logging +# logging.getLogger('autotvm').setLevel(logging.DEBUG) +# +# Finally, always feel free to ask our community for help on https://discuss.tvm.ai From ff20dc5e6de772fcccf2d0641f7b1fa3c6156d53 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Thu, 20 Jun 2019 09:54:04 -0700 Subject: [PATCH 100/126] bug fix and tweaking output --- vta/tutorials/autotvm/tune_relay_vta.py | 40 ++++++++++++++++--------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py index 891c23d6d105..1f52bb4f62c1 100644 --- a/vta/tutorials/autotvm/tune_relay_vta.py +++ b/vta/tutorials/autotvm/tune_relay_vta.py @@ -384,22 +384,24 @@ def tune_and_evaluate(tuning_opt): params=params, target_host=env.target_host) # Export library + print("Upload...") temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # Generate the graph runtime + ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) m = graph_runtime.create(graph, lib, ctx) # upload parameters to device - ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) image = tvm.nd.array( (np.random.uniform(size=(1, 3, 224, 224))).astype('float32')) m.set_input(**params) m.set_input('data', image) # evaluate + print("Evaluate inference time cost...") timer = m.module.time_evaluator("run", ctx, number=1, repeat=10) tcost = timer() prof_res = np.array(tcost.results) * 1000 # convert to millisecond @@ -420,23 +422,33 @@ def tune_and_evaluate(tuning_opt): # .. code-block:: bash # # Extract tasks... +# [Warning] Invalid shape during AutoTVM task creation +# Extracted 10 conv2d tasks: +# Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 16, 14, 14, 1, 16), 'int8'), ('TENSOR', (32, 16, 1, 1, 16, 16), 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 16, 14, 14, 1, 16, 'int8'), (32, 16, 1, 1, 16, 16, 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32')) +# Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 8, 28, 28, 1, 16), 'int8'), ('TENSOR', (16, 8, 1, 1, 16, 16), 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 8, 28, 28, 1, 16, 'int8'), (16, 8, 1, 1, 16, 16, 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32')) +# Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 4, 56, 56, 1, 16), 'int8'), ('TENSOR', (8, 4, 1, 1, 16, 16), 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 4, 56, 56, 1, 16, 'int8'), (8, 4, 1, 1, 16, 16, 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32')) +# Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 4, 56, 56, 1, 16), 'int8'), ('TENSOR', (4, 4, 3, 3, 16, 16), 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 4, 56, 56, 1, 16, 'int8'), (4, 4, 3, 3, 16, 16, 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32')) +# Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 8, 28, 28, 1, 16), 'int8'), ('TENSOR', (8, 8, 3, 3, 16, 16), 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 8, 28, 28, 1, 16, 'int8'), (8, 8, 3, 3, 16, 16, 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32')) +# Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 4, 56, 56, 1, 16), 'int8'), ('TENSOR', (8, 4, 3, 3, 16, 16), 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 4, 56, 56, 1, 16, 'int8'), (8, 4, 3, 3, 16, 16, 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32')) +# Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 16, 14, 14, 1, 16), 'int8'), ('TENSOR', (16, 16, 3, 3, 16, 16), 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 16, 14, 14, 1, 16, 'int8'), (16, 16, 3, 3, 16, 16, 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32')) +# Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 8, 28, 28, 1, 16), 'int8'), ('TENSOR', (16, 8, 3, 3, 16, 16), 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 8, 28, 28, 1, 16, 'int8'), (16, 8, 3, 3, 16, 16, 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32')) +# Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 32, 7, 7, 1, 16), 'int8'), ('TENSOR', (32, 32, 3, 3, 16, 16), 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 32, 7, 7, 1, 16, 'int8'), (32, 32, 3, 3, 16, 16, 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32')) +# Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 16, 14, 14, 1, 16), 'int8'), ('TENSOR', (32, 16, 3, 3, 16, 16), 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 16, 14, 14, 1, 16, 'int8'), (32, 16, 3, 3, 16, 16, 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32')) # Tuning... -# [Task 1/12] Current/Best: 22.37/ 52.19 GFLOPS | Progress: (544/1000) | 406.59 s Done. -# [Task 2/12] Current/Best: 6.51/ 18.77 GFLOPS | Progress: (608/1000) | 325.05 s Done. -# [Task 3/12] Current/Best: 4.67/ 24.87 GFLOPS | Progress: (480/1000) | 372.31 s Done. -# [Task 4/12] Current/Best: 11.35/ 46.83 GFLOPS | Progress: (736/1000) | 602.39 s Done. -# [Task 5/12] Current/Best: 1.01/ 19.80 GFLOPS | Progress: (448/1000) | 262.16 s Done. -# [Task 6/12] Current/Best: 2.47/ 23.76 GFLOPS | Progress: (672/1000) | 563.85 s Done. -# [Task 7/12] Current/Best: 14.57/ 33.97 GFLOPS | Progress: (544/1000) | 465.15 s Done. -# [Task 8/12] Current/Best: 1.13/ 17.65 GFLOPS | Progress: (576/1000) | 365.08 s Done. -# [Task 9/12] Current/Best: 14.45/ 22.66 GFLOPS | Progress: (928/1000) | 724.25 s Done. -# [Task 10/12] Current/Best: 3.22/ 15.36 GFLOPS | Progress: (864/1000) | 564.27 s Done. -# [Task 11/12] Current/Best: 11.03/ 32.23 GFLOPS | Progress: (736/1000) | 635.15 s Done. -# [Task 12/12] Current/Best: 8.00/ 21.65 GFLOPS | Progress: (1000/1000) | 1111.81 s Done. +# [Task 1/10] Current/Best: 0.72/ 23.24 GFLOPS | Progress: (480/1000) | 640.31 s Done. +# [Task 2/10] Current/Best: 0.00/ 27.69 GFLOPS | Progress: (576/1000) | 810.09 s Done. +# [Task 3/10] Current/Best: 0.00/ 22.97 GFLOPS | Progress: (1000/1000) | 1125.37 s Done. +# [Task 4/10] Current/Best: 0.00/ 31.26 GFLOPS | Progress: (1000/1000) | 1025.52 s Done. +# [Task 5/10] Current/Best: 0.00/ 15.15 GFLOPS | Progress: (1000/1000) | 1236.58 s Done. +# [Task 6/10] Current/Best: 0.00/ 22.74 GFLOPS | Progress: (1000/1000) | 906.60 s Done. +# [Task 7/10] Current/Best: 0.00/ 15.27 GFLOPS | Progress: (1000/1000) | 1056.25 s Done. +# [Task 8/10] Current/Best: 0.00/ 2.18 GFLOPS | Progress: (1000/1000) | 2275.29 s Done. +# [Task 9/10] Current/Best: 2.23/ 3.99 GFLOPS | Progress: (1000/1000) | 2527.25 s Done. +# [Task 10/10] Current/Best: 1.56/ 6.32 GFLOPS | Progress: (480/1000) | 1304.84 s Done. # Compile... # Upload... # Evaluate inference time cost... -# Mean inference time (std dev): 162.59 ms (0.06 ms) +# Mean inference time (std dev): 621.79 ms (0.14 ms) ###################################################################### # From 772a83745d81ff7693db25746e71ef6887efe6db Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Fri, 21 Jun 2019 02:12:20 -0700 Subject: [PATCH 101/126] addressing reviews --- python/tvm/autotvm/task/relay_integration.py | 1 + python/tvm/autotvm/task/topi_integration.py | 2 +- python/tvm/relay/quantize/quantize.py | 14 ++++++++------ vta/python/vta/build_module.py | 2 +- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py index e71c076e26d0..e5359c2f5d75 100644 --- a/python/tvm/autotvm/task/relay_integration.py +++ b/python/tvm/autotvm/task/relay_integration.py @@ -31,6 +31,7 @@ logger = logging.getLogger('autotvm') +# TODO(moreau89) find a more elegant way to build for VTAs def _build(func, target, target_host, diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py index 7ff8ec73e16e..bc434719c36a 100644 --- a/python/tvm/autotvm/task/topi_integration.py +++ b/python/tvm/autotvm/task/topi_integration.py @@ -107,7 +107,7 @@ def __init__(self, allow_duplicate=False): topi.nn.deformable_conv2d_nchw: [topi.generic.schedule_deformable_conv2d_nchw], } - # support reflection for tracing + # function reflection for tracing self.func_to_reflection = { topi.nn.conv2d: lambda x: setattr(topi.nn, 'conv2d', x), topi.nn.conv2d_NCHWc: lambda x: setattr(topi.nn, 'conv2d_NCHWc', x), diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py index 4f3ff60a8c06..b08c8650f917 100644 --- a/python/tvm/relay/quantize/quantize.py +++ b/python/tvm/relay/quantize/quantize.py @@ -124,7 +124,9 @@ def current_qconfig(): """Get the current quantization configuration.""" return _quantize._GetCurrentQConfig() - +# TODO(tmoreau89, ZihengJiang) the skip parameters are +# hacky - we should explore a more future-proof way to +# skip operators based on pattern matching def qconfig(**kwargs): """Configure the quantization behavior by setting config variables. @@ -200,18 +202,18 @@ def annotate_context(): return AnnotateContext.Current -DENSE_COUNTER = 0 +_DENSE_COUNTER = 0 -def _dense_counter(): +def _dense_counter_(): """Get the global counter for dense.""" - return DENSE_COUNTER + return _DENSE_COUNTER def _set_dense_counter(n): """Set the value of the global dense counter.""" - global DENSE_COUNTER - DENSE_COUNTER = n + global _DENSE_COUNTER + _DENSE_COUNTER = n def calibrate(graph, mod=None, ctx=None): diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py index 854dd4daf14a..183a2f4a500d 100644 --- a/vta/python/vta/build_module.py +++ b/vta/python/vta/build_module.py @@ -119,7 +119,7 @@ def build(*args, **kwargs): return tvm.build(*args, **kwargs) return tvm.build(*args, **kwargs) - +# TODO(tmoreau89) unify the build with the rest of the build modules def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs): """Custom build func for VTA. Used for autotvm""" From a692d2f73508fc42892ec64050d23d7fe669b114 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Fri, 21 Jun 2019 02:16:57 -0700 Subject: [PATCH 102/126] fix --- python/tvm/relay/quantize/quantize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py index b08c8650f917..12b1a1be728b 100644 --- a/python/tvm/relay/quantize/quantize.py +++ b/python/tvm/relay/quantize/quantize.py @@ -205,7 +205,7 @@ def annotate_context(): _DENSE_COUNTER = 0 -def _dense_counter_(): +def _dense_counter(): """Get the global counter for dense.""" return _DENSE_COUNTER From b2d060a22e6857a64e4f53a5f49c7604d0da603d Mon Sep 17 00:00:00 2001 From: ZihengJiang Date: Sat, 22 Jun 2019 20:29:21 -0700 Subject: [PATCH 103/126] Update. --- python/tvm/relay/quantize/quantize.py | 5 +---- src/relay/pass/quantize.h | 5 ++++- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py index 12b1a1be728b..3c2a8a10026f 100644 --- a/python/tvm/relay/quantize/quantize.py +++ b/python/tvm/relay/quantize/quantize.py @@ -360,8 +360,6 @@ def quantize(graph, params=None, dataset=None): if params: graph = _bind_params(graph, params) - cfg = current_qconfig() - mod = _module.Module.from_expr(graph) # Perform "SimplifyInference", "FoldScaleAxis", "FoldConstant", and # "CanonicalizeOps" optimization before quantization. @@ -378,8 +376,7 @@ def quantize(graph, params=None, dataset=None): calibrate_pass, realize(), _transform.FoldConstant()] - # Add rewrite_for_vta() pass if target is VTA - if cfg.target_vta: + if current_qconfig().store_lowbit_output: quant_passes = [rewrite_for_vta()] + quant_passes quantize_seq = _transform.Sequential(quant_passes) with _transform.PassContext(opt_level=3, diff --git a/src/relay/pass/quantize.h b/src/relay/pass/quantize.h index 2699ccd09e57..c20f0b606dea 100644 --- a/src/relay/pass/quantize.h +++ b/src/relay/pass/quantize.h @@ -72,9 +72,12 @@ class QAnnotateExprNode : public TempExprNode { RELAY_DEFINE_NODE_REF(QAnnotateExpr, QAnnotateExprNode, TempExpr); +/*! + * \brief TempExpr used to insert `force_cast` for VTA. + */ class QVTAExpr; /*! - * \brief TempExprNode used during annotate forward rewrite. + * \brief TempExprNode used to insert `force_cast` for VTA. */ class QVTAExprNode : public TempExprNode { public: From 5215d62db1a3f4691f9e81edb9a622f1ae7af6b3 Mon Sep 17 00:00:00 2001 From: ZihengJiang Date: Sat, 22 Jun 2019 20:36:23 -0700 Subject: [PATCH 104/126] Update. --- vta/tutorials/frontend/deploy_resnet_on_vta.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vta/tutorials/frontend/deploy_resnet_on_vta.py b/vta/tutorials/frontend/deploy_resnet_on_vta.py index 7b5e6b2e730e..271630e69558 100644 --- a/vta/tutorials/frontend/deploy_resnet_on_vta.py +++ b/vta/tutorials/frontend/deploy_resnet_on_vta.py @@ -159,9 +159,7 @@ # Perform quantization in Relay with relay.quantize.qconfig(global_scale=8.0, - skip_k_conv=1, - skip_k_dense=1, - target_vta=True): + skip_conv_layers=[0]): relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params) # Perform graph packing and constant folding for VTA target @@ -261,4 +259,4 @@ for k in top_categories[-5:]: if "cat" in synset[k]: cat_detected = True -assert(cat_detected) \ No newline at end of file +assert(cat_detected) From 3cb83a615c599c013c82560fcbcb0bda080edb10 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 24 Jun 2019 00:37:37 -0700 Subject: [PATCH 105/126] addressing comments --- python/tvm/relay/quantize/_annotate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py index 0a549f568b85..657fce54b42d 100644 --- a/python/tvm/relay/quantize/_annotate.py +++ b/python/tvm/relay/quantize/_annotate.py @@ -367,7 +367,7 @@ def concatenate_rewrite(ref_call, new_args, ctx): return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION) -# register for vta stop fusion +# Graph rewrite function registration for VTA target def register_vta_rewrite(op_name, frewrite=None, level=10): def _register(func): return _op.op._Register(op_name, "FQVTARewrite", func, level) From 42a447cb2fee260afc8cea7daa9e164ef664cb36 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 24 Jun 2019 00:40:54 -0700 Subject: [PATCH 106/126] addressing more comments --- python/tvm/autotvm/task/relay_integration.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py index e5359c2f5d75..29190fa43324 100644 --- a/python/tvm/autotvm/task/relay_integration.py +++ b/python/tvm/autotvm/task/relay_integration.py @@ -41,12 +41,11 @@ def _build(func, from tvm import relay - if "vta" in str(target): + if target.device_name == "vta": with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): - if target.device_name == "vta": - import vta - with vta.build_config(): - return relay.build(func, target, target_host, params) + import vta + with vta.build_config(): + return relay.build(func, target, target_host, params) # default case return relay.build(func, target, target_host, params) @@ -125,7 +124,7 @@ def extract_from_program(func, params, ops, target, target_host=None): template_key='direct') tasks.append(tsk) except topi.InvalidShapeError: - print("[Warning] Invalid shape during AutoTVM task creation") + warnings.warn("Invalid shape during AutoTVM task creation") return tasks From 1c52ed18eb7469dc48cfd7fe162ef938ac227b87 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 24 Jun 2019 00:45:32 -0700 Subject: [PATCH 107/126] clean up --- python/tvm/autotvm/task/nnvm_integration.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py index e785394a7da3..9161822d173c 100644 --- a/python/tvm/autotvm/task/nnvm_integration.py +++ b/python/tvm/autotvm/task/nnvm_integration.py @@ -105,8 +105,6 @@ def extract_from_graph(graph, shape, dtype, target, symbols, params=None, target tasks = [] for task_name, args in env.get_tasks(): try: - print(task_name) - print(args) tsk = create(task_name, args, target=target, target_host=target_host, template_key='direct') From 3e7aed38f0ed000b5ce6d4abc344d8e0207106cc Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 24 Jun 2019 01:15:14 -0700 Subject: [PATCH 108/126] comment --- python/tvm/autotvm/measure/measure_methods.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 7ddc6cd9ea5f..fd8cbe0e2a0d 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -86,9 +86,7 @@ def __init__(self, timeout=10, n_parallel=None, build_func='default'): build_func = ndk.create_shared else: raise ValueError("Invalid build_func" + build_func) - self.build_func = _wrap_build_func(build_func) - else: - self.build_func = build_func + self.build_func = _wrap_build_func(build_func) self.executor = LocalExecutor(timeout=timeout) self.tmp_dir = tempfile.mkdtemp() From 6f9037f1853451d99a8c43bc9528391b07ceb237 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 24 Jun 2019 01:39:50 -0700 Subject: [PATCH 109/126] adding comment --- python/tvm/autotvm/measure/measure_methods.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index fd8cbe0e2a0d..65f0c515bff2 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -86,7 +86,10 @@ def __init__(self, timeout=10, n_parallel=None, build_func='default'): build_func = ndk.create_shared else: raise ValueError("Invalid build_func" + build_func) - self.build_func = _wrap_build_func(build_func) + self.build_func = _wrap_build_func(build_func) + else: + # If build_func is callable, bypass wrapper + self.build_func = build_func self.executor = LocalExecutor(timeout=timeout) self.tmp_dir = tempfile.mkdtemp() From b0d09c1c0f78cac588a544c3972e927e9d97d32b Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 24 Jun 2019 02:25:51 -0700 Subject: [PATCH 110/126] unify the AutoTVM builder --- python/tvm/autotvm/measure/measure_methods.py | 14 +++-- vta/python/vta/__init__.py | 2 +- vta/python/vta/build_module.py | 57 ------------------- vta/tutorials/autotvm/tune_relay_vta.py | 2 +- 4 files changed, 10 insertions(+), 65 deletions(-) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 65f0c515bff2..b2cf73f7dee1 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -86,10 +86,7 @@ def __init__(self, timeout=10, n_parallel=None, build_func='default'): build_func = ndk.create_shared else: raise ValueError("Invalid build_func" + build_func) - self.build_func = _wrap_build_func(build_func) - else: - # If build_func is callable, bypass wrapper - self.build_func = build_func + self.build_func = _wrap_build_func(build_func) self.executor = LocalExecutor(timeout=timeout) self.tmp_dir = tempfile.mkdtemp() @@ -362,8 +359,13 @@ def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_opti if cuda_arch: set_cuda_target_arch(cuda_arch) - with build_config(**opts): - func = build(s, args, target_host=task.target_host) + if measure_input.target.device_name == 'vta': + # if target is vta, we need to use vta build + import vta + func = vta.build(s, args, target_host=task.target_host) + else: + with build_config(**opts): + func = build(s, args, target_host=task.target_host) return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args) diff --git a/vta/python/vta/__init__.py b/vta/python/vta/__init__.py index 75ecdbad4bc7..926d73649b31 100644 --- a/vta/python/vta/__init__.py +++ b/vta/python/vta/__init__.py @@ -18,5 +18,5 @@ # to maintain minimum dependency on the board if sys.argv[0] not in ("-c", "-m"): from . import top - from .build_module import build_config, lower, build, vta_autotvm_build_func + from .build_module import build_config, lower, build from . import graph diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py index 183a2f4a500d..a291c42e592b 100644 --- a/vta/python/vta/build_module.py +++ b/vta/python/vta/build_module.py @@ -118,60 +118,3 @@ def build(*args, **kwargs): with build_config(): return tvm.build(*args, **kwargs) return tvm.build(*args, **kwargs) - -# TODO(tmoreau89) unify the build with the rest of the build modules -def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs): - """Custom build func for VTA. Used for autotvm""" - - import time - import os - from random import getrandbits - from tvm.autotvm.util import get_const_tuple - from tvm.autotvm.measure.measure_methods import BuildResult, InstantiationError - - tic = time.time() - try: - filename = os.path.join(tmp_dir, "tmp_func_%0x.tar" % getrandbits(64)) - target, task, config = measure_input - - with target: - s, args = task.instantiate(config) - if not config.valid(): - raise InstantiationError(config.errors) - - func = build(s, args, target_host=task.target_host) - sim = build(s, args) - - arg_info = tuple((get_const_tuple(x.shape), x.dtype) for x in args) - func.export_library(filename) - - # When targeting VTA test the schedule on simulator first - # in order to catch runtime errors - if measure_input.target.device_name == 'vta': - from vta import reconfig_runtime - # Note: if you're not running the RPC locally, you cannot benefit - # from rumtime recompilation... - local_rpc_port = int(os.environ.get("VTA_LOCAL_SIM_RPC_PORT", "0")) - if local_rpc_port: - remote = rpc.connect("localhost", local_rpc_port) - reconfig_runtime(remote) - else: - remote = rpc.LocalSession() - sim_path = os.path.join(tmp_dir, "tmp_func_%0x.tar" % getrandbits(64)) - sim.export_library(sim_path) - remote.upload(sim_path) - f = remote.load_module(os.path.split(sim_path)[1]) - ctx = remote.context(str(measure_input.target), 0) - args = [tvm.nd.empty(x[0], dtype=x[1], ctx=ctx) for x in arg_info] - # Skip execution just to verify correctness - simulator.debug_mode(simulator.DEBUG_SKIP_EXEC) - f(*args) - - # check by local simulator - ctx = tvm.context(str(target)) - args = [tvm.nd.empty(x[0], dtype=x[1], ctx=ctx) for x in arg_info] - sim(*args) - - except Exception as exc: # pylint: disable=broad-except - return BuildResult(None, None, exc, time.time() - tic) - return BuildResult(filename, arg_info, None, time.time() - tic) diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py index 1f52bb4f62c1..9b91bdb13ba2 100644 --- a/vta/tutorials/autotvm/tune_relay_vta.py +++ b/vta/tutorials/autotvm/tune_relay_vta.py @@ -209,7 +209,7 @@ def compile_network(env, target, model, start_pack, stop_pack): 'early_stopping': None, 'measure_option': autotvm.measure_option( - builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func), + builder=autotvm.LocalBuilder(), runner=autotvm.RPCRunner( env.TARGET, host=tracker_host, port=tracker_port, number=5, From aa028590db3b3bfdf050ba7910eb413112bba958 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 24 Jun 2019 02:29:06 -0700 Subject: [PATCH 111/126] lint fix --- vta/python/vta/build_module.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py index a291c42e592b..dbd2e4b45fd6 100644 --- a/vta/python/vta/build_module.py +++ b/vta/python/vta/build_module.py @@ -19,10 +19,8 @@ from __future__ import absolute_import as _abs import tvm -from tvm import rpc from . import ir_pass from .environment import get_env -from .testing import simulator def lift_coproc_scope(x): From 7afb87e8da4dade8503f8e4ee99c6beab3df6b62 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 24 Jun 2019 12:23:04 -0700 Subject: [PATCH 112/126] bug fix --- python/tvm/autotvm/task/relay_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py index 29190fa43324..79e521a3a9e5 100644 --- a/python/tvm/autotvm/task/relay_integration.py +++ b/python/tvm/autotvm/task/relay_integration.py @@ -41,7 +41,7 @@ def _build(func, from tvm import relay - if target.device_name == "vta": + if "vta" in str(target): with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): import vta with vta.build_config(): From aee8f05bb35fb3f429958e5d25b5eaa593df7177 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 24 Jun 2019 21:54:24 -0700 Subject: [PATCH 113/126] reflecting update on qconfig --- vta/scripts/tune_resnet.py | 4 +--- vta/tutorials/autotvm/tune_relay_vta.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py index 1a7c74bee3f7..21aa96cd350f 100644 --- a/vta/scripts/tune_resnet.py +++ b/vta/scripts/tune_resnet.py @@ -126,9 +126,7 @@ def compile_network(opt, env, target): # Perform quantization in Relay with relay.quantize.qconfig(global_scale=8.0, - skip_k_conv=1, - skip_k_dense=1, - target_vta=True): + skip_conv_layers=[0]): relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params) # Perform graph packing and constant folding for VTA target diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py index 9b91bdb13ba2..bdeb6c5d03e2 100644 --- a/vta/tutorials/autotvm/tune_relay_vta.py +++ b/vta/tutorials/autotvm/tune_relay_vta.py @@ -90,9 +90,7 @@ def compile_network(env, target, model, start_pack, stop_pack): # Perform quantization in Relay with relay.quantize.qconfig(global_scale=8.0, - skip_k_conv=1, - skip_k_dense=1, - target_vta=True): + skip_conv_layers=[0]): relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params) # Perform graph packing and constant folding for VTA target From a25bcbf66a797215ba9767cb1b372b3ff18d47c1 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 24 Jun 2019 22:07:20 -0700 Subject: [PATCH 114/126] fixing incorrect target initialization --- tests/python/unittest/test_graph_tuner_core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/python/unittest/test_graph_tuner_core.py b/tests/python/unittest/test_graph_tuner_core.py index 240da7f88628..6e06b44b3ea9 100644 --- a/tests/python/unittest/test_graph_tuner_core.py +++ b/tests/python/unittest/test_graph_tuner_core.py @@ -117,7 +117,7 @@ def _create_data(target, dshape, dtype, layout): def test_graph_tuner_layout_transform(): log_file = "%s/test_tuner.log" % (os.getcwd()) - target = "llvm" + target = tvm.target.arm_cpu() dshape = (1, 3, 8, 8) dtype = "float32" layout = "NCHW" @@ -152,7 +152,7 @@ def test_graph_tuner_layout_transform(): def test_DPTuner_run(): log_file = "%s/test_tuner.log" % (os.getcwd()) - target = "llvm" + target = tvm.target.arm_cpu() dtype = "float32" layout = "NCHW" dshape = (1, 3, 8, 8) @@ -201,7 +201,7 @@ def test_DPTuner_run(): def test_PBQPTuner_run(): - target = "llvm" + target = tvm.target.arm_cpu() dtype = "float32" layout = "NCHW" dshape = (1, 3, 8, 8) From a69250af6b4b2021aff078188118ef2275c88d0e Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 24 Jun 2019 22:07:39 -0700 Subject: [PATCH 115/126] proper checking --- python/tvm/autotvm/task/relay_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py index 79e521a3a9e5..29190fa43324 100644 --- a/python/tvm/autotvm/task/relay_integration.py +++ b/python/tvm/autotvm/task/relay_integration.py @@ -41,7 +41,7 @@ def _build(func, from tvm import relay - if "vta" in str(target): + if target.device_name == "vta": with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): import vta with vta.build_config(): From d5ba66ef2de3755c46f4ed97e4fc4be9cc33fe51 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 24 Jun 2019 22:33:21 -0700 Subject: [PATCH 116/126] unused arg --- python/tvm/relay/quantize/quantize.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py index 3c2a8a10026f..f209590f7ef7 100644 --- a/python/tvm/relay/quantize/quantize.py +++ b/python/tvm/relay/quantize/quantize.py @@ -158,9 +158,6 @@ def qconfig(**kwargs): is None, which means will try to call all operartors' annotate rewrite function. - target_vta: boolean - Whether we are performing quantization for VTA. - Returns ------- config: QConfig From 39a9d6212648c0e83b8209e30c42fab8c78e3110 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Mon, 24 Jun 2019 22:39:10 -0700 Subject: [PATCH 117/126] adding a TODO to address later, bug fix --- python/tvm/relay/quantize/_annotate.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py index 657fce54b42d..4bb345ee47dd 100644 --- a/python/tvm/relay/quantize/_annotate.py +++ b/python/tvm/relay/quantize/_annotate.py @@ -191,7 +191,8 @@ def check_to_skip(): return False -@register_annotate_function("nn.dense") +# TODO(tmoreau89,ziheng) need to include an option to turn off dense quant +# @register_annotate_function("nn.dense") def dense_rewrite(ref_call, new_args, ctx): """Rewrite function for dense. Lhs of dense will be quantized to input field, and rhs of dense will be quantized to weight field. Output would be in activation field.""" @@ -203,13 +204,14 @@ def dense_rewrite(ref_call, new_args, ctx): lhs_expr, lhs_kind = _get_expr_kind(new_args[0]) rhs_expr, rhs_kind = _get_expr_kind(new_args[1]) - if lhs_kind is None or lhs_kind != QAnnotateKind.INPUT: + if lhs_kind is None or lhs_kind == QAnnotateKind.ACTIVATION: lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT) assert rhs_kind is None rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT) expr = _forward_op(ref_call, [lhs_expr, rhs_expr]) + return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION) From 3f60022a9086c4644a3b225c833b8b1a4a5803c4 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 25 Jun 2019 10:06:33 -0700 Subject: [PATCH 118/126] merge fix --- src/relay/pass/quantize.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc index 2f23c7659b02..1d08e7b7915c 100644 --- a/src/relay/pass/quantize.cc +++ b/src/relay/pass/quantize.cc @@ -388,6 +388,7 @@ Array UnifyDTypeScale(const Array& ref_args, const Array& args, DataType* dtype_ptr, Expr* scale_ptr) { + static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize"); const QConfig& cfg = QConfig::Current(); std::vector nptrs; From 8df123a6af8d4c07e9716abe55b1eeb113db1697 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 25 Jun 2019 10:10:53 -0700 Subject: [PATCH 119/126] merge fix --- src/relay/pass/quantize.cc | 2 -- src/relay/pass/quantize.h | 4 ---- 2 files changed, 6 deletions(-) diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc index 1d08e7b7915c..75897b75831d 100644 --- a/src/relay/pass/quantize.cc +++ b/src/relay/pass/quantize.cc @@ -648,8 +648,6 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable) p->stream << "nbit_activation=" << op->nbit_activation << ", "; p->stream << "global_scale=" << op->global_scale << ", "; p->stream << "skip_conv_layers==" << op->skip_conv_layers << ", "; - p->stream << "skip_k_dense==" << op->skip_k_dense << ", "; - p->stream << "skip_dense_layers==" << op->skip_dense_layers << ", "; p->stream << "round_for_shift==" << op->round_for_shift << ", "; p->stream << "store_lowbit_output==" << op->store_lowbit_output << ", "; p->stream << "debug_enabled_ops==" << op->debug_enabled_ops; diff --git a/src/relay/pass/quantize.h b/src/relay/pass/quantize.h index c20f0b606dea..262d420acf97 100644 --- a/src/relay/pass/quantize.h +++ b/src/relay/pass/quantize.h @@ -153,8 +153,6 @@ class QConfigNode : public Node { DataType dtype_activation = Int(32); double global_scale = 8.0; Array skip_conv_layers = Array(NodePtr(nullptr)); - int skip_k_dense = 0; - Array skip_dense_layers = Array(NodePtr(nullptr)); bool round_for_shift = true; bool store_lowbit_output = true; Array debug_enabled_ops = Array(NodePtr(nullptr)); @@ -168,8 +166,6 @@ class QConfigNode : public Node { v->Visit("dtype_activation", &dtype_activation); v->Visit("global_scale", &global_scale); v->Visit("skip_conv_layers", &skip_conv_layers); - v->Visit("skip_k_dense", &skip_k_dense); - v->Visit("skip_dense_layers", &skip_dense_layers); v->Visit("round_for_shift", &round_for_shift); v->Visit("store_lowbit_output", &store_lowbit_output); v->Visit("debug_enabled_ops", &debug_enabled_ops); From 6c2e142b05e41ebcaf01d346491bd07099436d93 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 25 Jun 2019 10:17:03 -0700 Subject: [PATCH 120/126] merge fixes --- python/tvm/relay/quantize/_annotate.py | 20 ++++++++++++-------- python/tvm/relay/quantize/quantize.py | 14 -------------- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py index 4bb345ee47dd..90bb2d08a8ed 100644 --- a/python/tvm/relay/quantize/_annotate.py +++ b/python/tvm/relay/quantize/_annotate.py @@ -199,8 +199,6 @@ def dense_rewrite(ref_call, new_args, ctx): if check_to_skip(): return None - _set_dense_counter(cnt + 1) - lhs_expr, lhs_kind = _get_expr_kind(new_args[0]) rhs_expr, rhs_kind = _get_expr_kind(new_args[1]) @@ -226,6 +224,7 @@ def multiply_rewrite(ref_call, new_args, ctx): if lhs_kind is None and rhs_kind is None: return None + if lhs_kind in [QAnnotateKind.ACTIVATION, QAnnotateKind.INPUT] and rhs_kind is None: # quantize lhs to INPUT field if lhs_kind == QAnnotateKind.ACTIVATION: @@ -234,6 +233,7 @@ def multiply_rewrite(ref_call, new_args, ctx): rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT) expr = _forward_op(ref_call, [lhs_expr, rhs_expr]) return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION) + raise ValueError @@ -325,6 +325,7 @@ def pool2d_rewrite(ref_call, new_args, ctx): return None if x_kind == QAnnotateKind.ACTIVATION: expr = attach_simulated_quantize(expr, QAnnotateKind.INPUT) + expr = _forward_op(ref_call, [expr]) return QAnnotateExpr(expr, QAnnotateKind.INPUT) @@ -335,8 +336,9 @@ def pool2d_rewrite(ref_call, new_args, ctx): @register_annotate_function("annotation.force_cast") def force_cast_rewrite(ref_call, new_args, ctx): """Rewrite function to force cast""" - if _conv_counter() <= current_qconfig().skip_k_conv: + if check_to_skip(): return None + expr, x_kind = _get_expr_kind(new_args[0]) if x_kind is None: @@ -395,11 +397,13 @@ def vta_expr_check(expr): @register_vta_rewrite("nn.conv2d") def conv2d_vta_rewrite(ref_call, new_args, ctx): """Rewrite function for conv2d for VTA target""" - cnt = _conv_counter() - if cnt < current_qconfig().skip_k_conv: - _set_conv_counter(cnt + 1) - return None - _set_conv_counter(cnt + 1) + actx = annotate_context() + if current_qconfig().skip_conv_layers is not None: + skipped_indices = [int(x) for x in current_qconfig().skip_conv_layers] + if actx.conv2d_counter() in skipped_indices: + actx.count_conv2d() + return None + actx.count_conv2d() data_cond, data = vta_expr_check(new_args[0]) kernel_cond, kernel = vta_expr_check(new_args[1]) diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py index f209590f7ef7..c24cb153fba5 100644 --- a/python/tvm/relay/quantize/quantize.py +++ b/python/tvm/relay/quantize/quantize.py @@ -199,20 +199,6 @@ def annotate_context(): return AnnotateContext.Current -_DENSE_COUNTER = 0 - - -def _dense_counter(): - """Get the global counter for dense.""" - return _DENSE_COUNTER - - -def _set_dense_counter(n): - """Set the value of the global dense counter.""" - global _DENSE_COUNTER - _DENSE_COUNTER = n - - def calibrate(graph, mod=None, ctx=None): """The calibrate procedure will try to calculate the content of dom_scale, nbit, clip_min, clip_max for every `simulated_quantize` From 288883af5a16f0477b96462faec8d6ff10f4b6d0 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 25 Jun 2019 10:21:32 -0700 Subject: [PATCH 121/126] merge fix --- src/relay/pass/quantize.cc | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc index 75897b75831d..1503d67feaf1 100644 --- a/src/relay/pass/quantize.cc +++ b/src/relay/pass/quantize.cc @@ -388,7 +388,6 @@ Array UnifyDTypeScale(const Array& ref_args, const Array& args, DataType* dtype_ptr, Expr* scale_ptr) { - static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize"); const QConfig& cfg = QConfig::Current(); std::vector nptrs; @@ -413,20 +412,6 @@ Array UnifyDTypeScale(const Array& ref_args, LOG(FATAL) << "should not touch here."; } - for (size_t i = 0; i < ret.size(); ++i) { - auto ref_arg = ref_args[i].as(); - if (nptrs[i]->dtype != dtype) { - ret.Set(i, Cast(ret[i], dtype)); - } else if (ref_arg && ref_arg->op.same_as(simulated_quantize) && - ref_arg->attrs.as()->kind == kQInput) { - auto new_arg = Cast(ret[i], cfg->dtype_input); - if (cfg->store_lowbit_output) { - new_arg = StopFusion(new_arg); - } - ret.Set(i, Cast(new_arg, dtype)); - } - } - // unify the dom_scale float s = ChooseDomScale(nptrs); Expr dom_scale = MakeConstantScalar(Float(32), s); From 4a61b1feecd7ecc3f09de265e7fe2e10c9fe3337 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 25 Jun 2019 10:47:31 -0700 Subject: [PATCH 122/126] guard to avoid errors when target is set as string --- python/tvm/autotvm/task/relay_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py index 29190fa43324..62d4b27fa303 100644 --- a/python/tvm/autotvm/task/relay_integration.py +++ b/python/tvm/autotvm/task/relay_integration.py @@ -41,7 +41,7 @@ def _build(func, from tvm import relay - if target.device_name == "vta": + if target.device_name and target.device_name == "vta": with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): import vta with vta.build_config(): From bf6df6927fb7ab2077d0efbb59bef330d2a22077 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 25 Jun 2019 10:48:44 -0700 Subject: [PATCH 123/126] reverting fix --- tests/python/unittest/test_graph_tuner_core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/python/unittest/test_graph_tuner_core.py b/tests/python/unittest/test_graph_tuner_core.py index 6e06b44b3ea9..240da7f88628 100644 --- a/tests/python/unittest/test_graph_tuner_core.py +++ b/tests/python/unittest/test_graph_tuner_core.py @@ -117,7 +117,7 @@ def _create_data(target, dshape, dtype, layout): def test_graph_tuner_layout_transform(): log_file = "%s/test_tuner.log" % (os.getcwd()) - target = tvm.target.arm_cpu() + target = "llvm" dshape = (1, 3, 8, 8) dtype = "float32" layout = "NCHW" @@ -152,7 +152,7 @@ def test_graph_tuner_layout_transform(): def test_DPTuner_run(): log_file = "%s/test_tuner.log" % (os.getcwd()) - target = tvm.target.arm_cpu() + target = "llvm" dtype = "float32" layout = "NCHW" dshape = (1, 3, 8, 8) @@ -201,7 +201,7 @@ def test_DPTuner_run(): def test_PBQPTuner_run(): - target = tvm.target.arm_cpu() + target = "llvm" dtype = "float32" layout = "NCHW" dshape = (1, 3, 8, 8) From 0a5b599d8a1299cf6aaffe123979e871e0ebd692 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 25 Jun 2019 10:50:16 -0700 Subject: [PATCH 124/126] fix --- python/tvm/autotvm/task/relay_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py index 62d4b27fa303..d80443a208d6 100644 --- a/python/tvm/autotvm/task/relay_integration.py +++ b/python/tvm/autotvm/task/relay_integration.py @@ -41,7 +41,7 @@ def _build(func, from tvm import relay - if target.device_name and target.device_name == "vta": + if hasattr(target, 'device_name') and target.device_name == "vta": with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): import vta with vta.build_config(): From f8e629f7a40c32f169bb0fa890cbe9a9ac2b4bbe Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Tue, 25 Jun 2019 21:35:46 -0700 Subject: [PATCH 125/126] removing unused comment --- python/tvm/relay/quantize/quantize.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py index c24cb153fba5..fa70e1954467 100644 --- a/python/tvm/relay/quantize/quantize.py +++ b/python/tvm/relay/quantize/quantize.py @@ -142,10 +142,6 @@ def qconfig(**kwargs): Specifying which layers to be skipped. Provide a list of indices that indicate which conv2d layers to leave untouched. - skip_dense_layers: list - Different way of specifying which dense layers to avoid. - Provide a list of indices that indicate which conv2d layers to leave untouched. - round_for_shift: boolean Whether to add bias for rounding during shift. From 867ebdf002d0d9eab7489630028c08dd6f200b23 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Wed, 26 Jun 2019 19:51:56 -0700 Subject: [PATCH 126/126] guarding against improperly initialized TVM targets --- python/tvm/autotvm/measure/measure_methods.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index b2cf73f7dee1..36efc881958e 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -359,8 +359,9 @@ def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_opti if cuda_arch: set_cuda_target_arch(cuda_arch) - if measure_input.target.device_name == 'vta': - # if target is vta, we need to use vta build + # if target is vta, we need to use vta build + if hasattr(measure_input.target, 'device_name') and \ + measure_input.target.device_name == 'vta': import vta func = vta.build(s, args, target_host=task.target_host) else: @@ -457,7 +458,8 @@ def run_through_rpc(measure_input, build_result, # upload built module remote = request_remote(*remote_args) # Program the FPGA every single time when targeting VTA - if measure_input.target.device_name == 'vta': + if hasattr(measure_input.target, 'device_name') and \ + measure_input.target.device_name == 'vta': from vta import program_fpga, reconfig_runtime program_fpga(remote, None) reconfig_runtime(remote)