From ab7f5cdc6b882a9da9597385a05d7273d23573db Mon Sep 17 00:00:00 2001
From: Logan Weber <weberlo@cs.washington.edu>
Date: Tue, 1 Oct 2019 16:25:10 -0700
Subject: [PATCH 01/11] Prototype for micro TVM.

---
 python/tvm/autotvm/measure/local_executor.py  |   1 +
 python/tvm/autotvm/measure/measure_methods.py |   6 +-
 python/tvm/autotvm/task/space.py              |   6 +-
 python/tvm/autotvm/tuner/model_based_tuner.py |   1 +
 python/tvm/autotvm/tuner/tuner.py             |   8 +-
 python/tvm/contrib/binutil.py                 | 164 +++++--
 python/tvm/contrib/debugger/debug_runtime.py  |   1 +
 python/tvm/exec/rpc_server.py                 |  39 +-
 python/tvm/micro/__init__.py                  |   5 +-
 python/tvm/micro/base.py                      | 222 ++++++---
 python/tvm/micro/device/__init__.py           |   2 +-
 python/tvm/micro/device/arm/stm32f746xx.py    | 120 +++--
 python/tvm/micro/device/base.py               | 142 ++++--
 python/tvm/micro/device/host.py               | 101 ++--
 python/tvm/micro/device/riscv_spike.py        |  92 ++--
 python/tvm/relay/_parser.py                   |  15 +-
 python/tvm/relay/build_module.py              |  22 +-
 python/tvm/relay/frontend/keras.py            |   2 +-
 python/tvm/rpc/client.py                      |  10 +
 python/tvm/rpc/server.py                      |  11 +-
 python/tvm/rpc/tracker.py                     |   3 +
 python/tvm/runtime/module.py                  |   2 +-
 src/ir/error.cc                               |  19 +-
 .../micro/device/arm/stm32f746xx/utvm_init.s  |   1 +
 .../micro/device/arm/stm32f746xx/utvm_timer.c |  60 ++-
 src/runtime/micro/device/host/utvm_init.c     |   1 +
 src/runtime/micro/device/host/utvm_timer.c    |  32 +-
 .../micro/device/riscv_spike/utvm_init.s      |  29 ++
 .../micro/device/riscv_spike/utvm_timer.c     |  43 ++
 .../host_driven/utvm_device_dylib_redirect.c  |  40 +-
 src/runtime/micro/host_driven/utvm_runtime.h  |  45 +-
 src/runtime/micro/micro_common.h              |  41 +-
 src/runtime/micro/micro_device_api.cc         |  17 +-
 src/runtime/micro/micro_module.cc             |   4 +-
 src/runtime/micro/micro_section_allocator.h   |   9 +-
 src/runtime/micro/micro_session.cc            | 447 +++++++++++++-----
 src/runtime/micro/micro_session.h             |  89 +++-
 .../micro/target_data_layout_encoder.h        |  31 +-
 src/runtime/micro/tcl_socket.cc               |   6 +-
 src/runtime/rpc/rpc_session.cc                |  85 +++-
 src/runtime/rpc/rpc_session.h                 |  48 +-
 src/target/source/codegen_c.cc                |   6 +
 src/target/source/codegen_c_host.cc           |   5 +-
 src/target/source/codegen_c_host.h            |   2 +
 tests/python/unittest/test_runtime_micro.py   | 192 ++++++--
 .../topi/arm_cpu/conv2d_spatial_pack.py       |  21 +-
 topi/python/topi/arm_cpu/injective.py         |   3 +
 .../python/topi/testing/conv2d_nhwc_python.py |   6 +-
 48 files changed, 1642 insertions(+), 615 deletions(-)
 create mode 100644 src/runtime/micro/device/riscv_spike/utvm_init.s
 create mode 100644 src/runtime/micro/device/riscv_spike/utvm_timer.c

diff --git a/python/tvm/autotvm/measure/local_executor.py b/python/tvm/autotvm/measure/local_executor.py
index cf81e2b50e50..d838a92daa01 100644
--- a/python/tvm/autotvm/measure/local_executor.py
+++ b/python/tvm/autotvm/measure/local_executor.py
@@ -145,6 +145,7 @@ def submit(self, func, *args, **kwargs):
         if not self.do_fork:
             return LocalFutureNoFork(func(*args, **kwargs))
 
+        # TODO why they choose a queue size of 2? add a comment
         queue = Queue(2)
         process = Process(target=call_with_timeout,
                           args=(queue, self.timeout, func, args, kwargs))
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 6533e75eef93..00ecd2e98a06 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -249,6 +249,7 @@ def get_build_kwargs(self):
         return kwargs
 
     def run(self, measure_inputs, build_results):
+        print('[RPCRunner.run]')
         results = []
         remote_args = (self.key, self.host, self.port, self.priority, self.timeout)
 
@@ -273,7 +274,9 @@ def run(self, measure_inputs, build_results):
                 if isinstance(res, Exception):   # executor error or timeout
                     results.append(MeasureResult((str(res),), MeasureErrorNo.RUN_TIMEOUT,
                                                  self.timeout, time.time()))
+                    #raise Exception(f'encountered exception during measurement: {results}')
                 else:
+                    print(f'  got a result: {res}')
                     results.append(res)
 
         return results
@@ -508,7 +511,8 @@ def run_through_rpc(measure_input, build_result,
             msg = msg[:msg.index("Stack trace returned")]
         if "CUDA Source" in msg:
             msg = msg[:msg.index("CUDA Source")]
-        costs = (RuntimeError(msg[:1024]),)
+        #costs = (RuntimeError(msg[:1024]),)
+        costs = (RuntimeError(msg),)
         errno = MeasureErrorNo.RUNTIME_DEVICE
     tstamp = time.time()
     time.sleep(cooldown_interval)
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index fbf474fc4df7..f8de1fbf32d5 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -544,8 +544,10 @@ def apply(self, sch, op, axes, axis_lens=None,
                 if ann == 'none':
                     pass
                 elif ann == 'unroll':
-                    if max_unroll and axis_lens[i] > max_unroll:
-                        cfg.raise_error("Too large factor for unrolling")
+                    #if max_unroll and axis_lens[i] > max_unroll:
+                    #    cfg.raise_error("Too large factor for unrolling")
+                    #if max_unroll and axis_lens[i] < max_unroll:
+                    #    cfg.raise_error("Too large factor for unrolling")
                     sch[op].unroll(axes[i])
                 elif ann == 'vec':
                     if vec_size and axis_lens[i] not in vec_size:
diff --git a/python/tvm/autotvm/tuner/model_based_tuner.py b/python/tvm/autotvm/tuner/model_based_tuner.py
index 432f7070c349..56fe5b4f3f72 100644
--- a/python/tvm/autotvm/tuner/model_based_tuner.py
+++ b/python/tvm/autotvm/tuner/model_based_tuner.py
@@ -263,6 +263,7 @@ def update(self, inputs, results):
         # if we have enough new training samples
         if len(self.xs) >= self.plan_size * (self.train_ct + 1) \
                 and self.flops_max > 1e-6:
+            import pdb; pdb.set_trace()
             self.cost_model.fit(self.xs, self.ys, self.plan_size)
             if self.diversity_filter_ratio:
                 candidate = self.model_optimizer.find_maximums(
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index 2441a4ae642f..52d745104e63 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -150,7 +150,13 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=(), si_pr
                              i + k + 1, si_prefix, format_si_prefix(flops, si_prefix),
                              format_si_prefix(self.best_flops, si_prefix), res, config)
 
-            i += len(results)
+            for result in results:
+                if isinstance(result.costs[0], float):
+                    i += 1
+                else:
+                    print('[Tuner.tune]')
+                    print('  not counting failure towards trial count')
+            #i += len(results)
             self.ttl = min(early_stopping + self.best_iter, n_trial) - i
 
             self.update(inputs, results)
diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py
index 521e0885548c..9d6d469ba7cd 100644
--- a/python/tvm/contrib/binutil.py
+++ b/python/tvm/contrib/binutil.py
@@ -21,6 +21,82 @@
 import tvm._ffi
 from . import util
 
+# TODO does this file still belong in `contrib`. is it too µTVM-specific?
+
+# TODO shouldn't need so many `ALIGN` directives
+RELOCATION_LD_SCRIPT_TEMPLATE = """
+/* linker symbol for use in UTVMInit */
+_utvm_stack_pointer_init = 0x{stack_pointer_init:x};
+
+SECTIONS
+{{
+  . = 0x{text_start:x};
+  . = ALIGN({word_size});
+  .text :
+  {{
+    . = ALIGN({word_size});
+    KEEP(*(.text))
+    KEEP(*(.text*))
+    . = ALIGN({word_size});
+  }}
+
+  . = 0x{rodata_start:x};
+  . = ALIGN({word_size});
+  .rodata :
+  {{
+    . = ALIGN({word_size});
+    KEEP(*(.rodata))
+    KEEP(*(.rodata*))
+    . = ALIGN({word_size});
+  }}
+
+  . = 0x{data_start:x};
+  . = ALIGN({word_size});
+  .data :
+  {{
+    . = ALIGN({word_size});
+    KEEP(*(.data))
+    KEEP(*(.data*))
+    . = ALIGN({word_size});
+  }}
+
+  . = 0x{bss_start:x};
+  . = ALIGN({word_size});
+  .bss :
+  {{
+    . = ALIGN({word_size});
+    KEEP(*(.bss))
+    KEEP(*(.bss*))
+    . = ALIGN({word_size});
+  }}
+}}
+"""
+
+def run_cmd(cmd):
+    """Runs `cmd` in a subprocess and awaits its completion.
+
+    Parameters
+    ----------
+    cmd : List[str]
+        list of command-line arguments
+
+    Returns
+    -------
+    output : str
+        resulting stdout capture from the subprocess
+    """
+    proc = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT)
+    (output, _) = proc.communicate()
+    output = output.decode('utf-8')
+    if proc.returncode != 0:
+        cmd_str = ' '.join(cmd)
+        msg = f'error while running command \"{cmd_str}\":\n{output}'
+        raise RuntimeError(msg)
+    return output
+
 
 RELOCATION_LD_SCRIPT_TEMPLATE = """
 /* linker symbol for use in UTVMInit */
@@ -118,18 +194,18 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
         size of the section in bytes
     """
     if not os.path.isfile(binary_path):
-        raise RuntimeError("no such file \"{}\"".format(binary_path))
+        raise RuntimeError('no such file \"{}\"'.format(binary_path))
     # We use the "-A" flag here to get the ".rodata" section's size, which is
     # not included by default.
-    size_output = run_cmd(["{}size".format(toolchain_prefix), "-A", binary_path])
+    size_output = run_cmd(['{}size'.format(toolchain_prefix), '-A', binary_path])
 
     # TODO(weberlo): Refactor this method and `*relocate_binary` so they are
     # both aware of [".bss", ".sbss", ".sdata"] being relocated to ".bss".
     section_mapping = {
-        ".text": [".text"],
-        ".rodata": [".rodata"],
-        ".data": [".data", ".sdata"],
-        ".bss": [".bss", ".sbss"],
+        '.text': ['.text'],
+        '.rodata': ['.rodata'],
+        '.data': ['.data', '.sdata'],
+        '.bss': ['.bss', '.sbss'],
     }
     sections_to_sum = section_mapping["." + section_name]
     section_size = 0
@@ -148,7 +224,7 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
     # NOTE: For some reason, the size of the BSS section on the RISC-V
     # GCC is sometimes reported to be smaller than it is, so we need to adjust
     # for this.
-    if "riscv" in toolchain_prefix and section_name == "bss":
+    if 'riscv' in toolchain_prefix and section_name == 'bss':
         # TODO(weberlo): Figure out why 32 is the minimum constant that works.
         #
         # The current hypothesis is that the last symbols in the ".bss" and
@@ -160,7 +236,11 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
         # padding for most cases, but symbols can be arbitrarily large, so this
         # isn't bulletproof.
         return section_size + 32
-    return section_size
+    # TODO remove this arbitrary addition once we figure out why section sizes
+    # are being undercalculated.
+    # maybe stop relying on `*size` to give us the size and instead read the
+    # section with `*objcopy` and count the bytes.
+    return section_size + 8
 
 
 @tvm._ffi.register_func("tvm_callback_relocate_binary")
@@ -206,11 +286,13 @@ def tvm_callback_relocate_binary(
     rel_bin : bytearray
         the relocated binary
     """
+    assert text_start < rodata_start < data_start < bss_start < stack_end
     stack_pointer_init = stack_end - word_size
-    ld_script_contents = ""
+    ld_script_contents = ''
     # TODO(weberlo): There should be a better way to configure this for different archs.
-    if "riscv" in toolchain_prefix:
-        ld_script_contents += "OUTPUT_ARCH( \"riscv\" )\n\n"
+    # TODO is this line even necessary?
+    if 'riscv' in toolchain_prefix:
+        ld_script_contents += 'OUTPUT_ARCH( "riscv" )\n\n'
     ld_script_contents += RELOCATION_LD_SCRIPT_TEMPLATE.format(
         word_size=word_size,
         text_start=text_start,
@@ -220,17 +302,31 @@ def tvm_callback_relocate_binary(
         stack_pointer_init=stack_pointer_init)
 
     tmp_dir = util.tempdir()
-    rel_obj_path = tmp_dir.relpath("relocated.obj")
-    rel_ld_script_path = tmp_dir.relpath("relocated.lds")
-    with open(rel_ld_script_path, "w") as f:
+    rel_obj_path = tmp_dir.relpath('relocated.obj')
+    rel_ld_script_path = tmp_dir.relpath('relocate.lds')
+    with open(rel_ld_script_path, 'w') as f:
         f.write(ld_script_contents)
     run_cmd([
-        "{}ld".format(toolchain_prefix),
+        '{}ld'.format(toolchain_prefix),
         binary_path,
-        "-T", rel_ld_script_path,
-        "-o", rel_obj_path])
-    with open(rel_obj_path, "rb") as f:
+        '-T', rel_ld_script_path,
+        '-o', rel_obj_path])
+
+    with open(rel_obj_path, 'rb') as f:
         rel_bin = bytearray(f.read())
+
+    gdb_init_dir = os.environ['MICRO_GDB_INIT_DIR']
+    gdb_init_path = f'{gdb_init_dir}/.gdbinit'
+    with open(gdb_init_path, 'r') as f:
+        gdbinit_contents = f.read().split('\n')
+    new_contents = []
+    for line in gdbinit_contents:
+        new_contents.append(line)
+        if line.startswith('target'):
+            new_contents.append(f'add-symbol-file {rel_obj_path}')
+    with open(gdb_init_path, 'w') as f:
+        f.write('\n'.join(new_contents))
+
     return rel_bin
 
 
@@ -255,22 +351,22 @@ def tvm_callback_read_binary_section(binary, section, toolchain_prefix):
         contents of the read section
     """
     tmp_dir = util.tempdir()
-    tmp_bin = tmp_dir.relpath("temp.bin")
-    tmp_section = tmp_dir.relpath("tmp_section.bin")
-    with open(tmp_bin, "wb") as out_file:
+    tmp_bin = tmp_dir.relpath('temp.bin')
+    tmp_section = tmp_dir.relpath('tmp_section.bin')
+    with open(tmp_bin, 'wb') as out_file:
         out_file.write(bytes(binary))
     run_cmd([
-        "{}objcopy".format(toolchain_prefix),
-        "--dump-section",
-        ".{}={}".format(section, tmp_section),
+        '{}objcopy'.format(toolchain_prefix),
+        '--dump-section',
+        '.{}={}'.format(section, tmp_section),
         tmp_bin])
     if os.path.isfile(tmp_section):
         # Get section content if it exists.
-        with open(tmp_section, "rb") as f:
+        with open(tmp_section, 'rb') as f:
             section_bin = bytearray(f.read())
     else:
         # Return empty bytearray if the section does not exist.
-        section_bin = bytearray("", "utf-8")
+        section_bin = bytearray('', 'utf-8')
     return section_bin
 
 
@@ -293,18 +389,18 @@ def tvm_callback_get_symbol_map(binary, toolchain_prefix):
         alternating newline-separated keys and values
     """
     tmp_dir = util.tempdir()
-    tmp_obj = tmp_dir.relpath("tmp_obj.bin")
-    with open(tmp_obj, "wb") as out_file:
+    tmp_obj = tmp_dir.relpath('tmp_obj.bin')
+    with open(tmp_obj, 'wb') as out_file:
         out_file.write(bytes(binary))
     nm_output = run_cmd([
-        "{}nm".format(toolchain_prefix),
-        "-C",
-        "--defined-only",
+        '{}nm'.format(toolchain_prefix),
+        '-C',
+        '--defined-only',
         tmp_obj])
     nm_output = nm_output.splitlines()
-    map_str = ""
+    map_str = ''
     for line in nm_output:
         line = line.split()
-        map_str += line[2] + "\n"
-        map_str += line[0] + "\n"
+        map_str += line[2] + '\n'
+        map_str += line[0] + '\n'
     return map_str
diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py
index 848d7f57d1de..fd838c407617 100644
--- a/python/tvm/contrib/debugger/debug_runtime.py
+++ b/python/tvm/contrib/debugger/debug_runtime.py
@@ -181,6 +181,7 @@ def _run_debug(self):
         """
         self.debug_datum._time_list = [
             [float(t) * 1e-6] for t in self.run_individual(10, 1, 1)
+            #[float(t) * 1e-6] for t in self.run_individual(1, 1, 1)
         ]
         for i, node in enumerate(self.debug_datum.get_graph_nodes()):
             num_outputs = self.debug_datum.get_graph_node_output_num(node)
diff --git a/python/tvm/exec/rpc_server.py b/python/tvm/exec/rpc_server.py
index dbb690267e2a..a3c43583f44d 100644
--- a/python/tvm/exec/rpc_server.py
+++ b/python/tvm/exec/rpc_server.py
@@ -20,6 +20,7 @@
 
 import argparse
 import ast
+import json
 import multiprocessing
 import sys
 import logging
@@ -75,8 +76,8 @@ def init_utvm(args):
             dev_config = json.load(dev_conf_file)
     else:
         dev_config_args = ast.literal_eval(args.utvm_dev_config_args)
-        default_config_func = micro.device.get_device_funcs(args.utvm_dev_id)['default_config']
-        dev_config = default_config_func(*dev_config_args)
+        generate_config_func = micro.device.get_device_funcs(args.utvm_dev_id)['generate_config']
+        dev_config = generate_config_func(*dev_config_args)
 
     if args.utvm_dev_config or args.utvm_dev_id:
         # add MicroTVM overrides
@@ -100,27 +101,33 @@ def server_shutdown():
     parser.add_argument('--port-end', type=int, default=9199,
                         help='The end search port of the RPC')
     parser.add_argument('--tracker', type=str,
-                        help="The address of RPC tracker in host:port format. "
-                             "e.g. (10.77.1.234:9190)")
+                        help=('The address of RPC tracker in host:port format. '
+                              'e.g. (10.77.1.234:9190)'))
     parser.add_argument('--key', type=str, default="",
-                        help="The key used to identify the device type in tracker.")
+                        help='The key used to identify the device type in tracker.')
     parser.add_argument('--silent', action='store_true',
-                        help="Whether run in silent mode.")
+                        help='Whether run in silent mode.')
     parser.add_argument('--load-library', type=str,
-                        help="Additional library to load")
+                        help='Additional library to load')
     parser.add_argument('--no-fork', dest='fork', action='store_false',
-                        help="Use spawn mode to avoid fork. This option \
-                         is able to avoid potential fork problems with Metal, OpenCL \
-                         and ROCM compilers.")
+                        help=('Use spawn mode to avoid fork. This option '
+                              'is able to avoid potential fork problems with Metal, OpenCL '
+                              'and ROCM compilers.'))
     parser.add_argument('--custom-addr', type=str,
-                        help="Custom IP Address to Report to RPC Tracker")
+                        help='Custom IP Address to Report to RPC Tracker')
     parser.add_argument('--utvm-dev-config', type=str,
-                        help='JSON config file for the target device (if using MicroTVM)')
-    parser.add_argument('--utvm-dev-id', type=str,
-                        help='Unique ID for the target device (if using MicroTVM)')
+                        help=('JSON config file for the target device (if using MicroTVM). '
+                              'This file should contain serialized output similar to that returned '
+                              "from the device module's generate_config. Can't be specified when "
+                              '--utvm-dev-config-args is specified.'))
     parser.add_argument('--utvm-dev-config-args', type=str,
-                        help=('Python list of literals required to generate a default'
-                              ' MicroTVM config (if --utvm-dev-id is specified)'))
+                        help=("Arguments to the device module's generate_config function. "
+                              'Must be a python literal parseable by literal_eval. If specified, the '
+                              "device configuration is generated using the device module's generate_config. "
+                              "Can't be specified when --utvm-dev-config is specified."))
+    parser.add_argument('--utvm-dev-id', type=str,
+                        help=('Unique ID for the target device (if using MicroTVM). Should '
+                              'match the name of a module underneath tvm.micro.device).'))
 
     parser.set_defaults(fork=True)
     args = parser.parse_args()
diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py
index 9e984c08fe2c..7c1389cc4eef 100644
--- a/python/tvm/micro/__init__.py
+++ b/python/tvm/micro/__init__.py
@@ -17,6 +17,7 @@
 """MicroTVM module for bare-metal backends"""
 
 from ..contrib import binutil
-from .base import Session, create_micro_mod, cross_compiler
-from .base import LibType, get_micro_host_driven_dir, get_micro_device_dir
+from .base import DEVICE_SECTIONS
+from .base import Session, create_micro_mod, cross_compiler, LibType
+from .base import get_micro_host_driven_dir, get_micro_device_dir
 from . import device
diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py
index 9f50f9855303..5a1b71846630 100644
--- a/python/tvm/micro/base.py
+++ b/python/tvm/micro/base.py
@@ -25,9 +25,22 @@
 import tvm
 import tvm._ffi
 
+import tvm
 from tvm.contrib import util as _util
 from tvm.contrib import cc as _cc
 
+# all sections that comprise a device's memory layout, in order from lowest
+# starting address to highest
+DEVICE_SECTIONS = [
+    'text',
+    'rodata',
+    'data',
+    'bss',
+    'args',
+    'heap',
+    'workspace',
+    'stack',
+]
 
 class LibType(Enum):
     """Enumeration of library types that can be compiled and loaded onto a device"""
@@ -51,9 +64,9 @@ class Session:
     .. code-block:: python
 
       c_mod = ...  # some module generated with "c" as the target
-      dev_config = micro.device.arm.stm32f746xx.default_config("127.0.0.1", 6666)
+      dev_config = micro.device.arm.stm32f746xx.default_config('127.0.0.1', 6666)
       with tvm.micro.Session(dev_config) as sess:
-          micro_mod = create_micro_mod(c_mod, dev_config)
+          micro_mod = sess.create_micro_mod(c_mod)
     """
 
     def __init__(self, config):
@@ -61,69 +74,72 @@ def __init__(self, config):
         # TODO(weberlo): add config validation
 
         # grab a binutil instance from the ID in the config
-        dev_funcs = tvm.micro.device.get_device_funcs(config["device_id"])
-        self.create_micro_lib = dev_funcs["create_micro_lib"]
-        self.toolchain_prefix = config["toolchain_prefix"]
-        self.mem_layout = config["mem_layout"]
-        self.word_size = config["word_size"]
-        self.thumb_mode = config["thumb_mode"]
-        self.comms_method = config["comms_method"]
+        dev_funcs = tvm.micro.device.get_device_funcs(config['device_id'])
+        self.toolchain_prefix = config['toolchain_prefix']
+        self.mem_layout = config['mem_layout']
+        self.word_size = config['word_size']
+        self.thumb_mode = config['thumb_mode']
+        self.use_device_timer = config['use_device_timer']
+        self.comms_method = config['comms_method']
 
         # First, find and compile runtime library.
-        runtime_src_path = os.path.join(get_micro_host_driven_dir(), "utvm_runtime.c")
+        runtime_src_path = os.path.join(get_micro_host_driven_dir(), 'utvm_runtime.c')
         tmp_dir = _util.tempdir()
-        runtime_obj_path = tmp_dir.relpath("utvm_runtime.obj")
-        self.create_micro_lib(runtime_obj_path, runtime_src_path, LibType.RUNTIME)
-        #input(f"check {runtime_obj_path}: ")
-
-        comms_method = config["comms_method"]
-        if comms_method == "openocd":
-            server_addr = config["server_addr"]
-            server_port = config["server_port"]
-        elif comms_method == "host":
-            server_addr = ""
+        runtime_obj_path = tmp_dir.relpath('utvm_runtime.obj')
+        dev_funcs['create_micro_lib'](runtime_obj_path, runtime_src_path, LibType.RUNTIME)
+
+        comms_method = config['comms_method']
+        if comms_method == 'openocd':
+            server_addr = config['server_addr']
+            server_port = config['server_port']
+        elif comms_method == 'host':
+            server_addr = ''
             server_port = 0
         else:
-            raise RuntimeError(f"unknown communication method: f{self.comms_method}")
+            raise RuntimeError(f'unknown communication method: f{self.comms_method}')
 
+        assert all(map(lambda sec: sec in self.mem_layout, DEVICE_SECTIONS)), 'not all sections have an assigned memory layout'
         self.module = _CreateSession(
             comms_method,
             runtime_obj_path,
             self.toolchain_prefix,
-            self.mem_layout["text"].get("start", 0),
-            self.mem_layout["text"]["size"],
-            self.mem_layout["rodata"].get("start", 0),
-            self.mem_layout["rodata"]["size"],
-            self.mem_layout["data"].get("start", 0),
-            self.mem_layout["data"]["size"],
-            self.mem_layout["bss"].get("start", 0),
-            self.mem_layout["bss"]["size"],
-            self.mem_layout["args"].get("start", 0),
-            self.mem_layout["args"]["size"],
-            self.mem_layout["heap"].get("start", 0),
-            self.mem_layout["heap"]["size"],
-            self.mem_layout["workspace"].get("start", 0),
-            self.mem_layout["workspace"]["size"],
-            self.mem_layout["stack"].get("start", 0),
-            self.mem_layout["stack"]["size"],
+            self.mem_layout['text'].get('start', 0),
+            self.mem_layout['text']['size'],
+            self.mem_layout['rodata'].get('start', 0),
+            self.mem_layout['rodata']['size'],
+            self.mem_layout['data'].get('start', 0),
+            self.mem_layout['data']['size'],
+            self.mem_layout['bss'].get('start', 0),
+            self.mem_layout['bss']['size'],
+            self.mem_layout['args'].get('start', 0),
+            self.mem_layout['args']['size'],
+            self.mem_layout['heap'].get('start', 0),
+            self.mem_layout['heap']['size'],
+            self.mem_layout['workspace'].get('start', 0),
+            self.mem_layout['workspace']['size'],
+            self.mem_layout['stack'].get('start', 0),
+            self.mem_layout['stack']['size'],
             self.word_size,
             self.thumb_mode,
+            self.use_device_timer,
             server_addr,
             server_port)
-        self._enter = self.module["enter"]
-        self._exit = self.module["exit"]
+        self._enter = self.module['enter']
+        self._exit = self.module['exit']
+        self.get_last_batch_time = self.module['get_last_batch_time']
+        self.get_last_batch_cycles = self.module['get_last_batch_cycles']
 
     def _check_system(self):
         """Check if the user's system is supported by MicroTVM.
 
         Raises error if not supported.
         """
-        if not sys.platform.startswith("linux"):
-            raise RuntimeError("MicroTVM is currently only supported on Linux hosts")
+        if not sys.platform.startswith('linux'):
+            raise RuntimeError('MicroTVM is currently only supported on Linux')
         # TODO(weberlo): Add 32-bit support.
         # It's primarily the compilation pipeline that isn't compatible.
         if sys.maxsize <= 2**32:
-            raise RuntimeError("MicroTVM is currently only supported on 64-bit host platforms")
+            raise RuntimeError('MicroTVM is currently only supported on 64-bit platforms')
 
     def __enter__(self):
         self._enter()
@@ -133,44 +149,90 @@ def __exit__(self, exc_type, exc_value, exc_traceback):
         self._exit()
 
 
-def create_micro_mod(c_mod, dev_config):
+def _calc_max_workspace_usage(src):
+    # TODO factor in alignment to the calculation (alloc sizes will be aligned up to the word size)
+    import re
+    alloc_re = re.compile(r'.*\* ?(.+) = (\(.+\))? TVMBackendAllocWorkspace\(.+, .+, \(uint64_t\)(.+), .+, .+\).*')
+    free_re = re.compile(r'.*if \(TVMBackendFreeWorkspace\(.+, .+, (\(void\*\))? (.+)\) != 0\) {.*')
+    max_usage = 0
+    alloc_map = {}
+    for line in src.split('\n'):
+        if line.strip().startswith('//'):
+            continue
+        match = alloc_re.match(line)
+        if match is not None:
+            alloc_map[match.group(1)] = int(match.group(3))
+            max_usage = max(max_usage, sum(alloc_map.values()))
+        else:
+            match = free_re.match(line)
+            if match is not None:
+                print(alloc_map)
+                del alloc_map[match.group(2)]
+    return max_usage
+
+
+def create_micro_mod(c_mod, dev_config, lib_src_paths=None, lib_headers=None, lib_include_paths=None):
     """Produces a micro module from a given module.
 
     Parameters
     ----------
-    c_mod : tvm.runtime.Module
+    c_mod : tvm.module.Module
         module with "c" as its target backend
 
-    dev_config : Dict[str, Any]
-        MicroTVM config dict for the target device
+    lib_src_paths: TODO
+        TODO
+
+    lib_headers: TODO
+        TODO
+
+    lib_include_paths: TODO
+        TODO
 
     Return
     ------
-    micro_mod : tvm.runtim.Module
+    micro_mod : tvm.module.Module
         micro module for the target device
     """
+    print('[create_micro_mod]')
     temp_dir = _util.tempdir()
-    lib_obj_path = temp_dir.relpath("dev_lib.obj")
+    lib_obj_path = temp_dir.relpath('dev_lib.obj')
+    # TODO use dev config to dispatch on the type of C codegen to run through
+    # (e.g., CodeGenCArm, CodeGenCHost, CodeGenCRiscV)
     c_mod.export_library(
-        lib_obj_path,
-        fcompile=cross_compiler(dev_config, LibType.OPERATOR))
-    micro_mod = tvm.runtime.load_module(lib_obj_path)
+            lib_obj_path,
+            fcompile=cross_compiler(
+                dev_config,
+                LibType.OPERATOR,
+                lib_src_paths=lib_src_paths,
+                lib_headers=lib_headers,
+                lib_include_paths=lib_include_paths))
+    micro_mod = tvm.module.load(lib_obj_path)
     return micro_mod
 
 
-def cross_compiler(dev_config, lib_type):
-    """Create a cross-compile function that wraps `create_lib` for a `Binutil` instance.
+def cross_compiler(dev_config, lib_type, lib_src_paths=None, lib_headers=None, lib_include_paths=None):
+    """Create a cross compile function that wraps `create_lib` for a `Binutil` instance.
 
     For use in `tvm.runtime.Module.export_library`.
 
     Parameters
     ----------
-    dev_config : Dict[str, Any]
-        MicroTVM config dict for the target device
+    create_micro_lib : func
+        function for creating MicroTVM libraries for a specific device (e.g.,
+        `tvm.micro.device.get_device_funcs('arm.stm32f746xx')['create_micro_lib']`)
 
     lib_type : micro.LibType
         whether to compile a MicroTVM runtime or operator library
 
+    lib_src_paths: TODO
+        TODO
+
+    lib_headers: TODO
+        e.g., `['cmsis_gcc.h', 'arm_math.h']`
+
+    lib_include_paths: TODO
+        TODO
+
     Return
     ------
     func : Callable[[str, str, Optional[str]], None]
@@ -183,17 +245,47 @@ def cross_compiler(dev_config, lib_type):
 
       c_mod = ...  # some module generated with "c" as the target
       fcompile = tvm.micro.cross_compiler(dev_config, LibType.OPERATOR)
-      c_mod.export_library("dev_lib.obj", fcompile=fcompile)
+      c_mod.export_library('dev_lib.obj', fcompile=fcompile)
     """
-    dev_funcs = tvm.micro.device.get_device_funcs(dev_config['device_id'])
-    create_micro_lib = dev_funcs['create_micro_lib']
+    assert (lib_headers is None) == (lib_include_paths is None), 'must specify both `lib_headers` and `lib_include_paths` or neither'
+
+    if lib_src_paths is None:
+        lib_src_paths = []
+    if lib_include_paths is None:
+        lib_include_paths = []
+    include_options = []
+    for include_path in lib_include_paths:
+        include_options.append('-I')
+        include_options.append(include_path)
+    create_micro_lib = tvm.micro.device.get_device_funcs(dev_config['device_id'])['create_micro_lib']
+    mem_layout = dev_config['mem_layout']
+
     def compile_func(obj_path, src_path, **kwargs):
         if isinstance(obj_path, list):
             obj_path = obj_path[0]
         if isinstance(src_path, list):
             src_path = src_path[0]
-        create_micro_lib(obj_path, src_path, lib_type, kwargs.get("options", None))
-    return _cc.cross_compiler(compile_func, output_format="obj")
+        options = kwargs.get('options', [])
+        options += include_options
+
+        # check that workspace allocations don't exceed available workspace memory
+        with open(src_path) as f:
+            src_contents = f.read()
+            max_ws_usage = _calc_max_workspace_usage(src_contents)
+            available_mem = mem_layout['workspace']['size']
+            if max_ws_usage > available_mem:
+                raise RuntimeError(f'workspace allocations in library ({max_ws_usage}) exceed available memory ({available_mem})')
+        # inject headers into new source path, if requested
+        if lib_headers:
+            headers_to_inject = '\n'.join(map(lambda s: f'#include <{s}>', lib_headers)) + '\n'
+            new_src_contents = headers_to_inject + src_contents
+            tmp_dir = _util.tempdir()
+            src_path = tmp_dir.relpath(os.path.basename(src_path))
+            with open(src_path, 'w') as f:
+                f.write(new_src_contents)
+
+        create_micro_lib(obj_path, src_path, lib_type, options, lib_src_paths=lib_src_paths)
+    return _cc.cross_compiler(compile_func, output_format='obj')
 
 
 def get_micro_host_driven_dir():
@@ -205,8 +297,8 @@ def get_micro_host_driven_dir():
         directory path
     """
     micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
-    micro_host_driven_dir = os.path.join(micro_dir, "..", "..", "..",
-                                         "src", "runtime", "micro", "host_driven")
+    micro_host_driven_dir = os.path.join(micro_dir, '..', '..', '..',
+                                         'src', 'runtime', 'micro', 'host_driven')
     return micro_host_driven_dir
 
 
@@ -219,9 +311,9 @@ def get_micro_device_dir():
         directory path
     """
     micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
-    micro_device_dir = os.path.join(micro_dir, "..", "..", "..",
-                                    "src", "runtime", "micro", "device")
+    micro_device_dir = os.path.join(micro_dir, '..', '..', '..',
+                                    'src', 'runtime', 'micro', 'device')
     return micro_device_dir
 
 
-tvm._ffi._init_api("tvm.micro", "tvm.micro.base")
+tvm._ffi._init_api('tvm.micro', 'tvm.micro.base')
diff --git a/python/tvm/micro/device/__init__.py b/python/tvm/micro/device/__init__.py
index 1ccd6847edd8..3d2291c6a052 100644
--- a/python/tvm/micro/device/__init__.py
+++ b/python/tvm/micro/device/__init__.py
@@ -16,7 +16,7 @@
 # under the License.
 """Device-specific configuration for MicroTVM"""
 
-from .base import register_device, get_device_funcs, create_micro_lib_base
+from .base import create_micro_lib_base, gen_mem_layout, MemConstraint, register_device, get_device_funcs
 from . import host
 from . import arm
 from . import riscv_spike
diff --git a/python/tvm/micro/device/arm/stm32f746xx.py b/python/tvm/micro/device/arm/stm32f746xx.py
index 31b44cf9d36b..8cd354738fe7 100644
--- a/python/tvm/micro/device/arm/stm32f746xx.py
+++ b/python/tvm/micro/device/arm/stm32f746xx.py
@@ -14,13 +14,31 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Compilation and config definitions for ARM STM32F746XX devices"""
-from .. import create_micro_lib_base, register_device
+"""Compilation and config definitions for Arm STM32F746XX devices"""
+from .. import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint
 
-DEVICE_ID = "arm.stm32f746xx"
-TOOLCHAIN_PREFIX = "arm-none-eabi-"
+DEVICE_ID = 'arm.stm32f746xx'
+TOOLCHAIN_PREFIX = 'arm-none-eabi-'
+WORD_SIZE = 4
+#
+# [Device Memory Layout]
+#   RAM   (rwx) : START = 0x20000000, LENGTH = 320K
+#   Flash (rx)  : START = 0x8000000,  LENGTH = 1024K
+#
+BASE_ADDR = 0x20000000
+AVAILABLE_MEM = 320000
+DEFAULT_SECTION_CONSTRAINTS = {
+    'text': (18000, MemConstraint.ABSOLUTE_BYTES),
+    'rodata': (100, MemConstraint.ABSOLUTE_BYTES),
+    'data': (100, MemConstraint.ABSOLUTE_BYTES),
+    'bss': (600, MemConstraint.ABSOLUTE_BYTES),
+    'args': (4096, MemConstraint.ABSOLUTE_BYTES),
+    'heap': (100.0, MemConstraint.WEIGHT),
+    'workspace': (64000, MemConstraint.ABSOLUTE_BYTES),
+    'stack': (32, MemConstraint.ABSOLUTE_BYTES),
+}
 
-def create_micro_lib(obj_path, src_path, lib_type, options=None):
+def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None):
     """Wrapper over `create_micro_lib_base` to add device-specific options
 
     Parameters
@@ -36,23 +54,33 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None):
 
     options : Optional[List[str]]
         additional options to pass to GCC
+
+    lib_src_paths : Optional[List[str]]
+        TODO
     """
     if options is None:
         options = []
     options += [
-        "-mcpu=cortex-m7",
-        "-mlittle-endian",
-        "-mfloat-abi=hard",
-        "-mfpu=fpv5-sp-d16",
-        "-mthumb",
-        "-gdwarf-5",
+        '-march=armv7e-m',
+        '-mcpu=cortex-m7',
+        '-mlittle-endian',
+        '-mfloat-abi=hard',
+        # TODO try this one?
+        #'-mfpu=fpv5-d16',
+        '-mfpu=fpv5-sp-d16',
+        '-mthumb',
+        '-ffast-math',
+        '-gdwarf-5',
+        '-DARM_MATH_CM7',
+        '-D__FPU_PRESENT=1U',
+        '-DARM_MATH_DSP',
         ]
     create_micro_lib_base(
-        obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options)
+        obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options, lib_src_paths=lib_src_paths)
 
 
-def default_config(server_addr, server_port):
-    """Generates a default configuration for ARM STM32F746XX devices
+def generate_config(server_addr, server_port, section_constraints=None):
+    """Generates a configuration for Arm STM32F746XX devices
 
     Parameters
     ----------
@@ -62,62 +90,30 @@ def default_config(server_addr, server_port):
     server_port : int
         port of OpenOCD server to connect to
 
+    section_constraints: Optional[Dict[str, Tuple[Number, MemConstraint]]]
+        TODO correct type annotation?
+
     Return
     ------
     config : Dict[str, Any]
         MicroTVM config dict for this device
     """
+    if section_constraints is None:
+        section_constraints = DEFAULT_SECTION_CONSTRAINTS
     return {
-        "device_id": DEVICE_ID,
-        "toolchain_prefix": TOOLCHAIN_PREFIX,
-        #
-        # [Device Memory Layout]
-        #   RAM   (rwx) : START = 0x20000000, LENGTH = 320K
-        #   FLASH (rx)  : START = 0x8000000,  LENGTH = 1024K
-        #
-        "mem_layout": {
-            "text": {
-                "start": 0x20000180,
-                "size": 20480,
-            },
-            "rodata": {
-                "start": 0x20005180,
-                "size": 20480,
-            },
-            "data": {
-                "start": 0x2000a180,
-                "size": 768,
-            },
-            "bss": {
-                "start": 0x2000a480,
-                "size": 768,
-            },
-            "args": {
-                "start": 0x2000a780,
-                "size": 1280,
-            },
-            "heap": {
-                "start": 0x2000ac80,
-                "size": 262144,
-            },
-            "workspace": {
-                "start": 0x2004ac80,
-                "size": 20480,
-            },
-            "stack": {
-                "start": 0x2004fc80,
-                "size": 80,
-            },
-        },
-        "word_size": 4,
-        "thumb_mode": True,
-        "comms_method": "openocd",
-        "server_addr": server_addr,
-        "server_port": server_port,
+        'device_id': DEVICE_ID,
+        'toolchain_prefix': TOOLCHAIN_PREFIX,
+        'mem_layout': gen_mem_layout(BASE_ADDR, AVAILABLE_MEM, WORD_SIZE, section_constraints),
+        'word_size': WORD_SIZE,
+        'thumb_mode': True,
+        'use_device_timer': False,
+        'comms_method': 'openocd',
+        'server_addr': server_addr,
+        'server_port': server_port,
     }
 
 
 register_device(DEVICE_ID, {
-    "create_micro_lib": create_micro_lib,
-    "default_config": default_config,
+    'create_micro_lib': create_micro_lib,
+    'generate_config': generate_config,
 })
diff --git a/python/tvm/micro/device/base.py b/python/tvm/micro/device/base.py
index ae53b9cc539f..41d1a3ed1a95 100644
--- a/python/tvm/micro/device/base.py
+++ b/python/tvm/micro/device/base.py
@@ -17,12 +17,14 @@
 """Base definitions for MicroTVM config"""
 import glob
 import os
-from pathlib import Path
+import enum
+import pathlib
+import operator
 
 from tvm.contrib import util as _util
 from tvm.contrib.binutil import run_cmd
 from tvm._ffi.libinfo import find_include_path
-from tvm.micro import LibType, get_micro_host_driven_dir, get_micro_device_dir
+from tvm.micro import DEVICE_SECTIONS, LibType, get_micro_host_driven_dir, get_micro_device_dir
 
 _DEVICE_REGISTRY = {}
 
@@ -38,7 +40,7 @@ def register_device(device_id, device_funcs):
         dictionary with compilation and config generation functions as values
     """
     if device_id in _DEVICE_REGISTRY:
-        raise RuntimeError(f"\"{device_id}\" already exists in the device registry")
+        raise RuntimeError(f'"{device_id}" already exists in the device registry')
     _DEVICE_REGISTRY[device_id] = device_funcs
 
 
@@ -56,7 +58,7 @@ def get_device_funcs(device_id):
         dictionary with compilation and config generation functions as values
     """
     if device_id not in _DEVICE_REGISTRY:
-        raise RuntimeError(f"\"{device_id}\" does not exist in the binutil registry")
+        raise RuntimeError(f'"{device_id}" does not exist in the binutil registry')
     device_funcs = _DEVICE_REGISTRY[device_id]
     return device_funcs
 
@@ -67,7 +69,9 @@ def create_micro_lib_base(
         toolchain_prefix,
         device_id,
         lib_type,
-        options=None):
+        options=None,
+        lib_src_paths=None,
+        ):
     """Compiles code into a binary for the target micro device.
 
     Parameters
@@ -92,21 +96,35 @@ def create_micro_lib_base(
 
     options : List[str]
         additional options to pass to GCC
+
+    lib_src_paths : Optional[List[str]]
+        TODO
     """
+    print('[MicroBinutil.create_lib]')
+    print('  EXTENDED OPTIONS')
+    print(f'    {out_obj_path}')
+    print(f'    {in_src_path}')
+    print(f'    {lib_type}')
+    print(f'    {options}')
+    # look at these (specifically `strip`):
+    #   https://stackoverflow.com/questions/15314581/g-compiler-flag-to-minimize-binary-size
     base_compile_cmd = [
-        f"{toolchain_prefix}gcc",
-        "-std=c11",
-        "-Wall",
-        "-Wextra",
-        "--pedantic",
-        "-c",
-        "-O0",
-        "-g",
-        "-nostartfiles",
-        "-nodefaultlibs",
-        "-nostdlib",
-        "-fdata-sections",
-        "-ffunction-sections",
+        f'{toolchain_prefix}gcc',
+        '-std=c11',
+        '-Wall',
+        '-Wextra',
+        '--pedantic',
+        '-c',
+        # TODO(weberlo): make a debug flag
+        '-O0',
+        # '-O2',
+        # '-Os',
+        '-g',
+        '-nostartfiles',
+        '-nodefaultlibs',
+        '-nostdlib',
+        '-fdata-sections',
+        '-ffunction-sections',
         ]
     if options is not None:
         base_compile_cmd += options
@@ -114,49 +132,101 @@ def create_micro_lib_base(
     src_paths = []
     include_paths = find_include_path() + [get_micro_host_driven_dir()]
     tmp_dir = _util.tempdir()
-    # we might transform the src path in one of the branches below
+    # we need to create a new src file in the operator branch
     new_in_src_path = in_src_path
     if lib_type == LibType.RUNTIME:
         dev_dir = _get_device_source_dir(device_id)
-        dev_src_paths = glob.glob(f"{dev_dir}/*.[csS]")
+
+        dev_src_paths = glob.glob(f'{dev_dir}/*.[csS]')
         # there needs to at least be a utvm_timer.c file
         assert dev_src_paths
-        assert "utvm_timer.c" in map(os.path.basename, dev_src_paths)
+        assert 'utvm_timer.c' in map(os.path.basename, dev_src_paths)
+
         src_paths += dev_src_paths
     elif lib_type == LibType.OPERATOR:
-        # create a temporary copy of the source, so we can inject the dev lib
+        # create a temporary copy of the operator source, so we can inject the dev lib
         # header without modifying the original.
-        temp_src_path = tmp_dir.relpath("temp.c")
-        with open(in_src_path, "r") as f:
+        temp_src_path = tmp_dir.relpath('temp.c')
+        with open(in_src_path, 'r') as f:
             src_lines = f.read().splitlines()
-        src_lines.insert(0, "#include \"utvm_device_dylib_redirect.c\"")
-        with open(temp_src_path, "w") as f:
-            f.write("\n".join(src_lines))
+        src_lines.insert(0, '#include "utvm_device_dylib_redirect.c"')
+        with open(temp_src_path, 'w') as f:
+            f.write('\n'.join(src_lines))
         new_in_src_path = temp_src_path
-        base_compile_cmd += ["-c"]
     else:
-        raise RuntimeError("unknown lib type")
+        raise RuntimeError('unknown lib type')
 
     src_paths += [new_in_src_path]
 
+    # add any src paths required by the operator
+    if lib_src_paths is not None:
+        src_paths += lib_src_paths
+
+    print(f'include paths: {include_paths}')
     for path in include_paths:
-        base_compile_cmd += ["-I", path]
+        base_compile_cmd += ['-I', path]
 
     prereq_obj_paths = []
+    print(src_paths)
     for src_path in src_paths:
-        curr_obj_path = Path(src_path).with_suffix(".o").name
+        curr_obj_path = tmp_dir.relpath(pathlib.Path(src_path).with_suffix('.o').name)
         assert curr_obj_path not in prereq_obj_paths
         prereq_obj_paths.append(curr_obj_path)
-        curr_compile_cmd = base_compile_cmd + [src_path, "-o", curr_obj_path]
+        curr_compile_cmd = base_compile_cmd + [src_path, '-o', curr_obj_path]
+        # TODO(weberlo): make compilation fail if there are any warnings
         run_cmd(curr_compile_cmd)
 
-    ld_cmd = [f"{toolchain_prefix}ld", "-relocatable"]
+    ld_cmd = [f'{toolchain_prefix}ld', '-relocatable']
     ld_cmd += prereq_obj_paths
-    ld_cmd += ["-o", out_obj_path]
+    ld_cmd += ['-o', out_obj_path]
     run_cmd(ld_cmd)
 
 
+# TODO we shouldn't need an enum for this. too much bureaucracy.
+class MemConstraint(enum.Enum):
+    """Represents a constraint on the device's memory layout"""
+    ABSOLUTE_BYTES = 0
+    WEIGHT = 1
+
+
+def gen_mem_layout(base_addr, available_mem, word_size, section_constraints):
+    print('[gen_mem_layout]')
+    byte_sum = sum(map(operator.itemgetter(0), filter(lambda x: x[1] == MemConstraint.ABSOLUTE_BYTES, section_constraints.values())))
+    weight_sum = sum(map(operator.itemgetter(0), filter(lambda x: x[1] == MemConstraint.WEIGHT, section_constraints.values())))
+    assert byte_sum <= available_mem
+    available_weight_mem = available_mem - byte_sum
+
+    res = {}
+    curr_addr = base_addr
+    for section in DEVICE_SECTIONS:
+        (val, cons_type) = section_constraints[section]
+        if cons_type == MemConstraint.ABSOLUTE_BYTES:
+            assert val % word_size == 0, f'constraint {val} for {section} section is not word-aligned'
+            size = val
+            res[section] = {
+                'start': curr_addr,
+                'size': size,
+            }
+        else:
+            size = int((val / weight_sum) * available_weight_mem)
+            size = (size // word_size) * word_size
+            res[section] = {
+                'start': curr_addr,
+                'size': size,
+            }
+        curr_addr += size
+
+    print('  result mem layout:')
+    for section in DEVICE_SECTIONS:
+        start = res[section]['start']
+        size = res[section]['size']
+        print(f'    {section}: start={start:x}, size={size}')
+    # import pprint
+    # pprint.pprint(res)
+    return res
+
+
 def _get_device_source_dir(device_id):
     """Grabs the source directory for device-specific uTVM files"""
-    dev_subdir = "/".join(device_id.split("."))
-    return get_micro_device_dir() + "/" + dev_subdir
+    dev_subdir = '/'.join(device_id.split('.'))
+    return get_micro_device_dir() + '/' + dev_subdir
diff --git a/python/tvm/micro/device/host.py b/python/tvm/micro/device/host.py
index a5495b60cf99..737026dfd51a 100644
--- a/python/tvm/micro/device/host.py
+++ b/python/tvm/micro/device/host.py
@@ -17,12 +17,26 @@
 """Compilation and config definitions for the host emulated device"""
 import sys
 
-from . import create_micro_lib_base, register_device
+from . import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint
 
-DEVICE_ID = "host"
-TOOLCHAIN_PREFIX = ""
+DEVICE_ID = 'host'
+TOOLCHAIN_PREFIX = ''
+WORD_SIZE = 8 if sys.maxsize > 2**32 else 4
 
-def create_micro_lib(obj_path, src_path, lib_type, options=None):
+# we pretend we only have 320kb in the default case, so we can use `gen_mem_layout`
+DEFAULT_AVAILABLE_MEM = 3200000
+DEFAULT_SECTION_CONSTRAINTS = {
+    'text': (20480, MemConstraint.ABSOLUTE_BYTES),
+    'rodata': (20480, MemConstraint.ABSOLUTE_BYTES),
+    'data': (768, MemConstraint.ABSOLUTE_BYTES),
+    'bss': (4096, MemConstraint.ABSOLUTE_BYTES),
+    'args': (4096, MemConstraint.ABSOLUTE_BYTES),
+    'heap': (262144, MemConstraint.ABSOLUTE_BYTES),
+    'workspace': (64000, MemConstraint.ABSOLUTE_BYTES),
+    'stack': (80, MemConstraint.ABSOLUTE_BYTES),
+}
+
+def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None):
     """Wrapper over `create_micro_lib_base` to add device-specific options
 
     Parameters
@@ -38,59 +52,62 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None):
 
     options : Optional[List[str]]
         additional options to pass to GCC
+
+    lib_src_paths : Optional[List[str]]
+        TODO
     """
     if options is None:
         options = []
-    if sys.maxsize > 2**32 and sys.platform.startswith("linux"):
-        options += ["-mcmodel=large"]
+    if sys.maxsize > 2**32 and sys.platform.startswith('linux'):
+        options += ['-mcmodel=large']
     create_micro_lib_base(
-        obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options)
+        obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options, lib_src_paths=lib_src_paths)
 
 
-def default_config():
-    """Generates a default configuration for the host emulated device
+def generate_config(available_mem=None, section_constraints=None):
+    """Generates a configuration for the host emulated device
+
+    Parameters
+    ----------
+    TODO correct type annotation?
+    available_mem: int
+        TODO
+
+    TODO correct type annotation?
+    section_constraints: Optional[Dict[str, Dict[str, Number]]]
+        TODO
 
     Return
     ------
     config : Dict[str, Any]
         MicroTVM config dict for this device
     """
+    if available_mem is None:
+        available_mem = DEFAULT_AVAILABLE_MEM
+    if section_constraints is None:
+        section_constraints = DEFAULT_SECTION_CONSTRAINTS
+    mem_layout = gen_mem_layout(0, available_mem, WORD_SIZE, section_constraints)
+    # TODO the host emulated device is an outlier, since we don't know how what
+    # its base address will be until we've created it in the C++. is there any
+    # way to change the infrastructure around this so it's not so much of an
+    # outlier?
+
+    # need to zero out all start addresses, because they don't make sense for a
+    # host device (the memory region is allocated in the backend)
+    for section in mem_layout:
+        mem_layout[section]['start'] = 0
     return {
-        "device_id": DEVICE_ID,
-        "toolchain_prefix": TOOLCHAIN_PREFIX,
-        "mem_layout": {
-            "text": {
-                "size": 20480,
-            },
-            "rodata": {
-                "size": 20480,
-            },
-            "data": {
-                "size": 768,
-            },
-            "bss": {
-                "size": 768,
-            },
-            "args": {
-                "size": 1280,
-            },
-            "heap": {
-                "size": 262144,
-            },
-            "workspace": {
-                "size": 20480,
-            },
-            "stack": {
-                "size": 80,
-            },
-        },
-        "word_size": 8 if sys.maxsize > 2**32 else 4,
-        "thumb_mode": False,
-        "comms_method": "host",
+        'device_id': DEVICE_ID,
+        'toolchain_prefix': TOOLCHAIN_PREFIX,
+        'mem_layout': mem_layout,
+        'word_size': WORD_SIZE,
+        'thumb_mode': False,
+        'use_device_timer': False,
+        'comms_method': 'host',
     }
 
 
 register_device(DEVICE_ID, {
-    "create_micro_lib": create_micro_lib,
-    "default_config": default_config,
+    'create_micro_lib': create_micro_lib,
+    'generate_config': generate_config,
 })
diff --git a/python/tvm/micro/device/riscv_spike.py b/python/tvm/micro/device/riscv_spike.py
index 923e5dfb23a2..b7beff3c5cb6 100644
--- a/python/tvm/micro/device/riscv_spike.py
+++ b/python/tvm/micro/device/riscv_spike.py
@@ -17,12 +17,24 @@
 """Compilation and config definitions for Spike, a RISC-V functional ISA simulator"""
 from collections import OrderedDict
 
-from . import create_micro_lib_base, register_device
+from . import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint
 
-DEVICE_ID = "riscv_spike"
-TOOLCHAIN_PREFIX = "riscv64-unknown-elf-"
+DEVICE_ID = 'riscv_spike'
+TOOLCHAIN_PREFIX = 'riscv64-unknown-elf-'
+WORD_SIZE = 8
 
-def create_micro_lib(obj_path, src_path, lib_type, options=None):
+DEFAULT_SECTION_CONSTRAINTS = {
+    'text': (18000, MemConstraint.ABSOLUTE_BYTES),
+    'rodata': (128, MemConstraint.ABSOLUTE_BYTES),
+    'data': (128, MemConstraint.ABSOLUTE_BYTES),
+    'bss': (2048, MemConstraint.ABSOLUTE_BYTES),
+    'args': (4096, MemConstraint.ABSOLUTE_BYTES),
+    'heap': (100.0, MemConstraint.WEIGHT),
+    'workspace': (64000, MemConstraint.ABSOLUTE_BYTES),
+    'stack': (32, MemConstraint.ABSOLUTE_BYTES),
+}
+
+def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None):
     """Wrapper over `create_micro_lib_base` to add device-specific options
 
     Parameters
@@ -38,6 +50,9 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None):
 
     options : Optional[List[str]]
         additional options to pass to GCC
+
+    lib_src_paths : Optional[List[str]]
+        TODO
     """
     create_micro_lib_base(
         obj_path,
@@ -45,11 +60,13 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None):
         TOOLCHAIN_PREFIX,
         DEVICE_ID,
         lib_type,
-        options=options)
+        options=options,
+        lib_src_paths=lib_src_paths
+        )
 
 
-def default_config(base_addr, server_addr, server_port):
-    """Generates a default configuration for Spike
+def generate_config(base_addr, available_mem, server_addr, server_port, section_constraints=None):
+    """Generates a configuration for Spike
 
     Parameters
     ----------
@@ -62,56 +79,31 @@ def default_config(base_addr, server_addr, server_port):
     server_port : int
         port of OpenOCD server to connect to
 
+    TODO correct type annotation?
+    section_constraints: Optional[Dict[str, Tuple[Number, MemConstraint]]]
+        TODO
+
     Return
     ------
     config : Dict[str, Any]
         MicroTVM config dict for this device
     """
-    res = {
-        "device_id": DEVICE_ID,
-        "toolchain_prefix": TOOLCHAIN_PREFIX,
-        "mem_layout": OrderedDict([
-            ("text", {
-                "size": 20480,
-            }),
-            ("rodata", {
-                "size": 20480,
-            }),
-            ("data", {
-                "size": 768,
-            }),
-            ("bss", {
-                "size": 768,
-            }),
-            ("args", {
-                "size": 1280,
-            }),
-            ("heap", {
-                "size": 262144,
-            }),
-            ("workspace", {
-                "size": 20480,
-            }),
-            ("stack", {
-                "size": 80,
-            }),
-        ]),
-        "word_size": 4,
-        "thumb_mode": True,
-        "comms_method": "openocd",
-        "server_addr": server_addr,
-        "server_port": server_port,
+    if section_constraints is None:
+        section_constraints = DEFAULT_SECTION_CONSTRAINTS
+    return {
+        'device_id': DEVICE_ID,
+        'toolchain_prefix': TOOLCHAIN_PREFIX,
+        'mem_layout': gen_mem_layout(base_addr, available_mem, WORD_SIZE, section_constraints),
+        'word_size': WORD_SIZE,
+        'thumb_mode': False,
+        'use_device_timer': False,
+        'comms_method': 'openocd',
+        'server_addr': server_addr,
+        'server_port': server_port,
     }
-    # generate section start addresses from the given `base_addr`
-    curr_offset = 0
-    mem_layout = res["mem_layout"]
-    for region_dict in mem_layout.values():
-        region_dict["start"] = base_addr + curr_offset
-        curr_offset += region_dict["size"]
-    return res
 
 
 register_device(DEVICE_ID, {
-    "create_micro_lib": create_micro_lib,
-    "default_config": default_config,
+    'create_micro_lib': create_micro_lib,
+    'generate_config': generate_config,
 })
diff --git a/python/tvm/relay/_parser.py b/python/tvm/relay/_parser.py
index 4a73e572f924..42ced795f6ae 100644
--- a/python/tvm/relay/_parser.py
+++ b/python/tvm/relay/_parser.py
@@ -151,7 +151,9 @@ def __call__(self, args, attrs, type_args):
     "nn.dropout": op.nn.dropout_raw,
     "zeros": op.zeros,
     "split": op.split,
-    "cast": op.cast
+    "cast": op.cast,
+    "clip": op.clip,
+    "right_shift": op.right_shift,
 }
 
 TYPE_PREFIXES = [
@@ -340,7 +342,10 @@ def visitLocalVar(self, ctx):
         return local_var
 
     def visitGraphVar(self, ctx):
-        return self.graph_expr[int(ctx.NAT().getText())]
+        graph_var_idx = int(ctx.NAT().getText())
+        if graph_var_idx >= len(self.graph_expr):
+            raise ParseError(f"graph var `%{graph_var_idx}` is unbound")
+        return self.graph_expr[graph_var_idx]
 
     def visit_list(self, ctx_list) -> List[Any]:
         """"Visit a list of contexts."""
@@ -625,6 +630,12 @@ def visitCallWithAttr(self, ctx: RelayParser.CallWithAttrContext):
 
     def call(self, func, args, attrs, type_args):
         if isinstance(func, OpWrapper):
+            #if hasattr(func.operator, '__name__') and func.operator.__name__ == 'clip':
+            #    # TODO(wbelrlo) this big fucking hack yes
+            #    import copy
+            #    args = copy.deepcopy(args)
+            #    args[1] = float(args[1].data.asnumpy())
+            #    args[2] = float(args[2].data.asnumpy())
             return func(args, attrs, type_args)
         if isinstance(func, adt.Constructor):
             return func(*args)
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 30c5971e32b9..459a4a588324 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -239,16 +239,26 @@ def build(mod, target=None, target_host=None, params=None):
         raise ValueError("target host must be the type of str, " +
                          "tvm.target.Target, or None")
 
-    # If current dispatch context is fallback context (the default root context),
-    # then load pre-tuned parameters from TopHub
+    # # If current dispatch context is fallback context (the default root context),
+    # # then load pre-tuned parameters from TopHub
+    # if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
+    #     tophub_context = autotvm.tophub.context(list(target.values()))
+    # else:
+    #     tophub_context = autotvm.util.EmptyContext()
+
+    # with tophub_context:
+    #     bld_mod = BuildModule()
+    #     graph_json, mod, params = bld_mod.build(func, target, target_host, params)
+
     if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
         tophub_context = autotvm.tophub.context(list(target.values()))
+        with tophub_context:
+            bld_mod = BuildModule()
+            graph_json, mod, params = bld_mod.build(func, target, target_host, params)
     else:
-        tophub_context = autotvm.util.EmptyContext()
-
-    with tophub_context:
         bld_mod = BuildModule()
-        graph_json, mod, params = bld_mod.build(mod, target, target_host, params)
+        graph_json, mod, params = bld_mod.build(func, target, target_host, params)
+
     return graph_json, mod, params
 
 
diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py
index 43065bef838a..d455f2229439 100644
--- a/python/tvm/relay/frontend/keras.py
+++ b/python/tvm/relay/frontend/keras.py
@@ -85,7 +85,7 @@ def _convert_activation(inexpr, keras_layer, _):
         return _op.sigmoid(inexpr)
     if act_type == 'tanh':
         return _op.tanh(inexpr)
-    if act_type == 'relu':
+    if act_type in ('relu', 'swish'):
         return _op.nn.relu(inexpr)
     if act_type == 'softplus':
         return _op.log(_op.add(_op.exp(inexpr), _expr.const(1., dtype='float32')))
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index ed57e0d4276d..f5a16941083b 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -70,6 +70,11 @@ def context(self, dev_type, dev_id=0):
         ctx: TVMContext
             The corresponding encoded remote context.
         """
+        print('[RPCSession.context]')
+        print(f'  dev_type: {dev_type}')
+        print(f'  dev_id: {dev_id}')
+        if '-device=micro_dev' in dev_type:
+            dev_type = 'micro_dev'
         ctx = nd.context(dev_type, dev_id)
         encode = (self._tbl_index + 1) * base.RPC_SESS_MASK
         ctx.device_type += encode
@@ -403,9 +408,14 @@ def connect(url, port, key="", session_timeout=0):
         The connected session.
     """
     try:
+        print('[client.connect]')
+        #session_timeout = 0
+        print('  hardcoding timeout to 0 (always keep alive)!')
         if session_timeout:
             key += " -timeout=%s" % str(session_timeout)
+        print(f'  connecting to RPC server with {url}, {port}, {key}')
         sess = base._Connect(url, port, key)
+        print(f'  finished connecting!')
     except NameError:
         raise RuntimeError("Please compile with USE_RPC=1")
     return RPCSession(sess)
diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py
index 627d67a0a835..74296cc2ceaa 100644
--- a/python/tvm/rpc/server.py
+++ b/python/tvm/rpc/server.py
@@ -64,7 +64,9 @@ def get_workpath(path):
     def load_module(file_name):
         """Load module from remote side."""
         path = temp.relpath(file_name)
+        print('[rpc.server] ABOUT TO LOAD MOD')
         m = _load_module(path)
+        print('[rpc.server] DONE LOADING MOD')
         logger.info("load_module %s", path)
         return m
 
@@ -325,7 +327,10 @@ def __init__(self,
                  key="",
                  load_library=None,
                  custom_addr=None,
-                 silent=False):
+                 silent=False,
+                 utvm_dev_id=None,
+                 utvm_dev_config_args=None,
+                 ):
         try:
             if base._ServerLoop is None:
                 raise RuntimeError("Please compile with USE_RPC=1")
@@ -355,6 +360,10 @@ def __init__(self,
                 cmd += ["--custom-addr", custom_addr]
             if silent:
                 cmd += ["--silent"]
+            if utvm_dev_id is not None:
+                assert utvm_dev_config_args is not None
+                cmd += [f"--utvm-dev-id={utvm_dev_id}"]
+                cmd += [f"--utvm-dev-config-args={utvm_dev_config_args}"]
 
             # prexec_fn is not thread safe and may result in deadlock.
             # python 3.2 introduced the start_new_session parameter as
diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py
index e3346b162aaf..001c76c76fef 100644
--- a/python/tvm/rpc/tracker.py
+++ b/python/tvm/rpc/tracker.py
@@ -297,8 +297,11 @@ def _event_handler(_, events):
     def _on_event(self, _):
         while True:
             try:
+                print('waiting for connection!')
                 conn, addr = self._sock.accept()
+                print(f'got new conn: {conn}, {addr}')
                 TCPEventHandler(self, conn, addr)
+                print(f'made event handler')
             except socket.error as err:
                 if err.args[0] in (errno.EAGAIN, errno.EWOULDBLOCK):
                     break
diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
index 7845a26bfca2..45dd79e50019 100644
--- a/python/tvm/runtime/module.py
+++ b/python/tvm/runtime/module.py
@@ -109,7 +109,6 @@ def __call__(self, *args):
         # pylint: disable=not-callable
         return self.entry_func(*args)
 
-
     def __repr__(self):
         return "Module(%s, %x)" % (self.type_key, self.handle.value)
 
@@ -212,6 +211,7 @@ def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0):
 
             def evaluator(*args):
                 """Internal wrapped evaluator."""
+                print('[Module.time_evaluator.evaluator]')
                 # Wrap feval so we can add more stats in future.
                 blob = feval(*args)
                 fmt = "@" + ("d" * repeat)
diff --git a/src/ir/error.cc b/src/ir/error.cc
index 9d498288d2ba..e700691f3e07 100644
--- a/src/ir/error.cc
+++ b/src/ir/error.cc
@@ -62,16 +62,23 @@ void ErrorReporter::RenderErrors(const IRModule& module, bool use_color) {
 
     CHECK(has_errs != this->node_to_error_.end());
 
-    const auto& error_indicies = has_errs->second;
+    const auto& error_indices = has_errs->second;
 
     std::stringstream err_msg;
 
-    err_msg << rang::fg::red;
-    err_msg << " ";
-    for (auto index : error_indicies) {
-      err_msg << this->errors_[index].what() << "; ";
+    if (error_indices.size() != 0) {
+      err_msg << rang::fg::red;
+      err_msg << " ";
+      // TODO should fix this reverse problem further upstream (in the error reporter).
+      //
+      // the errors are in reverse order, so print them with a reversed iteration
+      err_msg << this->errors_[error_indices[error_indices.size()-1]].what();
+      for (int i = error_indices.size() - 2; i >= 0; i--) {
+        size_t err_idx = error_indices[i];
+        err_msg << "; " << this->errors_[err_idx].what();
+      }
+      err_msg << rang::fg::reset;
     }
-    err_msg << rang::fg::reset;
 
     // Setup error map.
     auto it = error_maps.find(global);
diff --git a/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s b/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s
index 300deb8079a0..5861c0326dbf 100644
--- a/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s
+++ b/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2019 by Contributors
  * \file utvm_init.s
  * \brief uTVM init definition for STM32F746XX-series boards
  */
diff --git a/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c b/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c
index 1b8376150fce..a5a12cf86ebd 100644
--- a/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c
+++ b/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2019 by Contributors
  * \file utvm_timer.c
  * \brief uTVM timer API definitions for STM32F746XX-series boards
  */
@@ -51,34 +52,31 @@ extern "C" {
 #define SYST_CALIB_NOREF  31
 #define SYST_CALIB_SKEW   30
 
-uint32_t start_time = 0;
-uint32_t stop_time = 0;
+volatile uint32_t start_time = 0;
+volatile uint32_t stop_time = 0;
 
 int32_t UTVMTimerStart() {
+  SYST_CSR = 0;
+  // maximum reload value (24-bit)
+  SYST_RVR = (~((uint32_t) 0)) >> 8;
+  SYST_CVR = 0;
+
   SYST_CSR = (1 << SYST_CSR_ENABLE) | (1 << SYST_CSR_CLKSOURCE);
   // wait until timer starts
   while (SYST_CVR == 0) {}
   start_time = SYST_CVR;
-  return 0;
+  return UTVM_ERR_OK;
 }
 
-void UTVMTimerStop() {
-  SYST_CSR = 0;
+uint32_t UTVMTimerStop(int32_t *err) {
+  SYST_CSR &= ~((uint32_t) 1);
   stop_time = SYST_CVR;
-}
-
-void UTVMTimerReset() {
-  SYST_CSR = 0;
-  // maximum reload value (24-bit)
-  SYST_RVR = (~((uint32_t) 0)) >> 8;
-  SYST_CVR = 0;
-}
-
-uint32_t UTVMTimerRead() {
-  if (SYST_CSR & SYST_COUNTFLAG) {
+  if (SYST_CSR & (1 << SYST_COUNTFLAG)) {
     TVMAPISetLastError("timer overflowed");
-    return -1;
+    *err = UTVM_ERR_TIMER_OVERFLOW;
+    return 0;
   } else {
+    *err = UTVM_ERR_OK;
     return start_time - stop_time;
   }
 }
@@ -91,33 +89,33 @@ uint32_t UTVMTimerRead() {
 #define DWT_CTRL_NOCYCCNT   25
 #define DWT_CTRL_CYCCNTENA  0
 
-uint32_t start_time = 0;
-uint32_t stop_time = 0;
+volatile uint32_t start_time = 0;
+volatile uint32_t stop_time = 0;
 
-void UTVMTimerReset() {
+int32_t UTVMTimerStart() {
+  DWT_CTRL &= ~(1 << DWT_CTRL_CYCCNTENA);
   DWT_CYCCNT = 0;
-}
 
-int32_t UTVMTimerStart() {
-  if (DWT_CTRL & DWT_CTRL_NOCYCCNT) {
+  if (DWT_CTRL & (1 << DWT_CTRL_NOCYCCNT)) {
     TVMAPISetLastError("cycle counter not implemented on device");
-    return -1;
+    return UTVM_ERR_TIMER_NOT_IMPLEMENTED;
   }
   start_time = DWT_CYCCNT;
   DWT_CTRL |= (1 << DWT_CTRL_CYCCNTENA);
+  return UTVM_ERR_OK;
 }
 
-void UTVMTimerStop() {
+uint32_t UTVMTimerStop(int32_t* err) {
   stop_time = DWT_CYCCNT;
   DWT_CTRL &= ~(1 << DWT_CTRL_CYCCNTENA);
-}
-
-int32_t UTVMTimerRead() {
-  if (stop_time > stop_time) {
+  // even with this check, we can't know for sure if the timer has overflowed
+  // (it may have overflowed and gone past `start_time`).
+  if (stop_time > start_time) {
+    *err = UTVM_ERR_OK;
     return stop_time - start_time;
   } else {
-    uint32_t largest = ~0;
-    return (largest - start_time) + stop_time;
+    *err = UTVM_ERR_TIMER_OVERFLOW;
+    return 0;
   }
 }
 
diff --git a/src/runtime/micro/device/host/utvm_init.c b/src/runtime/micro/device/host/utvm_init.c
index 4fb43c11d20e..02888f04c48e 100644
--- a/src/runtime/micro/device/host/utvm_init.c
+++ b/src/runtime/micro/device/host/utvm_init.c
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2019 by Contributors
  * \file utvm_init.c
  * \brief uTVM init definition for the host emulated device
  */
diff --git a/src/runtime/micro/device/host/utvm_timer.c b/src/runtime/micro/device/host/utvm_timer.c
index 56a36ebae86d..7b24aab473d1 100644
--- a/src/runtime/micro/device/host/utvm_timer.c
+++ b/src/runtime/micro/device/host/utvm_timer.c
@@ -20,28 +20,34 @@
 /*!
  * \file utvm_timer.c
  * \brief uTVM timer API stubs for the host emulated device
+ *  Copyright (c) 2019 by Contributors
  */
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "utvm_runtime.h"
+#include <stdint.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/c_backend_api.h>
 
 // TODO(weberlo): use this? https://stackoverflow.com/questions/5141960/get-the-current-time-in-c
 
 int32_t UTVMTimerStart() {
+  return UTVM_ERR_OK;
+}
+
+uint32_t UTVMTimerStop(int32_t* err) {
+  *err = UTVM_ERR_OK;
   return 0;
 }
 
-void UTVMTimerStop() { }
+extern void UTVMInit();
 
-void UTVMTimerReset() { }
+extern void UTVMTimerReset();
 
-uint32_t UTVMTimerRead() {
-  return 1;
-}
+extern int32_t UTVMTimerStart();
+
+extern void UTVMTimerStop();
+
+extern uint32_t UTVMTimerRead();
+
+void UTVMMain();
 
-#ifdef __cplusplus
-}  // TVM_EXTERN_C
-#endif
+void UTVMDone();
diff --git a/src/runtime/micro/device/riscv_spike/utvm_init.s b/src/runtime/micro/device/riscv_spike/utvm_init.s
new file mode 100644
index 000000000000..a73641249165
--- /dev/null
+++ b/src/runtime/micro/device/riscv_spike/utvm_init.s
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file utvm_init.s
+ * \brief uTVM init definition for Spike
+ */
+
+UTVMInit:
+  /* set stack pointer */
+  la sp, _utvm_stack_pointer_init
+  call UTVMMain
diff --git a/src/runtime/micro/device/riscv_spike/utvm_timer.c b/src/runtime/micro/device/riscv_spike/utvm_timer.c
new file mode 100644
index 000000000000..c4e0af2b230b
--- /dev/null
+++ b/src/runtime/micro/device/riscv_spike/utvm_timer.c
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file utvm_timer.c
+ * \brief uTVM timer API stubs for Spike
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "utvm_runtime.h"
+
+int32_t UTVMTimerStart() {
+  return UTVM_ERR_OK;
+}
+
+uint32_t UTVMTimerStop(int32_t* err) {
+  *err = UTVM_ERR_OK;
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // TVM_EXTERN_C
+#endif
diff --git a/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c b/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c
index a8c600ed347b..970eb27a1ef3 100644
--- a/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c
+++ b/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c
@@ -32,10 +32,11 @@ extern "C" {
 #include <stdint.h>
 #include <stddef.h>
 
-void *(*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) =
+// TODO compiler errors say volatile qualifier is discarded. should we just get rid of em?
+volatile void *(*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) =
     (void *(*)(int, int, uint64_t, int, int)) NULL;
-int (*TVMBackendFreeWorkspace_)(int, int, void*) = (int (*)(int, int, void*)) NULL;
-void (*TVMAPISetLastError_)(const char*) = (void (*)(const char*)) NULL;
+volatile int (*TVMBackendFreeWorkspace_)(int, int, void*) = (int (*)(int, int, void*)) NULL;
+volatile void (*TVMAPISetLastError_)(const char*) = (void (*)(const char*)) NULL;
 
 void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size,
     int dtype_code_hint, int dtype_bits_hint) {
@@ -51,6 +52,39 @@ void TVMAPISetLastError(const char* msg) {
   (*TVMAPISetLastError_)(msg);
 }
 
+void *memset(void *s, int c, size_t n) {
+  char *p = (char*) s;
+  while (n > 0) {
+    *p = (char) c;
+    p++;
+    n--;
+  }
+  return s;
+}
+
+void *memmove(void *to, const void *from, size_t n) {
+  // TODO will need to factor memmove calls into workspace size calculation
+  char *temp = (char*) TVMBackendAllocWorkspace(1, 1, (uint64_t) n, 2, 8);
+  if (temp == NULL) {
+    return NULL;
+  }
+
+  const char *from_pp = (char*) from;
+  for (size_t i = 0; i < n; i++) {
+    temp[i] = from_pp[i];
+  }
+  char *to_pp = (char*) to;
+  for (size_t i = 0; i < n; i++) {
+    to_pp[i] = temp[i];
+  }
+
+  if (TVMBackendFreeWorkspace(1, (uint64_t) 1, (void*) temp) != 0) {
+    return NULL;
+  }
+
+  return to;
+}
+
 #ifdef __cplusplus
 }  // TVM_EXTERN_C
 #endif
diff --git a/src/runtime/micro/host_driven/utvm_runtime.h b/src/runtime/micro/host_driven/utvm_runtime.h
index c364ecf40792..b39309a784f5 100644
--- a/src/runtime/micro/host_driven/utvm_runtime.h
+++ b/src/runtime/micro/host_driven/utvm_runtime.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2019 by Contributors
  * \file utvm_runtime.h
  * \brief uTVM runtime headers
  */
@@ -32,6 +33,22 @@ extern "C" {
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/c_backend_api.h>
 
+/*!
+ * \brief TODO
+ */
+enum UTVMReturnCode {
+  UTVM_ERR_OK = 0,
+  UTVM_ERR_NOT_FINISHED = -1,
+  UTVM_ERR_TIMER_NOT_IMPLEMENTED = -2,
+  UTVM_ERR_TIMER_OVERFLOW = -3,
+  UTVM_ERR_WS_DOUBLE_FREE = -4,
+  UTVM_ERR_WS_OUT_OF_SPACE = -5,
+  UTVM_ERR_WS_TOO_MANY_ALLOCS = -6,
+  UTVM_ERR_WS_ZERO_SIZE_ALLOC = -7,
+  UTVM_ERR_WS_UNALIGNED_START = -8,
+  UTVM_ERR_WS_UNALIGNED_ALLOC_SIZE = -9,
+};
+
 /*!
  * \brief Task structure for uTVM
  */
@@ -46,20 +63,38 @@ typedef struct {
   int32_t num_args;
 } UTVMTask;
 
+/*!
+ * \brief TODO
+ */
 extern void UTVMInit();
 
-extern void UTVMTimerReset();
-
+/*!
+ * \brief TODO
+ */
 extern int32_t UTVMTimerStart();
 
-extern void UTVMTimerStop();
-
-extern uint32_t UTVMTimerRead();
+/*!
+ * \brief TODO
+ */
+extern uint32_t UTVMTimerStop(int32_t* err);
 
+/*!
+ * \brief TODO
+ */
 void UTVMMain();
 
+/*!
+ * \brief TODO
+ */
 void UTVMDone();
 
+// GCC -O3 begins to inject memset and memmove calls, so we provide impls in
+// the runtime for this case and for general usage.
+
+void *memset(void *s, int c, size_t n);
+
+void *memmove(void *to, const void *from, size_t n);
+
 #ifdef __cplusplus
 }  // TVM_EXTERN_C
 #endif
diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h
index 4a0189b3e89e..e6696f56f06c 100644
--- a/src/runtime/micro/micro_common.h
+++ b/src/runtime/micro/micro_common.h
@@ -60,20 +60,21 @@ union TargetVal {
   uint64_t val64;
 };
 
+// TODO just get rid of `DevPtr`.
 /*! \brief absolute device address */
-class DevPtr {
+class TargetPtr {
  public:
   /*! \brief construct a device address with value `value` */
-  explicit DevPtr(std::uintptr_t value) : value_(TargetVal { .val64 = value }) {}
+  explicit TargetPtr(std::uintptr_t value) : value_(TargetVal { .val64 = value }) {}
 
   /*! \brief default constructor */
-  DevPtr() : value_(TargetVal { .val64 = 0 }) {}
+  TargetPtr() : value_(TargetVal { .val64 = 0 }) {}
 
   /*! \brief construct a null address */
-  explicit DevPtr(std::nullptr_t value) : value_(TargetVal { .val64 = 0 }) {}
+  explicit TargetPtr(std::nullptr_t value) : value_(TargetVal { .val64 = 0 }) {}
 
   /*! \brief destructor */
-  ~DevPtr() {}
+  ~TargetPtr() {}
 
   /*!
    * \brief get value of pointer
@@ -95,23 +96,23 @@ class DevPtr {
   bool operator!=(std::nullptr_t) const { return value_.val64 != 0; }
 
   /*! \brief add an integer to this absolute address to get a larger absolute address */
-  DevPtr operator+(size_t n) const {
-    return DevPtr(value_.val64 + n);
+  TargetPtr operator+(size_t n) const {
+    return TargetPtr(value_.val64 + n);
   }
 
   /*! \brief mutably add an integer to this absolute address */
-  DevPtr& operator+=(size_t n) {
+  TargetPtr& operator+=(size_t n) {
     value_.val64 += n;
     return *this;
   }
 
   /*! \brief subtract an integer from this absolute address to get a smaller absolute address */
-  DevPtr operator-(size_t n) const {
-    return DevPtr(value_.val64 - n);
+  TargetPtr operator-(size_t n) const {
+    return TargetPtr(value_.val64 - n);
   }
 
   /*! \brief mutably subtract an integer from this absolute address */
-  DevPtr& operator-=(size_t n) {
+  TargetPtr& operator-=(size_t n) {
     value_.val64 -= n;
     return *this;
   }
@@ -152,7 +153,7 @@ class SymbolMap {
     stream >> name;
     stream >> std::hex >> addr;
     while (stream) {
-      map_[name] = DevPtr(addr);
+      map_[name] = TargetPtr(addr);
       stream >> name;
       stream >> std::hex >> addr;
     }
@@ -163,7 +164,7 @@ class SymbolMap {
    * \param name name of the symbol
    * \return on-device offset of the symbol
    */
-  DevPtr operator[](const std::string& name) const {
+  TargetPtr operator[](const std::string& name) const {
     auto result = map_.find(name);
     CHECK(result != map_.end()) << "\"" << name << "\" not in symbol map";
     return result->second;
@@ -175,13 +176,13 @@ class SymbolMap {
 
  private:
   /*! \brief backing map */
-  std::unordered_map<std::string, DevPtr> map_;
+  std::unordered_map<std::string, TargetPtr> map_;
 };
 
 /*! \brief struct containing start and size of a device memory region */
 struct DevMemRegion {
   /*! \brief section start offset */
-  DevPtr start;
+  TargetPtr start;
   /*! \brief size of section */
   size_t size;
 };
@@ -240,11 +241,11 @@ const char* SectionToString(SectionKind section);
 std::string RelocateBinarySections(
     const std::string& binary_path,
     size_t word_size,
-    DevPtr text_start,
-    DevPtr rodata_start,
-    DevPtr data_start,
-    DevPtr bss_start,
-    DevPtr stack_end,
+    TargetPtr text_start,
+    TargetPtr rodata_start,
+    TargetPtr data_start,
+    TargetPtr bss_start,
+    TargetPtr stack_end,
     const std::string& toolchain_prefix);
 
 /*!
diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc
index 3d0a6889c4f7..fbaef4af5b07 100644
--- a/src/runtime/micro/micro_device_api.cc
+++ b/src/runtime/micro/micro_device_api.cc
@@ -74,8 +74,10 @@ class MicroDeviceAPI final : public DeviceAPI {
                       TVMContext ctx_to,
                       DLDataType type_hint,
                       TVMStreamHandle stream) final {
+    std::cout << "[MicroDeviceAPI::CopyDataFromTo]" << std::endl;
     std::tuple<int, int> type_from_to(ctx_from.device_type, ctx_to.device_type);
     if (type_from_to == std::make_tuple(kDLMicroDev, kDLMicroDev)) {
+      std::cout << "  device to device" << std::endl;
       // Copying from the device to the device.
 
       MicroDevSpace* from_space = static_cast<MicroDevSpace*>(const_cast<void*>(from));
@@ -87,6 +89,8 @@ class MicroDeviceAPI final : public DeviceAPI {
       CHECK(ctx_from.device_id == ctx_to.device_id)
         << "can only copy between the same micro device";
       ObjectPtr<MicroSession>& session = from_space->session;
+      // flush all pending tasks to ensure data is consistent
+      session->FlushTaskQueue();
       const std::shared_ptr<LowLevelDevice>& lld = session->low_level_device();
 
       DevPtr from_dev_addr = GetDevLoc(from_space, from_offset);
@@ -96,20 +100,27 @@ class MicroDeviceAPI final : public DeviceAPI {
       lld->Read(from_dev_addr, static_cast<void*>(buffer.data()), size);
       lld->Write(to_dev_addr, static_cast<void*>(buffer.data()), size);
     } else if (type_from_to == std::make_tuple(kDLMicroDev, kDLCPU)) {
+      std::cout << "  reading from device" << std::endl;
+      std::cout << "    num_bytes: " << size << std::endl;
       // Reading from the device.
 
       MicroDevSpace* from_space = static_cast<MicroDevSpace*>(const_cast<void*>(from));
       ObjectPtr<MicroSession>& session = from_space->session;
+      // flush all pending tasks to ensure data is consistent
+      session->FlushTaskQueue();
       const std::shared_ptr<LowLevelDevice>& lld = session->low_level_device();
 
       DevPtr from_dev_addr = GetDevLoc(from_space, from_offset);
-      void* to_host_ptr = GetHostLoc(to, to_offset);
       lld->Read(from_dev_addr, to_host_ptr, size);
     } else if (type_from_to == std::make_tuple(kDLCPU, kDLMicroDev)) {
+      std::cout << "  writing to device" << std::endl;
+      std::cout << "    num_bytes: " << size << std::endl;
       // Writing to the device.
 
       MicroDevSpace* to_space = static_cast<MicroDevSpace*>(const_cast<void*>(to));
       ObjectPtr<MicroSession>& session = to_space->session;
+      // flush all pending tasks to ensure data is consistent
+      session->FlushTaskQueue();
       const std::shared_ptr<LowLevelDevice>& lld = session->low_level_device();
 
       void* from_host_ptr = GetHostLoc(from, from_offset);
@@ -121,9 +132,12 @@ class MicroDeviceAPI final : public DeviceAPI {
   }
 
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
+    std::cout << "[MicroDeviceAPI::StreamSync]" << std::endl;
+    MicroSession::Current()->FlushTaskQueue();
   }
 
   void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final {
+    CHECK(false) << "the on-device workspace allocator isn't aware of this function";
     ObjectPtr<MicroSession>& session = MicroSession::Current();
 
     void* data = session->AllocateInSection(SectionKind::kWorkspace, size).cast_to<void*>();
@@ -135,6 +149,7 @@ class MicroDeviceAPI final : public DeviceAPI {
   }
 
   void FreeWorkspace(TVMContext ctx, void* data) final {
+    CHECK(false) << "the on-device workspace allocator isn't aware of this function";
     MicroDevSpace* dev_space = static_cast<MicroDevSpace*>(data);
     ObjectPtr<MicroSession>& session = dev_space->session;
     session->FreeInSection(SectionKind::kWorkspace,
diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc
index 50cee34be4a6..b27bd923c638 100644
--- a/src/runtime/micro/micro_module.cc
+++ b/src/runtime/micro/micro_module.cc
@@ -54,6 +54,8 @@ class MicroModuleNode final : public ModuleNode {
    * \param binary_path path of the binary to be loaded
    */
   void InitMicroModule(const std::string& binary_path) {
+    std::cout << "[MicroModuleNode::InitMicroModule]" << std::endl;
+    std::cout << "  start" << std::endl;
     session_ = MicroSession::Current();
     symbol_map_ = session_->LoadBinary(binary_path, true).symbol_map;
   }
@@ -73,7 +75,7 @@ class MicroWrappedFunc {
   }
 
   void operator()(TVMArgs args, TVMRetValue* rv) const {
-    *rv = session_->PushToExecQueue(func_ptr_, args);
+    *rv = session_->PushToTaskQueue(func_ptr_, args);
   }
 
  private:
diff --git a/src/runtime/micro/micro_section_allocator.h b/src/runtime/micro/micro_section_allocator.h
index 5c75f92737ab..4e8f7201ad7a 100644
--- a/src/runtime/micro/micro_section_allocator.h
+++ b/src/runtime/micro/micro_section_allocator.h
@@ -38,8 +38,9 @@ class MicroSectionAllocator {
    * \brief constructor that specifies section boundaries
    * \param region location and size of the section on the device
    */
-  explicit MicroSectionAllocator(DevMemRegion region, size_t word_size)
-    : start_addr_(region.start),
+  explicit MicroSectionAllocator(std::string section_name, DevMemRegion region, size_t word_size)
+    : section_name_(section_name),
+      start_addr_(region.start),
       size_(0),
       capacity_(region.size),
       word_size_(word_size) {
@@ -56,7 +57,7 @@ class MicroSectionAllocator {
 
   /*!
    * \brief memory allocator
-   * \param size size of allocated memory in bytes
+   * \param alloc_size size of allocated memory in bytes
    * \return pointer to allocated memory region in section, nullptr if out of space
    */
   DevPtr Allocate(size_t size) {
@@ -110,6 +111,8 @@ class MicroSectionAllocator {
   size_t capacity() const { return capacity_; }
 
  private:
+  /*! \brief name of the section (for debugging) */
+  std::string section_name_;
   /*! \brief start address of the section */
   DevPtr start_addr_;
   /*! \brief current size of the section */
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index 4bdc8ed69797..e6cdea5d1c88 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -23,6 +23,7 @@
 
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/registry.h>
+#include <chrono>
 #include <memory>
 #include <stack>
 #include <tuple>
@@ -79,11 +80,14 @@ MicroSession::MicroSession(
     size_t stack_size,
     size_t word_size,
     bool thumb_mode,
+    bool use_device_timer,
     const std::string& server_addr,
     int port)
-    : toolchain_prefix_(toolchain_prefix)
-    , word_size_(word_size)
-    , thumb_mode_(thumb_mode) {
+    : toolchain_prefix_(toolchain_prefix),
+      word_size_(word_size),
+      thumb_mode_(thumb_mode),
+      use_device_timer_(use_device_timer),
+      batch_args_encoder_(args_size, word_size) {
   CHECK(word_size_ == 4 || word_size_ == 8) << "unsupported word size " << word_size_;
   if (comms_method == "host") {
     // TODO(weberlo): move checks to python
@@ -105,84 +109,129 @@ MicroSession::MicroSession(
       << "base address not aligned to " << word_size_ << " bytes";
     DevPtr curr_addr = DevPtr(reinterpret_cast<std::uintptr_t>(base_addr));
 
-    section_allocators_[0] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
+    section_allocators_[0] = std::make_shared<MicroSectionAllocator>(
+      "text",
+      DevMemRegion {
       .start = curr_addr,
       .size = text_size,
-    }, word_size_);
+      }, word_size_);
     curr_addr += text_size;
-    section_allocators_[1] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
-      .start = curr_addr,
-      .size = rodata_size,
-    }, word_size_);
+    section_allocators_[1] = std::make_shared<MicroSectionAllocator>(
+      "rodata",
+      DevMemRegion {
+        .start = curr_addr,
+        .size = rodata_size,
+      }, word_size_);
     curr_addr += rodata_size;
-    section_allocators_[2] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
-      .start = curr_addr,
-      .size = data_size,
-    }, word_size_);
+    section_allocators_[2] = std::make_shared<MicroSectionAllocator>(
+      "data",
+      DevMemRegion {
+        .start = curr_addr,
+        .size = data_size,
+      }, word_size_);
     curr_addr += data_size;
-    section_allocators_[3] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
-      .start = curr_addr,
-      .size = bss_size,
-    }, word_size_);
+    section_allocators_[3] = std::make_shared<MicroSectionAllocator>(
+      "bss",
+      DevMemRegion {
+        .start = curr_addr,
+        .size = bss_size,
+      }, word_size_);
     curr_addr += bss_size;
-    section_allocators_[4] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
-      .start = curr_addr,
-      .size = args_size,
-    }, word_size_);
+    section_allocators_[4] = std::make_shared<MicroSectionAllocator>(
+      "args",
+      DevMemRegion {
+        .start = curr_addr,
+        .size = args_size,
+      }, word_size_);
     curr_addr += args_size;
-    section_allocators_[5] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
-      .start = curr_addr,
-      .size = heap_size,
-    }, word_size_);
+    section_allocators_[5] = std::make_shared<MicroSectionAllocator>(
+      "heap",
+      DevMemRegion {
+        .start = curr_addr,
+        .size = heap_size,
+      }, word_size_);
     curr_addr += heap_size;
-    section_allocators_[6] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
-      .start = curr_addr,
-      .size = workspace_size,
-    }, word_size_);
+    section_allocators_[6] = std::make_shared<MicroSectionAllocator>(
+      "workspace",
+      DevMemRegion {
+        .start = curr_addr,
+        .size = workspace_size,
+      }, word_size_);
     curr_addr += workspace_size;
-    section_allocators_[7] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
-      .start = curr_addr,
-      .size = stack_size,
-    }, word_size_);
+    section_allocators_[7] = std::make_shared<MicroSectionAllocator>(
+      "stack",
+      DevMemRegion {
+        .start = curr_addr,
+        .size = stack_size,
+      }, word_size_);
     curr_addr += stack_size;
   } else if (comms_method == "openocd") {
     low_level_device_ = OpenOCDLowLevelDeviceCreate(server_addr, port);
-    section_allocators_[0] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
-      .start = DevPtr(text_start),
-      .size = text_size,
-    }, word_size_);
-    section_allocators_[1] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
-      .start = DevPtr(rodata_start),
-      .size = rodata_size,
-    }, word_size_);
-    section_allocators_[2] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
-      .start = DevPtr(data_start),
-      .size = data_size,
-    }, word_size_);
-    section_allocators_[3] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
-      .start = DevPtr(bss_start),
-      .size = bss_size,
-    }, word_size_);
-    section_allocators_[4] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
-      .start = DevPtr(args_start),
-      .size = args_size,
-    }, word_size_);
-    section_allocators_[5] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
-      .start = DevPtr(heap_start),
-      .size = heap_size,
-    }, word_size_);
-    section_allocators_[6] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
-      .start = DevPtr(workspace_start),
-      .size = workspace_size,
-    }, word_size_);
-    section_allocators_[7] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
-      .start = DevPtr(stack_start),
-      .size = stack_size,
-    }, word_size_);
+    section_allocators_[0] = std::make_shared<MicroSectionAllocator>(
+      "text",
+      DevMemRegion {
+        .start = DevPtr(text_start),
+        .size = text_size,
+      }, word_size_);
+    section_allocators_[1] = std::make_shared<MicroSectionAllocator>(
+      "rodata",
+      DevMemRegion {
+        .start = DevPtr(rodata_start),
+        .size = rodata_size,
+      }, word_size_);
+    section_allocators_[2] = std::make_shared<MicroSectionAllocator>(
+      "data",
+      DevMemRegion {
+        .start = DevPtr(data_start),
+        .size = data_size,
+      }, word_size_);
+    section_allocators_[3] = std::make_shared<MicroSectionAllocator>(
+      "bss",
+      DevMemRegion {
+        .start = DevPtr(bss_start),
+        .size = bss_size,
+      }, word_size_);
+    section_allocators_[4] = std::make_shared<MicroSectionAllocator>(
+      "args",
+      DevMemRegion {
+        .start = DevPtr(args_start),
+        .size = args_size,
+      }, word_size_);
+    section_allocators_[5] = std::make_shared<MicroSectionAllocator>(
+      "heap",
+      DevMemRegion {
+        .start = DevPtr(heap_start),
+        .size = heap_size,
+      }, word_size_);
+    section_allocators_[6] = std::make_shared<MicroSectionAllocator>(
+      "workspace",
+      DevMemRegion {
+        .start = DevPtr(workspace_start),
+        .size = workspace_size,
+      }, word_size_);
+    section_allocators_[7] = std::make_shared<MicroSectionAllocator>(
+      "stack",
+      DevMemRegion {
+        .start = DevPtr(stack_start),
+        .size = stack_size,
+      }, word_size_);
   } else {
     LOG(FATAL) << "unsupported micro low-level device";
   }
 
+  std::cout << "[Memory Layout]" << std::endl;
+  std::cout << "  text (size = " << (section_allocators_[0]->capacity() / 1000.0) << " KB): " << section_allocators_[0]->start_addr().cast_to<void*>() << std::endl;
+  std::cout << "  rodata (size = " << (section_allocators_[1]->capacity() / 1000.0) << " KB): " << section_allocators_[1]->start_addr().cast_to<void*>() << std::endl;
+  std::cout << "  data (size = " << (section_allocators_[2]->capacity() / 1000.0) << " KB): " << section_allocators_[2]->start_addr().cast_to<void*>() << std::endl;
+  std::cout << "  bss (size = " << (section_allocators_[3]->capacity() / 1000.0) << " KB): " << section_allocators_[3]->start_addr().cast_to<void*>() << std::endl;
+  std::cout << "  args (size = " << (section_allocators_[4]->capacity() / 1000.0) << " KB): " << section_allocators_[4]->start_addr().cast_to<void*>() << std::endl;
+  std::cout << "  heap (size = " << (section_allocators_[5]->capacity() / 1000.0) << " KB): " << section_allocators_[5]->start_addr().cast_to<void*>() << std::endl;
+  std::cout << "  workspace (size = " << (section_allocators_[6]->capacity() / 1000.0) << " KB): " << section_allocators_[6]->start_addr().cast_to<void*>() << std::endl;
+  std::cout << "  stack (size = " << (section_allocators_[7]->capacity() / 1000.0) << " KB): " << section_allocators_[7]->start_addr().cast_to<void*>() << std::endl;
+
+  DevPtr args_start_addr = GetAllocator(SectionKind::kArgs)->start_addr();
+  batch_args_encoder_.set_start_addr(args_start_addr);
+
   runtime_symbol_map_ = LoadBinary(binary_path, false).symbol_map;
 
   // Patch pointers to define the bounds of the workspace section and the word
@@ -209,54 +258,80 @@ MicroSession::~MicroSession() {
   low_level_device_ = nullptr;
 }
 
-double MicroSession::PushToExecQueue(DevPtr func_ptr, const TVMArgs& args) {
+void MicroSession::PushToTaskQueue(DevPtr func_ptr, const TVMArgs& args) {
+  std::cout << "[MicroSession::PushToTaskQueue]" << std::endl;
+  std::cout << "  pushed func ptr: " << func_ptr.cast_to<void*>() << std::endl;
   if (thumb_mode_) {
-    func_ptr += 1;
+    func_ptr |= 1;
   }
+  DevVal func_dev_addr = func_ptr.value();
+
+  std::tuple<DevPtr, DevPtr> arg_field_addrs = EncoderAppend(&batch_args_encoder_, args);
+  DevVal arg_values_dev_addr = { .val64 = std::get<0>(arg_field_addrs).value() };
+  DevVal arg_type_codes_dev_addr = { .val64 = std::get<1>(arg_field_addrs).value() };
+
+  task_queue_.push_back(
+      DevTask {
+        .func = func_dev_addr,
+        .arg_values = arg_values_dev_addr,
+        .arg_type_codes = arg_type_codes_dev_addr,
+        .num_args = args.num_args
+      });
+
+  if (task_queue_.size() == MicroSession::kTaskQueueCapacity) {
+    FlushTaskQueue();
+  }
+}
 
-  // Create an allocator stream for the memory region after the most recent
-  // allocation in the args section.
-  DevPtr args_addr = GetAllocator(SectionKind::kArgs)->curr_end_addr();
-  TargetDataLayoutEncoder encoder(args_addr, word_size_);
-
-  std::tuple<DevPtr, DevPtr> arg_field_addrs = EncoderAppend(&encoder, args);
-
-  // Flush `stream` to device memory.
-  DevPtr stream_dev_addr =
-      GetAllocator(SectionKind::kArgs)->Allocate(encoder.buf_size());
-  low_level_device()->Write(stream_dev_addr,
-                            reinterpret_cast<void*>(encoder.data()),
-                            encoder.buf_size());
-
-  TargetVal arg_values_dev_addr = std::get<0>(arg_field_addrs).value();
-  TargetVal arg_type_codes_dev_addr = std::get<1>(arg_field_addrs).value();
+void MicroSession::FlushTaskQueue() {
+  if (task_queue_.size() == 0) {
+    // nothing to run
+    return;
+  }
   if (word_size_ == 4) {
-    UTVMTask32 task = {
-      .func = func_ptr.value().val32,
-      .arg_values = arg_values_dev_addr.val32,
-      .arg_type_codes = arg_type_codes_dev_addr.val32,
-      .num_args = args.num_args,
-    };
-    // Write the task.
-    DevSymbolWrite(runtime_symbol_map_, "utvm_task", task);
+    FlushTaskQueuePriv<UTVMTask32>();
   } else if (word_size_ == 8) {
-    UTVMTask64 task = {
-      .func = func_ptr.value().val64,
-      .arg_values = arg_values_dev_addr.val64,
-      .arg_type_codes = arg_type_codes_dev_addr.val64,
-      .num_args = args.num_args,
-    };
-    // Write the task.
-    DevSymbolWrite(runtime_symbol_map_, "utvm_task", task);
+    FlushTaskQueuePriv<UTVMTask64>();
+  }
+}
+
+template <typename T>
+void MicroSession::FlushTaskQueuePriv() {
+  std::cout << "[MicroSession::FlushTaskQueue]" << std::endl;
+  std::vector<T> prepped_tasks;
+  for (const auto& task : task_queue_) {
+    prepped_tasks.push_back(T(task));
   }
 
+  // Flush `args` to device memory.
+  low_level_device()->Write(
+      batch_args_encoder_.start_addr(),
+      reinterpret_cast<void*>(batch_args_encoder_.data()),
+      batch_args_encoder_.buf_size());
+
+  // Flush `tasks` to device memory.
+  DevPtr dev_tasks_addr = runtime_symbol_map_["utvm_tasks"];
+  low_level_device()->Write(
+      dev_tasks_addr,
+      reinterpret_cast<void*>(prepped_tasks.data()),
+      prepped_tasks.size() * sizeof(T));
+  DevSymbolWrite<uint32_t>(runtime_symbol_map_, "utvm_num_tasks", prepped_tasks.size());
+
   DevPtr utvm_init_addr = runtime_symbol_map_["UTVMInit"];
   DevPtr utvm_done_addr = runtime_symbol_map_["UTVMDone"];
   if (thumb_mode_) {
-    utvm_init_addr += 1;
+    utvm_init_addr |= 1;
   }
 
+  std::chrono::time_point<
+    std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin, tend;
+  tbegin = std::chrono::high_resolution_clock::now();
+  // std::cout << "  do execution things: ";
+  // char tmp;
+  // std::cin >> tmp;
   low_level_device()->Execute(utvm_init_addr, utvm_done_addr);
+  tend = std::chrono::high_resolution_clock::now();
+
   // Check if there was an error during execution.  If so, log it.
   CheckDeviceError();
   uint32_t task_time = DevSymbolRead<uint32_t>(runtime_symbol_map_, "utvm_task_time");
@@ -314,6 +389,88 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d
     PatchImplHole(symbol_map, "TVMAPISetLastError");
   }
 
+  if (use_device_timer_) {
+    uint64_t sum = 0;
+    std::vector<uint32_t> times;
+    times.resize(task_queue_.size());
+    low_level_device()->Read(runtime_symbol_map_["utvm_task_times"], times.data(), task_queue_.size() * sizeof(uint32_t));
+    for (uint32_t time : times) {
+      sum += time;
+    }
+    last_batch_time_ += static_cast<double>(sum);
+  } else {
+    last_batch_time_ += std::chrono::duration_cast<std::chrono::duration<double> >
+        (tend - tbegin).count() * 1000;
+    // TODO fukn hack
+    uint64_t sum = 0;
+    std::vector<uint32_t> times;
+    times.resize(task_queue_.size());
+    low_level_device()->Read(runtime_symbol_map_["utvm_task_times"], times.data(), task_queue_.size() * sizeof(uint32_t));
+    for (uint32_t time : times) {
+      sum += time;
+    }
+    last_batch_cycles_ += static_cast<double>(sum);
+  }
+
+  batch_args_encoder_.Clear();
+  task_queue_.clear();
+}
+
+BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_dylib_pointers) {
+  std::cout << "[MicroSession::LoadBinary]" << std::endl;
+  DevMemRegion text_section;
+  DevMemRegion rodata_section;
+  DevMemRegion data_section;
+  DevMemRegion bss_section;
+
+  text_section.size = GetSectionSize(
+      binary_path, SectionKind::kText, toolchain_prefix_, word_size_);
+  rodata_section.size = GetSectionSize(
+      binary_path, SectionKind::kRodata, toolchain_prefix_, word_size_);
+  data_section.size = GetSectionSize(
+      binary_path, SectionKind::kData, toolchain_prefix_, word_size_);
+  bss_section.size = GetSectionSize(
+      binary_path, SectionKind::kBss, toolchain_prefix_, word_size_);
+  std::cout << "  text_section.size: " << text_section.size << std::endl;
+  std::cout << "  rodata_section.size: " << rodata_section.size << std::endl;
+  std::cout << "  data_section.size: " << data_section.size << std::endl;
+  std::cout << "  bss_section.size: " << bss_section.size << std::endl;
+
+  text_section.start = AllocateInSection(SectionKind::kText, text_section.size);
+  rodata_section.start = AllocateInSection(SectionKind::kRodata, rodata_section.size);
+  data_section.start = AllocateInSection(SectionKind::kData, data_section.size);
+  bss_section.start = AllocateInSection(SectionKind::kBss, bss_section.size);
+  CHECK(text_section.start != nullptr && rodata_section.start != nullptr &&
+        data_section.start != nullptr && bss_section.start != nullptr)
+      << "not enough space to load module on device";
+
+  std::string relocated_bin = RelocateBinarySections(
+      binary_path,
+      word_size_,
+      text_section.start,
+      rodata_section.start,
+      data_section.start,
+      bss_section.start,
+      GetAllocator(SectionKind::kStack)->max_addr(),
+      toolchain_prefix_);
+  std::string text_contents = ReadSection(relocated_bin, SectionKind::kText, toolchain_prefix_);
+  std::string rodata_contents = ReadSection(relocated_bin, SectionKind::kRodata, toolchain_prefix_);
+  std::string data_contents = ReadSection(relocated_bin, SectionKind::kData, toolchain_prefix_);
+  std::string bss_contents = ReadSection(relocated_bin, SectionKind::kBss, toolchain_prefix_);
+
+  low_level_device_->Write(text_section.start, &text_contents[0], text_section.size);
+  low_level_device_->Write(rodata_section.start, &rodata_contents[0], rodata_section.size);
+  low_level_device_->Write(data_section.start, &data_contents[0], data_section.size);
+  low_level_device_->Write(bss_section.start, &bss_contents[0], bss_section.size);
+  SymbolMap symbol_map {relocated_bin, toolchain_prefix_};
+
+  if (patch_dylib_pointers) {
+    // Patch device lib pointers.
+    PatchImplHole(symbol_map, "TVMBackendAllocWorkspace");
+    PatchImplHole(symbol_map, "TVMBackendFreeWorkspace");
+    PatchImplHole(symbol_map, "TVMAPISetLastError");
+  }
+
   return BinaryInfo {
       .text_section = text_section,
       .rodata_section = rodata_section,
@@ -325,6 +482,7 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d
 
 std::tuple<DevPtr, DevPtr> MicroSession::EncoderAppend(
     TargetDataLayoutEncoder* encoder, const TVMArgs& args) {
+  std::cout << "[MicroSession::EncoderAppend(TVMArgs)]" << std::endl;
   const int* type_codes = args.type_codes;
   int num_args = args.num_args;
 
@@ -404,21 +562,56 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTen
   return tvm_arr_slot.start_addr();
 }
 
+// TODO(weberlo): switch over entirely to error codes that expand to error
+// messages on the host side.
 void MicroSession::CheckDeviceError() {
-  int32_t return_code = DevSymbolRead<int32_t>(runtime_symbol_map_, "utvm_return_code");
-
-  if (return_code) {
-    std::uintptr_t last_error =
-        DevSymbolRead<std::uintptr_t>(runtime_symbol_map_, "utvm_last_error");
-    std::string last_error_str;
-    if (last_error) {
-      DevPtr last_err_addr = DevPtr(last_error);
-      last_error_str = ReadString(last_err_addr);
+  int32_t last_error = DevSymbolRead<int32_t>(runtime_symbol_map_, "utvm_last_error");
+
+  if (last_error) {
+    if (!use_device_timer_ &&
+        (last_error == UTVM_ERR_TIMER_OVERFLOW ||
+         last_error == UTVM_ERR_TIMER_NOT_IMPLEMENTED)) {
+      // these errors don't matter if we're not using the on-device timer
+      return;
+    }
+    std::string err_msg;
+    switch(last_error) {
+      case UTVM_ERR_NOT_FINISHED:
+        err_msg = "execution timed out";
+        break;
+      case UTVM_ERR_TIMER_NOT_IMPLEMENTED:
+        err_msg = "timer is not implemented for the target device";
+        break;
+      case UTVM_ERR_TIMER_OVERFLOW:
+        // TODO this should be remedied by using interrupts to accumulate the
+        // timer into a larger datatype (ARM timers are only 24 bits)
+        err_msg = "timer overflowed during execution";
+        break;
+      case UTVM_ERR_WS_DOUBLE_FREE:
+        err_msg = "free called with no active workspace allocations";
+        break;
+      case UTVM_ERR_WS_OUT_OF_SPACE:
+        err_msg = "ran out of space in workspace section";
+        break;
+      case UTVM_ERR_WS_TOO_MANY_ALLOCS:
+        err_msg = "exceeded number of allocs the runtime can keep track of";
+        break;
+      case UTVM_ERR_WS_ZERO_SIZE_ALLOC:
+        err_msg = "attempt to allocate scratchpad of size zero";
+        break;
+      case UTVM_ERR_WS_UNALIGNED_START:
+        err_msg = "start of workspace section is not word-aligned";
+        break;
+      case UTVM_ERR_WS_UNALIGNED_ALLOC_SIZE:
+        err_msg = "scratchpad allocation size is not a multiple of the word size";
+        break;
+      default:
+        err_msg = "unknown error code";
+        break;
     }
     LOG(FATAL) << "error during micro function execution:\n"
-               << "  return code: " << std::dec << return_code << "\n"
-               << "  dev str addr: 0x" << std::hex << last_error << "\n"
-               << "  dev str data: " << last_error_str << std::endl;
+               << "  error ID: " << std::dec << last_error << std::endl
+               << "  error message: " << err_msg;
   }
 }
 
@@ -455,10 +648,22 @@ std::string MicroSession::ReadString(DevPtr str_addr) {
 }
 
 DevPtr MicroSession::AllocateInSection(SectionKind type, size_t size) {
-  return GetAllocator(type)->Allocate(size);
+  if (type == SectionKind::kHeap) {
+    std::cout << "[MicroSession::AllocateInSection(Heap)]" << std::endl;
+    std::cout << "  allocating " << std::dec << size << " hex=" << (void*) size << " bytes" << std::endl;
+  }
+  DevPtr result = GetAllocator(type)->Allocate(size);
+  if (type == SectionKind::kHeap) {
+    std::cout << "  allocated at addr " << result.cast_to<void*>() << std::endl;
+  }
+  return result;
 }
 
 void MicroSession::FreeInSection(SectionKind type, DevPtr addr) {
+  if (type == SectionKind::kHeap) {
+    std::cout << "[MicroSession::FreeInSection]" << std::endl;
+    std::cout << "  freeing alloc at addr " << addr.cast_to<void*>() << std::endl;
+  }
   return GetAllocator(type)->Free(addr);
 }
 
@@ -489,6 +694,16 @@ PackedFunc MicroSession::GetFunction(
     return PackedFunc([sptr_to_self](TVMArgs args, TVMRetValue* rv) {
       MicroSession::ExitWithScope();
     });
+    // TODO add a `clear_batch_timer` func
+  } else if (name == "get_last_batch_time") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      *rv = this->GetLastBatchTime();
+    });
+    // TODO remove this func
+  } else if (name == "get_last_batch_cycles") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      *rv = this->GetLastBatchCycles();
+    });
   } else {
     return PackedFunc();
   }
@@ -518,8 +733,9 @@ TVM_REGISTER_GLOBAL("micro._CreateSession")
     size_t stack_size = args[18];
     size_t word_size = args[19];
     bool thumb_mode = args[20];
-    const std::string& server_addr = args[21];
-    int port = args[22];
+    bool use_device_timer = args[21];
+    const std::string& server_addr = args[22];
+    int port = args[23];
     ObjectPtr<MicroSession> session = make_object<MicroSession>(
         comms_method,
         binary_path,
@@ -542,6 +758,7 @@ TVM_REGISTER_GLOBAL("micro._CreateSession")
         stack_size,
         word_size,
         thumb_mode,
+        use_device_timer,
         server_addr,
         port);
     *rv = Module(session);
diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h
index 9e844e8b2140..00e5bcdd7e01 100644
--- a/src/runtime/micro/micro_session.h
+++ b/src/runtime/micro/micro_session.h
@@ -52,6 +52,8 @@
 namespace tvm {
 namespace runtime {
 
+struct DevTask;
+
 /*!
  * \brief session for facilitating micro device interaction
  */
@@ -66,6 +68,9 @@ class MicroSession : public ModuleNode {
   virtual PackedFunc GetFunction(const std::string& name,
                                  const ObjectPtr<Object>& sptr_to_self);
 
+  // todo having this decoupled from the value in utvm_runtime.c gives me stress dreams
+  static const size_t kTaskQueueCapacity = 20;
+
   /*!
    * \return The type key of the executor.
    */
@@ -121,6 +126,7 @@ class MicroSession : public ModuleNode {
       size_t stack_size,
       size_t word_size,
       bool thumb_mode,
+      bool use_device_timer,
       const std::string& server_addr,
       int port);
 
@@ -137,7 +143,19 @@ class MicroSession : public ModuleNode {
    * \param args args to the packed function
    * \return elapsed time during function execution on the device
    */
-  double PushToExecQueue(DevPtr func, const TVMArgs& args);
+  double PushToTaskQueue(DevPtr func, const TVMArgs& args);
+
+  /*!
+   * \brief serialize runtime metadata to the device for enqueued tasks and execute
+   * \return elapsed time during function execution on the device
+   */
+  void FlushTaskQueue();
+
+  /*!
+   * \brief TODO
+   */
+  template <typename T>
+  void FlushTaskQueuePriv();
 
   /*!
    * \brief loads binary onto device
@@ -196,6 +214,18 @@ class MicroSession : public ModuleNode {
     return low_level_device_;
   }
 
+  const double GetLastBatchTime() {
+    double result = last_batch_time_;
+    last_batch_time_ = 0.0;
+    return result;
+  }
+
+  const double GetLastBatchCycles() {
+    double result = last_batch_cycles_;
+    last_batch_cycles_ = 0.0;
+    return result;
+  }
+
  private:
   /*! \brief low-level device pointer */
   std::shared_ptr<LowLevelDevice> low_level_device_;
@@ -204,6 +234,8 @@ class MicroSession : public ModuleNode {
   /*! \brief array of memory allocators for each on-device section */
   std::shared_ptr<MicroSectionAllocator>
       section_allocators_[static_cast<size_t>(SectionKind::kNumKinds)];
+  /*! \brief total number of bytes of usable device memory for this session */
+  size_t memory_size_;
   /*! \brief number of bytes in a word on the target device */
   size_t word_size_;
   /*! \brief whether the target device requires a thumb-mode bit on function addresses
@@ -213,8 +245,20 @@ class MicroSession : public ModuleNode {
    * results in more compact binaries.
    */
   bool thumb_mode_;
+  /*! \brief TODO */
+  bool use_device_timer_;
   /*! \brief symbol map for the device runtime */
   SymbolMap runtime_symbol_map_;
+  /*! \brief TODO */
+  std::vector<DevTask> task_queue_;
+  // TODO(weberlo): we don't even need an allocator mechanism for the args
+  // section. there's only ever one allocation.
+  /*! \brief TODO fukn hack */
+  TargetDataLayoutEncoder batch_args_encoder_;
+  /*! \brief TODO fukn hack */
+  double last_batch_time_;
+  /*! \brief TODO fukn hack */
+  double last_batch_cycles_;
 
   /*!
    * \brief patches a function pointer in this module to an implementation
@@ -237,7 +281,7 @@ class MicroSession : public ModuleNode {
    * \return device address of the allocated `DLTensor`
    */
   template <typename T>
-  DevPtr EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr);
+  TargetPtr EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr);
 
   /*!
    * \brief checks and logs if there was an error during the device's most recent execution
@@ -302,7 +346,11 @@ struct TVMArray32 {
       byte_offset(byte_offset.val32),
       pad2(0) { }
 
-  /*! \brief opaque pointer to the allocated data */
+  /*!
+   * \brief The opaque data pointer points to the allocated data.
+   *  This will be CUDA device pointer or cl_mem handle in OpenCL.
+   *  This pointer is always aligns to 256 bytes as in CUDA.
+   */
   uint32_t data;
   /*! \brief The device context of the tensor */
   DLContext ctx;
@@ -345,8 +393,11 @@ struct TVMArray64 {
       shape(shape.val64),
       strides(strides.val64),
       byte_offset(byte_offset.val64) { }
-
-  /*! \brief opaque pointer to the allocated data */
+  /*!
+   * \brief The opaque data pointer points to the allocated data.
+   *  This will be CUDA device pointer or cl_mem handle in OpenCL.
+   *  This pointer is always aligns to 256 bytes as in CUDA.
+   */
   uint64_t data;
   /*! \brief The device context of the tensor */
   DLContext ctx;
@@ -367,8 +418,26 @@ struct TVMArray64 {
   uint64_t byte_offset;
 };
 
+/*! \brief MicroTVM task to store in task queue before specializing to word size */
+struct DevTask {
+  /*! \brief Pointer to function to call for this task */
+  DevVal func;
+  /*! \brief Array of argument values */
+  DevVal arg_values;
+  /*! \brief Array of type codes for each argument value */
+  DevVal arg_type_codes;
+  /*! \brief Number of arguments */
+  int32_t num_args;
+};
+
 /*! \brief MicroTVM task for serialization to 32-bit devices */
 typedef struct StructUTVMTask32 {
+  StructUTVMTask32(DevTask task)
+    : func(task.func.val32),
+      arg_values(task.arg_values.val32),
+      arg_type_codes(task.arg_type_codes.val32),
+      num_args(task.num_args) { }
+
   /*! \brief Pointer to function to call for this task */
   uint32_t func;
   /*! \brief Array of argument values */
@@ -377,10 +446,16 @@ typedef struct StructUTVMTask32 {
   uint32_t arg_type_codes;
   /*! \brief Number of arguments */
   int32_t num_args;
-} UTVMTask32;
+} StructUTVMTask32;
 
 /*! \brief MicroTVM task for serialization to 64-bit devices */
 typedef struct StructUTVMTask64 {
+  StructUTVMTask64(DevTask task)
+    : func(task.func.val64),
+      arg_values(task.arg_values.val64),
+      arg_type_codes(task.arg_type_codes.val64),
+      num_args(task.num_args) { }
+
   /*! \brief Pointer to function to call for this task */
   uint64_t func;
   /*! \brief Array of argument values */
@@ -389,7 +464,7 @@ typedef struct StructUTVMTask64 {
   uint64_t arg_type_codes;
   /*! \brief Number of arguments */
   int32_t num_args;
-} UTVMTask64;
+} StructUTVMTask64;
 
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h
index e0275165e774..0abd269eedd4 100644
--- a/src/runtime/micro/target_data_layout_encoder.h
+++ b/src/runtime/micro/target_data_layout_encoder.h
@@ -96,9 +96,8 @@ class TargetDataLayoutEncoder {
    * \brief constructor
    * \param start_addr start address of the encoder in device memory
    */
-  explicit TargetDataLayoutEncoder(DevPtr start_addr, size_t word_size)
-      : buf_(std::vector<uint8_t>()), curr_offset_(0), word_size_(word_size) {
-    start_addr_ = DevPtr(UpperAlignValue(start_addr.value().val64, word_size_));
+  explicit TargetDataLayoutEncoder(size_t capacity, size_t word_size)
+      : buf_(std::vector<uint8_t>()), curr_offset_(0), capacity_(capacity), word_size_(word_size) {
   }
 
   /*!
@@ -113,9 +112,15 @@ class TargetDataLayoutEncoder {
     if (curr_offset_ + size > buf_.size()) {
       buf_.resize(curr_offset_ + size);
     }
+    CHECK(buf_.size() < capacity_) << "out of space in data encoder";
     size_t slot_start_offset = curr_offset_;
     curr_offset_ += size;
-    return Slot<T>(this, slot_start_offset, size, start_addr_ + slot_start_offset);
+    return Slot<T>(this, slot_start_offset, size, start_addr() + slot_start_offset);
+  }
+
+  void Clear() {
+    buf_.clear();
+    curr_offset_ = 0;
   }
 
   /*!
@@ -130,10 +135,23 @@ class TargetDataLayoutEncoder {
    * \brief returns current size of the encoder's buffer
    * \return buffer size
    */
-  size_t buf_size() {
+  size_t buf_size() const {
     return buf_.size();
   }
 
+  /*!
+   * \brief TODO
+   */
+  DevPtr start_addr() const {
+    CHECK_NE(start_addr_.value().val64, 0) << "start addr uninitialized";
+    return start_addr_;
+  }
+
+  void set_start_addr(DevPtr start_addr) {
+    CHECK_EQ(buf_.size(), 0) << "cannot change encoder start addr unless empty";
+    start_addr_ = DevPtr(UpperAlignValue(start_addr.value().val64, word_size_));
+  }
+
  private:
   /*! \brief in-memory backing buffer */
   std::vector<uint8_t> buf_;
@@ -141,6 +159,8 @@ class TargetDataLayoutEncoder {
   size_t curr_offset_;
   /*! \brief start address of the encoder in device memory */
   DevPtr start_addr_;
+  /*! \brief TODO */
+  size_t capacity_;
   /*! \brief number of bytes in a word on the target device */
   size_t word_size_;
 };
@@ -158,6 +178,7 @@ TargetDataLayoutEncoder::Slot<T>::Slot(TargetDataLayoutEncoder* parent,
 
 template <typename T>
 TargetDataLayoutEncoder::Slot<T>::~Slot() {
+  // TODO this can mask the exception thrown by slot allocation... even though that doesn't make sense.
   CHECK(curr_offset_ == size_) << "unwritten space in slot";
 }
 
diff --git a/src/runtime/micro/tcl_socket.cc b/src/runtime/micro/tcl_socket.cc
index 64dfbf218388..d0bb6007f815 100644
--- a/src/runtime/micro/tcl_socket.cc
+++ b/src/runtime/micro/tcl_socket.cc
@@ -42,9 +42,11 @@ void TclSocket::Connect(tvm::support::SockAddr addr) {
 }
 
 void TclSocket::SendCommand() {
-  const char terminate_token = kCommandTerminateToken;
-  cmd_builder_ << terminate_token;
+  //std::cout << "[TclSocket::SendCommand]" << std::endl;
+  //std::cout << "  cmd: " << cmd_builder_.str() << std::endl;
+  cmd_builder_ << kCommandTerminateToken;
   std::string full_cmd = cmd_builder_.str();
+
   CHECK(tcp_socket_.Send(full_cmd.data(), full_cmd.length()) != -1)
     << "failed to send command";
   cmd_builder_.str(std::string());
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index 43ca630f9496..7556c4915b9c 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -36,12 +36,43 @@
 #include <algorithm>
 #include "rpc_session.h"
 #include "../object_internal.h"
-#include "../../support/ring_buffer.h"
-#include "../../support/socket.h"
+#include "../../common/ring_buffer.h"
+#include "../../common/socket.h"
+#include "../micro/micro_session.h"
 
 namespace tvm {
 namespace runtime {
 
+std::string RPCCodeToString(RPCCode code) {
+  switch (code) {
+    case RPCCode::kNone: return "None";
+    case RPCCode::kCallFunc: return "CallFunc";
+    case RPCCode::kReturn: return "Return";
+    case RPCCode::kException: return "Exception";
+    case RPCCode::kShutdown: return "Shutdown";
+    case RPCCode::kCopyFromRemote: return "CopyFromRemote";
+    case RPCCode::kCopyToRemote: return "CopyToRemote";
+    case RPCCode::kCopyAck: return "CopyAck";
+    case RPCCode::kSystemFuncStart: return "SystemFuncStart";
+    case RPCCode::kGetGlobalFunc: return "GetGlobalFunc";
+    case RPCCode::kGetTimeEvaluator: return "GetTimeEvaluator";
+    case RPCCode::kFreeFunc: return "FreeFunc";
+    case RPCCode::kDevSetDevice: return "DevSetDevice";
+    case RPCCode::kDevGetAttr: return "DevGetAttr";
+    case RPCCode::kDevAllocData: return "DevAllocData";
+    case RPCCode::kDevFreeData: return "DevFreeData";
+    case RPCCode::kDevStreamSync: return "DevStreamSync";
+    case RPCCode::kCopyAmongRemote: return "CopyAmongRemote";
+    case RPCCode::kModuleLoad: return "ModuleLoad";
+    case RPCCode::kModuleImport: return "ModuleImport";
+    case RPCCode::kModuleFree: return "ModuleFree";
+    case RPCCode::kModuleGetFunc: return "ModuleGetFunc";
+    case RPCCode::kModuleGetSource: return "ModuleGetSource";
+    case RPCCode::kNDArrayFree: return "NDArrayFree";
+    default: CHECK(false) << "invalid RPC code";
+  }
+}
+
 // Temp buffer for data array
 struct RPCByteArrayBuffer {
   TVMByteArray arr;
@@ -898,6 +929,12 @@ void RPCSession::Init() {
       &reader_, &writer_, table_index_, name_, &remote_key_);
   // Quick function to call remote.
   call_remote_ = PackedFunc([this](TVMArgs args, TVMRetValue* rv) {
+      std::cout << "[RPCSession::call_remote_]" << std::endl;
+      if (args.type_codes[0] == kTVMContext) {
+        const TVMContext ctx = args[0];
+        std::cout << "  ctx.device_type: " << ctx.device_type << std::endl;
+        std::cout << "  ctx.device_id: " << ctx.device_id << std::endl;
+      }
       handler_->SendPackedSeq(args.values, args.type_codes, args.num_args, true);
       RPCCode code = HandleUntilReturnEvent(rv, true, nullptr);
       CHECK(code == RPCCode::kReturn) << "code=" << static_cast<int>(code);
@@ -1080,7 +1117,10 @@ void RPCDevSetDevice(TVMArgs args, TVMRetValue *rv) {
 }
 
 void RPCDevGetAttr(TVMArgs args, TVMRetValue *rv) {
+  std::cout << "[RPCDevGetAttr]" << std::endl;
   TVMContext ctx = args[0];
+  std::cout <<  "  ctx.device_type: " << ctx.device_type << std::endl;
+  std::cout <<  "  ctx.device_id: " << ctx.device_id << std::endl;
   DeviceAttrKind kind = static_cast<DeviceAttrKind>(args[1].operator int());
   if (kind == kExist) {
     DeviceAPI* api = DeviceAPI::Get(ctx, true);
@@ -1096,7 +1136,10 @@ void RPCDevGetAttr(TVMArgs args, TVMRetValue *rv) {
 }
 
 void RPCDevAllocData(TVMArgs args, TVMRetValue *rv) {
+  std::cout <<  "[RPCDevAllocData]" << std::endl;
   TVMContext ctx = args[0];
+  std::cout <<  "  ctx.device_type: " << ctx.device_type << std::endl;
+  std::cout <<  "  ctx.device_id: " << ctx.device_id << std::endl;
   uint64_t nbytes = args[1];
   uint64_t alignment = args[2];
   DLDataType type_hint = args[3];
@@ -1118,13 +1161,18 @@ void RPCDevStreamSync(TVMArgs args, TVMRetValue *rv) {
 }
 
 void RPCCopyAmongRemote(TVMArgs args, TVMRetValue *rv) {
+  std::cout << "[RPCCopyAmongRemote]" << std::endl;
   void* from = args[0];
   uint64_t from_offset = args[1];
   void* to = args[2];
   uint64_t to_offset = args[3];
   uint64_t size = args[4];
   TVMContext ctx_from = args[5];
+  std::cout << "  ctx_from.device_type: " << ctx_from.device_type << std::endl;
+  std::cout << "  ctx_from.device_id: " << ctx_from.device_type << std::endl;
   TVMContext ctx_to = args[6];
+  std::cout << "  ctx_to.device_type: " << ctx_to.device_type << std::endl;
+  std::cout << "  ctx_to.device_id: " << ctx_to.device_type << std::endl;
   DLDataType type_hint = args[7];
   TVMStreamHandle stream = args[8];
   TVMContext ctx = ctx_from;
@@ -1135,10 +1183,12 @@ void RPCCopyAmongRemote(TVMArgs args, TVMRetValue *rv) {
           ctx_to.device_type == ctx_from.device_type)
         << "Can not copy across different ctx types directly";
   }
+  std::cout << "  before CopyDataFromTo" << std::endl;
   DeviceAPI::Get(ctx)->CopyDataFromTo(
       from, from_offset,
       to, to_offset,
       size, ctx_from, ctx_to, type_hint, stream);
+  std::cout << "  after CopyDataFromTo" << std::endl;
 }
 
 void RPCModuleLoad(TVMArgs args, TVMRetValue *rv) {
@@ -1200,6 +1250,7 @@ void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) {
 }
 
 void RPCSession::EventHandler::HandlePackedCall() {
+  std::cout << "[RPCSession::EventHandler::HandlePackedCall]" << std::endl;
   CHECK_EQ(pending_request_bytes_, 0U);
   if (code_ == RPCCode::kReturn) {
     state_ = kReturnReceived; return;
@@ -1208,6 +1259,7 @@ void RPCSession::EventHandler::HandlePackedCall() {
   state_ = kRecvCode;
   this->RequestBytes(sizeof(RPCCode));
   // Event handler sit at clean state at this point.
+  std::cout << "  RPC code is " << static_cast<int>(code_) << "(" << RPCCodeToString(code_) << ")" << std::endl;
   switch (code_) {
     case RPCCode::kCallFunc: {
       PackedFunc* pf = reinterpret_cast<PackedFunc*>(call_handle_);
@@ -1222,6 +1274,7 @@ void RPCSession::EventHandler::HandlePackedCall() {
       std::ostringstream os;
       os << "Except caught from RPC call: " << arg_buf_->value[0].v_str;
       arg_buf_.reset();
+      std::cout << os.str() << std::endl;
       throw dmlc::Error(os.str());
       break;
     }
@@ -1250,21 +1303,24 @@ PackedFunc MicroTimeEvaluator(
     PackedFunc pf,
     TVMContext ctx,
     int number,
-    int repeat) {
-  auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) mutable {
+    int repeat,
+    int min_repeat_ms) {
+  auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable {
     TVMRetValue temp;
     std::ostringstream os;
-    // skip first time call, to activate lazy compilation components.
-    pf.CallPacked(args, &temp);
-    DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
+
     for (int i = 0; i < repeat; ++i) {
-      double speed = 0.0;
-      for (int j = 0; j < number; ++j) {
+      // start timing
+      CHECK(number < MicroSession::kTaskQueueCapacity)
+        << "`number` must be less than uTVM task queue capacity";
+      for (int i = 0; i < number; ++i) {
         pf.CallPacked(args, &temp);
-        DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
-        speed += (temp.operator double()) / number;
       }
-      os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
+      ObjectPtr<MicroSession> session = MicroSession::Current();
+      DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
+      double time_per_batch = session->GetLastBatchTime() / number;
+      std::cout << "LAST AVERAGE BATCH TIME WAS " << time_per_batch << std::endl;
+      os.write(reinterpret_cast<char*>(&time_per_batch), sizeof(time_per_batch));
     }
     std::string blob = os.str();
     TVMByteArray arr;
@@ -1281,9 +1337,12 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf,
                              int number,
                              int repeat,
                              int min_repeat_ms) {
+  std::cout << "[WrapTimeEvaluator]" << std::endl;
   if (static_cast<int>(ctx.device_type) == static_cast<int>(kDLMicroDev)) {
-    return MicroTimeEvaluator(pf, ctx, number, repeat);
+    std::cout << "  USING MICRO TIME EVAL" << std::endl;
+    return MicroTimeEvaluator(pf, ctx, number, repeat, min_repeat_ms);
   }
+  std::cout << "  USING NORMAL TIME EVAL" << std::endl;
 
   auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable {
     TVMRetValue temp;
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index db63be4be74d..1bb75357e030 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -63,31 +63,31 @@ struct RPCArgBuffer;
 
 /*! \brief The RPC code */
 enum class RPCCode : int {
-  kNone,
-  kCallFunc,
-  kReturn,
-  kException,
-  kShutdown,
-  kCopyFromRemote,
-  kCopyToRemote,
-  kCopyAck,
+  kNone = 0,
+  kCallFunc = 1,
+  kReturn = 2,
+  kException = 3,
+  kShutdown = 4,
+  kCopyFromRemote = 5,
+  kCopyToRemote = 6,
+  kCopyAck = 7,
   // The following are code that can send over CallRemote
-  kSystemFuncStart,
-  kGetGlobalFunc,
-  kGetTimeEvaluator,
-  kFreeFunc,
-  kDevSetDevice,
-  kDevGetAttr,
-  kDevAllocData,
-  kDevFreeData,
-  kDevStreamSync,
-  kCopyAmongRemote,
-  kModuleLoad,
-  kModuleImport,
-  kModuleFree,
-  kModuleGetFunc,
-  kModuleGetSource,
-  kNDArrayFree
+  kSystemFuncStart = 8,
+  kGetGlobalFunc = 9,
+  kGetTimeEvaluator = 10,
+  kFreeFunc = 11,
+  kDevSetDevice = 12,
+  kDevGetAttr = 13,
+  kDevAllocData = 14,
+  kDevFreeData = 15,
+  kDevStreamSync = 16,
+  kCopyAmongRemote = 17,
+  kModuleLoad = 18,
+  kModuleImport = 19,
+  kModuleFree = 20,
+  kModuleGetFunc = 21,
+  kModuleGetSource = 22,
+  kNDArrayFree = 23
 };
 
 /*!
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index 6461908f1c4a..0c4404f515c0 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -840,6 +840,12 @@ void CodeGenC::VisitStmt_(const AttrStmtNode* op) {
     const VarNode* v = op->node.as<VarNode>();
     CHECK(v);
     volatile_buf_.insert(v);
+  } else if (op->attr_key == ir::attr::pragma_import_c) {
+    const StringImm* value = op->value.as<StringImm>();
+    CHECK(value != nullptr);
+    decl_stream << value->value;
+    //this->HandleImport(value->value);
+    //this->VisitStmt(op->body);
   }
   this->PrintStmt(op->body);
 }
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index cbdec6201742..c356ac816957 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -29,6 +29,7 @@
 namespace tvm {
 namespace codegen {
 
+// TODO rename to CodeGenCMicro?
 CodeGenCHost::CodeGenCHost() {
   module_name_ = GetUniqueName("__tvm_module_ctx");
 }
@@ -254,7 +255,7 @@ runtime::Module BuildCHost(IRModule mod) {
 
 TVM_REGISTER_GLOBAL("target.build.c")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = BuildCHost(args[0]);
-  });
+   *rv = BuildCHost(args[0]);
+ });
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h
index 4f9a0a74511f..f854b7abf68c 100644
--- a/src/target/source/codegen_c_host.h
+++ b/src/target/source/codegen_c_host.h
@@ -36,6 +36,8 @@ class CodeGenCHost final : public CodeGenC {
  public:
   CodeGenCHost();
   void Init(bool output_ssa, bool emit_asserts);
+  void AddFunction(LoweredFunc f);
+  std::string Finish();
 
   void PrintType(DataType t, std::ostream& os) final; // NOLINT(*)
   void PrintFuncPrefix() final; // NOLINT(*)
diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py
index 28fdb11c3de4..1ab324e50228 100644
--- a/tests/python/unittest/test_runtime_micro.py
+++ b/tests/python/unittest/test_runtime_micro.py
@@ -25,8 +25,25 @@
 from tvm.micro import create_micro_mod
 from tvm.relay.testing import resnet
 
-# Use the host emulated micro device.
-DEV_CONFIG = micro.device.host.default_config()
+# # Use the host emulated micro device.
+# DEV_CONFIG_A = micro.device.host.generate_config()
+# DEV_CONFIG_B = micro.device.host.generate_config()
+# TARGET = 'c -device=micro_dev'
+
+# # TODO why do spike examples have memory that starts at 0x10000000, but you
+# # should set the base addr as 0x10010000? should somehow help the user to be
+# # aware of that.
+# # are there always 0x10000 bytes reserved at the beginning of the address space?
+# BASE_ADDR = 0x10010000
+
+# AVAILABLE_MEM = 0x200000
+# DEV_CONFIG_A = micro.device.riscv_spike.generate_config(BASE_ADDR, AVAILABLE_MEM, '127.0.0.1', 6666)
+# DEV_CONFIG_B = micro.device.riscv_spike.generate_config(BASE_ADDR, AVAILABLE_MEM, '127.0.0.1', 6667)
+# TARGET = 'c -device=micro_dev'
+
+DEV_CONFIG_A = micro.device.arm.stm32f746xx.generate_config('127.0.0.1', 6666)
+DEV_CONFIG_B = micro.device.arm.stm32f746xx.generate_config('127.0.0.1', 6667)
+TARGET = 'c -device=micro_dev'
 
 def relay_micro_build(func, dev_config, params=None):
     """Create a graph runtime module with a micro device context from a Relay function.
@@ -47,22 +64,41 @@ def relay_micro_build(func, dev_config, params=None):
     mod : tvm.runtime.Module
         graph runtime module for the target device
     """
-    with tvm.target.build_config(disable_vectorize=True):
-        graph, c_mod, params = relay.build(func, target="c", params=params)
-    micro_mod = create_micro_mod(c_mod, dev_config)
+    disable_vectorize = tvm.build_config(disable_vectorize=True)
+    disable_fusion = relay.build_config(disabled_pass={'FuseOps'})
+    with disable_vectorize, disable_fusion:
+        graph, c_mod, params = relay.build(func, target=TARGET, params=params)
+    print(c_mod.get_source())
+    micro_mod = micro.create_micro_mod(c_mod, dev_config)
     ctx = tvm.micro_dev(0)
     mod = graph_runtime.create(graph, micro_mod, ctx)
     mod.set_input(**params)
     return mod
 
 
+GDB_INIT_TEMPLATE = """
+layout asm
+target remote localhost:{gdb_port}
+set $pc = UTVMInit
+break UTVMDone
+"""
+
+def reset_gdbinit():
+    if 'server_port' not in DEV_CONFIG_A:
+        return
+    gdb_init_dir = os.environ['MICRO_GDB_INIT_DIR']
+    with open(f'{gdb_init_dir}/.gdbinit', 'w') as f:
+        gdb_port = DEV_CONFIG_A['server_port'] - 3333
+        f.write(GDB_INIT_TEMPLATE.format(gdb_port=gdb_port))
+
+
 def test_alloc():
     """Test tensor allocation on the device."""
     if not tvm.runtime.enabled("micro_dev"):
         return
     shape = (1024,)
     dtype = "float32"
-    with micro.Session(DEV_CONFIG):
+    with micro.Session(DEV_CONFIG_A):
         ctx = tvm.micro_dev(0)
         np_tensor = np.random.uniform(size=shape).astype(dtype)
         micro_tensor = tvm.nd.array(np_tensor, ctx)
@@ -76,6 +112,8 @@ def test_add():
     shape = (1024,)
     dtype = "float32"
 
+    reset_gdbinit()
+
     # Construct TVM expression.
     tvm_shape = tvm.runtime.convert(shape)
     A = te.placeholder(tvm_shape, name="A", dtype=dtype)
@@ -86,14 +124,24 @@ def test_add():
     func_name = "fadd"
     c_mod = tvm.build(s, [A, B, C], target="c", name=func_name)
 
-    with micro.Session(DEV_CONFIG):
-        micro_mod = create_micro_mod(c_mod, DEV_CONFIG)
+    with micro.Session(DEV_CONFIG_A) as sess:
+        micro_mod = micro.create_micro_mod(c_mod, DEV_CONFIG_A)
         micro_func = micro_mod[func_name]
         ctx = tvm.micro_dev(0)
-        a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx)
+
+        a_np = np.random.uniform(size=shape).astype(dtype)
+        a = tvm.nd.array(a_np, ctx)
+        b_np = np.random.uniform(size=shape).astype(dtype)
+        b = tvm.nd.array(b_np, ctx)
         c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
         micro_func(a, b, c)
+
+        # ensure inputs weren't corrupted
+        tvm.testing.assert_allclose(
+                a.asnumpy(), a_np)
+        tvm.testing.assert_allclose(
+                b.asnumpy(), b_np)
+        # ensure output is correct
         tvm.testing.assert_allclose(
                 c.asnumpy(), a.asnumpy() + b.asnumpy())
 
@@ -105,6 +153,8 @@ def test_workspace_add():
     shape = (1024,)
     dtype = "float32"
 
+    reset_gdbinit()
+
     # Construct TVM expression.
     tvm_shape = tvm.runtime.convert(shape)
     A = te.placeholder(tvm_shape, name="A", dtype=dtype)
@@ -116,14 +166,19 @@ def test_workspace_add():
     func_name = "fadd_two_workspace"
     c_mod = tvm.build(s, [A, C], target="c", name=func_name)
 
-    with micro.Session(DEV_CONFIG):
-        micro_mod = create_micro_mod(c_mod, DEV_CONFIG)
+    with micro.Session(DEV_CONFIG_A) as sess:
+        micro_mod = micro.create_micro_mod(c_mod, DEV_CONFIG_A)
         micro_func = micro_mod[func_name]
         ctx = tvm.micro_dev(0)
-        a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx)
+        a_np = np.random.uniform(size=shape).astype(dtype)
+        a = tvm.nd.array(a_np, ctx)
         c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
         micro_func(a, c)
 
+        # ensure input wasn't corrupted
+        tvm.testing.assert_allclose(
+                a.asnumpy(), a_np)
+        # ensure output is correct
         tvm.testing.assert_allclose(
                 c.asnumpy(), a.asnumpy() + 2.0)
 
@@ -141,17 +196,68 @@ def test_graph_runtime():
     z = relay.add(xx, relay.const(1.0))
     func = relay.Function([x], z)
 
-    with micro.Session(DEV_CONFIG):
-        mod = relay_micro_build(func, DEV_CONFIG)
+    with micro.Session(DEV_CONFIG_A):
+        mod = relay_micro_build(func, DEV_CONFIG_A)
 
         x_in = np.random.uniform(size=shape[0]).astype(dtype)
         mod.run(x=x_in)
         result = mod.get_output(0).asnumpy()
 
+        tvm.testing.assert_allclose(
+                mod.get_input(0).asnumpy(), x_in)
         tvm.testing.assert_allclose(
                 result, x_in * x_in + 1.0)
 
 
+def test_conv2d():
+    if not tvm.module.enabled("micro_dev"):
+        return
+
+    from tvm.relay import create_executor
+    from tvm.relay import transform
+
+    dshape = (1, 4, 16, 16)
+    dtype = 'int8'
+    func_name = 'fused_nn_conv2d'
+
+    reset_gdbinit()
+
+    # Construct Relay program.
+    x = relay.var("x", shape=dshape, dtype=dtype)
+    conv_expr = relay.nn.conv2d(
+            x, relay.var("w"),
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            channels=4)
+    func = relay.Function(relay.analysis.free_vars(conv_expr), conv_expr)
+    mod = relay.Module.from_expr(func)
+    mod = transform.InferType()(mod)
+
+    x_shape = list(map(lambda x: x.value, mod['main'].params[0].checked_type.shape))
+    w_shape = list(map(lambda x: x.value, mod['main'].params[1].checked_type.shape))
+    out_shape = list(map(lambda x: x.value, mod['main'].ret_type.shape))
+
+    with tvm.build_config(disable_vectorize=True):
+        graph, c_mod, params = relay.build(mod, target="c")
+
+    with micro.Session(DEV_CONFIG_A):
+        micro_mod = micro.create_micro_mod(c_mod, DEV_CONFIG_A)
+        micro_func = micro_mod[func_name]
+        ctx = tvm.micro_dev(0)
+
+        x_data = tvm.nd.array(np.random.uniform(size=x_shape).astype(dtype), ctx)
+        w_data = tvm.nd.array(np.random.uniform(size=w_shape).astype(dtype), ctx)
+        result = tvm.nd.array(np.zeros(shape=out_shape, dtype=dtype), ctx)
+        micro_func(x_data, w_data, result)
+
+        out_data = np.zeros(out_shape, dtype=dtype)
+        params = { 'x': x_data.asnumpy(), 'w': w_data.asnumpy() }
+        intrp = create_executor('debug')
+        expected_result = intrp.evaluate(mod['main'])(x_data, w_data).data
+
+        tvm.testing.assert_allclose(result.asnumpy(), expected_result.asnumpy())
+
+
 def test_multiple_modules():
     """Test loading multiple modules on the device simultaneously."""
     if not tvm.runtime.enabled("micro_dev"):
@@ -168,9 +274,9 @@ def test_multiple_modules():
     ret = relay.subtract(x, relay.const(1.0))
     sub_const_func = relay.Function([x], ret)
 
-    with micro.Session(DEV_CONFIG):
-        add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG)
-        sub_const_mod = relay_micro_build(sub_const_func, DEV_CONFIG)
+    with micro.Session(DEV_CONFIG_A):
+        add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_A)
+        sub_const_mod = relay_micro_build(sub_const_func, DEV_CONFIG_A)
 
         x_in = np.random.uniform(size=shape[0]).astype(dtype)
         add_const_mod.run(x=x_in)
@@ -196,8 +302,8 @@ def test_interleave_sessions():
     ret = relay.add(x, relay.const(1.0))
     add_const_func = relay.Function([x], ret)
 
-    sess_a = micro.Session(DEV_CONFIG)
-    sess_b = micro.Session(DEV_CONFIG)
+    sess_a = micro.Session(DEV_CONFIG_A)
+    sess_b = micro.Session(DEV_CONFIG_B)
     with sess_a:
         np_tensor_a = np.random.uniform(size=shape).astype(dtype)
         micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0))
@@ -205,13 +311,13 @@ def test_interleave_sessions():
         np_tensor_b = np.random.uniform(size=shape).astype(dtype)
         micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro_dev(0))
     with sess_a:
-        add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG)
+        add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_A)
         add_const_mod.run(x=micro_tensor_a)
         add_result = add_const_mod.get_output(0).asnumpy()
         tvm.testing.assert_allclose(
                 add_result, np_tensor_a + 1.0)
     with sess_b:
-        add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG)
+        add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_B)
         add_const_mod.run(x=micro_tensor_b)
         add_result = add_const_mod.get_output(0).asnumpy()
         tvm.testing.assert_allclose(
@@ -230,15 +336,15 @@ def test_nested_sessions():
     ret = relay.add(x, relay.const(1.0))
     add_const_func = relay.Function([x], ret)
 
-    sess_a = micro.Session(DEV_CONFIG)
-    sess_b = micro.Session(DEV_CONFIG)
+    sess_a = micro.Session(DEV_CONFIG_A)
+    sess_b = micro.Session(DEV_CONFIG_B)
     with sess_a:
         np_tensor_a = np.random.uniform(size=shape).astype(dtype)
         micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0))
         with sess_b:
             np_tensor_b = np.random.uniform(size=shape).astype(dtype)
             micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro_dev(0))
-        add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG)
+        add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_A)
         add_const_mod.run(x=micro_tensor_a)
         add_result = add_const_mod.get_output(0).asnumpy()
         tvm.testing.assert_allclose(
@@ -257,12 +363,12 @@ def test_inactive_session_use():
     ret = relay.add(x, relay.const(1.0))
     add_const_func = relay.Function([x], ret)
 
-    sess_a = micro.Session(DEV_CONFIG)
-    sess_b = micro.Session(DEV_CONFIG)
+    sess_a = micro.Session(DEV_CONFIG_A)
+    sess_b = micro.Session(DEV_CONFIG_B)
     with sess_a:
         np_tensor_a = np.random.uniform(size=shape).astype(dtype)
         micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0))
-        add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG)
+        add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_A)
 
     with sess_b:
         # These objects belong to `sess_a`.
@@ -272,12 +378,42 @@ def test_inactive_session_use():
                 add_result, np_tensor_a + 1.0)
 
 
+# TODO add workspace alloc/free stress test
+
 if __name__ == "__main__":
     test_alloc()
+    print()
+    print('finished alloc test')
+    input('[press enter to continue]')
     test_add()
+    print()
+    print('finished add test')
+    input('[press enter to continue]')
     test_workspace_add()
+    print()
+    print('finished workspace add test')
+    input('[press enter to continue]')
     test_graph_runtime()
+    print()
+    print('finished graph runtime test')
+    input('[press enter to continue]')
+    test_conv2d()
+    print()
+    print('finished conv2d test')
+    input('[press enter to continue]')
     test_multiple_modules()
+    print()
+    print('finished multiple modules test')
+    input('[press enter to continue]')
     test_interleave_sessions()
+    print()
+    print('finished interleaved sessions test')
+    input('[press enter to continue]')
     test_nested_sessions()
+    print()
+    print('finished nested sessions test')
+    input('[press enter to continue]')
     test_inactive_session_use()
+    print()
+    print('finished use inactive session test')
+    input('[press enter to continue]')
diff --git a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
index 3bb9dc73e2db..9f9785e834d7 100644
--- a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
+++ b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
@@ -74,7 +74,8 @@ def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation,
                            [n, co, oh, ow, ci, kh, kw, vc, vh, vw]])
 
     cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
-    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
+    #cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
+    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll')
 
     # fallback support
     if cfg.is_fallback:
@@ -158,7 +159,7 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec,
                              axis_lens=[cfg['tile_oh'].size[-1],
                                         cfg['tile_ow'].size[-1],
                                         cfg['tile_co'].size[-1]],
-                             max_unroll=16,
+                             max_unroll=None,
                              cfg=cfg)
 
     # schedule fusion
@@ -173,18 +174,18 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec,
                                  axis_lens=[cfg['tile_oh'].size[-1],
                                             cfg['tile_ow'].size[-1],
                                             cfg['tile_co'].size[-1]],
-                                 max_unroll=16,
+                                 max_unroll=None,
                                  cfg=cfg)
     s[conv].compute_at(s[last], ow)
 
-    # mark parallel
-    s[last].parallel(co)
+    ## mark parallel
+    #s[last].parallel(co)
 
     if data_vec.op.name == 'data_vec_undilated':
         _, h, _, _, _, _, _, _ = s[data_vec].op.axis
     else:
         _, h, _, _, _, _ = s[data_vec].op.axis
-    s[data_vec].parallel(h)
+    #s[data_vec].parallel(h)
 
     if kernel_vec.op.name == 'kernel_vec':
         co, _, _, _, _ = s[kernel_vec].op.axis
@@ -193,10 +194,12 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec,
             # this part to make tuning records correct
             s[kernel_vec].pragma(co, 'debug_skip_region')
         else:
-            s[kernel_vec].parallel(co)
+            #s[kernel_vec].parallel(co)
+            pass
     elif kernel_vec.op.name == 'kernel_vec_conv2d_transpose':  # for conv2d transpose
-        co, _, _, _, _ = s[kernel_vec].op.axis
-        s[kernel_vec].parallel(co)
+        #co, _, _, _, _ = s[kernel_vec].op.axis
+        #s[kernel_vec].parallel(co)
+        pass
 
     return s
 
diff --git a/topi/python/topi/arm_cpu/injective.py b/topi/python/topi/arm_cpu/injective.py
index 966520088bc7..8cf3fd5f11bf 100644
--- a/topi/python/topi/arm_cpu/injective.py
+++ b/topi/python/topi/arm_cpu/injective.py
@@ -20,6 +20,7 @@
 from tvm import te
 from ..util import is_empty_shape
 
+@generic.schedule_injective_from_existing.register(["arm_cpu", "micro_dev"])
 def schedule_injective_from_existing(sch, out):
     """Schedule for injective op from existing schedule.
 
@@ -45,6 +46,7 @@ def schedule_injective_from_existing(sch, out):
         sch[out].parallel(sch[out].op.axis[0])
     return sch
 
+@generic.schedule_injective.register(["arm_cpu", "micro_dev"])
 def schedule_injective(outs):
     """ARM CPU schedule for injective op.
 
@@ -72,6 +74,7 @@ def schedule_injective(outs):
         schedule_injective_from_existing(s, x)
     return s
 
+@generic.schedule_concatenate.register(["arm_cpu", "micro_dev"])
 def schedule_concatenate(outs):
     """Schedule for concatenate op.
 
diff --git a/topi/python/topi/testing/conv2d_nhwc_python.py b/topi/python/topi/testing/conv2d_nhwc_python.py
index d8713110056a..7c021785544c 100644
--- a/topi/python/topi/testing/conv2d_nhwc_python.py
+++ b/topi/python/topi/testing/conv2d_nhwc_python.py
@@ -35,10 +35,8 @@ def _conv2d_nhwc_python(a_np, w_np, stride, padding):
     stride : int or a list/tuple of two ints
         Stride size, or [stride_height, stride_width]
 
-    padding : int or str or a list/tuple of 2 or 4 ints
-        Padding size, or ['VALID', 'SAME'], or
-        [pad_height, pad_width] for 2 ints, or
-        [pad_top, pad_left, pad_bottom, pad_right] for 2 ints
+    padding : int or str or a list/tuple of two ints
+        Padding size, or ['VALID', 'SAME'], or [pad_height, pad_width]
 
     Returns
     -------

From 91e132d56503b3f93a5343656274048e7e61e37c Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Tue, 7 Apr 2020 19:13:44 -0700
Subject: [PATCH 02/11] Cleanup and sync micro tvm prototype.

---
 3rdparty/dmlc-core                            |   2 +-
 Makefile                                      |   5 +-
 include/tvm/tir/stmt.h                        |   2 +
 python/tvm/autotvm/measure/measure_methods.py |  13 +-
 python/tvm/autotvm/task/relay_integration.py  |   1 +
 python/tvm/autotvm/task/space.py              |   6 +-
 python/tvm/autotvm/tuner/callback.py          |   2 +-
 python/tvm/autotvm/tuner/ga_tuner.py          |   8 +-
 python/tvm/autotvm/tuner/model_based_tuner.py |   1 -
 python/tvm/autotvm/tuner/tuner.py             |  12 +-
 python/tvm/contrib/binutil.py                 | 107 ++------
 python/tvm/exec/rpc_server.py                 |   7 +-
 python/tvm/micro/base.py                      |  47 ++--
 python/tvm/micro/device/__init__.py           |   3 +-
 python/tvm/micro/device/arm/stm32f746xx.py    |  21 +-
 python/tvm/micro/device/base.py               |  52 ++--
 python/tvm/micro/device/host.py               |  17 +-
 python/tvm/micro/device/riscv_spike.py        |   1 -
 python/tvm/relay/_parser.py                   |   6 -
 python/tvm/relay/build_module.py              |  22 +-
 python/tvm/relay/frontend/keras.py            |   2 +-
 python/tvm/relay/op/strategy/arm_cpu.py       |  36 ++-
 python/tvm/rpc/client.py                      |  10 -
 python/tvm/rpc/server.py                      |   2 -
 python/tvm/rpc/tracker.py                     |   3 -
 python/tvm/runtime/module.py                  |   1 -
 python/tvm/runtime/ndarray.py                 |   2 +-
 python/tvm/target/arm_isa.py                  |  34 +++
 src/driver/driver_api.cc                      |   2 +-
 src/ir/error.cc                               |   2 -
 .../micro/device/arm/stm32f746xx/utvm_init.s  |   6 -
 .../micro/device/arm/stm32f746xx/utvm_timer.c | 107 +++-----
 src/runtime/micro/device/host/utvm_init.c     |   1 -
 src/runtime/micro/device/host/utvm_timer.c    |  18 +-
 .../micro/device/riscv_spike/utvm_init.s      |   6 -
 .../micro/device/riscv_spike/utvm_timer.c     |   1 -
 .../host_driven/utvm_device_dylib_redirect.c  |   9 +-
 src/runtime/micro/host_driven/utvm_runtime.c  | 149 +++++++----
 src/runtime/micro/host_driven/utvm_runtime.h  |   1 -
 src/runtime/micro/host_low_level_device.cc    |   8 +-
 src/runtime/micro/low_level_device.h          |   6 +-
 src/runtime/micro/micro_common.cc             |  10 +-
 src/runtime/micro/micro_common.h              |  85 +++++--
 src/runtime/micro/micro_device_api.cc         |  30 +--
 src/runtime/micro/micro_module.cc             |  12 +-
 src/runtime/micro/micro_section_allocator.h   |  28 ++-
 src/runtime/micro/micro_session.cc            | 237 +++++++-----------
 src/runtime/micro/micro_session.h             |  47 ++--
 src/runtime/micro/openocd_low_level_device.cc |  12 +-
 .../micro/target_data_layout_encoder.h        |  29 ++-
 src/runtime/micro/tcl_socket.cc               |   5 +-
 src/runtime/rpc/rpc_session.cc                |  66 +----
 src/runtime/rpc/rpc_session.h                 |  48 ++--
 src/target/source/codegen_c.cc                |   6 +-
 src/target/source/codegen_c_host.cc           |  25 +-
 src/target/source/codegen_c_host.h            |   9 +-
 src/target/target.cc                          |   2 +-
 tests/python/unittest/test_runtime_micro.py   |  66 ++---
 topi/python/topi/arm_cpu/__init__.py          |   1 +
 topi/python/topi/arm_cpu/conv2d.py            |  13 +
 .../topi/arm_cpu/conv2d_spatial_pack.py       |  19 +-
 .../python/topi/arm_cpu/cortex_m7/__init__.py |  20 ++
 .../topi/arm_cpu/cortex_m7/conv2d/__init__.py |  19 ++
 .../topi/arm_cpu/cortex_m7/conv2d/direct.py   | 177 +++++++++++++
 .../arm_cpu/cortex_m7/conv2d/direct_simd.py   | 163 ++++++++++++
 .../cortex_m7/micro_kernel/__init__.py        |   0
 .../arm_cpu/cortex_m7/micro_kernel/gemm.py    | 221 ++++++++++++++++
 topi/python/topi/arm_cpu/injective.py         |   3 -
 topi/python/topi/generic/default.py           |   2 +-
 69 files changed, 1277 insertions(+), 819 deletions(-)
 create mode 100644 python/tvm/target/arm_isa.py
 create mode 100644 topi/python/topi/arm_cpu/cortex_m7/__init__.py
 create mode 100644 topi/python/topi/arm_cpu/cortex_m7/conv2d/__init__.py
 create mode 100644 topi/python/topi/arm_cpu/cortex_m7/conv2d/direct.py
 create mode 100644 topi/python/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
 create mode 100644 topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py
 create mode 100644 topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py

diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 981b1c32f916..808f485387f9 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 981b1c32f91668e669ee376856f92f36cfd2a351
+Subproject commit 808f485387f9a03f78fa9f1159f387d0d91b7a28
diff --git a/Makefile b/Makefile
index 757b3300f7d5..64a127346e53 100644
--- a/Makefile
+++ b/Makefile
@@ -73,7 +73,10 @@ build/libtvm_web_runtime.js: build/libtvm_web_runtime.bc
 cpplint:
 	python3 3rdparty/dmlc-core/scripts/lint.py vta cpp vta/include vta/src
 	python3 3rdparty/dmlc-core/scripts/lint.py topi cpp topi/include;
-	python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp include src \
+	# Note: exclude src/runtime/micro/host_driven becuase it contains C99 files.
+	python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp \
+	 --exclude_path=src/runtime/micro/host_driven \
+	 include src \
 	 examples/extension/src examples/graph_executor/src
 
 pylint:
diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index aed8b5c77ae5..0d3cf42d8190 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -930,6 +930,8 @@ constexpr const char* loop_scope = "loop_scope";
 constexpr const char* reduce_scope = "reduce_scope";
 /*! \brief Mark region is guarded by the pragma extension */
 constexpr const char* pragma_scope_prefix = "pragma_";
+/*! \brief Import C source or file into the final code gen module */
+constexpr const char* pragma_import_c = "pragma_import_c";
 /*! \brief Import llvm source or file into the final code gen module */
 constexpr const char* pragma_import_llvm = "pragma_import_llvm";
 /*! \brief Try to modify the AST to support Tensor Core */
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 00ecd2e98a06..185ed7d05019 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -245,11 +245,12 @@ def get_build_kwargs(self):
 
             if 'cuda' in self.task.target.keys:
                 kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split('.'))
+        if self.task.target.device_name == 'micro_dev':
+            kwargs.setdefault('build_option', {})['disable_vectorize'] = True
 
         return kwargs
 
     def run(self, measure_inputs, build_results):
-        print('[RPCRunner.run]')
         results = []
         remote_args = (self.key, self.host, self.port, self.priority, self.timeout)
 
@@ -274,10 +275,9 @@ def run(self, measure_inputs, build_results):
                 if isinstance(res, Exception):   # executor error or timeout
                     results.append(MeasureResult((str(res),), MeasureErrorNo.RUN_TIMEOUT,
                                                  self.timeout, time.time()))
-                    #raise Exception(f'encountered exception during measurement: {results}')
-                else:
-                    print(f'  got a result: {res}')
-                    results.append(res)
+                    raise Exception(f'encountered exception during measurement: {results}')
+
+                results.append(res)
 
         return results
 
@@ -511,8 +511,7 @@ def run_through_rpc(measure_input, build_result,
             msg = msg[:msg.index("Stack trace returned")]
         if "CUDA Source" in msg:
             msg = msg[:msg.index("CUDA Source")]
-        #costs = (RuntimeError(msg[:1024]),)
-        costs = (RuntimeError(msg),)
+        costs = (RuntimeError(msg[:1024]),)
         errno = MeasureErrorNo.RUNTIME_DEVICE
     tstamp = time.time()
     time.sleep(cooldown_interval)
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index de183db41e2c..f3edfb01dc07 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -48,6 +48,7 @@ def _lower(mod,
                 grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
                 grc.codegen(mod["main"])
                 return
+
     # default case
     # Try graph codegen first to extract autotvm tasks.
     # If failed to compile, then fallback to use VM compiler.
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index f8de1fbf32d5..fbf474fc4df7 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -544,10 +544,8 @@ def apply(self, sch, op, axes, axis_lens=None,
                 if ann == 'none':
                     pass
                 elif ann == 'unroll':
-                    #if max_unroll and axis_lens[i] > max_unroll:
-                    #    cfg.raise_error("Too large factor for unrolling")
-                    #if max_unroll and axis_lens[i] < max_unroll:
-                    #    cfg.raise_error("Too large factor for unrolling")
+                    if max_unroll and axis_lens[i] > max_unroll:
+                        cfg.raise_error("Too large factor for unrolling")
                     sch[op].unroll(axes[i])
                 elif ann == 'vec':
                     if vec_size and axis_lens[i] not in vec_size:
diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py
index eede450eaeaf..f2e608088510 100644
--- a/python/tvm/autotvm/tuner/callback.py
+++ b/python/tvm/autotvm/tuner/callback.py
@@ -149,7 +149,7 @@ def _callback(tuner, inputs, results):
             if res.error_no == 0:
                 flops = inp.task.flop / np.mean(res.costs)
 
-        if logger.level < logging.DEBUG:  # only print progress bar in non-debug mode
+        if logger.level > logging.DEBUG:  # only print progress bar in non-debug mode
             ctx.cur_flops = flops
             ctx.best_flops = tuner.best_flops
 
diff --git a/python/tvm/autotvm/tuner/ga_tuner.py b/python/tvm/autotvm/tuner/ga_tuner.py
index a4c36bcd385e..da10f73d5a53 100644
--- a/python/tvm/autotvm/tuner/ga_tuner.py
+++ b/python/tvm/autotvm/tuner/ga_tuner.py
@@ -50,7 +50,11 @@ def __init__(self, task, pop_size=100, elite_num=3, mutation_prob=0.1):
 
         # space info
         self.space = task.config_space
-        self.dims = [len(x) for x in self.space.space_map.values()]
+        self.dim_keys = []
+        self.dims = []
+        for k, v in self.space.space_map.items():
+            self.dim_keys.append(k)
+            self.dims.append(len(v))
 
         self.visited = set([])
 
@@ -123,7 +127,7 @@ def update(self, inputs, results):
                 if len(self.visited) < len(self.space):
                     while knob2point(tmp_gene, self.dims) in self.visited:
                         j = np.random.randint(len(self.dims))
-                        tmp_gene[j] = np.random.randint(self.dims[j])
+                        tmp_gene[j] = np.random.randint(self.dims[j])  # pylint: disable=invalid-sequence-index
                     next_genes.append(tmp_gene)
                     self.visited.add(knob2point(tmp_gene, self.dims))
                 else:
diff --git a/python/tvm/autotvm/tuner/model_based_tuner.py b/python/tvm/autotvm/tuner/model_based_tuner.py
index 56fe5b4f3f72..432f7070c349 100644
--- a/python/tvm/autotvm/tuner/model_based_tuner.py
+++ b/python/tvm/autotvm/tuner/model_based_tuner.py
@@ -263,7 +263,6 @@ def update(self, inputs, results):
         # if we have enough new training samples
         if len(self.xs) >= self.plan_size * (self.train_ct + 1) \
                 and self.flops_max > 1e-6:
-            import pdb; pdb.set_trace()
             self.cost_model.fit(self.xs, self.ys, self.plan_size)
             if self.diversity_filter_ratio:
                 candidate = self.model_optimizer.find_maximums(
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index 52d745104e63..0d66d34ac316 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -150,13 +150,15 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=(), si_pr
                              i + k + 1, si_prefix, format_si_prefix(flops, si_prefix),
                              format_si_prefix(self.best_flops, si_prefix), res, config)
 
+            num_successes = 0
             for result in results:
                 if isinstance(result.costs[0], float):
-                    i += 1
-                else:
-                    print('[Tuner.tune]')
-                    print('  not counting failure towards trial count')
-            #i += len(results)
+                    num_successes += 1
+            if num_successes != len(results):
+                logger.debug('not counting %d failures towards trial count',
+                             len(results) - num_successes)
+            i += num_successes
+
             self.ttl = min(early_stopping + self.best_iter, n_trial) - i
 
             self.update(inputs, results)
diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py
index 9d6d469ba7cd..2b18d645dbc0 100644
--- a/python/tvm/contrib/binutil.py
+++ b/python/tvm/contrib/binutil.py
@@ -72,80 +72,6 @@
 }}
 """
 
-def run_cmd(cmd):
-    """Runs `cmd` in a subprocess and awaits its completion.
-
-    Parameters
-    ----------
-    cmd : List[str]
-        list of command-line arguments
-
-    Returns
-    -------
-    output : str
-        resulting stdout capture from the subprocess
-    """
-    proc = subprocess.Popen(
-        cmd,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT)
-    (output, _) = proc.communicate()
-    output = output.decode('utf-8')
-    if proc.returncode != 0:
-        cmd_str = ' '.join(cmd)
-        msg = f'error while running command \"{cmd_str}\":\n{output}'
-        raise RuntimeError(msg)
-    return output
-
-
-RELOCATION_LD_SCRIPT_TEMPLATE = """
-/* linker symbol for use in UTVMInit */
-_utvm_stack_pointer_init = 0x{stack_pointer_init:x};
-
-SECTIONS
-{{
-  . = 0x{text_start:x};
-  . = ALIGN({word_size});
-  .text :
-  {{
-    . = ALIGN({word_size});
-    KEEP(*(.text))
-    KEEP(*(.text*))
-    . = ALIGN({word_size});
-  }}
-
-  . = 0x{rodata_start:x};
-  . = ALIGN({word_size});
-  .rodata :
-  {{
-    . = ALIGN({word_size});
-    KEEP(*(.rodata))
-    KEEP(*(.rodata*))
-    . = ALIGN({word_size});
-  }}
-
-  . = 0x{data_start:x};
-  . = ALIGN({word_size});
-  .data :
-  {{
-    . = ALIGN({word_size});
-    KEEP(*(.data))
-    KEEP(*(.data*))
-    . = ALIGN({word_size});
-  }}
-
-  . = 0x{bss_start:x};
-  . = ALIGN({word_size});
-  .bss :
-  {{
-    . = ALIGN({word_size});
-    KEEP(*(.bss))
-    KEEP(*(.bss*))
-    . = ALIGN({word_size});
-  }}
-}}
-"""
-
 def run_cmd(cmd):
     """Runs `cmd` in a subprocess and awaits its completion.
 
@@ -236,11 +162,11 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
         # padding for most cases, but symbols can be arbitrarily large, so this
         # isn't bulletproof.
         return section_size + 32
-    # TODO remove this arbitrary addition once we figure out why section sizes
-    # are being undercalculated.
-    # maybe stop relying on `*size` to give us the size and instead read the
-    # section with `*objcopy` and count the bytes.
-    return section_size + 8
+
+    # NOTE: in the past, section_size has been wrong on x86. it may be
+    # inconsistent. TODO: maybe stop relying on `*size` to give us the size and
+    # instead read the section with `*objcopy` and count the bytes.
+    return section_size
 
 
 @tvm._ffi.register_func("tvm_callback_relocate_binary")
@@ -315,17 +241,18 @@ def tvm_callback_relocate_binary(
     with open(rel_obj_path, 'rb') as f:
         rel_bin = bytearray(f.read())
 
-    gdb_init_dir = os.environ['MICRO_GDB_INIT_DIR']
-    gdb_init_path = f'{gdb_init_dir}/.gdbinit'
-    with open(gdb_init_path, 'r') as f:
-        gdbinit_contents = f.read().split('\n')
-    new_contents = []
-    for line in gdbinit_contents:
-        new_contents.append(line)
-        if line.startswith('target'):
-            new_contents.append(f'add-symbol-file {rel_obj_path}')
-    with open(gdb_init_path, 'w') as f:
-        f.write('\n'.join(new_contents))
+    gdb_init_dir = os.environ.get('MICRO_GDB_INIT_DIR')
+    if gdb_init_dir is not None:
+        gdb_init_path = f'{gdb_init_dir}/.gdbinit'
+        with open(gdb_init_path, 'r') as f:
+            gdbinit_contents = f.read().split('\n')
+        new_contents = []
+        for line in gdbinit_contents:
+            new_contents.append(line)
+            if line.startswith('target'):
+                new_contents.append(f'add-symbol-file {rel_obj_path}')
+        with open(gdb_init_path, 'w') as f:
+            f.write('\n'.join(new_contents))
 
     return rel_bin
 
diff --git a/python/tvm/exec/rpc_server.py b/python/tvm/exec/rpc_server.py
index a3c43583f44d..dd275a3bad4d 100644
--- a/python/tvm/exec/rpc_server.py
+++ b/python/tvm/exec/rpc_server.py
@@ -122,9 +122,10 @@ def server_shutdown():
                               '--utvm-dev-config-args is specified.'))
     parser.add_argument('--utvm-dev-config-args', type=str,
                         help=("Arguments to the device module's generate_config function. "
-                              'Must be a python literal parseable by literal_eval. If specified, the '
-                              "device configuration is generated using the device module's generate_config. "
-                              "Can't be specified when --utvm-dev-config is specified."))
+                              'Must be a python literal parseable by literal_eval. If specified, '
+                              "the device configuration is generated using the device module's "
+                              "generate_config. Can't be specified when --utvm-dev-config is "
+                              "specified."))
     parser.add_argument('--utvm-dev-id', type=str,
                         help=('Unique ID for the target device (if using MicroTVM). Should '
                               'match the name of a module underneath tvm.micro.device).'))
diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py
index 5a1b71846630..d2ec7d398c46 100644
--- a/python/tvm/micro/base.py
+++ b/python/tvm/micro/base.py
@@ -19,13 +19,13 @@
 from __future__ import absolute_import
 
 import os
+import re
 import sys
 from enum import Enum
 
 import tvm
 import tvm._ffi
 
-import tvm
 from tvm.contrib import util as _util
 from tvm.contrib import cc as _cc
 
@@ -86,7 +86,9 @@ def __init__(self, config):
         runtime_src_path = os.path.join(get_micro_host_driven_dir(), 'utvm_runtime.c')
         tmp_dir = _util.tempdir()
         runtime_obj_path = tmp_dir.relpath('utvm_runtime.obj')
-        dev_funcs['create_micro_lib'](runtime_obj_path, runtime_src_path, LibType.RUNTIME)
+        options = ['-I{}'.format(get_micro_host_driven_dir())]
+        dev_funcs['create_micro_lib'](
+            runtime_obj_path, runtime_src_path, LibType.RUNTIME, options=options)
 
         comms_method = config['comms_method']
         if comms_method == 'openocd':
@@ -98,7 +100,8 @@ def __init__(self, config):
         else:
             raise RuntimeError(f'unknown communication method: f{self.comms_method}')
 
-        assert all(map(lambda sec: sec in self.mem_layout, DEVICE_SECTIONS)), 'not all sections have an assigned memory layout'
+        assert all(map(lambda sec: sec in self.mem_layout, DEVICE_SECTIONS)), \
+            'not all sections have an assigned memory layout'
         self.module = _CreateSession(
             comms_method,
             runtime_obj_path,
@@ -139,7 +142,7 @@ def _check_system(self):
         # TODO(weberlo): Add 32-bit support.
         # It's primarily the compilation pipeline that isn't compatible.
         if sys.maxsize <= 2**32:
-            raise RuntimeError('MicroTVM is currently only supported on 64-bit platforms')
+            raise RuntimeError('MicroTVM is currently only supported on 64-bit host platforms')
 
     def __enter__(self):
         self._enter()
@@ -151,8 +154,8 @@ def __exit__(self, exc_type, exc_value, exc_traceback):
 
 def _calc_max_workspace_usage(src):
     # TODO factor in alignment to the calculation (alloc sizes will be aligned up to the word size)
-    import re
-    alloc_re = re.compile(r'.*\* ?(.+) = (\(.+\))? TVMBackendAllocWorkspace\(.+, .+, \(uint64_t\)(.+), .+, .+\).*')
+    alloc_re = re.compile(
+        r'.*\* ?(.+) = (\(.+\))? TVMBackendAllocWorkspace\(.+, .+, \(uint64_t\)(.+), .+, .+\).*')
     free_re = re.compile(r'.*if \(TVMBackendFreeWorkspace\(.+, .+, (\(void\*\))? (.+)\) != 0\) {.*')
     max_usage = 0
     alloc_map = {}
@@ -171,7 +174,8 @@ def _calc_max_workspace_usage(src):
     return max_usage
 
 
-def create_micro_mod(c_mod, dev_config, lib_src_paths=None, lib_headers=None, lib_include_paths=None):
+def create_micro_mod(c_mod, dev_config, lib_src_paths=None, lib_headers=None,
+                     lib_include_paths=None):
     """Produces a micro module from a given module.
 
     Parameters
@@ -193,24 +197,24 @@ def create_micro_mod(c_mod, dev_config, lib_src_paths=None, lib_headers=None, li
     micro_mod : tvm.module.Module
         micro module for the target device
     """
-    print('[create_micro_mod]')
     temp_dir = _util.tempdir()
     lib_obj_path = temp_dir.relpath('dev_lib.obj')
     # TODO use dev config to dispatch on the type of C codegen to run through
     # (e.g., CodeGenCArm, CodeGenCHost, CodeGenCRiscV)
     c_mod.export_library(
-            lib_obj_path,
-            fcompile=cross_compiler(
-                dev_config,
-                LibType.OPERATOR,
-                lib_src_paths=lib_src_paths,
-                lib_headers=lib_headers,
-                lib_include_paths=lib_include_paths))
-    micro_mod = tvm.module.load(lib_obj_path)
+        lib_obj_path,
+        fcompile=cross_compiler(
+            dev_config,
+            LibType.OPERATOR,
+            lib_src_paths=lib_src_paths,
+            lib_headers=lib_headers,
+            lib_include_paths=lib_include_paths))
+    micro_mod = tvm.runtime.load_module(lib_obj_path)
     return micro_mod
 
 
-def cross_compiler(dev_config, lib_type, lib_src_paths=None, lib_headers=None, lib_include_paths=None):
+def cross_compiler(dev_config, lib_type, lib_src_paths=None, lib_headers=None,
+                   lib_include_paths=None):
     """Create a cross compile function that wraps `create_lib` for a `Binutil` instance.
 
     For use in `tvm.runtime.Module.export_library`.
@@ -247,7 +251,8 @@ def cross_compiler(dev_config, lib_type, lib_src_paths=None, lib_headers=None, l
       fcompile = tvm.micro.cross_compiler(dev_config, LibType.OPERATOR)
       c_mod.export_library('dev_lib.obj', fcompile=fcompile)
     """
-    assert (lib_headers is None) == (lib_include_paths is None), 'must specify both `lib_headers` and `lib_include_paths` or neither'
+    assert (lib_headers is None) == (lib_include_paths is None), \
+        'must specify both `lib_headers` and `lib_include_paths` or neither'
 
     if lib_src_paths is None:
         lib_src_paths = []
@@ -257,7 +262,8 @@ def cross_compiler(dev_config, lib_type, lib_src_paths=None, lib_headers=None, l
     for include_path in lib_include_paths:
         include_options.append('-I')
         include_options.append(include_path)
-    create_micro_lib = tvm.micro.device.get_device_funcs(dev_config['device_id'])['create_micro_lib']
+    create_micro_lib = tvm.micro.device.get_device_funcs(
+        dev_config['device_id'])['create_micro_lib']
     mem_layout = dev_config['mem_layout']
 
     def compile_func(obj_path, src_path, **kwargs):
@@ -274,7 +280,8 @@ def compile_func(obj_path, src_path, **kwargs):
             max_ws_usage = _calc_max_workspace_usage(src_contents)
             available_mem = mem_layout['workspace']['size']
             if max_ws_usage > available_mem:
-                raise RuntimeError(f'workspace allocations in library ({max_ws_usage}) exceed available memory ({available_mem})')
+                raise RuntimeError(f'workspace allocations in library ({max_ws_usage}) '
+                                   f'exceed available memory ({available_mem})')
         # inject headers into new source path, if requested
         if lib_headers:
             headers_to_inject = '\n'.join(map(lambda s: f'#include <{s}>', lib_headers)) + '\n'
diff --git a/python/tvm/micro/device/__init__.py b/python/tvm/micro/device/__init__.py
index 3d2291c6a052..89731b9aa797 100644
--- a/python/tvm/micro/device/__init__.py
+++ b/python/tvm/micro/device/__init__.py
@@ -16,7 +16,8 @@
 # under the License.
 """Device-specific configuration for MicroTVM"""
 
-from .base import create_micro_lib_base, gen_mem_layout, MemConstraint, register_device, get_device_funcs
+from .base import create_micro_lib_base, gen_mem_layout
+from .base import MemConstraint, register_device, get_device_funcs
 from . import host
 from . import arm
 from . import riscv_spike
diff --git a/python/tvm/micro/device/arm/stm32f746xx.py b/python/tvm/micro/device/arm/stm32f746xx.py
index 8cd354738fe7..f85a34e4e0a2 100644
--- a/python/tvm/micro/device/arm/stm32f746xx.py
+++ b/python/tvm/micro/device/arm/stm32f746xx.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Compilation and config definitions for Arm STM32F746XX devices"""
+import os
 from .. import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint
 
 DEVICE_ID = 'arm.stm32f746xx'
@@ -60,13 +61,16 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=N
     """
     if options is None:
         options = []
+    else:
+        options = list(options)
+
     options += [
+        # TODO(weberlo): make a debug flag
+        '-O2',
         '-march=armv7e-m',
         '-mcpu=cortex-m7',
         '-mlittle-endian',
         '-mfloat-abi=hard',
-        # TODO try this one?
-        #'-mfpu=fpv5-d16',
         '-mfpu=fpv5-sp-d16',
         '-mthumb',
         '-ffast-math',
@@ -74,9 +78,14 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=N
         '-DARM_MATH_CM7',
         '-D__FPU_PRESENT=1U',
         '-DARM_MATH_DSP',
+        '-Wno-unused-variable',
+        '-Wno-unused-parameter',
+        '-I{}'.format(os.environ['CMSIS_ST_PATH']),
+        '-I{}/Core/Include'.format(os.environ['CMSIS_ST_PATH'])
         ]
     create_micro_lib_base(
-        obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options, lib_src_paths=lib_src_paths)
+        obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options,
+        lib_src_paths=lib_src_paths)
 
 
 def generate_config(server_addr, server_port, section_constraints=None):
@@ -90,8 +99,8 @@ def generate_config(server_addr, server_port, section_constraints=None):
     server_port : int
         port of OpenOCD server to connect to
 
-    section_constraints: Optional[Dict[str, Tuple[Number, MemConstraint]]]
-        TODO correct type annotation?
+    section_constraints: Optional[Dict[str, [Number, MemConstraint]]]
+        maps section name to the quantity of available memory
 
     Return
     ------
@@ -106,7 +115,7 @@ def generate_config(server_addr, server_port, section_constraints=None):
         'mem_layout': gen_mem_layout(BASE_ADDR, AVAILABLE_MEM, WORD_SIZE, section_constraints),
         'word_size': WORD_SIZE,
         'thumb_mode': True,
-        'use_device_timer': False,
+        'use_device_timer': True,
         'comms_method': 'openocd',
         'server_addr': server_addr,
         'server_port': server_port,
diff --git a/python/tvm/micro/device/base.py b/python/tvm/micro/device/base.py
index 41d1a3ed1a95..1621c69d1a77 100644
--- a/python/tvm/micro/device/base.py
+++ b/python/tvm/micro/device/base.py
@@ -19,7 +19,6 @@
 import os
 import enum
 import pathlib
-import operator
 
 from tvm.contrib import util as _util
 from tvm.contrib.binutil import run_cmd
@@ -98,14 +97,8 @@ def create_micro_lib_base(
         additional options to pass to GCC
 
     lib_src_paths : Optional[List[str]]
-        TODO
+        paths to additional source files to be compiled into the library
     """
-    print('[MicroBinutil.create_lib]')
-    print('  EXTENDED OPTIONS')
-    print(f'    {out_obj_path}')
-    print(f'    {in_src_path}')
-    print(f'    {lib_type}')
-    print(f'    {options}')
     # look at these (specifically `strip`):
     #   https://stackoverflow.com/questions/15314581/g-compiler-flag-to-minimize-binary-size
     base_compile_cmd = [
@@ -115,10 +108,6 @@ def create_micro_lib_base(
         '-Wextra',
         '--pedantic',
         '-c',
-        # TODO(weberlo): make a debug flag
-        '-O0',
-        # '-O2',
-        # '-Os',
         '-g',
         '-nostartfiles',
         '-nodefaultlibs',
@@ -162,12 +151,12 @@ def create_micro_lib_base(
     if lib_src_paths is not None:
         src_paths += lib_src_paths
 
-    print(f'include paths: {include_paths}')
+    # print(f'include paths: {include_paths}')
     for path in include_paths:
         base_compile_cmd += ['-I', path]
 
     prereq_obj_paths = []
-    print(src_paths)
+    # print(src_paths)
     for src_path in src_paths:
         curr_obj_path = tmp_dir.relpath(pathlib.Path(src_path).with_suffix('.o').name)
         assert curr_obj_path not in prereq_obj_paths
@@ -190,9 +179,28 @@ class MemConstraint(enum.Enum):
 
 
 def gen_mem_layout(base_addr, available_mem, word_size, section_constraints):
-    print('[gen_mem_layout]')
-    byte_sum = sum(map(operator.itemgetter(0), filter(lambda x: x[1] == MemConstraint.ABSOLUTE_BYTES, section_constraints.values())))
-    weight_sum = sum(map(operator.itemgetter(0), filter(lambda x: x[1] == MemConstraint.WEIGHT, section_constraints.values())))
+    """Template function to generate memory layout for devices.
+
+    Parameters
+    ----------
+    base_addr: Number
+        The address where usable memory begins on this device.
+
+    available_mem: Number
+        Available memory at base_addr, given in bytes.
+
+    word_size: Number
+        Number of bytes in one word on this device.
+
+    section_constraints: Optional[Dict[str, [Number, MemConstraint]]]
+        maps section name to the quantity of available memory
+    """
+    byte_sum = sum(x[0]
+                   for x in section_constraints.values()
+                   if x[1] == MemConstraint.ABSOLUTE_BYTES)
+    weight_sum = sum(x[0]
+                     for x in section_constraints.values()
+                     if x[1] == MemConstraint.WEIGHT)
     assert byte_sum <= available_mem
     available_weight_mem = available_mem - byte_sum
 
@@ -201,7 +209,8 @@ def gen_mem_layout(base_addr, available_mem, word_size, section_constraints):
     for section in DEVICE_SECTIONS:
         (val, cons_type) = section_constraints[section]
         if cons_type == MemConstraint.ABSOLUTE_BYTES:
-            assert val % word_size == 0, f'constraint {val} for {section} section is not word-aligned'
+            assert val % word_size == 0, \
+                f'constraint {val} for {section} section is not word-aligned'
             size = val
             res[section] = {
                 'start': curr_addr,
@@ -216,13 +225,6 @@ def gen_mem_layout(base_addr, available_mem, word_size, section_constraints):
             }
         curr_addr += size
 
-    print('  result mem layout:')
-    for section in DEVICE_SECTIONS:
-        start = res[section]['start']
-        size = res[section]['size']
-        print(f'    {section}: start={start:x}, size={size}')
-    # import pprint
-    # pprint.pprint(res)
     return res
 
 
diff --git a/python/tvm/micro/device/host.py b/python/tvm/micro/device/host.py
index 737026dfd51a..f6d0dadc5f43 100644
--- a/python/tvm/micro/device/host.py
+++ b/python/tvm/micro/device/host.py
@@ -54,14 +54,19 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=N
         additional options to pass to GCC
 
     lib_src_paths : Optional[List[str]]
-        TODO
+        paths to additional source files to be compiled into the library
     """
     if options is None:
         options = []
+    else:
+        options = list(options)
+    # Cannot increase optimization level on host due to code loading method.
+    options.append('-O0')
     if sys.maxsize > 2**32 and sys.platform.startswith('linux'):
         options += ['-mcmodel=large']
     create_micro_lib_base(
-        obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options, lib_src_paths=lib_src_paths)
+        obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options,
+        lib_src_paths=lib_src_paths)
 
 
 def generate_config(available_mem=None, section_constraints=None):
@@ -69,13 +74,11 @@ def generate_config(available_mem=None, section_constraints=None):
 
     Parameters
     ----------
-    TODO correct type annotation?
     available_mem: int
-        TODO
+        number of RW bytes available for use on device
 
-    TODO correct type annotation?
-    section_constraints: Optional[Dict[str, Dict[str, Number]]]
-        TODO
+    section_constraints: Optional[Dict[str, Dict[Number, MemConstraint]]]
+        maps section name to the quantity of available memory
 
     Return
     ------
diff --git a/python/tvm/micro/device/riscv_spike.py b/python/tvm/micro/device/riscv_spike.py
index b7beff3c5cb6..f26f04604cac 100644
--- a/python/tvm/micro/device/riscv_spike.py
+++ b/python/tvm/micro/device/riscv_spike.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """Compilation and config definitions for Spike, a RISC-V functional ISA simulator"""
-from collections import OrderedDict
 
 from . import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint
 
diff --git a/python/tvm/relay/_parser.py b/python/tvm/relay/_parser.py
index 42ced795f6ae..6a1d928352f4 100644
--- a/python/tvm/relay/_parser.py
+++ b/python/tvm/relay/_parser.py
@@ -630,12 +630,6 @@ def visitCallWithAttr(self, ctx: RelayParser.CallWithAttrContext):
 
     def call(self, func, args, attrs, type_args):
         if isinstance(func, OpWrapper):
-            #if hasattr(func.operator, '__name__') and func.operator.__name__ == 'clip':
-            #    # TODO(wbelrlo) this big fucking hack yes
-            #    import copy
-            #    args = copy.deepcopy(args)
-            #    args[1] = float(args[1].data.asnumpy())
-            #    args[2] = float(args[2].data.asnumpy())
             return func(args, attrs, type_args)
         if isinstance(func, adt.Constructor):
             return func(*args)
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 459a4a588324..30c5971e32b9 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -239,26 +239,16 @@ def build(mod, target=None, target_host=None, params=None):
         raise ValueError("target host must be the type of str, " +
                          "tvm.target.Target, or None")
 
-    # # If current dispatch context is fallback context (the default root context),
-    # # then load pre-tuned parameters from TopHub
-    # if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
-    #     tophub_context = autotvm.tophub.context(list(target.values()))
-    # else:
-    #     tophub_context = autotvm.util.EmptyContext()
-
-    # with tophub_context:
-    #     bld_mod = BuildModule()
-    #     graph_json, mod, params = bld_mod.build(func, target, target_host, params)
-
+    # If current dispatch context is fallback context (the default root context),
+    # then load pre-tuned parameters from TopHub
     if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
         tophub_context = autotvm.tophub.context(list(target.values()))
-        with tophub_context:
-            bld_mod = BuildModule()
-            graph_json, mod, params = bld_mod.build(func, target, target_host, params)
     else:
-        bld_mod = BuildModule()
-        graph_json, mod, params = bld_mod.build(func, target, target_host, params)
+        tophub_context = autotvm.util.EmptyContext()
 
+    with tophub_context:
+        bld_mod = BuildModule()
+        graph_json, mod, params = bld_mod.build(mod, target, target_host, params)
     return graph_json, mod, params
 
 
diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py
index d455f2229439..43065bef838a 100644
--- a/python/tvm/relay/frontend/keras.py
+++ b/python/tvm/relay/frontend/keras.py
@@ -85,7 +85,7 @@ def _convert_activation(inexpr, keras_layer, _):
         return _op.sigmoid(inexpr)
     if act_type == 'tanh':
         return _op.tanh(inexpr)
-    if act_type in ('relu', 'swish'):
+    if act_type == 'relu':
         return _op.nn.relu(inexpr)
     if act_type == 'softplus':
         return _op.log(_op.add(_op.exp(inexpr), _expr.const(1., dtype='float32')))
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 942d4c7f86af..2ad75533e807 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -20,24 +20,25 @@
 import logging
 
 import topi
+from ....target import arm_isa
 from .generic import *
 from .. import op as _op
 
 logger = logging.getLogger('strategy')
 
-@schedule_injective.register("arm_cpu")
+@schedule_injective.register(["arm_cpu", "micro_dev"])
 def schedule_injective_arm_cpu(_, outs, target):
     """schedule injective ops for arm cpu"""
     with target:
         return topi.arm_cpu.schedule_injective(outs)
 
-@schedule_concatenate.register("arm_cpu")
+@schedule_concatenate.register(["arm_cpu", "micro_dev"])
 def schedule_concatenate_arm_cpu(_, outs, target):
     """schedule concatenate for arm cpu"""
     with target:
         return topi.arm_cpu.schedule_concatenate(outs)
 
-@conv2d_strategy.register("arm_cpu")
+@conv2d_strategy.register(["arm_cpu", "micro_dev"])
 def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
     """conv2d arm cpu strategy"""
     strategy = _op.OpStrategy()
@@ -51,6 +52,8 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
     if dilation_h < 1 or dilation_w < 1:
         raise ValueError("dilation should be positive value")
 
+    isa = arm_isa.IsaAnalyzer(target)
+
     if groups == 1:
         if layout == "NCHW":
             if kernel_layout == "OIHW":
@@ -102,11 +105,22 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                 wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn),
                 name="conv2d_hwcn.generic")
         elif layout == "NHWC":
-            assert kernel_layout == "HWIO"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
-                wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
-                name="conv2d_nhwc_spatial_pack.arm_cpu")
+            channels = data.shape[3]
+            if "SMLAD" in isa and (channels % 4) == 0 and kernel_layout == "HWOI":
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.arm_cpu.conv2d_direct_simd),
+                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_direct_simd),
+                    name='conv2d_direct_simd.micro_dev')
+            elif kernel_layout == "HWIO":
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
+                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
+                    name="conv2d_nhwc_spatial_pack.arm_cpu")
+            else:
+                raise RuntimeError("Unsupported kernel layout {} for conv2d NHWC".
+                                   format(kernel_layout))
+
+
         else:
             raise RuntimeError("Unsupported conv2d layout {} for arm cpu".format(layout))
     elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
@@ -232,7 +246,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_arm_cpu(attrs, inputs, out
                            format(layout))
     return strategy
 
-@conv2d_transpose_strategy.register("arm_cpu")
+@conv2d_transpose_strategy.register(["arm_cpu", "micro_dev"])
 def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target):
     """conv2d_transpose arm cpu strategy"""
     layout = attrs.data_layout
@@ -248,7 +262,7 @@ def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target):
         name="conv2d_tranpose_nchw.arm_cpu")
     return strategy
 
-@bitserial_conv2d_strategy.register("arm_cpu")
+@bitserial_conv2d_strategy.register(["arm_cpu", "micro_dev"])
 def bitserial_conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
     """bitserial_conv2d x86 strategy"""
     strategy = _op.OpStrategy()
@@ -267,7 +281,7 @@ def bitserial_conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
         raise ValueError("Data layout {} not supported.".format(layout))
     return strategy
 
-@bitserial_dense_strategy.register("arm_cpu")
+@bitserial_dense_strategy.register(["arm_cpu", "micro_dev"])
 def schedule_bitserial_dense_arm_cpu(attrs, inputs, out_type, target):
     """bitserial_dense arm cpu strategy"""
     strategy = _op.OpStrategy()
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index f5a16941083b..ed57e0d4276d 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -70,11 +70,6 @@ def context(self, dev_type, dev_id=0):
         ctx: TVMContext
             The corresponding encoded remote context.
         """
-        print('[RPCSession.context]')
-        print(f'  dev_type: {dev_type}')
-        print(f'  dev_id: {dev_id}')
-        if '-device=micro_dev' in dev_type:
-            dev_type = 'micro_dev'
         ctx = nd.context(dev_type, dev_id)
         encode = (self._tbl_index + 1) * base.RPC_SESS_MASK
         ctx.device_type += encode
@@ -408,14 +403,9 @@ def connect(url, port, key="", session_timeout=0):
         The connected session.
     """
     try:
-        print('[client.connect]')
-        #session_timeout = 0
-        print('  hardcoding timeout to 0 (always keep alive)!')
         if session_timeout:
             key += " -timeout=%s" % str(session_timeout)
-        print(f'  connecting to RPC server with {url}, {port}, {key}')
         sess = base._Connect(url, port, key)
-        print(f'  finished connecting!')
     except NameError:
         raise RuntimeError("Please compile with USE_RPC=1")
     return RPCSession(sess)
diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py
index 74296cc2ceaa..03749c1c17e4 100644
--- a/python/tvm/rpc/server.py
+++ b/python/tvm/rpc/server.py
@@ -64,9 +64,7 @@ def get_workpath(path):
     def load_module(file_name):
         """Load module from remote side."""
         path = temp.relpath(file_name)
-        print('[rpc.server] ABOUT TO LOAD MOD')
         m = _load_module(path)
-        print('[rpc.server] DONE LOADING MOD')
         logger.info("load_module %s", path)
         return m
 
diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py
index 001c76c76fef..e3346b162aaf 100644
--- a/python/tvm/rpc/tracker.py
+++ b/python/tvm/rpc/tracker.py
@@ -297,11 +297,8 @@ def _event_handler(_, events):
     def _on_event(self, _):
         while True:
             try:
-                print('waiting for connection!')
                 conn, addr = self._sock.accept()
-                print(f'got new conn: {conn}, {addr}')
                 TCPEventHandler(self, conn, addr)
-                print(f'made event handler')
             except socket.error as err:
                 if err.args[0] in (errno.EAGAIN, errno.EWOULDBLOCK):
                     break
diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
index 45dd79e50019..716f87f33fc1 100644
--- a/python/tvm/runtime/module.py
+++ b/python/tvm/runtime/module.py
@@ -211,7 +211,6 @@ def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0):
 
             def evaluator(*args):
                 """Internal wrapped evaluator."""
-                print('[Module.time_evaluator.evaluator]')
                 # Wrap feval so we can add more stats in future.
                 blob = feval(*args)
                 fmt = "@" + ("d" * repeat)
diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index 10bbb6ef54c2..9b7e7c52f351 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -219,7 +219,7 @@ def context(dev_type, dev_id=0):
     """
     if isinstance(dev_type, string_types):
         if '-device=micro_dev' in dev_type:
-            dev_type = 'micro_dev'
+            dev_type = TVMContext.STR2MASK['micro_dev']
         else:
             dev_type = dev_type.split()[0]
             if dev_type not in TVMContext.STR2MASK:
diff --git a/python/tvm/target/arm_isa.py b/python/tvm/target/arm_isa.py
new file mode 100644
index 000000000000..6b2eda8f050e
--- /dev/null
+++ b/python/tvm/target/arm_isa.py
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Defines functions to analyze availble opcodes in the ARM ISA."""
+
+
+ARM_ISA_MAP = {
+    'armv7e-m': ['SMLAD'],
+}
+
+
+class IsaAnalyzer(object):
+
+    def __init__(self, target):
+        self.target = target
+        # TODO: actually parse -mcpu
+        arch = 'armv7e-m'
+        self._isa_map = ARM_ISA_MAP[arch]
+
+    def __contains__(self, instruction):
+        return instruction in self._isa_map
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 849c74028d63..8231c1b79d38 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -272,7 +272,7 @@ runtime::Module build(const Map<Target, IRModule>& inputs,
   Target target_host_val = target_host;
   if (!target_host.defined()) {
     for (const auto& it : inputs) {
-      if (it.first->device_type == kDLCPU) {
+      if (it.first->device_type == kDLCPU || it.first->device_type == kDLMicroDev) {
         target_host_val = it.first;
         break;
       }
diff --git a/src/ir/error.cc b/src/ir/error.cc
index e700691f3e07..67694342db45 100644
--- a/src/ir/error.cc
+++ b/src/ir/error.cc
@@ -69,8 +69,6 @@ void ErrorReporter::RenderErrors(const IRModule& module, bool use_color) {
     if (error_indices.size() != 0) {
       err_msg << rang::fg::red;
       err_msg << " ";
-      // TODO should fix this reverse problem further upstream (in the error reporter).
-      //
       // the errors are in reverse order, so print them with a reversed iteration
       err_msg << this->errors_[error_indices[error_indices.size()-1]].what();
       for (int i = error_indices.size() - 2; i >= 0; i--) {
diff --git a/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s b/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s
index 5861c0326dbf..f5720f4d7b28 100644
--- a/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s
+++ b/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s
@@ -17,12 +17,6 @@
  * under the License.
  */
 
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file utvm_init.s
- * \brief uTVM init definition for STM32F746XX-series boards
- */
-
 .syntax unified
 .cpu cortex-m7
 .fpu softvfp
diff --git a/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c b/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c
index a5a12cf86ebd..0f13a7dede88 100644
--- a/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c
+++ b/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file utvm_timer.c
  * \brief uTVM timer API definitions for STM32F746XX-series boards
  */
@@ -30,97 +29,51 @@ extern "C" {
 #include <stdint.h>
 
 #include "utvm_runtime.h"
+// NOTE: This expects ST CMSIS to be in your include path.
+// Download STM32CubeF7 here:
+// https://www.st.com/content/st_com/en/products/embedded-software/mcu-mpu-embedded-software/stm32-embedded-software/stm32cube-mcu-mpu-packages/stm32cubef7.html
+// and add Drivers/CMSIS to your C include path.
+#include "Device/ST/STM32F7xx/Include/stm32f746xx.h"
 
-// There are two implementations of cycle counters on the STM32F7X: SysTick and
-// CYCCNT.  SysTick is preferred, as it gives better error handling, but the
-// counter is only 24 bits wide.  If a larger timer is needed, use the CYCCNT
-// implementation, which has a 32-bit counter.
-#define USE_SYSTICK
 
-#ifdef USE_SYSTICK
-
-#define SYST_CSR    (*((volatile uint32_t *) 0xE000E010))
-#define SYST_RVR    (*((volatile uint32_t *) 0xE000E014))
-#define SYST_CVR    (*((volatile uint32_t *) 0xE000E018))
-#define SYST_CALIB  (*((volatile uint32_t *) 0xE000E01C))
-
-#define SYST_CSR_ENABLE     0
-#define SYST_CSR_TICKINT    1
-#define SYST_CSR_CLKSOURCE  2
-#define SYST_COUNTFLAG      16
-
-#define SYST_CALIB_NOREF  31
-#define SYST_CALIB_SKEW   30
-
-volatile uint32_t start_time = 0;
-volatile uint32_t stop_time = 0;
+#define utvm_SystemCoreClock 216000000UL
 
 int32_t UTVMTimerStart() {
-  SYST_CSR = 0;
-  // maximum reload value (24-bit)
-  SYST_RVR = (~((uint32_t) 0)) >> 8;
-  SYST_CVR = 0;
-
-  SYST_CSR = (1 << SYST_CSR_ENABLE) | (1 << SYST_CSR_CLKSOURCE);
-  // wait until timer starts
-  while (SYST_CVR == 0) {}
-  start_time = SYST_CVR;
+  UTVMTimerReset();
+  TIM2->CR1 =
+    TIM_CR1_CEN;  // Start counter
   return UTVM_ERR_OK;
 }
 
-uint32_t UTVMTimerStop(int32_t *err) {
-  SYST_CSR &= ~((uint32_t) 1);
-  stop_time = SYST_CVR;
-  if (SYST_CSR & (1 << SYST_COUNTFLAG)) {
-    TVMAPISetLastError("timer overflowed");
+uint32_t UTVMTimerStop(int32_t* err) {
+  TIM2->CR1 &= TIM_CR1_CEN;
+  if (TIM2->SR & TIM_SR_UIF_Msk) {
     *err = UTVM_ERR_TIMER_OVERFLOW;
     return 0;
-  } else {
-    *err = UTVM_ERR_OK;
-    return start_time - stop_time;
-  }
-}
-
-#else  // !USE_SYSTICK
-
-#define DWT_CTRL    (*((volatile uint32_t *) 0xE0001000))
-#define DWT_CYCCNT  (*((volatile uint32_t *) 0xE0001004))
-
-#define DWT_CTRL_NOCYCCNT   25
-#define DWT_CTRL_CYCCNTENA  0
-
-volatile uint32_t start_time = 0;
-volatile uint32_t stop_time = 0;
-
-int32_t UTVMTimerStart() {
-  DWT_CTRL &= ~(1 << DWT_CTRL_CYCCNTENA);
-  DWT_CYCCNT = 0;
-
-  if (DWT_CTRL & (1 << DWT_CTRL_NOCYCCNT)) {
-    TVMAPISetLastError("cycle counter not implemented on device");
-    return UTVM_ERR_TIMER_NOT_IMPLEMENTED;
   }
-  start_time = DWT_CYCCNT;
-  DWT_CTRL |= (1 << DWT_CTRL_CYCCNTENA);
-  return UTVM_ERR_OK;
+  *err = UTVM_ERR_OK;
+  uint32_t tim_cnt = TIM2->CNT;
+  uint32_t millis = tim_cnt / (utvm_SystemCoreClock / 1000);
+  uint32_t micros =
+    (tim_cnt - (millis * (utvm_SystemCoreClock / 1000))) /
+    (utvm_SystemCoreClock / 1000000);
+  return millis * 1000 + micros;
 }
 
-uint32_t UTVMTimerStop(int32_t* err) {
-  stop_time = DWT_CYCCNT;
-  DWT_CTRL &= ~(1 << DWT_CTRL_CYCCNTENA);
-  // even with this check, we can't know for sure if the timer has overflowed
-  // (it may have overflowed and gone past `start_time`).
-  if (stop_time > start_time) {
-    *err = UTVM_ERR_OK;
-    return stop_time - start_time;
-  } else {
-    *err = UTVM_ERR_TIMER_OVERFLOW;
-    return 0;
+void UTVMTimerReset() {
+  RCC->APB1RSTR |= RCC_APB1RSTR_TIM2RST;  // Hold TIM2 in reset
+  RCC->DCKCFGR1 = (RCC->DCKCFGR1 & ~RCC_DCKCFGR1_TIMPRE_Msk);  // disable 2x clock boost to TIM2
+  RCC->CFGR = (RCC->CFGR & ~RCC_CFGR_PPRE1_Msk);  // No AHB clock division to APB1 (1:1).
+  RCC->APB1ENR |= RCC_APB1ENR_TIM2EN;  // Enable TIM2 clock.
+  RCC->APB1RSTR &= ~RCC_APB1RSTR_TIM2RST;  // Exit TIM2 reset.
+
+  DBGMCU->APB1FZ |= DBGMCU_APB1_FZ_DBG_TIM2_STOP;  // stop TIM2 clock during debug halt.
+  TIM2->ARR = 0xffffffff;
+  if (TIM2->SR & TIM_SR_UIF_Msk) {
+    for (;;) ;
   }
 }
 
-#endif  // USE_SYSTICK
-
 #ifdef __cplusplus
 }  // TVM_EXTERN_C
 #endif
diff --git a/src/runtime/micro/device/host/utvm_init.c b/src/runtime/micro/device/host/utvm_init.c
index 02888f04c48e..4fb43c11d20e 100644
--- a/src/runtime/micro/device/host/utvm_init.c
+++ b/src/runtime/micro/device/host/utvm_init.c
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file utvm_init.c
  * \brief uTVM init definition for the host emulated device
  */
diff --git a/src/runtime/micro/device/host/utvm_timer.c b/src/runtime/micro/device/host/utvm_timer.c
index 7b24aab473d1..6ab585a88f24 100644
--- a/src/runtime/micro/device/host/utvm_timer.c
+++ b/src/runtime/micro/device/host/utvm_timer.c
@@ -20,12 +20,10 @@
 /*!
  * \file utvm_timer.c
  * \brief uTVM timer API stubs for the host emulated device
- *  Copyright (c) 2019 by Contributors
  */
 
 #include <stdint.h>
-#include <tvm/runtime/c_runtime_api.h>
-#include <tvm/runtime/c_backend_api.h>
+#include "utvm_runtime.h"
 
 // TODO(weberlo): use this? https://stackoverflow.com/questions/5141960/get-the-current-time-in-c
 
@@ -37,17 +35,3 @@ uint32_t UTVMTimerStop(int32_t* err) {
   *err = UTVM_ERR_OK;
   return 0;
 }
-
-extern void UTVMInit();
-
-extern void UTVMTimerReset();
-
-extern int32_t UTVMTimerStart();
-
-extern void UTVMTimerStop();
-
-extern uint32_t UTVMTimerRead();
-
-void UTVMMain();
-
-void UTVMDone();
diff --git a/src/runtime/micro/device/riscv_spike/utvm_init.s b/src/runtime/micro/device/riscv_spike/utvm_init.s
index a73641249165..68662cce97e7 100644
--- a/src/runtime/micro/device/riscv_spike/utvm_init.s
+++ b/src/runtime/micro/device/riscv_spike/utvm_init.s
@@ -17,12 +17,6 @@
  * under the License.
  */
 
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file utvm_init.s
- * \brief uTVM init definition for Spike
- */
-
 UTVMInit:
   /* set stack pointer */
   la sp, _utvm_stack_pointer_init
diff --git a/src/runtime/micro/device/riscv_spike/utvm_timer.c b/src/runtime/micro/device/riscv_spike/utvm_timer.c
index c4e0af2b230b..5cf38559feab 100644
--- a/src/runtime/micro/device/riscv_spike/utvm_timer.c
+++ b/src/runtime/micro/device/riscv_spike/utvm_timer.c
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file utvm_timer.c
  * \brief uTVM timer API stubs for Spike
  */
diff --git a/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c b/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c
index 970eb27a1ef3..3a840e7a7861 100644
--- a/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c
+++ b/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c
@@ -32,11 +32,10 @@ extern "C" {
 #include <stdint.h>
 #include <stddef.h>
 
-// TODO compiler errors say volatile qualifier is discarded. should we just get rid of em?
-volatile void *(*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) =
-    (void *(*)(int, int, uint64_t, int, int)) NULL;
-volatile int (*TVMBackendFreeWorkspace_)(int, int, void*) = (int (*)(int, int, void*)) NULL;
-volatile void (*TVMAPISetLastError_)(const char*) = (void (*)(const char*)) NULL;
+// TODO(areusch): compiler errors say volatile qualifier is discarded. should we just get rid of em?
+void* (* volatile TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = NULL;
+int (* volatile TVMBackendFreeWorkspace_)(int, int, void*) = NULL;
+void (* volatile TVMAPISetLastError_)(const char*) = NULL;
 
 void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size,
     int dtype_code_hint, int dtype_bits_hint) {
diff --git a/src/runtime/micro/host_driven/utvm_runtime.c b/src/runtime/micro/host_driven/utvm_runtime.c
index a4de495a185c..f05fdb5fe2c1 100644
--- a/src/runtime/micro/host_driven/utvm_runtime.c
+++ b/src/runtime/micro/host_driven/utvm_runtime.c
@@ -34,89 +34,148 @@ extern "C" {
 
 #include "utvm_runtime.h"
 
-// Task pointers must be patched before calling a function.
-UTVMTask utvm_task = {
-    .func = NULL,
-    .arg_values = NULL,
-    .arg_type_codes = NULL,
-    .num_args = 0,
-};
-
-size_t utvm_word_size = 0;  // NOLINT(*)
+// TODO(areusch): move defines into header
+#define TASK_QUEUE_SIZE 20
+volatile UTVMTask utvm_tasks[TASK_QUEUE_SIZE] = { };
+volatile uint32_t utvm_num_tasks = 0;
+volatile uint32_t utvm_task_times[TASK_QUEUE_SIZE] = { };
 
 // These pointers are patched at load time to point to the workspace section.
-char* utvm_workspace_start = NULL;  // NOLINT(*)
-char* utvm_workspace_end = NULL;    // NOLINT(*)
-char* utvm_workspace_curr = NULL;   // NOLINT(*)
+volatile char* utvm_workspace_start = NULL;  // NOLINT(*)
+volatile char* utvm_workspace_end = NULL;    // NOLINT(*)
+volatile char* utvm_workspace_curr = NULL;   // NOLINT(*)
+#define MAX_WS_ALLOCS 10
+volatile char* utvm_alloc_ends[MAX_WS_ALLOCS] = {};  // NOLINT(*)
+volatile uint32_t utvm_alloc_idx = 0;
 // Keep track of how many active allocations there are on the workspace.
-size_t utvm_num_active_allocs = 0;
+volatile uint32_t utvm_num_active_allocs = 0;
+
+volatile uint32_t utvm_word_size = 0;
 
-const char* utvm_last_error = NULL;  // NOLINT(*)
-int32_t utvm_return_code = 0;        // NOLINT(*)
+volatile int32_t utvm_last_error = 0;  // NOLINT(*)
 
-uint32_t utvm_task_time = 0;
+volatile uint32_t utvm_done = 0;
 
 // Gets called by UTVMInit, after device-specific initialization is finished.
 void UTVMMain() {
+  utvm_done = 0;
+  // loss of precision should be fine here, since we only care about the lower bits
+  if (((uint32_t) utvm_workspace_start) % utvm_word_size) {
+    utvm_last_error = UTVM_ERR_WS_UNALIGNED_START;
+    UTVMDone();
+    return;
+  }
   utvm_workspace_curr = utvm_workspace_start;
   utvm_num_active_allocs = 0;
-  utvm_last_error = NULL;  // NOLINT(*)
-  utvm_return_code = 0;
-  utvm_task_time = 0;
-  UTVMTimerReset();
-  int32_t err = UTVMTimerStart();
-  if (err < 0) {
-    utvm_return_code = err;
-    UTVMDone();
+  utvm_alloc_idx = 0;
+  utvm_last_error = UTVM_ERR_NOT_FINISHED;
+  for (uint32_t i = 0; i < utvm_num_tasks; i++) {
+    int32_t err = UTVM_ERR_OK;
+    utvm_task_times[i] = 0;
+    err = UTVMTimerStart();
+    if (err < 0) {
+      utvm_last_error = err;
+      UTVMDone();
+      return;
+    }
+    err = utvm_tasks[i].func(
+        (void*) utvm_tasks[i].arg_values,      // NOLINT(*)
+        (void*) utvm_tasks[i].arg_type_codes,  // NOLINT(*)
+        utvm_tasks[i].num_args);
+    if (err < 0) {
+      UTVMDone();
+      return;
+    }
+    utvm_task_times[i] = UTVMTimerStop(&err);
+    if (err < 0) {
+      utvm_last_error = err;
+      UTVMDone();
+      return;
+    }
+  }
+  if (utvm_last_error == UTVM_ERR_NOT_FINISHED) {
+    utvm_last_error = UTVM_ERR_OK;
   }
-  utvm_return_code = utvm_task.func(
-          (void*) utvm_task.arg_values,      // NOLINT(*)
-          (void*) utvm_task.arg_type_codes,  // NOLINT(*)
-          utvm_task.num_args);
-  UTVMTimerStop();
-  utvm_task_time = UTVMTimerRead();
   UTVMDone();
 }
 
 // We use a dummy function to signal execution is finished for device
 // backends which require breakpoints.
-void UTVMDone() { }
+void __attribute__((noinline)) UTVMDone() {
+  utvm_done = 1;
+}
+
+#define ALIGNED_UP(x, word_size) \
+  ((((word_size) - (((uintptr_t) (x)) % (word_size))) % (word_size)) + (x))
 
 void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size,
                                int dtype_code_hint, int dtype_bits_hint) {
-  // Align up to 8 bytes.
-  utvm_workspace_curr +=
-    (utvm_word_size - ((uintptr_t) utvm_workspace_curr % utvm_word_size)) % utvm_word_size;  // NOLINT(*)
-  if (utvm_workspace_curr + size > utvm_workspace_end) {
+  if (size == 0) {
+    utvm_last_error = UTVM_ERR_WS_ZERO_SIZE_ALLOC;
+    return NULL;
+  }
+  size_t alloc_requested_bytes = size;
+  size_t alloc_size_words = (alloc_requested_bytes + utvm_word_size - 1) / utvm_word_size;
+  size_t alloc_size_bytes = alloc_size_words * utvm_word_size;
+
+  // Align up to the target word size.
+  if (utvm_workspace_curr + alloc_size_bytes > utvm_workspace_end) {
     // Out of space in workspace.
+    utvm_last_error = UTVM_ERR_WS_OUT_OF_SPACE;
+    return NULL;
+  }
+  if (utvm_alloc_idx == MAX_WS_ALLOCS - 1) {
+    // Exceeded number of allocs we can keep track of.
+    utvm_last_error = UTVM_ERR_WS_TOO_MANY_ALLOCS;
     return NULL;
   }
   void* ret_ptr = (void*) utvm_workspace_curr;  // NOLINT(*)
-  utvm_workspace_curr += size;
+  utvm_workspace_curr = utvm_workspace_curr + alloc_size_bytes;
+  // store the *end* of the alloc, so we can restore the WS pointer when freeing
+  utvm_alloc_ends[utvm_alloc_idx] = utvm_workspace_curr;
+  utvm_alloc_idx++;
   utvm_num_active_allocs++;
   return ret_ptr;
 }
 
 int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) {
-  utvm_num_active_allocs--;
-  if (utvm_num_active_allocs < 0) {
+  // TODO(areusch): add dev type check
+  if (utvm_num_active_allocs == 0) {
     TVMAPISetLastError("free called with no active workspace allocations");
     // Reset allocations and workspace (for future task executions).
     utvm_num_active_allocs = 0;
     utvm_workspace_curr = utvm_workspace_start;
+    utvm_last_error = UTVM_ERR_WS_DOUBLE_FREE;
     return -1;
-  } else if (utvm_num_active_allocs == 0) {
-    // No more allocations.  Reset workspace.
-    utvm_workspace_curr = utvm_workspace_start;
-    return 0;
   } else {
+    utvm_num_active_allocs--;
+    if (ptr == utvm_workspace_start) {
+      // it's the first allocation
+      utvm_alloc_ends[0] = NULL;
+    } else {
+      // TODO(areusch): reverse loop iteration since usually it's the last alloc being freed
+      for (uint32_t i = utvm_alloc_idx - 1; i >= 0; i--) {
+        if (utvm_alloc_ends[i] == ptr) {
+          utvm_alloc_ends[i + 1] = NULL;
+          break;
+        }
+      }
+    }
+    while (utvm_alloc_idx > 0 && utvm_alloc_ends[utvm_alloc_idx - 1] == NULL) {
+      utvm_alloc_idx--;
+    }
+    if (utvm_alloc_idx == 0) {
+      utvm_workspace_curr = utvm_workspace_start;
+    } else {
+      // TODO(areusch): could you possibly have utvm_alloc_idx pointing to a NULL entry in this
+      // branch?
+      utvm_workspace_curr = utvm_alloc_ends[utvm_alloc_idx - 1];
+    }
     return 0;
   }
 }
 
-void TVMAPISetLastError(const char* msg) {
-  utvm_last_error = msg;
-}
+void TVMAPISetLastError(const char* msg) { }
 
 #ifdef __cplusplus
 }  // TVM_EXTERN_C
diff --git a/src/runtime/micro/host_driven/utvm_runtime.h b/src/runtime/micro/host_driven/utvm_runtime.h
index b39309a784f5..42ca025fc2ab 100644
--- a/src/runtime/micro/host_driven/utvm_runtime.h
+++ b/src/runtime/micro/host_driven/utvm_runtime.h
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file utvm_runtime.h
  * \brief uTVM runtime headers
  */
diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc
index a24994a2a0e5..13f68f295fa6 100644
--- a/src/runtime/micro/host_low_level_device.cc
+++ b/src/runtime/micro/host_low_level_device.cc
@@ -60,16 +60,16 @@ class HostLowLevelDevice final : public LowLevelDevice {
     munmap(base_addr_, size_);
   }
 
-  void Read(DevPtr addr, void* buf, size_t num_bytes) {
+  void Read(TargetPtr addr, void* buf, size_t num_bytes) {
     std::memcpy(buf, addr.cast_to<void*>(), num_bytes);
   }
 
-  void Write(DevPtr addr, const void* buf, size_t num_bytes) {
+  void Write(TargetPtr addr, const void* buf, size_t num_bytes) {
     std::memcpy(addr.cast_to<void*>(), buf, num_bytes);
   }
 
-  void Execute(DevPtr func_addr, DevPtr breakpoint_addr) {
-    reinterpret_cast<void (*)(void)>(func_addr.value().val64)();
+  void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) {
+    reinterpret_cast<void (*)(void)>(func_addr.value().uint64())();
   }
 
   const char* device_type() const final {
diff --git a/src/runtime/micro/low_level_device.h b/src/runtime/micro/low_level_device.h
index 3158e2fe20de..666b08199a6b 100644
--- a/src/runtime/micro/low_level_device.h
+++ b/src/runtime/micro/low_level_device.h
@@ -45,7 +45,7 @@ class LowLevelDevice {
    * \param buffer on-host buffer to be read into
    * \param num_bytes number of bytes to read
    */
-  virtual void Read(DevPtr addr,
+  virtual void Read(TargetPtr addr,
                     void* buffer,
                     size_t num_bytes) = 0;
 
@@ -55,7 +55,7 @@ class LowLevelDevice {
    * \param buffer host buffer to write from
    * \param num_bytes number of bytes to write
    */
-  virtual void Write(DevPtr addr,
+  virtual void Write(TargetPtr addr,
                      const void* buffer,
                      size_t num_bytes) = 0;
 
@@ -64,7 +64,7 @@ class LowLevelDevice {
    * \param func_addr offset of the init stub function
    * \param breakpoint_addr address at which to stop function execution
    */
-  virtual void Execute(DevPtr func_addr, DevPtr breakpoint_addr) = 0;
+  virtual void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) = 0;
 
   /*!
    * \brief getter function for low-level device type
diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc
index 632b6048b182..3e322c639edd 100644
--- a/src/runtime/micro/micro_common.cc
+++ b/src/runtime/micro/micro_common.cc
@@ -52,11 +52,11 @@ const char* SectionToString(SectionKind section) {
 std::string RelocateBinarySections(
     const std::string& binary_path,
     size_t word_size,
-    DevPtr text_start,
-    DevPtr rodata_start,
-    DevPtr data_start,
-    DevPtr bss_start,
-    DevPtr stack_end,
+    TargetPtr text_start,
+    TargetPtr rodata_start,
+    TargetPtr data_start,
+    TargetPtr bss_start,
+    TargetPtr stack_end,
     const std::string& toolchain_prefix) {
   const auto* f = Registry::Get("tvm_callback_relocate_binary");
   CHECK(f != nullptr)
diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h
index e6696f56f06c..7ba62d3f30df 100644
--- a/src/runtime/micro/micro_common.h
+++ b/src/runtime/micro/micro_common.h
@@ -52,26 +52,65 @@ enum class SectionKind : size_t {
   kNumKinds,
 };
 
-/*! \brief union for storing values on varying target word sizes */
-union TargetVal {
-  /*! \brief 32-bit pointer */
-  uint32_t val32;
-  /*! \brief 64-bit pointer */
-  uint64_t val64;
+/*! \brief class for storing values on varying target word sizes */
+class TargetVal {
+ private:
+  size_t width_bits_;
+  uint64_t value_;
+
+ public:
+  template<typename T, typename U = typename std::enable_if<std::is_integral<T>::value, T>::type>
+  explicit constexpr TargetVal(T value) :
+      width_bits_{sizeof(T) * 8}, value_{value} {}
+
+  TargetVal(size_t width_bits, uint64_t value) : width_bits_{width_bits} {
+    CHECK(width_bits != 0 && (width_bits & (width_bits - 1)) == 0)
+      << "width_bits must be a power of 2, got " << width_bits;
+    *this = value;
+  }
+
+  size_t width_bits() const { return width_bits_; }
+  uint64_t bitmask() const {
+    if (width_bits_ == 64) {
+      return 0xffffffff;
+    } else {
+      return (1 << width_bits_) - 1;
+    }
+  }
+
+  uint32_t uint32() const {
+    CHECK(width_bits_ <= 32) << "TargetVal: requested 32-bit value, actual width is "
+                             << width_bits_;
+    return uint32_t(value_ & bitmask());
+  }
+
+  uint64_t uint64() const {
+    return value_;
+  }
+
+  TargetVal& operator=(const uint64_t& value) {
+    if (width_bits_ == 64) {
+      value_ = value;
+    } else {
+      CHECK((value & ~bitmask()) == 0) << "bits above " << width_bits_ << " are non-zero";
+      value_ = value & bitmask();
+    }
+    return *this;
+  }
 };
 
-// TODO just get rid of `DevPtr`.
+// TODO(areusch): just get rid of `TargetPtr`.
 /*! \brief absolute device address */
 class TargetPtr {
  public:
-  /*! \brief construct a device address with value `value` */
-  explicit TargetPtr(std::uintptr_t value) : value_(TargetVal { .val64 = value }) {}
+  /*! \brief construct a device address with val64 `value` */
+  explicit TargetPtr(std::uint64_t value) : value_(TargetVal(64, value)) {}
 
-  /*! \brief default constructor */
-  TargetPtr() : value_(TargetVal { .val64 = 0 }) {}
+  /*! \brief default constructor (val64 0) */
+  TargetPtr() : value_(TargetVal(64, 0)) {}
 
-  /*! \brief construct a null address */
-  explicit TargetPtr(std::nullptr_t value) : value_(TargetVal { .val64 = 0 }) {}
+  /*! \brief construct a null address (stored in val64) */
+  explicit TargetPtr(std::nullptr_t value) : value_{TargetVal(64, 0)} {}
 
   /*! \brief destructor */
   ~TargetPtr() {}
@@ -87,33 +126,33 @@ class TargetPtr {
    * \return casted result
    */
   template <typename T>
-  T cast_to() const { return reinterpret_cast<T>(value_.val64); }
+  T cast_to() const { return reinterpret_cast<T>(value_.uint64()); }
 
   /*! \brief check if location is null */
-  bool operator==(std::nullptr_t) const { return value_.val64 == 0; }
+  bool operator==(std::nullptr_t) const { return value_.uint64() == 0; }
 
   /*! \brief check if location is not null */
-  bool operator!=(std::nullptr_t) const { return value_.val64 != 0; }
+  bool operator!=(std::nullptr_t) const { return value_.uint64() != 0; }
 
   /*! \brief add an integer to this absolute address to get a larger absolute address */
   TargetPtr operator+(size_t n) const {
-    return TargetPtr(value_.val64 + n);
+    return TargetPtr(value_.uint64() + n);
   }
 
   /*! \brief mutably add an integer to this absolute address */
   TargetPtr& operator+=(size_t n) {
-    value_.val64 += n;
+    value_ = value_.uint64() + n;
     return *this;
   }
 
   /*! \brief subtract an integer from this absolute address to get a smaller absolute address */
   TargetPtr operator-(size_t n) const {
-    return TargetPtr(value_.val64 - n);
+    return TargetPtr(value_.uint64() - n);
   }
 
   /*! \brief mutably subtract an integer from this absolute address */
   TargetPtr& operator-=(size_t n) {
-    value_.val64 -= n;
+    value_ = value_.uint64() - n;
     return *this;
   }
 
@@ -174,6 +213,12 @@ class SymbolMap {
     return map_.find(name) != map_.end();
   }
 
+  void Dump(std::ostream& stream) const {
+    for (auto e : map_) {
+      stream << "Entry:" << e.first << std::endl;
+    }
+  }
+
  private:
   /*! \brief backing map */
   std::unordered_map<std::string, TargetPtr> map_;
diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc
index fbaef4af5b07..619c31a8b1ba 100644
--- a/src/runtime/micro/micro_device_api.cc
+++ b/src/runtime/micro/micro_device_api.cc
@@ -61,7 +61,7 @@ class MicroDeviceAPI final : public DeviceAPI {
   void FreeDataSpace(TVMContext ctx, void* ptr) final {
     MicroDevSpace* dev_space = static_cast<MicroDevSpace*>(ptr);
     dev_space->session->FreeInSection(
-      SectionKind::kHeap, DevPtr(reinterpret_cast<std::uintptr_t>(dev_space->data)));
+      SectionKind::kHeap, TargetPtr(reinterpret_cast<std::uintptr_t>(dev_space->data)));
     delete dev_space;
   }
 
@@ -74,12 +74,9 @@ class MicroDeviceAPI final : public DeviceAPI {
                       TVMContext ctx_to,
                       DLDataType type_hint,
                       TVMStreamHandle stream) final {
-    std::cout << "[MicroDeviceAPI::CopyDataFromTo]" << std::endl;
     std::tuple<int, int> type_from_to(ctx_from.device_type, ctx_to.device_type);
     if (type_from_to == std::make_tuple(kDLMicroDev, kDLMicroDev)) {
-      std::cout << "  device to device" << std::endl;
       // Copying from the device to the device.
-
       MicroDevSpace* from_space = static_cast<MicroDevSpace*>(const_cast<void*>(from));
       MicroDevSpace* to_space = static_cast<MicroDevSpace*>(const_cast<void*>(to));
       CHECK(from_space->session == to_space->session)
@@ -93,30 +90,27 @@ class MicroDeviceAPI final : public DeviceAPI {
       session->FlushTaskQueue();
       const std::shared_ptr<LowLevelDevice>& lld = session->low_level_device();
 
-      DevPtr from_dev_addr = GetDevLoc(from_space, from_offset);
-      DevPtr to_dev_addr = GetDevLoc(to_space, to_offset);
+      TargetPtr from_dev_addr = GetDevLoc(from_space, from_offset);
+      TargetPtr to_dev_addr = GetDevLoc(to_space, to_offset);
 
       std::vector<uint8_t> buffer(size);
       lld->Read(from_dev_addr, static_cast<void*>(buffer.data()), size);
       lld->Write(to_dev_addr, static_cast<void*>(buffer.data()), size);
+
     } else if (type_from_to == std::make_tuple(kDLMicroDev, kDLCPU)) {
-      std::cout << "  reading from device" << std::endl;
-      std::cout << "    num_bytes: " << size << std::endl;
       // Reading from the device.
-
       MicroDevSpace* from_space = static_cast<MicroDevSpace*>(const_cast<void*>(from));
       ObjectPtr<MicroSession>& session = from_space->session;
       // flush all pending tasks to ensure data is consistent
       session->FlushTaskQueue();
       const std::shared_ptr<LowLevelDevice>& lld = session->low_level_device();
 
-      DevPtr from_dev_addr = GetDevLoc(from_space, from_offset);
+      TargetPtr from_dev_addr = GetDevLoc(from_space, from_offset);
+      void* to_host_ptr = GetHostLoc(to, to_offset);
       lld->Read(from_dev_addr, to_host_ptr, size);
+
     } else if (type_from_to == std::make_tuple(kDLCPU, kDLMicroDev)) {
-      std::cout << "  writing to device" << std::endl;
-      std::cout << "    num_bytes: " << size << std::endl;
       // Writing to the device.
-
       MicroDevSpace* to_space = static_cast<MicroDevSpace*>(const_cast<void*>(to));
       ObjectPtr<MicroSession>& session = to_space->session;
       // flush all pending tasks to ensure data is consistent
@@ -124,15 +118,15 @@ class MicroDeviceAPI final : public DeviceAPI {
       const std::shared_ptr<LowLevelDevice>& lld = session->low_level_device();
 
       void* from_host_ptr = GetHostLoc(from, from_offset);
-      DevPtr to_dev_addr = GetDevLoc(to_space, to_offset);
+      TargetPtr to_dev_addr = GetDevLoc(to_space, to_offset);
       lld->Write(to_dev_addr, from_host_ptr, size);
+
     } else {
       LOG(FATAL) << "Expect copy from/to micro device or between micro device\n";
     }
   }
 
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
-    std::cout << "[MicroDeviceAPI::StreamSync]" << std::endl;
     MicroSession::Current()->FlushTaskQueue();
   }
 
@@ -153,7 +147,7 @@ class MicroDeviceAPI final : public DeviceAPI {
     MicroDevSpace* dev_space = static_cast<MicroDevSpace*>(data);
     ObjectPtr<MicroSession>& session = dev_space->session;
     session->FreeInSection(SectionKind::kWorkspace,
-                           DevPtr(reinterpret_cast<std::uintptr_t>(dev_space->data)));
+                           TargetPtr(reinterpret_cast<std::uintptr_t>(dev_space->data)));
     delete dev_space;
   }
 
@@ -167,8 +161,8 @@ class MicroDeviceAPI final : public DeviceAPI {
   }
 
  private:
-  DevPtr GetDevLoc(MicroDevSpace* dev_space, size_t offset) {
-    return DevPtr(reinterpret_cast<std::uintptr_t>(dev_space->data) + offset);
+  TargetPtr GetDevLoc(MicroDevSpace* dev_space, size_t offset) {
+    return TargetPtr(reinterpret_cast<std::uintptr_t>(dev_space->data) + offset);
   }
 
   void* GetHostLoc(const void* ptr, size_t offset) {
diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc
index b27bd923c638..01056deeeb18 100644
--- a/src/runtime/micro/micro_module.cc
+++ b/src/runtime/micro/micro_module.cc
@@ -54,8 +54,8 @@ class MicroModuleNode final : public ModuleNode {
    * \param binary_path path of the binary to be loaded
    */
   void InitMicroModule(const std::string& binary_path) {
-    std::cout << "[MicroModuleNode::InitMicroModule]" << std::endl;
-    std::cout << "  start" << std::endl;
+    // std::cout << "[MicroModuleNode::InitMicroModule]" << std::endl;
+    // std::cout << "  start" << std::endl;
     session_ = MicroSession::Current();
     symbol_map_ = session_->LoadBinary(binary_path, true).symbol_map;
   }
@@ -69,26 +69,26 @@ class MicroModuleNode final : public ModuleNode {
 class MicroWrappedFunc {
  public:
   MicroWrappedFunc(ObjectPtr<MicroSession> session,
-                   DevPtr func_ptr) {
+                   TargetPtr func_ptr) {
     session_ = session;
     func_ptr_ = func_ptr;
   }
 
   void operator()(TVMArgs args, TVMRetValue* rv) const {
-    *rv = session_->PushToTaskQueue(func_ptr_, args);
+    session_->PushToTaskQueue(func_ptr_, args);
   }
 
  private:
   /*! \brief reference to the session for this function (to keep the session alive) */
   ObjectPtr<MicroSession> session_;
   /*! \brief offset of the function to be called */
-  DevPtr func_ptr_;
+  TargetPtr func_ptr_;
 };
 
 PackedFunc MicroModuleNode::GetFunction(
     const std::string& name,
     const ObjectPtr<Object>& sptr_to_self) {
-  DevPtr func_ptr;
+  TargetPtr func_ptr;
   if (name == tvm::runtime::symbol::tvm_module_main) {
     if (symbol_map_.HasSymbol(tvm::runtime::symbol::tvm_module_main)) {
       func_ptr = symbol_map_[tvm::runtime::symbol::tvm_module_main];
diff --git a/src/runtime/micro/micro_section_allocator.h b/src/runtime/micro/micro_section_allocator.h
index 4e8f7201ad7a..8fff39b61d19 100644
--- a/src/runtime/micro/micro_section_allocator.h
+++ b/src/runtime/micro/micro_section_allocator.h
@@ -23,6 +23,7 @@
 #ifndef TVM_RUNTIME_MICRO_MICRO_SECTION_ALLOCATOR_H_
 #define TVM_RUNTIME_MICRO_MICRO_SECTION_ALLOCATOR_H_
 
+#include <string>
 #include <unordered_map>
 #include "micro_common.h"
 
@@ -44,7 +45,7 @@ class MicroSectionAllocator {
       size_(0),
       capacity_(region.size),
       word_size_(word_size) {
-      CHECK_EQ(start_addr_.value().val64 % word_size, 0)
+      CHECK_EQ(start_addr_.value().uint64() % word_size, 0)
         << "micro section start not aligned to " << word_size << " bytes";
       CHECK_EQ(capacity_ % word_size, 0)
         << "micro section end not aligned to " << word_size << " bytes";
@@ -60,14 +61,15 @@ class MicroSectionAllocator {
    * \param alloc_size size of allocated memory in bytes
    * \return pointer to allocated memory region in section, nullptr if out of space
    */
-  DevPtr Allocate(size_t size) {
+  TargetPtr Allocate(size_t size) {
     size_ = UpperAlignValue(size_, word_size_);
     CHECK(size_ + size < capacity_)
-        << "cannot alloc " << size << " bytes in section with start_addr " <<
-        start_addr_.cast_to<void*>();
-    DevPtr alloc_addr = start_addr_ + size_;
+      << "cannot alloc " << size << " bytes in section \""
+      << section_name_ << "\" (start_addr=" << start_addr_.cast_to<void*>()
+      << ", used=" << size_ << ", capacity=" << capacity_ << ")";
+    TargetPtr alloc_addr = start_addr_ + size_;
     size_ += size;
-    alloc_map_[alloc_addr.value().val64] = size;
+    alloc_map_[alloc_addr.value().uint64()] = size;
     return alloc_addr;
   }
 
@@ -76,10 +78,10 @@ class MicroSectionAllocator {
    * \param offs offset to allocated memory
    * \note simple allocator scheme, more complex versions will be implemented later
    */
-  void Free(DevPtr addr) {
-    CHECK(alloc_map_.find(addr.value().val64) != alloc_map_.end())
+  void Free(TargetPtr addr) {
+    CHECK(alloc_map_.find(addr.value().uint64()) != alloc_map_.end())
       << "freed pointer was never allocated";
-    alloc_map_.erase(addr.value().val64);
+    alloc_map_.erase(addr.value().uint64());
     if (alloc_map_.empty()) {
       size_ = 0;
     }
@@ -88,17 +90,17 @@ class MicroSectionAllocator {
   /*!
    * \brief start offset of the memory region managed by this allocator
    */
-  DevPtr start_addr() const { return start_addr_; }
+  TargetPtr start_addr() const { return start_addr_; }
 
   /*!
    * \brief current end addr of the space being used in this memory region
    */
-  DevPtr curr_end_addr() const { return start_addr_ + size_; }
+  TargetPtr curr_end_addr() const { return start_addr_ + size_; }
 
   /*!
    * \brief end addr of the memory region managed by this allocator
    */
-  DevPtr max_addr() const { return start_addr_ + capacity_; }
+  TargetPtr max_addr() const { return start_addr_ + capacity_; }
 
   /*!
    * \brief size of the section
@@ -114,7 +116,7 @@ class MicroSectionAllocator {
   /*! \brief name of the section (for debugging) */
   std::string section_name_;
   /*! \brief start address of the section */
-  DevPtr start_addr_;
+  TargetPtr start_addr_;
   /*! \brief current size of the section */
   size_t size_;
   /*! \brief total storage capacity of the section */
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index e6cdea5d1c88..309fea45cd31 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -25,6 +25,7 @@
 #include <tvm/runtime/registry.h>
 #include <chrono>
 #include <memory>
+#include <locale>
 #include <stack>
 #include <tuple>
 #include <vector>
@@ -107,7 +108,7 @@ MicroSession::MicroSession(
     low_level_device_ = HostLowLevelDeviceCreate(memory_size, &base_addr);
     CHECK_EQ(reinterpret_cast<std::uintptr_t>(base_addr) % word_size_, 0)
       << "base address not aligned to " << word_size_ << " bytes";
-    DevPtr curr_addr = DevPtr(reinterpret_cast<std::uintptr_t>(base_addr));
+    TargetPtr curr_addr = TargetPtr(reinterpret_cast<std::uintptr_t>(base_addr));
 
     section_allocators_[0] = std::make_shared<MicroSectionAllocator>(
       "text",
@@ -170,66 +171,56 @@ MicroSession::MicroSession(
     section_allocators_[0] = std::make_shared<MicroSectionAllocator>(
       "text",
       DevMemRegion {
-        .start = DevPtr(text_start),
+        .start = TargetPtr(text_start),
         .size = text_size,
       }, word_size_);
     section_allocators_[1] = std::make_shared<MicroSectionAllocator>(
       "rodata",
       DevMemRegion {
-        .start = DevPtr(rodata_start),
+        .start = TargetPtr(rodata_start),
         .size = rodata_size,
       }, word_size_);
     section_allocators_[2] = std::make_shared<MicroSectionAllocator>(
       "data",
       DevMemRegion {
-        .start = DevPtr(data_start),
+        .start = TargetPtr(data_start),
         .size = data_size,
       }, word_size_);
     section_allocators_[3] = std::make_shared<MicroSectionAllocator>(
       "bss",
       DevMemRegion {
-        .start = DevPtr(bss_start),
+        .start = TargetPtr(bss_start),
         .size = bss_size,
       }, word_size_);
     section_allocators_[4] = std::make_shared<MicroSectionAllocator>(
       "args",
       DevMemRegion {
-        .start = DevPtr(args_start),
+        .start = TargetPtr(args_start),
         .size = args_size,
       }, word_size_);
     section_allocators_[5] = std::make_shared<MicroSectionAllocator>(
       "heap",
       DevMemRegion {
-        .start = DevPtr(heap_start),
+        .start = TargetPtr(heap_start),
         .size = heap_size,
       }, word_size_);
     section_allocators_[6] = std::make_shared<MicroSectionAllocator>(
       "workspace",
       DevMemRegion {
-        .start = DevPtr(workspace_start),
+        .start = TargetPtr(workspace_start),
         .size = workspace_size,
       }, word_size_);
     section_allocators_[7] = std::make_shared<MicroSectionAllocator>(
       "stack",
       DevMemRegion {
-        .start = DevPtr(stack_start),
+        .start = TargetPtr(stack_start),
         .size = stack_size,
       }, word_size_);
   } else {
     LOG(FATAL) << "unsupported micro low-level device";
   }
 
-  std::cout << "[Memory Layout]" << std::endl;
-  std::cout << "  text (size = " << (section_allocators_[0]->capacity() / 1000.0) << " KB): " << section_allocators_[0]->start_addr().cast_to<void*>() << std::endl;
-  std::cout << "  rodata (size = " << (section_allocators_[1]->capacity() / 1000.0) << " KB): " << section_allocators_[1]->start_addr().cast_to<void*>() << std::endl;
-  std::cout << "  data (size = " << (section_allocators_[2]->capacity() / 1000.0) << " KB): " << section_allocators_[2]->start_addr().cast_to<void*>() << std::endl;
-  std::cout << "  bss (size = " << (section_allocators_[3]->capacity() / 1000.0) << " KB): " << section_allocators_[3]->start_addr().cast_to<void*>() << std::endl;
-  std::cout << "  args (size = " << (section_allocators_[4]->capacity() / 1000.0) << " KB): " << section_allocators_[4]->start_addr().cast_to<void*>() << std::endl;
-  std::cout << "  heap (size = " << (section_allocators_[5]->capacity() / 1000.0) << " KB): " << section_allocators_[5]->start_addr().cast_to<void*>() << std::endl;
-  std::cout << "  workspace (size = " << (section_allocators_[6]->capacity() / 1000.0) << " KB): " << section_allocators_[6]->start_addr().cast_to<void*>() << std::endl;
-  std::cout << "  stack (size = " << (section_allocators_[7]->capacity() / 1000.0) << " KB): " << section_allocators_[7]->start_addr().cast_to<void*>() << std::endl;
-
-  DevPtr args_start_addr = GetAllocator(SectionKind::kArgs)->start_addr();
+  TargetPtr args_start_addr = GetAllocator(SectionKind::kArgs)->start_addr();
   batch_args_encoder_.set_start_addr(args_start_addr);
 
   runtime_symbol_map_ = LoadBinary(binary_path, false).symbol_map;
@@ -237,17 +228,17 @@ MicroSession::MicroSession(
   // Patch pointers to define the bounds of the workspace section and the word
   // size (for allocation alignment).
   std::shared_ptr<MicroSectionAllocator> ws_allocator = GetAllocator(SectionKind::kWorkspace);
-  TargetVal ws_start = ws_allocator->start_addr().value();
-  TargetVal ws_end = ws_allocator->max_addr().value();
-  TargetVal target_word_size { .val64 = word_size_ };
+  TargetVal ws_start(word_size_ * 8, ws_allocator->start_addr().value().uint64());
+  TargetVal ws_end(word_size_ * 8, ws_allocator->max_addr().value().uint64());
+  TargetVal target_word_size(word_size_ * 8, word_size_);
   if (word_size_ == 4) {
-    DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.val32);
-    DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.val32);
-    DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.val32);
+    DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.uint32());
+    DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.uint32());
+    DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.uint32());
   } else if (word_size_ == 8) {
-    DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.val64);
-    DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.val64);
-    DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.val64);
+    DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.uint64());
+    DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.uint64());
+    DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.uint64());
   }
 }
 
@@ -258,17 +249,16 @@ MicroSession::~MicroSession() {
   low_level_device_ = nullptr;
 }
 
-void MicroSession::PushToTaskQueue(DevPtr func_ptr, const TVMArgs& args) {
-  std::cout << "[MicroSession::PushToTaskQueue]" << std::endl;
-  std::cout << "  pushed func ptr: " << func_ptr.cast_to<void*>() << std::endl;
+void MicroSession::PushToTaskQueue(TargetPtr func_ptr, const TVMArgs& args) {
   if (thumb_mode_) {
-    func_ptr |= 1;
+    // TODO(areusch): should be |=
+    func_ptr += 1;
   }
-  DevVal func_dev_addr = func_ptr.value();
+  TargetVal func_dev_addr = func_ptr.value();
 
-  std::tuple<DevPtr, DevPtr> arg_field_addrs = EncoderAppend(&batch_args_encoder_, args);
-  DevVal arg_values_dev_addr = { .val64 = std::get<0>(arg_field_addrs).value() };
-  DevVal arg_type_codes_dev_addr = { .val64 = std::get<1>(arg_field_addrs).value() };
+  std::tuple<TargetPtr, TargetPtr> arg_field_addrs = EncoderAppend(&batch_args_encoder_, args);
+  TargetVal arg_values_dev_addr{std::get<0>(arg_field_addrs).cast_to<uint64_t>()};
+  TargetVal arg_type_codes_dev_addr{std::get<1>(arg_field_addrs).cast_to<uint64_t>()};
 
   task_queue_.push_back(
       DevTask {
@@ -289,15 +279,15 @@ void MicroSession::FlushTaskQueue() {
     return;
   }
   if (word_size_ == 4) {
-    FlushTaskQueuePriv<UTVMTask32>();
+    FlushTaskQueuePriv<StructUTVMTask32>();
   } else if (word_size_ == 8) {
-    FlushTaskQueuePriv<UTVMTask64>();
+    FlushTaskQueuePriv<StructUTVMTask64>();
   }
 }
 
 template <typename T>
 void MicroSession::FlushTaskQueuePriv() {
-  std::cout << "[MicroSession::FlushTaskQueue]" << std::endl;
+  // std::cout << "[MicroSession::FlushTaskQueue]" << std::endl;
   std::vector<T> prepped_tasks;
   for (const auto& task : task_queue_) {
     prepped_tasks.push_back(T(task));
@@ -310,102 +300,65 @@ void MicroSession::FlushTaskQueuePriv() {
       batch_args_encoder_.buf_size());
 
   // Flush `tasks` to device memory.
-  DevPtr dev_tasks_addr = runtime_symbol_map_["utvm_tasks"];
+//  runtime_symbol_map_.Dump(std::cout);
+  TargetPtr dev_tasks_addr = runtime_symbol_map_["utvm_tasks"];
   low_level_device()->Write(
       dev_tasks_addr,
       reinterpret_cast<void*>(prepped_tasks.data()),
       prepped_tasks.size() * sizeof(T));
   DevSymbolWrite<uint32_t>(runtime_symbol_map_, "utvm_num_tasks", prepped_tasks.size());
 
-  DevPtr utvm_init_addr = runtime_symbol_map_["UTVMInit"];
-  DevPtr utvm_done_addr = runtime_symbol_map_["UTVMDone"];
+  TargetPtr utvm_init_addr = runtime_symbol_map_["UTVMInit"];
+  TargetPtr utvm_done_addr = runtime_symbol_map_["UTVMDone"];
   if (thumb_mode_) {
-    utvm_init_addr |= 1;
+    // TODO(areusch): should be |=
+    utvm_init_addr += 1;
   }
 
   std::chrono::time_point<
     std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin, tend;
   tbegin = std::chrono::high_resolution_clock::now();
-  // std::cout << "  do execution things: ";
-  // char tmp;
-  // std::cin >> tmp;
+  // std::string tmp;
+  // while (tmp[0] != 'd' && tmp[0] != 'e') {
+  //   std::cout << "How to proceed? [Debug / Execute] ";
+  //   getline(std::cin, tmp);
+  //   CHECK(std::cin.good()) << "Stdin closed";
+  //   tmp[0] = std::tolower(tmp[0]);
+  // }
+  // if (tmp[0] == 'd') {
+  //   std::cout << "Launch debugger; [Enter] to resume automated execution";
+  //   getline(std::cin, tmp);
+  // } else {
   low_level_device()->Execute(utvm_init_addr, utvm_done_addr);
+  // }
   tend = std::chrono::high_resolution_clock::now();
 
   // Check if there was an error during execution.  If so, log it.
   CheckDeviceError();
-  uint32_t task_time = DevSymbolRead<uint32_t>(runtime_symbol_map_, "utvm_task_time");
-  GetAllocator(SectionKind::kArgs)->Free(stream_dev_addr);
-  return static_cast<double>(task_time);
-}
-
-BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_dylib_pointers) {
-  DevMemRegion text_section;
-  DevMemRegion rodata_section;
-  DevMemRegion data_section;
-  DevMemRegion bss_section;
-
-  text_section.size = GetSectionSize(
-      binary_path, SectionKind::kText, toolchain_prefix_, word_size_);
-  rodata_section.size = GetSectionSize(
-      binary_path, SectionKind::kRodata, toolchain_prefix_, word_size_);
-  data_section.size = GetSectionSize(
-      binary_path, SectionKind::kData, toolchain_prefix_, word_size_);
-  bss_section.size = GetSectionSize(
-      binary_path, SectionKind::kBss, toolchain_prefix_, word_size_);
-
-  text_section.start = AllocateInSection(SectionKind::kText, text_section.size);
-  rodata_section.start = AllocateInSection(SectionKind::kRodata, rodata_section.size);
-  data_section.start = AllocateInSection(SectionKind::kData, data_section.size);
-  bss_section.start = AllocateInSection(SectionKind::kBss, bss_section.size);
-  CHECK(text_section.start != nullptr && rodata_section.start != nullptr &&
-        data_section.start != nullptr && bss_section.start != nullptr)
-      << "not enough space to load module on device";
-
-  std::string relocated_bin = RelocateBinarySections(
-      binary_path,
-      word_size_,
-      text_section.start,
-      rodata_section.start,
-      data_section.start,
-      bss_section.start,
-      GetAllocator(SectionKind::kStack)->max_addr(),
-      toolchain_prefix_);
-  std::string text_contents = ReadSection(relocated_bin, SectionKind::kText, toolchain_prefix_);
-  std::string rodata_contents = ReadSection(relocated_bin, SectionKind::kRodata, toolchain_prefix_);
-  std::string data_contents = ReadSection(relocated_bin, SectionKind::kData, toolchain_prefix_);
-  std::string bss_contents = ReadSection(relocated_bin, SectionKind::kBss, toolchain_prefix_);
-
-  low_level_device_->Write(text_section.start, &text_contents[0], text_section.size);
-  low_level_device_->Write(rodata_section.start, &rodata_contents[0], rodata_section.size);
-  low_level_device_->Write(data_section.start, &data_contents[0], data_section.size);
-  low_level_device_->Write(bss_section.start, &bss_contents[0], bss_section.size);
-  SymbolMap symbol_map {relocated_bin, toolchain_prefix_};
-
-  if (patch_dylib_pointers) {
-    // Patch device lib pointers.
-    PatchImplHole(symbol_map, "TVMBackendAllocWorkspace");
-    PatchImplHole(symbol_map, "TVMBackendFreeWorkspace");
-    PatchImplHole(symbol_map, "TVMAPISetLastError");
-  }
 
   if (use_device_timer_) {
     uint64_t sum = 0;
     std::vector<uint32_t> times;
     times.resize(task_queue_.size());
-    low_level_device()->Read(runtime_symbol_map_["utvm_task_times"], times.data(), task_queue_.size() * sizeof(uint32_t));
+    low_level_device()->Read(runtime_symbol_map_["utvm_task_times"],
+                             times.data(),
+                             task_queue_.size() * sizeof(uint32_t));
+    int i = 0;
     for (uint32_t time : times) {
+      LOG(INFO) << "Time " << i++ << ": " << time;
       sum += time;
     }
-    last_batch_time_ += static_cast<double>(sum);
+    last_batch_time_ += static_cast<double>(sum) / 1e3;
   } else {
     last_batch_time_ += std::chrono::duration_cast<std::chrono::duration<double> >
         (tend - tbegin).count() * 1000;
-    // TODO fukn hack
+    // TODO(weberlo): Reading internal data structure is hacky.
     uint64_t sum = 0;
     std::vector<uint32_t> times;
     times.resize(task_queue_.size());
-    low_level_device()->Read(runtime_symbol_map_["utvm_task_times"], times.data(), task_queue_.size() * sizeof(uint32_t));
+    low_level_device()->Read(runtime_symbol_map_["utvm_task_times"],
+                             times.data(),
+                             task_queue_.size() * sizeof(uint32_t));
     for (uint32_t time : times) {
       sum += time;
     }
@@ -417,7 +370,6 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d
 }
 
 BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_dylib_pointers) {
-  std::cout << "[MicroSession::LoadBinary]" << std::endl;
   DevMemRegion text_section;
   DevMemRegion rodata_section;
   DevMemRegion data_section;
@@ -431,10 +383,6 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d
       binary_path, SectionKind::kData, toolchain_prefix_, word_size_);
   bss_section.size = GetSectionSize(
       binary_path, SectionKind::kBss, toolchain_prefix_, word_size_);
-  std::cout << "  text_section.size: " << text_section.size << std::endl;
-  std::cout << "  rodata_section.size: " << rodata_section.size << std::endl;
-  std::cout << "  data_section.size: " << data_section.size << std::endl;
-  std::cout << "  bss_section.size: " << bss_section.size << std::endl;
 
   text_section.start = AllocateInSection(SectionKind::kText, text_section.size);
   rodata_section.start = AllocateInSection(SectionKind::kRodata, rodata_section.size);
@@ -480,9 +428,8 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d
   };
 }
 
-std::tuple<DevPtr, DevPtr> MicroSession::EncoderAppend(
+std::tuple<TargetPtr, TargetPtr> MicroSession::EncoderAppend(
     TargetDataLayoutEncoder* encoder, const TVMArgs& args) {
-  std::cout << "[MicroSession::EncoderAppend(TVMArgs)]" << std::endl;
   const int* type_codes = args.type_codes;
   int num_args = args.num_args;
 
@@ -529,7 +476,7 @@ std::tuple<DevPtr, DevPtr> MicroSession::EncoderAppend(
 }
 
 template <typename T>
-DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr) {
+TargetPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr) {
   auto tvm_arr_slot = encoder->Alloc<T>();
   auto shape_slot = encoder->Alloc<int64_t>(arr.ndim);
 
@@ -537,8 +484,8 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTen
   // the device first. The `data` field is already allocated on the device and
   // is a device pointer, so we don't need to write it.
   shape_slot.WriteArray(arr.shape, arr.ndim);
-  DevPtr shape_dev_addr = shape_slot.start_addr();
-  DevPtr strides_dev_addr = DevPtr(nullptr);
+  TargetPtr shape_dev_addr = shape_slot.start_addr();
+  TargetPtr strides_dev_addr = TargetPtr(nullptr);
   if (arr.strides != nullptr) {
     auto stride_slot = encoder->Alloc<int64_t>(arr.ndim);
     stride_slot.WriteArray(arr.strides, arr.ndim);
@@ -546,13 +493,13 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTen
   }
 
   T dev_arr(
-      TargetVal { .val64 = reinterpret_cast<uint64_t>(arr.data) },
+      TargetVal { word_size_ * 8, reinterpret_cast<uint64_t>(arr.data) },
       arr.ctx,
       arr.ndim,
       arr.dtype,
       shape_dev_addr.value(),
       strides_dev_addr.value(),
-      TargetVal { .val64 = arr.byte_offset });
+      TargetVal { word_size_ * 8, arr.byte_offset });
   CHECK(dev_arr.ctx.device_type == static_cast<DLDeviceType>(kDLMicroDev))
     << "attempt to write DLTensor with non-micro device type";
   // Update the device type to CPU, because from the microcontroller's
@@ -575,7 +522,7 @@ void MicroSession::CheckDeviceError() {
       return;
     }
     std::string err_msg;
-    switch(last_error) {
+    switch (last_error) {
       case UTVM_ERR_NOT_FINISHED:
         err_msg = "execution timed out";
         break;
@@ -583,7 +530,7 @@ void MicroSession::CheckDeviceError() {
         err_msg = "timer is not implemented for the target device";
         break;
       case UTVM_ERR_TIMER_OVERFLOW:
-        // TODO this should be remedied by using interrupts to accumulate the
+        // TODO(weberlo): this should be remedied by using interrupts to accumulate the
         // timer into a larger datatype (ARM timers are only 24 bits)
         err_msg = "timer overflowed during execution";
         break;
@@ -616,20 +563,20 @@ void MicroSession::CheckDeviceError() {
 }
 
 void MicroSession::PatchImplHole(const SymbolMap& symbol_map, const std::string& func_name) {
-  DevPtr runtime_impl_addr = runtime_symbol_map_[func_name];
+  TargetPtr runtime_impl_addr = runtime_symbol_map_[func_name];
   if (thumb_mode_) {
     runtime_impl_addr += 1;
   }
   std::ostringstream func_name_underscore;
   func_name_underscore << func_name << "_";
   if (word_size_ == 4) {
-    DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().val32);
+    DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().uint32());
   } else if (word_size_ == 8) {
-    DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().val64);
+    DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().uint64());
   }
 }
 
-std::string MicroSession::ReadString(DevPtr str_addr) {
+std::string MicroSession::ReadString(TargetPtr str_addr) {
   std::ostringstream result;
   const size_t buf_size = 256;
   std::vector<char> buf(buf_size, 0);
@@ -647,29 +594,17 @@ std::string MicroSession::ReadString(DevPtr str_addr) {
   return result.str();
 }
 
-DevPtr MicroSession::AllocateInSection(SectionKind type, size_t size) {
-  if (type == SectionKind::kHeap) {
-    std::cout << "[MicroSession::AllocateInSection(Heap)]" << std::endl;
-    std::cout << "  allocating " << std::dec << size << " hex=" << (void*) size << " bytes" << std::endl;
-  }
-  DevPtr result = GetAllocator(type)->Allocate(size);
-  if (type == SectionKind::kHeap) {
-    std::cout << "  allocated at addr " << result.cast_to<void*>() << std::endl;
-  }
-  return result;
+TargetPtr MicroSession::AllocateInSection(SectionKind type, size_t size) {
+  return GetAllocator(type)->Allocate(size);
 }
 
-void MicroSession::FreeInSection(SectionKind type, DevPtr addr) {
-  if (type == SectionKind::kHeap) {
-    std::cout << "[MicroSession::FreeInSection]" << std::endl;
-    std::cout << "  freeing alloc at addr " << addr.cast_to<void*>() << std::endl;
-  }
+void MicroSession::FreeInSection(SectionKind type, TargetPtr addr) {
   return GetAllocator(type)->Free(addr);
 }
 
 template <typename T>
 T MicroSession::DevSymbolRead(const SymbolMap& symbol_map, const std::string& symbol) {
-  DevPtr sym_addr = symbol_map[symbol];
+  TargetPtr sym_addr = symbol_map[symbol];
   T result;
   low_level_device()->Read(sym_addr, &result, sizeof(T));
   return result;
@@ -679,7 +614,7 @@ template <typename T>
 void MicroSession::DevSymbolWrite(const SymbolMap& symbol_map,
                                   const std::string& symbol,
                                   const T& value) {
-  DevPtr sym_addr = symbol_map[symbol];
+  TargetPtr sym_addr = symbol_map[symbol];
   low_level_device()->Write(sym_addr, &value, sizeof(T));
 }
 
@@ -694,12 +629,12 @@ PackedFunc MicroSession::GetFunction(
     return PackedFunc([sptr_to_self](TVMArgs args, TVMRetValue* rv) {
       MicroSession::ExitWithScope();
     });
-    // TODO add a `clear_batch_timer` func
+    // TODO(weberlo): add a `clear_batch_timer` func
   } else if (name == "get_last_batch_time") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       *rv = this->GetLastBatchTime();
     });
-    // TODO remove this func
+    // TODO(weberlo): remove this func
   } else if (name == "get_last_batch_cycles") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       *rv = this->GetLastBatchCycles();
@@ -716,22 +651,22 @@ TVM_REGISTER_GLOBAL("micro._CreateSession")
     const std::string& binary_path = args[1];
     const std::string& toolchain_prefix = args[2];
     uint64_t text_start = args[3];
-    size_t text_size = args[4];
+    size_t text_size = uint64_t(args[4]);
     uint64_t rodata_start = args[5];
-    size_t rodata_size = args[6];
+    size_t rodata_size = uint64_t(args[6]);
     uint64_t data_start = args[7];
-    size_t data_size = args[8];
+    size_t data_size = uint64_t(args[8]);
     uint64_t bss_start = args[9];
-    size_t bss_size = args[10];
+    size_t bss_size = uint64_t(args[10]);
     uint64_t args_start = args[11];
-    size_t args_size = args[12];
+    size_t args_size = uint64_t(args[12]);
     uint64_t heap_start = args[13];
-    size_t heap_size = args[14];
+    size_t heap_size = uint64_t(args[14]);
     uint64_t workspace_start = args[15];
-    size_t workspace_size = args[16];
+    size_t workspace_size = uint64_t(args[16]);
     uint64_t stack_start = args[17];
-    size_t stack_size = args[18];
-    size_t word_size = args[19];
+    size_t stack_size = uint64_t(args[18]);
+    size_t word_size = uint64_t(args[19]);
     bool thumb_mode = args[20];
     bool use_device_timer = args[21];
     const std::string& server_addr = args[22];
diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h
index 00e5bcdd7e01..f7b05ae211e9 100644
--- a/src/runtime/micro/micro_session.h
+++ b/src/runtime/micro/micro_session.h
@@ -143,7 +143,7 @@ class MicroSession : public ModuleNode {
    * \param args args to the packed function
    * \return elapsed time during function execution on the device
    */
-  double PushToTaskQueue(DevPtr func, const TVMArgs& args);
+  void PushToTaskQueue(TargetPtr func, const TVMArgs& args);
 
   /*!
    * \brief serialize runtime metadata to the device for enqueued tasks and execute
@@ -171,21 +171,21 @@ class MicroSession : public ModuleNode {
    * \param size size of allocated memory in bytes
    * \return pointer to allocated memory region in section, nullptr if out of space
    */
-  DevPtr AllocateInSection(SectionKind type, size_t size);
+  TargetPtr AllocateInSection(SectionKind type, size_t size);
 
   /*!
    * \brief free prior allocation from section
    * \param type type of section to allocate in
    * \param addr device address of allocated memory
    */
-  void FreeInSection(SectionKind type, DevPtr addr);
+  void FreeInSection(SectionKind type, TargetPtr addr);
 
   /*!
    * \brief read string from device to host
    * \param str_addr device address of first character of string
    * \return host copy of device string that was read
    */
-  std::string ReadString(DevPtr str_addr);
+  std::string ReadString(TargetPtr str_addr);
 
   /*!
   * \brief read value of symbol from device memory
@@ -234,8 +234,6 @@ class MicroSession : public ModuleNode {
   /*! \brief array of memory allocators for each on-device section */
   std::shared_ptr<MicroSectionAllocator>
       section_allocators_[static_cast<size_t>(SectionKind::kNumKinds)];
-  /*! \brief total number of bytes of usable device memory for this session */
-  size_t memory_size_;
   /*! \brief number of bytes in a word on the target device */
   size_t word_size_;
   /*! \brief whether the target device requires a thumb-mode bit on function addresses
@@ -272,7 +270,8 @@ class MicroSession : public ModuleNode {
    * \param args args to be appended
    * \return device address of the allocated args
    */
-  std::tuple<DevPtr, DevPtr> EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArgs& args);
+  std::tuple<TargetPtr, TargetPtr> EncoderAppend(TargetDataLayoutEncoder* encoder,
+                                                 const TVMArgs& args);
 
   /*!
    * \brief appends a `DLTensor` to the host-side buffer of `encoder`
@@ -335,15 +334,15 @@ struct TVMArray32 {
       TargetVal shape,
       TargetVal strides,
       TargetVal byte_offset)
-    : data(data.val32),
+    : data(data.uint32()),
       ctx(ctx),
       ndim(ndim),
       pad0(0),
       dtype(dtype),
-      shape(shape.val32),
-      strides(strides.val32),
+      shape(shape.uint32()),
+      strides(strides.uint32()),
       pad1(0),
-      byte_offset(byte_offset.val32),
+      byte_offset(byte_offset.uint32()),
       pad2(0) { }
 
   /*!
@@ -385,14 +384,14 @@ struct TVMArray64 {
       TargetVal shape,
       TargetVal strides,
       TargetVal byte_offset)
-    : data(data.val64),
+    : data(data.uint64()),
       ctx(ctx),
       ndim(ndim),
       pad0(0),
       dtype(dtype),
-      shape(shape.val64),
-      strides(strides.val64),
-      byte_offset(byte_offset.val64) { }
+      shape(shape.uint64()),
+      strides(strides.uint64()),
+      byte_offset(byte_offset.uint64()) { }
   /*!
    * \brief The opaque data pointer points to the allocated data.
    *  This will be CUDA device pointer or cl_mem handle in OpenCL.
@@ -421,11 +420,11 @@ struct TVMArray64 {
 /*! \brief MicroTVM task to store in task queue before specializing to word size */
 struct DevTask {
   /*! \brief Pointer to function to call for this task */
-  DevVal func;
+  TargetVal func;
   /*! \brief Array of argument values */
-  DevVal arg_values;
+  TargetVal arg_values;
   /*! \brief Array of type codes for each argument value */
-  DevVal arg_type_codes;
+  TargetVal arg_type_codes;
   /*! \brief Number of arguments */
   int32_t num_args;
 };
@@ -433,9 +432,9 @@ struct DevTask {
 /*! \brief MicroTVM task for serialization to 32-bit devices */
 typedef struct StructUTVMTask32 {
   StructUTVMTask32(DevTask task)
-    : func(task.func.val32),
-      arg_values(task.arg_values.val32),
-      arg_type_codes(task.arg_type_codes.val32),
+    : func(task.func.uint32()),
+      arg_values(task.arg_values.uint32()),
+      arg_type_codes(task.arg_type_codes.uint32()),
       num_args(task.num_args) { }
 
   /*! \brief Pointer to function to call for this task */
@@ -451,9 +450,9 @@ typedef struct StructUTVMTask32 {
 /*! \brief MicroTVM task for serialization to 64-bit devices */
 typedef struct StructUTVMTask64 {
   StructUTVMTask64(DevTask task)
-    : func(task.func.val64),
-      arg_values(task.arg_values.val64),
-      arg_type_codes(task.arg_type_codes.val64),
+    : func(task.func.uint64()),
+      arg_values(task.arg_values.uint64()),
+      arg_type_codes(task.arg_type_codes.uint64()),
       num_args(task.num_args) { }
 
   /*! \brief Pointer to function to call for this task */
diff --git a/src/runtime/micro/openocd_low_level_device.cc b/src/runtime/micro/openocd_low_level_device.cc
index e5c83e590c36..d1f279f6120f 100644
--- a/src/runtime/micro/openocd_low_level_device.cc
+++ b/src/runtime/micro/openocd_low_level_device.cc
@@ -50,7 +50,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
     socket_.SendCommand();
   }
 
-  void Read(DevPtr addr, void* buf, size_t num_bytes) {
+  void Read(TargetPtr addr, void* buf, size_t num_bytes) {
     if (num_bytes == 0) {
       return;
     }
@@ -88,7 +88,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
     }
 
     {
-      socket_.cmd_builder() << "ocd_echo $output";
+      socket_.cmd_builder() << "return $output";
       socket_.SendCommand();
       const std::string& reply = socket_.last_reply();
 
@@ -119,7 +119,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
     }
   }
 
-  void Write(DevPtr addr, const void* buf, size_t num_bytes) {
+  void Write(TargetPtr addr, const void* buf, size_t num_bytes) {
     if (num_bytes == 0) {
       return;
     }
@@ -171,7 +171,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
     }
   }
 
-  void Execute(DevPtr func_addr, DevPtr breakpoint_addr) {
+  void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) {
     socket_.cmd_builder() << "halt 0";
     socket_.SendCommand();
 
@@ -210,9 +210,9 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
   // NOTE: OpenOCD will call any request larger than this constant an "absurd
   // request".
   /*! \brief maximum number of bytes allowed in a single memory transfer */
-  static const constexpr ssize_t kMemTransferLimit = 64000;
+  static const constexpr ssize_t kMemTransferLimit = 8000;
   /*! \brief number of milliseconds to wait for function execution to halt */
-  static const constexpr int kWaitTime = 10000;
+  static const constexpr int kWaitTime = 30000;
 };
 
 const std::shared_ptr<LowLevelDevice> OpenOCDLowLevelDeviceCreate(const std::string& server_addr,
diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h
index 0abd269eedd4..ea682e261348 100644
--- a/src/runtime/micro/target_data_layout_encoder.h
+++ b/src/runtime/micro/target_data_layout_encoder.h
@@ -50,7 +50,7 @@ class TargetDataLayoutEncoder {
      * \param size size (in bytes) of the memory region allocated for this slot
      * \param start_addr start address of the slot in the device's memory
      */
-    Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, DevPtr start_addr);
+    Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, TargetPtr start_addr);
 
     ~Slot();
 
@@ -71,7 +71,7 @@ class TargetDataLayoutEncoder {
      * \brief returns start address of the slot in device memory
      * \return device start address
      */
-    DevPtr start_addr();
+    TargetPtr start_addr();
 
     /*!
      * \brief returns number of bytes allocated for this slot
@@ -89,7 +89,7 @@ class TargetDataLayoutEncoder {
     /*! \brief size (in bytes) of the memory region allocated for this slot */
     size_t size_;
     /*! \brief start address of the slot in the device's memory */
-    DevPtr start_addr_;
+    TargetPtr start_addr_;
   };
 
   /*!
@@ -139,17 +139,14 @@ class TargetDataLayoutEncoder {
     return buf_.size();
   }
 
-  /*!
-   * \brief TODO
-   */
-  DevPtr start_addr() const {
-    CHECK_NE(start_addr_.value().val64, 0) << "start addr uninitialized";
+  TargetPtr start_addr() const {
+    CHECK_NE(start_addr_.value().uint64(), 0) << "start addr uninitialized";
     return start_addr_;
   }
 
-  void set_start_addr(DevPtr start_addr) {
+  void set_start_addr(TargetPtr start_addr) {
     CHECK_EQ(buf_.size(), 0) << "cannot change encoder start addr unless empty";
-    start_addr_ = DevPtr(UpperAlignValue(start_addr.value().val64, word_size_));
+    start_addr_ = TargetPtr(UpperAlignValue(start_addr.value().uint64(), word_size_));
   }
 
  private:
@@ -158,7 +155,7 @@ class TargetDataLayoutEncoder {
   /*! \brief current offset */
   size_t curr_offset_;
   /*! \brief start address of the encoder in device memory */
-  DevPtr start_addr_;
+  TargetPtr start_addr_;
   /*! \brief TODO */
   size_t capacity_;
   /*! \brief number of bytes in a word on the target device */
@@ -169,7 +166,7 @@ template <typename T>
 TargetDataLayoutEncoder::Slot<T>::Slot(TargetDataLayoutEncoder* parent,
                                        size_t start_offset,
                                        size_t size,
-                                       DevPtr start_addr)
+                                       TargetPtr start_addr)
     : parent_(parent),
       start_offset_(start_offset),
       curr_offset_(0),
@@ -178,8 +175,10 @@ TargetDataLayoutEncoder::Slot<T>::Slot(TargetDataLayoutEncoder* parent,
 
 template <typename T>
 TargetDataLayoutEncoder::Slot<T>::~Slot() {
-  // TODO this can mask the exception thrown by slot allocation... even though that doesn't make sense.
-  CHECK(curr_offset_ == size_) << "unwritten space in slot";
+  // TODO(areusch): this can mask the exception thrown by slot allocation... even though that
+  // doesn't make sense.
+  CHECK(curr_offset_ == size_) << "unwritten space in slot; curr_offset="
+                               << curr_offset_ << ", size=" << size_;
 }
 
 template <typename T>
@@ -198,7 +197,7 @@ void TargetDataLayoutEncoder::Slot<T>::WriteValue(const T& val) {
 }
 
 template <typename T>
-DevPtr TargetDataLayoutEncoder::Slot<T>::start_addr() {
+TargetPtr TargetDataLayoutEncoder::Slot<T>::start_addr() {
   return start_addr_;
 }
 
diff --git a/src/runtime/micro/tcl_socket.cc b/src/runtime/micro/tcl_socket.cc
index d0bb6007f815..24abe42f786f 100644
--- a/src/runtime/micro/tcl_socket.cc
+++ b/src/runtime/micro/tcl_socket.cc
@@ -42,9 +42,8 @@ void TclSocket::Connect(tvm::support::SockAddr addr) {
 }
 
 void TclSocket::SendCommand() {
-  //std::cout << "[TclSocket::SendCommand]" << std::endl;
-  //std::cout << "  cmd: " << cmd_builder_.str() << std::endl;
-  cmd_builder_ << kCommandTerminateToken;
+  const char terminate_token = kCommandTerminateToken;
+  cmd_builder_ << terminate_token;
   std::string full_cmd = cmd_builder_.str();
 
   CHECK(tcp_socket_.Send(full_cmd.data(), full_cmd.length()) != -1)
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index 7556c4915b9c..9b5612efc690 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -36,43 +36,13 @@
 #include <algorithm>
 #include "rpc_session.h"
 #include "../object_internal.h"
-#include "../../common/ring_buffer.h"
-#include "../../common/socket.h"
+#include "../../support/ring_buffer.h"
+#include "../../support/socket.h"
 #include "../micro/micro_session.h"
 
 namespace tvm {
 namespace runtime {
 
-std::string RPCCodeToString(RPCCode code) {
-  switch (code) {
-    case RPCCode::kNone: return "None";
-    case RPCCode::kCallFunc: return "CallFunc";
-    case RPCCode::kReturn: return "Return";
-    case RPCCode::kException: return "Exception";
-    case RPCCode::kShutdown: return "Shutdown";
-    case RPCCode::kCopyFromRemote: return "CopyFromRemote";
-    case RPCCode::kCopyToRemote: return "CopyToRemote";
-    case RPCCode::kCopyAck: return "CopyAck";
-    case RPCCode::kSystemFuncStart: return "SystemFuncStart";
-    case RPCCode::kGetGlobalFunc: return "GetGlobalFunc";
-    case RPCCode::kGetTimeEvaluator: return "GetTimeEvaluator";
-    case RPCCode::kFreeFunc: return "FreeFunc";
-    case RPCCode::kDevSetDevice: return "DevSetDevice";
-    case RPCCode::kDevGetAttr: return "DevGetAttr";
-    case RPCCode::kDevAllocData: return "DevAllocData";
-    case RPCCode::kDevFreeData: return "DevFreeData";
-    case RPCCode::kDevStreamSync: return "DevStreamSync";
-    case RPCCode::kCopyAmongRemote: return "CopyAmongRemote";
-    case RPCCode::kModuleLoad: return "ModuleLoad";
-    case RPCCode::kModuleImport: return "ModuleImport";
-    case RPCCode::kModuleFree: return "ModuleFree";
-    case RPCCode::kModuleGetFunc: return "ModuleGetFunc";
-    case RPCCode::kModuleGetSource: return "ModuleGetSource";
-    case RPCCode::kNDArrayFree: return "NDArrayFree";
-    default: CHECK(false) << "invalid RPC code";
-  }
-}
-
 // Temp buffer for data array
 struct RPCByteArrayBuffer {
   TVMByteArray arr;
@@ -929,12 +899,6 @@ void RPCSession::Init() {
       &reader_, &writer_, table_index_, name_, &remote_key_);
   // Quick function to call remote.
   call_remote_ = PackedFunc([this](TVMArgs args, TVMRetValue* rv) {
-      std::cout << "[RPCSession::call_remote_]" << std::endl;
-      if (args.type_codes[0] == kTVMContext) {
-        const TVMContext ctx = args[0];
-        std::cout << "  ctx.device_type: " << ctx.device_type << std::endl;
-        std::cout << "  ctx.device_id: " << ctx.device_id << std::endl;
-      }
       handler_->SendPackedSeq(args.values, args.type_codes, args.num_args, true);
       RPCCode code = HandleUntilReturnEvent(rv, true, nullptr);
       CHECK(code == RPCCode::kReturn) << "code=" << static_cast<int>(code);
@@ -1117,10 +1081,7 @@ void RPCDevSetDevice(TVMArgs args, TVMRetValue *rv) {
 }
 
 void RPCDevGetAttr(TVMArgs args, TVMRetValue *rv) {
-  std::cout << "[RPCDevGetAttr]" << std::endl;
   TVMContext ctx = args[0];
-  std::cout <<  "  ctx.device_type: " << ctx.device_type << std::endl;
-  std::cout <<  "  ctx.device_id: " << ctx.device_id << std::endl;
   DeviceAttrKind kind = static_cast<DeviceAttrKind>(args[1].operator int());
   if (kind == kExist) {
     DeviceAPI* api = DeviceAPI::Get(ctx, true);
@@ -1136,10 +1097,7 @@ void RPCDevGetAttr(TVMArgs args, TVMRetValue *rv) {
 }
 
 void RPCDevAllocData(TVMArgs args, TVMRetValue *rv) {
-  std::cout <<  "[RPCDevAllocData]" << std::endl;
   TVMContext ctx = args[0];
-  std::cout <<  "  ctx.device_type: " << ctx.device_type << std::endl;
-  std::cout <<  "  ctx.device_id: " << ctx.device_id << std::endl;
   uint64_t nbytes = args[1];
   uint64_t alignment = args[2];
   DLDataType type_hint = args[3];
@@ -1161,18 +1119,13 @@ void RPCDevStreamSync(TVMArgs args, TVMRetValue *rv) {
 }
 
 void RPCCopyAmongRemote(TVMArgs args, TVMRetValue *rv) {
-  std::cout << "[RPCCopyAmongRemote]" << std::endl;
   void* from = args[0];
   uint64_t from_offset = args[1];
   void* to = args[2];
   uint64_t to_offset = args[3];
   uint64_t size = args[4];
   TVMContext ctx_from = args[5];
-  std::cout << "  ctx_from.device_type: " << ctx_from.device_type << std::endl;
-  std::cout << "  ctx_from.device_id: " << ctx_from.device_type << std::endl;
   TVMContext ctx_to = args[6];
-  std::cout << "  ctx_to.device_type: " << ctx_to.device_type << std::endl;
-  std::cout << "  ctx_to.device_id: " << ctx_to.device_type << std::endl;
   DLDataType type_hint = args[7];
   TVMStreamHandle stream = args[8];
   TVMContext ctx = ctx_from;
@@ -1183,12 +1136,10 @@ void RPCCopyAmongRemote(TVMArgs args, TVMRetValue *rv) {
           ctx_to.device_type == ctx_from.device_type)
         << "Can not copy across different ctx types directly";
   }
-  std::cout << "  before CopyDataFromTo" << std::endl;
   DeviceAPI::Get(ctx)->CopyDataFromTo(
       from, from_offset,
       to, to_offset,
       size, ctx_from, ctx_to, type_hint, stream);
-  std::cout << "  after CopyDataFromTo" << std::endl;
 }
 
 void RPCModuleLoad(TVMArgs args, TVMRetValue *rv) {
@@ -1250,7 +1201,6 @@ void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) {
 }
 
 void RPCSession::EventHandler::HandlePackedCall() {
-  std::cout << "[RPCSession::EventHandler::HandlePackedCall]" << std::endl;
   CHECK_EQ(pending_request_bytes_, 0U);
   if (code_ == RPCCode::kReturn) {
     state_ = kReturnReceived; return;
@@ -1259,7 +1209,6 @@ void RPCSession::EventHandler::HandlePackedCall() {
   state_ = kRecvCode;
   this->RequestBytes(sizeof(RPCCode));
   // Event handler sit at clean state at this point.
-  std::cout << "  RPC code is " << static_cast<int>(code_) << "(" << RPCCodeToString(code_) << ")" << std::endl;
   switch (code_) {
     case RPCCode::kCallFunc: {
       PackedFunc* pf = reinterpret_cast<PackedFunc*>(call_handle_);
@@ -1274,7 +1223,6 @@ void RPCSession::EventHandler::HandlePackedCall() {
       std::ostringstream os;
       os << "Except caught from RPC call: " << arg_buf_->value[0].v_str;
       arg_buf_.reset();
-      std::cout << os.str() << std::endl;
       throw dmlc::Error(os.str());
       break;
     }
@@ -1302,10 +1250,10 @@ void RPCSession::EventHandler::HandlePackedCall() {
 PackedFunc MicroTimeEvaluator(
     PackedFunc pf,
     TVMContext ctx,
-    int number,
+    size_t number,
     int repeat,
     int min_repeat_ms) {
-  auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable {
+  auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) mutable {
     TVMRetValue temp;
     std::ostringstream os;
 
@@ -1313,13 +1261,12 @@ PackedFunc MicroTimeEvaluator(
       // start timing
       CHECK(number < MicroSession::kTaskQueueCapacity)
         << "`number` must be less than uTVM task queue capacity";
-      for (int i = 0; i < number; ++i) {
+      for (unsigned int j = 0; j < number; ++j) {
         pf.CallPacked(args, &temp);
       }
       ObjectPtr<MicroSession> session = MicroSession::Current();
       DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
       double time_per_batch = session->GetLastBatchTime() / number;
-      std::cout << "LAST AVERAGE BATCH TIME WAS " << time_per_batch << std::endl;
       os.write(reinterpret_cast<char*>(&time_per_batch), sizeof(time_per_batch));
     }
     std::string blob = os.str();
@@ -1337,12 +1284,9 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf,
                              int number,
                              int repeat,
                              int min_repeat_ms) {
-  std::cout << "[WrapTimeEvaluator]" << std::endl;
   if (static_cast<int>(ctx.device_type) == static_cast<int>(kDLMicroDev)) {
-    std::cout << "  USING MICRO TIME EVAL" << std::endl;
     return MicroTimeEvaluator(pf, ctx, number, repeat, min_repeat_ms);
   }
-  std::cout << "  USING NORMAL TIME EVAL" << std::endl;
 
   auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable {
     TVMRetValue temp;
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index 1bb75357e030..db63be4be74d 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -63,31 +63,31 @@ struct RPCArgBuffer;
 
 /*! \brief The RPC code */
 enum class RPCCode : int {
-  kNone = 0,
-  kCallFunc = 1,
-  kReturn = 2,
-  kException = 3,
-  kShutdown = 4,
-  kCopyFromRemote = 5,
-  kCopyToRemote = 6,
-  kCopyAck = 7,
+  kNone,
+  kCallFunc,
+  kReturn,
+  kException,
+  kShutdown,
+  kCopyFromRemote,
+  kCopyToRemote,
+  kCopyAck,
   // The following are code that can send over CallRemote
-  kSystemFuncStart = 8,
-  kGetGlobalFunc = 9,
-  kGetTimeEvaluator = 10,
-  kFreeFunc = 11,
-  kDevSetDevice = 12,
-  kDevGetAttr = 13,
-  kDevAllocData = 14,
-  kDevFreeData = 15,
-  kDevStreamSync = 16,
-  kCopyAmongRemote = 17,
-  kModuleLoad = 18,
-  kModuleImport = 19,
-  kModuleFree = 20,
-  kModuleGetFunc = 21,
-  kModuleGetSource = 22,
-  kNDArrayFree = 23
+  kSystemFuncStart,
+  kGetGlobalFunc,
+  kGetTimeEvaluator,
+  kFreeFunc,
+  kDevSetDevice,
+  kDevGetAttr,
+  kDevAllocData,
+  kDevFreeData,
+  kDevStreamSync,
+  kCopyAmongRemote,
+  kModuleLoad,
+  kModuleImport,
+  kModuleFree,
+  kModuleGetFunc,
+  kModuleGetSource,
+  kNDArrayFree
 };
 
 /*!
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index 0c4404f515c0..adb84e498e5d 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -840,12 +840,10 @@ void CodeGenC::VisitStmt_(const AttrStmtNode* op) {
     const VarNode* v = op->node.as<VarNode>();
     CHECK(v);
     volatile_buf_.insert(v);
-  } else if (op->attr_key == ir::attr::pragma_import_c) {
-    const StringImm* value = op->value.as<StringImm>();
+  } else if (op->attr_key == tir::attr::pragma_import_c) {
+    const StringImmNode* value = op->value.as<StringImmNode>();
     CHECK(value != nullptr);
     decl_stream << value->value;
-    //this->HandleImport(value->value);
-    //this->VisitStmt(op->body);
   }
   this->PrintStmt(op->body);
 }
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index c356ac816957..4cca1efea2ca 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -20,25 +20,25 @@
 /*!
  * \file codegen_c_host.cc
  */
-#include <tvm/target/codegen.h>
+#include "codegen_c_host.h"
 #include <vector>
 #include <string>
-#include "codegen_c_host.h"
+#include "tvm/target/codegen.h"
 #include "../build_common.h"
 
 namespace tvm {
 namespace codegen {
 
-// TODO rename to CodeGenCMicro?
 CodeGenCHost::CodeGenCHost() {
   module_name_ = GetUniqueName("__tvm_module_ctx");
 }
 
 void CodeGenCHost::Init(bool output_ssa, bool emit_asserts) {
   emit_asserts_ = emit_asserts;
+  declared_globals_.clear();
   decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n";
   decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
-  decl_stream << "extern void* " << module_name_ << " = NULL;\n";
+  decl_stream << "void* " << module_name_ << " = NULL;\n";
   CodeGenC::Init(output_ssa);
 }
 
@@ -183,8 +183,15 @@ void CodeGenCHost::VisitExpr_(const CallNode *op, std::ostream& os) { // NOLINT(
     int64_t num_args = end - begin;
     CHECK_GE(num_args, 0);
     std::string func_name = s->value;
-    std::string packed_func_name = GetUniqueName(func_name + "_packed");
-    decl_stream << "static void* " << packed_func_name << " = NULL;\n";
+    // NOTE: cannot rely on GetUnique for global decl_stream declarations
+    // because it is reset between AddFunction().
+    std::string packed_func_name = func_name + "_packed";
+    if (declared_globals_.insert(packed_func_name).second) {
+      // Still reserve the name among unique names.
+      CHECK(GetUniqueName(packed_func_name) == packed_func_name) <<
+        "Expected name " << packed_func_name << " to not be taken";
+      decl_stream << "static void* " << packed_func_name << " = NULL;\n";
+    }
     this->PrintGetFuncFromBackend(func_name, packed_func_name);
     this->PrintFuncCall(packed_func_name, num_args);
   } else if (op->is_intrinsic(intrinsic::tvm_throw_last_error)) {
@@ -242,7 +249,7 @@ runtime::Module BuildCHost(IRModule mod) {
   CodeGenCHost cg;
   cg.Init(output_ssa, emit_asserts);
 
-  for (auto kv :  mod->functions) {
+  for (auto kv : mod->functions) {
     CHECK(kv.second->IsInstance<PrimFuncNode>())
         << "CodegenCHost: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
@@ -255,7 +262,7 @@ runtime::Module BuildCHost(IRModule mod) {
 
 TVM_REGISTER_GLOBAL("target.build.c")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
-   *rv = BuildCHost(args[0]);
- });
+  *rv = BuildCHost(args[0]);
+});
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h
index f854b7abf68c..bec96860ad47 100644
--- a/src/target/source/codegen_c_host.h
+++ b/src/target/source/codegen_c_host.h
@@ -24,9 +24,10 @@
 #ifndef TVM_TARGET_SOURCE_CODEGEN_C_HOST_H_
 #define TVM_TARGET_SOURCE_CODEGEN_C_HOST_H_
 
-#include <tvm/target/codegen.h>
-#include <tvm/tir/expr.h>
+#include <set>
 #include <string>
+#include "tvm/target/codegen.h"
+#include "tvm/tir/expr.h"
 #include "codegen_c.h"
 
 namespace tvm {
@@ -36,8 +37,6 @@ class CodeGenCHost final : public CodeGenC {
  public:
   CodeGenCHost();
   void Init(bool output_ssa, bool emit_asserts);
-  void AddFunction(LoweredFunc f);
-  std::string Finish();
 
   void PrintType(DataType t, std::ostream& os) final; // NOLINT(*)
   void PrintFuncPrefix() final; // NOLINT(*)
@@ -55,6 +54,8 @@ class CodeGenCHost final : public CodeGenC {
 
  private:
   std::string module_name_;
+  /* \brief tracks declared global variables which live despite GetUniqueName */
+  std::set<std::string> declared_globals_;
   /*! \brief whether to emit asserts in the resulting C code */
   bool emit_asserts_;
 
diff --git a/src/target/target.cc b/src/target/target.cc
index a72ce1c5b3e4..2cb72a285a41 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -140,7 +140,7 @@ Target CreateTarget(const std::string& target_name,
     t->keys_array.push_back("hexagon");
     t->device_type = kDLHexagon;
   } else {
-    LOG(ERROR) << "Unknown target name " << target_name;
+    LOG(ERROR) << "Unknown target name " << target_name << "; falling back to stackvm";
     return target::stackvm();
   }
 
diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py
index 1ab324e50228..448f53c57f3c 100644
--- a/tests/python/unittest/test_runtime_micro.py
+++ b/tests/python/unittest/test_runtime_micro.py
@@ -26,9 +26,9 @@
 from tvm.relay.testing import resnet
 
 # # Use the host emulated micro device.
-# DEV_CONFIG_A = micro.device.host.generate_config()
-# DEV_CONFIG_B = micro.device.host.generate_config()
-# TARGET = 'c -device=micro_dev'
+DEV_CONFIG_A = micro.device.host.generate_config()
+DEV_CONFIG_B = micro.device.host.generate_config()
+TARGET = 'c -device=micro_dev'
 
 # # TODO why do spike examples have memory that starts at 0x10000000, but you
 # # should set the base addr as 0x10010000? should somehow help the user to be
@@ -41,9 +41,9 @@
 # DEV_CONFIG_B = micro.device.riscv_spike.generate_config(BASE_ADDR, AVAILABLE_MEM, '127.0.0.1', 6667)
 # TARGET = 'c -device=micro_dev'
 
-DEV_CONFIG_A = micro.device.arm.stm32f746xx.generate_config('127.0.0.1', 6666)
-DEV_CONFIG_B = micro.device.arm.stm32f746xx.generate_config('127.0.0.1', 6667)
-TARGET = 'c -device=micro_dev'
+# DEV_CONFIG_A = micro.device.arm.stm32f746xx.generate_config('127.0.0.1', 6666)
+# DEV_CONFIG_B = micro.device.arm.stm32f746xx.generate_config('127.0.0.1', 6667)
+# TARGET = 'c -device=micro_dev'
 
 def relay_micro_build(func, dev_config, params=None):
     """Create a graph runtime module with a micro device context from a Relay function.
@@ -64,11 +64,10 @@ def relay_micro_build(func, dev_config, params=None):
     mod : tvm.runtime.Module
         graph runtime module for the target device
     """
-    disable_vectorize = tvm.build_config(disable_vectorize=True)
+    disable_vectorize = tvm.target.build_config(disable_vectorize=True)
     disable_fusion = relay.build_config(disabled_pass={'FuseOps'})
     with disable_vectorize, disable_fusion:
         graph, c_mod, params = relay.build(func, target=TARGET, params=params)
-    print(c_mod.get_source())
     micro_mod = micro.create_micro_mod(c_mod, dev_config)
     ctx = tvm.micro_dev(0)
     mod = graph_runtime.create(graph, micro_mod, ctx)
@@ -83,6 +82,7 @@ def relay_micro_build(func, dev_config, params=None):
 break UTVMDone
 """
 
+
 def reset_gdbinit():
     if 'server_port' not in DEV_CONFIG_A:
         return
@@ -210,7 +210,7 @@ def test_graph_runtime():
 
 
 def test_conv2d():
-    if not tvm.module.enabled("micro_dev"):
+    if not tvm.runtime.enabled("micro_dev"):
         return
 
     from tvm.relay import create_executor
@@ -230,19 +230,27 @@ def test_conv2d():
             padding=(1, 1),
             channels=4)
     func = relay.Function(relay.analysis.free_vars(conv_expr), conv_expr)
-    mod = relay.Module.from_expr(func)
+    mod = tvm.IRModule.from_expr(func)
     mod = transform.InferType()(mod)
 
     x_shape = list(map(lambda x: x.value, mod['main'].params[0].checked_type.shape))
     w_shape = list(map(lambda x: x.value, mod['main'].params[1].checked_type.shape))
     out_shape = list(map(lambda x: x.value, mod['main'].ret_type.shape))
 
-    with tvm.build_config(disable_vectorize=True):
+    with tvm.target.build_config(disable_vectorize=True):
         graph, c_mod, params = relay.build(mod, target="c")
 
     with micro.Session(DEV_CONFIG_A):
         micro_mod = micro.create_micro_mod(c_mod, DEV_CONFIG_A)
-        micro_func = micro_mod[func_name]
+        candidate_func_name = func_name
+        for i in range(100):
+            try:
+                micro_func = micro_mod[candidate_func_name]
+                break
+            except tvm.TVMError as e:
+                candidate_func_name = f'{func_name}_{i}'
+        else:
+            assert False
         ctx = tvm.micro_dev(0)
 
         x_data = tvm.nd.array(np.random.uniform(size=x_shape).astype(dtype), ctx)
@@ -253,43 +261,11 @@ def test_conv2d():
         out_data = np.zeros(out_shape, dtype=dtype)
         params = { 'x': x_data.asnumpy(), 'w': w_data.asnumpy() }
         intrp = create_executor('debug')
-        expected_result = intrp.evaluate(mod['main'])(x_data, w_data).data
+        expected_result = intrp.evaluate(mod['main'])(x_data, w_data)
 
         tvm.testing.assert_allclose(result.asnumpy(), expected_result.asnumpy())
 
 
-def test_multiple_modules():
-    """Test loading multiple modules on the device simultaneously."""
-    if not tvm.runtime.enabled("micro_dev"):
-        return
-    shape = (1024,)
-    dtype = "float32"
-
-    # Construct Relay add program.
-    x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype))
-    ret = relay.add(x, relay.const(1.0))
-    add_const_func = relay.Function([x], ret)
-    # Construct Relay subtract program.
-    x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype))
-    ret = relay.subtract(x, relay.const(1.0))
-    sub_const_func = relay.Function([x], ret)
-
-    with micro.Session(DEV_CONFIG_A):
-        add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_A)
-        sub_const_mod = relay_micro_build(sub_const_func, DEV_CONFIG_A)
-
-        x_in = np.random.uniform(size=shape[0]).astype(dtype)
-        add_const_mod.run(x=x_in)
-        add_result = add_const_mod.get_output(0).asnumpy()
-        sub_const_mod.run(x=x_in)
-        sub_result = sub_const_mod.get_output(0).asnumpy()
-
-        tvm.testing.assert_allclose(
-                add_result, x_in + 1.0)
-        tvm.testing.assert_allclose(
-                sub_result, x_in - 1.0)
-
-
 def test_interleave_sessions():
     """Test closing and reopening sessions."""
     if not tvm.runtime.enabled("micro_dev"):
diff --git a/topi/python/topi/arm_cpu/__init__.py b/topi/python/topi/arm_cpu/__init__.py
index eb05dd839e32..e121fbc7ec6d 100644
--- a/topi/python/topi/arm_cpu/__init__.py
+++ b/topi/python/topi/arm_cpu/__init__.py
@@ -25,3 +25,4 @@
 from .bitserial_conv2d import *
 from .bitserial_dense import *
 from .injective import *
+from . import cortex_m7
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index 25b338e06b5f..df63ae3e9e59 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -31,6 +31,7 @@
     conv2d_spatial_pack_nhwc, \
     schedule_conv2d_spatial_pack_nchw, \
     schedule_conv2d_spatial_pack_nhwc
+from .cortex_m7.conv2d import direct_simd
 
 
 @autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu")
@@ -425,3 +426,15 @@ def _callback(op):
 
     traverse_inline(s, outs[0].op, _callback)
     return s
+
+@autotvm.register_topi_compute("conv2d_direct_simd.arm_cpu")
+def conv2d_direct_simd(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv2d with SIMD (v7e-m)."""
+    return direct_simd.conv2d_direct_simd_compute(
+        cfg, data, kernel, strides, padding, dilation, out_dtype)
+
+
+@autotvm.register_topi_schedule("conv2d_direct_simd.arm_cpu")
+def schedule_conv2d_direct_simd(cfg, outs):
+    """Create schedule for conv2d_direct_simd"""
+    return direct_simd.conv2d_direct_simd_nhwc_schedule(cfg, outs)
diff --git a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
index 9f9785e834d7..b4f8e7cf836d 100644
--- a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
+++ b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
@@ -74,8 +74,7 @@ def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation,
                            [n, co, oh, ow, ci, kh, kw, vc, vh, vw]])
 
     cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
-    #cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
-    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll')
+    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
 
     # fallback support
     if cfg.is_fallback:
@@ -153,7 +152,7 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec,
     cfg["ann_reduce"].apply(s, conv, [kh, kw],
                             axis_lens=[get_const_int(kh.dom.extent),
                                        get_const_int(kw.dom.extent)],
-                            max_unroll=16,
+                            max_unroll=None,
                             cfg=cfg)
     cfg["ann_spatial"].apply(s, conv, [vh, vw, vc],
                              axis_lens=[cfg['tile_oh'].size[-1],
@@ -178,14 +177,14 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec,
                                  cfg=cfg)
     s[conv].compute_at(s[last], ow)
 
-    ## mark parallel
-    #s[last].parallel(co)
+    # mark parallel
+    s[last].parallel(co)
 
     if data_vec.op.name == 'data_vec_undilated':
         _, h, _, _, _, _, _, _ = s[data_vec].op.axis
     else:
         _, h, _, _, _, _ = s[data_vec].op.axis
-    #s[data_vec].parallel(h)
+    s[data_vec].parallel(h)
 
     if kernel_vec.op.name == 'kernel_vec':
         co, _, _, _, _ = s[kernel_vec].op.axis
@@ -194,12 +193,10 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec,
             # this part to make tuning records correct
             s[kernel_vec].pragma(co, 'debug_skip_region')
         else:
-            #s[kernel_vec].parallel(co)
-            pass
+            s[kernel_vec].parallel(co)
     elif kernel_vec.op.name == 'kernel_vec_conv2d_transpose':  # for conv2d transpose
-        #co, _, _, _, _ = s[kernel_vec].op.axis
-        #s[kernel_vec].parallel(co)
-        pass
+        co, _, _, _, _ = s[kernel_vec].op.axis
+        s[kernel_vec].parallel(co)
 
     return s
 
diff --git a/topi/python/topi/arm_cpu/cortex_m7/__init__.py b/topi/python/topi/arm_cpu/cortex_m7/__init__.py
new file mode 100644
index 000000000000..631c5f7ff447
--- /dev/null
+++ b/topi/python/topi/arm_cpu/cortex_m7/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Schedules specialized for cortex-m7."""
+
+
+from . import conv2d
diff --git a/topi/python/topi/arm_cpu/cortex_m7/conv2d/__init__.py b/topi/python/topi/arm_cpu/cortex_m7/conv2d/__init__.py
new file mode 100644
index 000000000000..cc4faf97b126
--- /dev/null
+++ b/topi/python/topi/arm_cpu/cortex_m7/conv2d/__init__.py
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Conv2d implementations for cortex-m7."""
+
+from . import direct_simd
diff --git a/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct.py b/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct.py
new file mode 100644
index 000000000000..1fdb596b7163
--- /dev/null
+++ b/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct.py
@@ -0,0 +1,177 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Direct implementation of conv2d."""
+
+import tvm
+from tvm import autotvm
+from tvm.autotvm.task import deserialize_args
+from topi.nn.conv2d import conv2d_nchw, conv2d_nhwc
+from topi.util import get_const_tuple, get_const_int, traverse_inline
+
+def conv2d_direct(*args, **kwargs):
+    """Schedule function for directly-scheduled conv2d."""
+    assert not kwargs, "Do not support kwargs in template function call"
+    args = deserialize_args(args)
+    data, kernel = args[:2]
+    layout = args[-2]
+    cfg = autotvm.get_config()
+    args = [cfg] + args
+    conv = conv2d_direct_compute(*args)
+    if layout == 'NHWC':
+        sched = conv2d_direct_nhwc_schedule(cfg, [data, kernel, conv])
+    elif layout == 'NCHW':
+        sched = conv2d_direct_nchw_schedule(cfg, [data, kernel, conv])
+    else:
+        raise RuntimeError(f'unsupported data layout "{layout}"')
+    return sched, [data, kernel, conv]
+
+
+conv2d_direct.template_key = 'direct'
+conv2d_direct.default_data_layout = 'NHWC'
+conv2d_direct.default_kernel_layout = 'HWIO'
+
+@autotvm.register_topi_compute('conv2d_direct.micro_dev')
+def conv2d_direct_compute(*args):
+    layout = args[-2]
+    if layout == 'NHWC':
+        return _conv2d_direct_nhwc_compute(*args)
+    if layout == 'NCHW':
+        return _conv2d_direct_nchw_compute(*args)
+
+    raise RuntimeError(f'unsupported data layout "{layout}"')
+
+
+def _conv2d_direct_nhwc_compute(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
+    assert layout == 'NHWC'
+    conv = conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
+
+    ###########################
+    # Config Space Definition #
+    ###########################
+    N, H, W, CI = get_const_tuple(data.shape)
+    KH, KW, _, CO = get_const_tuple(kernel.shape)
+    n, oh, ow, co = cfg.axis(N), cfg.axis(H), cfg.axis(W), cfg.axis(CO)
+    kh, kw, ci = cfg.reduce_axis(KH), cfg.reduce_axis(KW), cfg.reduce_axis(CI)
+
+    # TODO should we add a max_factor attr to these splits?
+    co, vc = cfg.define_split('tile_co', co, num_outputs=2)
+    oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2)
+    ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2)
+
+    cfg.define_reorder('reorder_0',
+                       [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
+                       policy='candidate', candidate=[
+                           [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
+                           [n, co, oh, ow, ci, kh, kw, vc, vh, vw],
+                           [n, co, oh, ow, ci, vh, vw, vc, kh, kw],
+                           [n, co, oh, ow, ci, vc, vh, vw, kh, kw]])
+
+    cfg.define_annotate('ann_reduce', [kh, kw], policy='try_unroll')
+    cfg.define_annotate('ann_spatial', [vh, vw, vc], policy='try_unroll')
+
+    cfg.define_knob('auto_unroll_max_step', [0, 2, 4, 8, 16, 32])
+    cfg.define_knob('unroll_explicit', [0, 1])
+
+    return conv
+
+
+def _conv2d_direct_nchw_compute(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
+    assert layout == 'NCHW'
+    conv = conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
+
+    ###########################
+    # Config Space Definition #
+    ###########################
+    cfg.define_knob('auto_unroll_max_step', [0, 2, 4, 8, 16, 32])
+    cfg.define_knob('unroll_explicit', [0, 1])
+
+    return conv
+
+
+@autotvm.register_topi_schedule('conv2d_direct_nhwc.micro_dev')
+def conv2d_direct_nhwc_schedule(cfg, outs):
+    """Schedule function for directly-scheduled conv2d on NHWC layout."""
+    sched = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'conv2d_nhwc' not in op.tag:
+            return
+
+        ### extract tensors ###
+        output = op.output(0)
+        conv = op
+        data_vec = conv.input_tensors[0]
+        kernel = conv.input_tensors[1]  # pylint: disable=unused-variable
+        last = outs[0]  # pylint: disable=unused-variable
+
+        # tile reduction axes
+        n, oh, ow, co = sched[conv].op.axis
+        kh, kw, ci = sched[conv].op.reduce_axis
+        # NOTE we can't inline data padding in the SIMD path, because it
+        # introduces conditionals in the inner loop.
+        data_pad = data_vec.op
+        sched[data_pad].compute_inline()
+
+        co, vc = cfg['tile_co'].apply(sched, conv, co)
+        oh, vh = cfg['tile_oh'].apply(sched, conv, oh)
+        ow, vw = cfg['tile_ow'].apply(sched, conv, ow)
+        cfg['reorder_0'].apply(sched, conv, [n, co, oh, ow, ci, kh, kw, vh, vw, vc])
+        cfg['ann_reduce'].apply(sched, conv, [kh, kw],
+                                axis_lens=[get_const_int(kh.dom.extent),
+                                           get_const_int(kw.dom.extent)],
+                                max_unroll=8,
+                                cfg=cfg)
+        cfg['ann_spatial'].apply(sched, conv, [vh, vw, vc],
+                                 axis_lens=[cfg['tile_oh'].size[-1],
+                                            cfg['tile_ow'].size[-1],
+                                            cfg['tile_co'].size[-1]],
+                                 max_unroll=8,
+                                 cfg=cfg)
+
+        kernel_scope = n  # this is the scope to attach global config inside this kernel
+
+        # tune unroll
+        sched[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+        sched[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    traverse_inline(sched, outs[-1].op, _callback)
+    return sched
+
+
+@autotvm.register_topi_schedule('conv2d_direct_nchw.micro_dev')
+def conv2d_direct_nchw_schedule(cfg, outs):
+    """Schedule function for Cortex-M7 direct implementation of conv2d."""
+    # use default schedule
+    sched = tvm.create_schedule([x.op for x in outs])
+
+    conv = outs[-1].op
+    output = conv.output(0)
+    data_vec = conv.input_tensors[0]
+    data_pad = data_vec.op
+    sched[data_pad].compute_inline()
+
+    # TODO add more schedule opts (similar to the NHWC template)
+
+    n, _, _, _ = sched[conv].op.axis
+    kernel_scope = n  # this is the scope to attach global config inside this kernel
+
+    # tune unroll
+    sched[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    sched[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    return sched
diff --git a/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py b/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
new file mode 100644
index 000000000000..fd411251272e
--- /dev/null
+++ b/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
@@ -0,0 +1,163 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, no-value-for-parameter
+"""Direct implementation of conv2d."""
+
+from tvm import autotvm
+from tvm.autotvm.task import deserialize_args
+from tvm import te
+from topi.util import simplify, traverse_inline
+from topi.nn.pad import pad
+from topi.nn.util import get_pad_tuple
+
+from ..micro_kernel.gemm import (
+        intrin_gemm_MxKxN, gemm_MxKxN_impl,
+)
+
+def conv2d_direct_simd(*args, **kwargs):
+    """Defines the Cortex-M7 SIMD implementation of conv2d."""
+    assert not kwargs, "Do not support kwargs in template function call"
+    args = deserialize_args(args)
+    data, kernel = args[:2]
+    layout = args[-2]
+    cfg = autotvm.get_config()
+    args = [cfg] + args
+    assert layout == 'NHWC'
+    conv = conv2d_direct_simd_compute(*args)
+    sched = conv2d_direct_simd_nhwc_schedule(cfg, [data, kernel, conv])
+    return sched, [data, kernel, conv]
+
+
+conv2d_direct_simd.template_key = 'direct_simd'
+conv2d_direct_simd.default_data_layout = 'NHWC'
+conv2d_direct_simd.default_kernel_layout = 'HWOI'
+
+def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute function for Cortex-M7 SIMD implementation of conv2d."""
+    assert isinstance(strides, int) or len(strides) == 2
+    assert isinstance(dilation, int) or len(dilation) == 2
+
+    if isinstance(strides, int):
+        stride_h = stride_w = strides
+    else:
+        stride_h, stride_w = strides
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    batch_size, in_height, in_width, in_channels = data.shape
+    kernel_h, kernel_w, out_channels, _ = kernel.shape
+
+    # compute the output shape
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w))
+    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
+
+    pad_before = [0, pad_top, pad_left, 0]
+    pad_after = [0, pad_down, pad_right, 0]
+    padded_data = pad(data, pad_before, pad_after, name='padded_data')
+
+    rc = te.reduce_axis((0, in_channels), name='rc')
+    ry = te.reduce_axis((0, kernel_h), name='ry')
+    rx = te.reduce_axis((0, kernel_w), name='rx')
+
+    conv = te.compute(
+        (batch_size, out_height, out_width, out_channels),
+        lambda nn, yy, xx, ff: te.sum(
+            padded_data[nn, yy * stride_h + ry * dilation_h,
+                        xx * stride_w + rx * dilation_w, rc].astype(out_dtype) *
+            kernel[ry, rx, ff, rc].astype(out_dtype), axis=[ry, rx, rc]),
+        name='conv2d', tag='conv2d_nhwc')
+
+    ###########################
+    # Config Space Definition #
+    ###########################
+    n, oh, ow, co = (cfg.axis(batch_size.value),
+                     cfg.axis(out_height.value),
+                     cfg.axis(out_width.value),
+                     cfg.axis(out_channels.value))
+    kh, kw, ci = (cfg.reduce_axis(kernel_h.value),
+                  cfg.reduce_axis(kernel_w.value),
+                  cfg.reduce_axis(in_channels.value))
+
+    assert in_channels.value % 4 == 0
+    owo, owi = cfg.define_split('tile_ow', ow, policy='factors', num_outputs=2)
+    cio, cii = cfg.define_split('tile_ci', ci, policy='factors', num_outputs=2,
+                                filter=lambda x: x.size[-1] % 4 == 0)
+    coo, coi = cfg.define_split('tile_co', co, policy='factors', num_outputs=2)
+
+    cfg.define_reorder('reorder_0_simd',
+                       [n, oh, owo, owi, coo, coi, kh, kw, cio, cii],
+                       policy='candidate', candidate=[
+                           [n, oh, kh, kw, owo, coo, cio, owi, coi, cii],
+                           [n, oh, kh, kw, coo, owo, cio, owi, coi, cii],
+                           [n, kh, kw, oh, owo, coo, cio, owi, coi, cii],
+                           [n, kh, kw, oh, coo, owo, cio, owi, coi, cii]])
+
+    cfg.define_knob('auto_unroll_max_step', [0, 2, 4, 8, 16, 32])
+    cfg.define_knob('unroll_explicit', [0, 1])
+
+    return conv
+
+
+def conv2d_direct_simd_nhwc_schedule(cfg, outs):
+    """Schedule function for Cortex-M7 SIMD implementation of conv2d."""
+    sched = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'conv2d_nhwc' not in op.tag:
+            return
+
+        # extract tensors
+        output = op.output(0)
+        conv = op
+        data_vec = conv.input_tensors[0]
+        kernel = conv.input_tensors[1]  # pylint: disable=unused-variable
+        last = outs[0]  # pylint: disable=unused-variable
+
+        # tile reduction axes
+        n, oh, ow, co = sched[conv].op.axis
+        kh, kw, ci = sched[conv].op.reduce_axis
+
+        M = cfg['tile_ow'].size[-1]
+        K = cfg['tile_ci'].size[-1]
+        N = cfg['tile_co'].size[-1]
+
+        owo, owi = cfg['tile_ow'].apply(sched, conv, ow)
+        cio, cii = cfg['tile_ci'].apply(sched, conv, ci)
+        coo, coi = cfg['tile_co'].apply(sched, conv, co)
+
+        cfg['reorder_0_simd'].apply(sched, conv, [n, oh, owo, owi, coo, coi, kh, kw, cio, cii])
+
+        gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype)
+        sched[output].tensorize(owi, gemm)
+        sched[output].pragma(n, 'import_c', gemm_MxKxN_impl(M, K, N, uniq_id))
+
+        # this is the scope to attach global config inside this kernel
+        kernel_scope = n
+
+        # tune unroll
+        sched[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+        sched[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    traverse_inline(sched, outs[-1].op, _callback)
+    return sched
diff --git a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
new file mode 100644
index 000000000000..70cd46bb32c2
--- /dev/null
+++ b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
@@ -0,0 +1,221 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, no-value-for-parameter
+"""Defines gemm intrinsics for SIMD matrix multiplication."""
+
+import random
+import string
+
+import tvm
+from tvm import te
+
+##########################
+# MxKxN MatMul Intrinsic #
+##########################
+
+# NOTE this is transposed matmul (A * B^T)
+def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
+    """Defines a SIMD-accelerated transposed matmul."""
+    # we generate a unique ID for every intrinsic definition, to prevent name
+    # collisions in the generated source (e.g., if there are multiple operators
+    # in the same module that use the same intrinsic)
+    #
+    # TODO to cut down on memory usage, we should cache each intrinsic
+    # instantiation and include it only once, eliminating the need for unique
+    # IDs
+    UNIQ_ID_LEN = 8
+    uniq_id = ''.join(random.choices(string.ascii_uppercase, k=UNIQ_ID_LEN))
+
+    if isinstance(M, tvm.tir.IntImm):
+        M = M.value
+    if isinstance(K, tvm.tir.IntImm):
+        K = K.value
+    if isinstance(N, tvm.tir.IntImm):
+        N = N.value
+    assert K % 4 == 0
+    # TODO support more dtypes?
+    assert in_dtype == 'int8'
+    assert out_dtype == 'int32'
+    A = te.placeholder((M, K), name='a', dtype=in_dtype)
+    B = te.placeholder((N, K), name='b', dtype=in_dtype)
+    k = te.reduce_axis((0, K), name='k')
+    C = te.compute(
+        (M, N),
+        lambda i, j: te.sum(A[i, k].astype(out_dtype) * B[j, k].astype(out_dtype), axis=k),
+        name='c')
+    A_buf = tvm.tir.decl_buffer(
+        A.shape, A.dtype,
+        name="A",
+        offset_factor=1,
+        strides=[te.var("A_s"), 1])
+    B_buf = tvm.tir.decl_buffer(
+        B.shape, B.dtype,
+        name="B",
+        offset_factor=1,
+        strides=[te.var("B_s"), 1])
+    C_buf = tvm.tir.decl_buffer(
+        C.shape, C.dtype,
+        name="C",
+        offset_factor=1,
+        strides=[te.var("C_s"), 1])
+    def intrin_func(ins, outs):
+        aa, bb = ins
+        cc = outs[0]
+        def _reduce_update():
+            ib = tvm.tir.ir_builder.create()
+            ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_update_{uniq_id}",
+                                        aa.access_ptr("r"),
+                                        bb.access_ptr("r"),
+                                        cc.access_ptr("w"),
+                                        aa.strides[0],
+                                        bb.strides[0],
+                                        cc.strides[0]))
+            return ib.get()
+        def _reduce_reset():
+            ib = tvm.tir.ir_builder.create()
+            ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_reset_{uniq_id}",
+                                        cc.access_ptr("w"),
+                                        cc.strides[0]))
+            return ib.get()
+        def _body():
+            ib = tvm.tir.ir_builder.create()
+            # # NOTE we need the reset in the body for cases where the buffer
+            # # we're accumulating into is uninitialized (e.g., if it's the
+            # # result of a workspace allocation, because there are no guarantees
+            # # on the contents).
+            # ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_reset",
+            #                         cc.access_ptr("w"),
+            #                         cc.strides[0]))
+            # ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_update",
+            #                         aa.access_ptr("r"),
+            #                         bb.access_ptr("r"),
+            #                         cc.access_ptr("w"),
+            #                         aa.strides[0],
+            #                         bb.strides[0],
+            #                         cc.strides[0]))
+            ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_body_{uniq_id}",
+                                        aa.access_ptr("r"),
+                                        bb.access_ptr("r"),
+                                        cc.access_ptr("w"),
+                                        aa.strides[0],
+                                        bb.strides[0],
+                                        cc.strides[0]))
+            return ib.get()
+        return _body(), _reduce_reset(), _reduce_update()
+    with tvm.target.build_config(offset_factor=1):
+        intrin_decl = te.decl_tensor_intrin(
+            C.op, intrin_func, binds={A: A_buf, B: B_buf, C: C_buf})
+        return intrin_decl, uniq_id
+
+
+def gemm_MxKxN_impl(M, K, N, uniq_id):
+    """Emit C code for gemm impl."""
+    # TODO are there any SIMD tricks to zero out arrays quickly?
+    aa_pad_size = M * K
+    bb_pad_size = N * K
+    # code reference: CMSIS-NN paper (https://arxiv.org/abs/1801.06601)
+    cc_code = f"""
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_body_{uniq_id}(
+    int8_t *aa, int8_t *bb, int32_t *cc,
+    int A_stride, int B_stride, int C_stride) {{
+  int16_t aa_pad[{aa_pad_size}];
+  int16_t bb_pad[{bb_pad_size}];
+
+  for (int i = 0; i < {M}; i++) {{
+    for (int j = 0; j < {K} / 4; j++) {{
+      read_and_pad(&aa[i*A_stride + j*4], (int32_t*) &aa_pad[i*{K} + j*4], (int32_t*) &aa_pad[i*{K} + j*4 + 2]);
+    }}
+  }}
+
+  for (int i = 0; i < {N}; i++) {{
+    for (int j = 0; j < {K} / 4; j++) {{
+      read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4], (int32_t*) &bb_pad[i*{K} + j*4 + 2]);
+    }}
+  }}
+
+  for (int i = 0; i < {M}; i++) {{
+    for (int j = 0; j < {N}; j++) {{
+      int32_t sum = 0;
+      for (int l = 0; l < {K} / 2; l++) {{
+        sum = __SMLAD(
+          *((int32_t*) &aa_pad[i*{K} + l*2]),
+          *((int32_t*) &bb_pad[j*{K} + l*2]),
+          sum);
+      }}
+      // NOTE: this is the line where `*_body` differs from `*_update`. here
+      // we're *setting* the result, instead of accumulating, because we know
+      // the `i` and `j` itervars span their entire respective axes.
+      cc[i*C_stride + j] = sum;
+    }}
+  }}
+
+  return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_update_{uniq_id}(
+    int8_t *aa, int8_t *bb, int32_t *cc,
+    int A_stride, int B_stride, int C_stride) {{
+  int16_t aa_pad[{aa_pad_size}];
+  int16_t bb_pad[{bb_pad_size}];
+
+  for (int i = 0; i < {M}; i++) {{
+    for (int j = 0; j < {K} / 4; j++) {{
+      read_and_pad(&aa[i*A_stride + j*4], (int32_t*) &aa_pad[i*{K} + j*4], (int32_t*) &aa_pad[i*{K} + j*4 + 2]);
+    }}
+  }}
+
+  for (int i = 0; i < {N}; i++) {{
+    for (int j = 0; j < {K} / 4; j++) {{
+      read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4], (int32_t*) &bb_pad[i*{K} + j*4 + 2]);
+    }}
+  }}
+
+  for (int i = 0; i < {M}; i++) {{
+    for (int j = 0; j < {N}; j++) {{
+      int32_t sum = 0;
+      for (int l = 0; l < {K} / 2; l++) {{
+        sum = __SMLAD(
+          *((int32_t*) &aa_pad[i*{K} + l*2]),
+          *((int32_t*) &bb_pad[j*{K} + l*2]),
+          sum);
+      }}
+      cc[i*C_stride + j] += sum;
+    }}
+  }}
+
+  return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_reset_{uniq_id}(int32_t *cc, int C_stride) {{
+  for (int i = 0; i < {M}; i++) {{
+    for (int j = 0; j < {N}; j++) {{
+      cc[i*C_stride + j] = 0;
+    }}
+  }}
+  return 0;
+}}
+    """
+    return cc_code
diff --git a/topi/python/topi/arm_cpu/injective.py b/topi/python/topi/arm_cpu/injective.py
index 8cf3fd5f11bf..966520088bc7 100644
--- a/topi/python/topi/arm_cpu/injective.py
+++ b/topi/python/topi/arm_cpu/injective.py
@@ -20,7 +20,6 @@
 from tvm import te
 from ..util import is_empty_shape
 
-@generic.schedule_injective_from_existing.register(["arm_cpu", "micro_dev"])
 def schedule_injective_from_existing(sch, out):
     """Schedule for injective op from existing schedule.
 
@@ -46,7 +45,6 @@ def schedule_injective_from_existing(sch, out):
         sch[out].parallel(sch[out].op.axis[0])
     return sch
 
-@generic.schedule_injective.register(["arm_cpu", "micro_dev"])
 def schedule_injective(outs):
     """ARM CPU schedule for injective op.
 
@@ -74,7 +72,6 @@ def schedule_injective(outs):
         schedule_injective_from_existing(s, x)
     return s
 
-@generic.schedule_concatenate.register(["arm_cpu", "micro_dev"])
 def schedule_concatenate(outs):
     """Schedule for concatenate op.
 
diff --git a/topi/python/topi/generic/default.py b/topi/python/topi/generic/default.py
index d4c642ab8814..59e5a255c6e1 100644
--- a/topi/python/topi/generic/default.py
+++ b/topi/python/topi/generic/default.py
@@ -24,7 +24,7 @@ def default_schedule(outs, auto_inline):
     """Default schedule for llvm."""
     target = tvm.target.Target.current(allow_none=False)
     outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    if target.target_name != "llvm":
+    if target.target_name not in ("llvm", "c"):
         raise RuntimeError("schedule not registered for '%s'" % target)
     s = te.create_schedule([x.op for x in outs])
     if auto_inline:

From 80c647039ce0ecfaf2a0f32817ac46511bdc1158 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Thu, 23 Apr 2020 10:51:04 -0700
Subject: [PATCH 03/11] Use /std:c++14 with MSVC.

 * Per tqchen: project has already moved to C++14
 * Presubmit failed for code that built locally on gcc.
---
 CMakeLists.txt | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc7c67c83a48..a0ebdf037756 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -304,12 +304,15 @@ include(cmake/modules/contrib/TFLite.cmake)
 include(cmake/modules/contrib/TF_TVMDSOOP.cmake)
 include(cmake/modules/contrib/CoreML.cmake)
 
+include(CheckCXXCompilerFlag)
 if(NOT MSVC)
-  include(CheckCXXCompilerFlag)
   check_cxx_compiler_flag("-std=c++14" SUPPORT_CXX14)
-  message(STATUS "Build with c++14")
   set(CMAKE_CXX_FLAGS "-std=c++14 ${CMAKE_CXX_FLAGS}")
   set(CMAKE_CUDA_STANDARD 14)
+else()
+  check_cxx_compiler_flag("/std:c++14" SUPPORT_CXX14)
+  set(CMAKE_CXX_FLAGS "/std:c++14 ${CMAKE_CXX_FLAGS}")
+  set(CMAKE_CUDA_STANDARD 14)
 endif()
 
 add_library(tvm SHARED ${COMPILER_SRCS} ${RUNTIME_SRCS})

From ce06883a7894cf7f929f3b8e47902d30d5cd8f95 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Thu, 23 Apr 2020 23:21:05 +0000
Subject: [PATCH 04/11] fix ASF lint, and fix add_asf_header too

---
 tests/lint/add_asf_header.py                    |  4 +++-
 .../arm_cpu/cortex_m7/micro_kernel/__init__.py  | 17 +++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/tests/lint/add_asf_header.py b/tests/lint/add_asf_header.py
index a44fbd3df1b5..21d25c25e573 100644
--- a/tests/lint/add_asf_header.py
+++ b/tests/lint/add_asf_header.py
@@ -181,7 +181,9 @@ def add_header(fname, header):
         skipline = False
         ext = os.path.splitext(fname)[1][1:]
 
-        if lines[0][:2] == "#!":
+        if not lines:
+            skipline = False  # File is enpty
+        elif lines[0][:2] == "#!":
             skipline = True
         elif lines[0][:2] == "<?":
             skipline = True
diff --git a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py
index e69de29bb2d1..245692337bc3 100644
--- a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py
+++ b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+

From 9a8e2b34b0a475d424c236afa8c55f6fb37e4673 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 24 Apr 2020 10:28:33 -0700
Subject: [PATCH 05/11] Compiles with USE_MICRO=OFF.

---
 src/runtime/micro/micro_common.h   |  2 +-
 src/runtime/micro/micro_session.cc | 35 +++++++++++++++++++++++++++++
 src/runtime/rpc/rpc_session.cc     | 36 +++---------------------------
 3 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h
index 7ba62d3f30df..a8d16d2b9803 100644
--- a/src/runtime/micro/micro_common.h
+++ b/src/runtime/micro/micro_common.h
@@ -74,7 +74,7 @@ class TargetVal {
     if (width_bits_ == 64) {
       return 0xffffffff;
     } else {
-      return (1 << width_bits_) - 1;
+      return (1UL << width_bits_) - 1;
     }
   }
 
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index 309fea45cd31..11ee035820a2 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -23,6 +23,7 @@
 
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/runtime/device_api.h>
 #include <chrono>
 #include <memory>
 #include <locale>
@@ -644,6 +645,40 @@ PackedFunc MicroSession::GetFunction(
   }
 }
 
+TVM_REGISTER_GLOBAL("micro._GetMicroTimeEvaluator")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  PackedFunc pf = args[0];
+  TVMContext ctx = args[1];
+  int number = args[2];
+  int repeat = args[3];
+
+  auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) mutable {
+    TVMRetValue temp;
+    std::ostringstream os;
+
+    for (int i = 0; i < repeat; ++i) {
+      // start timing
+      CHECK(number < MicroSession::kTaskQueueCapacity)
+        << "`number` must be less than uTVM task queue capacity";
+      for (unsigned int j = 0; j < number; ++j) {
+        pf.CallPacked(args, &temp);
+      }
+      ObjectPtr<MicroSession> session = MicroSession::Current();
+      DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
+      double time_per_batch = session->GetLastBatchTime() / number;
+      os.write(reinterpret_cast<char*>(&time_per_batch), sizeof(time_per_batch));
+    }
+    std::string blob = os.str();
+    TVMByteArray arr;
+    arr.size = blob.length();
+    arr.data = blob.data();
+    // return the time.
+    *rv = arr;
+  };
+  *rv = PackedFunc(ftimer);
+});
+
+
 // create micro session and low-level device from Python frontend
 TVM_REGISTER_GLOBAL("micro._CreateSession")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index 9b5612efc690..ae293abfacdd 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -1247,45 +1247,15 @@ void RPCSession::EventHandler::HandlePackedCall() {
   CHECK_EQ(state_, kRecvCode);
 }
 
-PackedFunc MicroTimeEvaluator(
-    PackedFunc pf,
-    TVMContext ctx,
-    size_t number,
-    int repeat,
-    int min_repeat_ms) {
-  auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) mutable {
-    TVMRetValue temp;
-    std::ostringstream os;
-
-    for (int i = 0; i < repeat; ++i) {
-      // start timing
-      CHECK(number < MicroSession::kTaskQueueCapacity)
-        << "`number` must be less than uTVM task queue capacity";
-      for (unsigned int j = 0; j < number; ++j) {
-        pf.CallPacked(args, &temp);
-      }
-      ObjectPtr<MicroSession> session = MicroSession::Current();
-      DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
-      double time_per_batch = session->GetLastBatchTime() / number;
-      os.write(reinterpret_cast<char*>(&time_per_batch), sizeof(time_per_batch));
-    }
-    std::string blob = os.str();
-    TVMByteArray arr;
-    arr.size = blob.length();
-    arr.data = blob.data();
-    // return the time.
-    *rv = arr;
-  };
-  return PackedFunc(ftimer);
-}
-
 PackedFunc WrapTimeEvaluator(PackedFunc pf,
                              TVMContext ctx,
                              int number,
                              int repeat,
                              int min_repeat_ms) {
   if (static_cast<int>(ctx.device_type) == static_cast<int>(kDLMicroDev)) {
-    return MicroTimeEvaluator(pf, ctx, number, repeat, min_repeat_ms);
+    auto get_micro_time_evaluator = runtime::Registry::Get("micro._GetMicroTimeEvaluator");
+    CHECK(get_micro_time_evaluator != nullptr) << "micro backend not enabled";
+    return (*get_micro_time_evaluator)(pf, ctx, number, repeat);
   }
 
   auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable {

From 25961d6f581f1c1a43a60b9290e44776fd2f09ce Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 24 Apr 2020 13:26:01 -0700
Subject: [PATCH 06/11] Cleanup TargetPtr and word size representations.

---
 python/tvm/micro/base.py                      |   4 +-
 python/tvm/micro/device/arm/stm32f746xx.py    |   6 +-
 python/tvm/micro/device/base.py               |  12 +-
 python/tvm/micro/device/host.py               |   6 +-
 python/tvm/micro/device/riscv_spike.py        |   6 +-
 src/runtime/micro/host_low_level_device.cc    |  11 +-
 src/runtime/micro/low_level_device.h          |   3 +-
 src/runtime/micro/micro_common.cc             |   8 +-
 src/runtime/micro/micro_common.h              | 105 +++++++++++++-----
 src/runtime/micro/micro_device_api.cc         |  25 ++---
 src/runtime/micro/micro_section_allocator.h   |  16 +--
 src/runtime/micro/micro_session.cc            |  92 +++++++--------
 src/runtime/micro/micro_session.h             |  24 ++--
 src/runtime/micro/openocd_low_level_device.cc |   6 +-
 .../micro/target_data_layout_encoder.h        |  13 ++-
 .../cortex_m7/micro_kernel/__init__.py        |   1 -
 16 files changed, 200 insertions(+), 138 deletions(-)

diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py
index d2ec7d398c46..c34dd728841a 100644
--- a/python/tvm/micro/base.py
+++ b/python/tvm/micro/base.py
@@ -77,7 +77,7 @@ def __init__(self, config):
         dev_funcs = tvm.micro.device.get_device_funcs(config['device_id'])
         self.toolchain_prefix = config['toolchain_prefix']
         self.mem_layout = config['mem_layout']
-        self.word_size = config['word_size']
+        self.word_size_bits = config['word_size_bits']
         self.thumb_mode = config['thumb_mode']
         self.use_device_timer = config['use_device_timer']
         self.comms_method = config['comms_method']
@@ -122,7 +122,7 @@ def __init__(self, config):
             self.mem_layout['workspace']['size'],
             self.mem_layout['stack'].get('start', 0),
             self.mem_layout['stack']['size'],
-            self.word_size,
+            self.word_size_bits,
             self.thumb_mode,
             self.use_device_timer,
             server_addr,
diff --git a/python/tvm/micro/device/arm/stm32f746xx.py b/python/tvm/micro/device/arm/stm32f746xx.py
index f85a34e4e0a2..9a2b46ac0cd7 100644
--- a/python/tvm/micro/device/arm/stm32f746xx.py
+++ b/python/tvm/micro/device/arm/stm32f746xx.py
@@ -20,7 +20,7 @@
 
 DEVICE_ID = 'arm.stm32f746xx'
 TOOLCHAIN_PREFIX = 'arm-none-eabi-'
-WORD_SIZE = 4
+WORD_SIZE_BITS = 32
 #
 # [Device Memory Layout]
 #   RAM   (rwx) : START = 0x20000000, LENGTH = 320K
@@ -112,8 +112,8 @@ def generate_config(server_addr, server_port, section_constraints=None):
     return {
         'device_id': DEVICE_ID,
         'toolchain_prefix': TOOLCHAIN_PREFIX,
-        'mem_layout': gen_mem_layout(BASE_ADDR, AVAILABLE_MEM, WORD_SIZE, section_constraints),
-        'word_size': WORD_SIZE,
+        'mem_layout': gen_mem_layout(BASE_ADDR, AVAILABLE_MEM, WORD_SIZE_BITS, section_constraints),
+        'word_size_bits': WORD_SIZE_BITS,
         'thumb_mode': True,
         'use_device_timer': True,
         'comms_method': 'openocd',
diff --git a/python/tvm/micro/device/base.py b/python/tvm/micro/device/base.py
index 1621c69d1a77..4d42bff8ebbc 100644
--- a/python/tvm/micro/device/base.py
+++ b/python/tvm/micro/device/base.py
@@ -178,7 +178,7 @@ class MemConstraint(enum.Enum):
     WEIGHT = 1
 
 
-def gen_mem_layout(base_addr, available_mem, word_size, section_constraints):
+def gen_mem_layout(base_addr, available_mem, word_size_bits, section_constraints):
     """Template function to generate memory layout for devices.
 
     Parameters
@@ -189,12 +189,14 @@ def gen_mem_layout(base_addr, available_mem, word_size, section_constraints):
     available_mem: Number
         Available memory at base_addr, given in bytes.
 
-    word_size: Number
-        Number of bytes in one word on this device.
+    word_size_bits: Number
+        Number of bits in one word on this device.
 
     section_constraints: Optional[Dict[str, [Number, MemConstraint]]]
         maps section name to the quantity of available memory
     """
+    assert word_size_bits in (32, 64), 'only 32- or 64-bit devices are supported now'
+    word_size_bytes = word_size_bits // 8
     byte_sum = sum(x[0]
                    for x in section_constraints.values()
                    if x[1] == MemConstraint.ABSOLUTE_BYTES)
@@ -209,7 +211,7 @@ def gen_mem_layout(base_addr, available_mem, word_size, section_constraints):
     for section in DEVICE_SECTIONS:
         (val, cons_type) = section_constraints[section]
         if cons_type == MemConstraint.ABSOLUTE_BYTES:
-            assert val % word_size == 0, \
+            assert val % word_size_bytes == 0, \
                 f'constraint {val} for {section} section is not word-aligned'
             size = val
             res[section] = {
@@ -218,7 +220,7 @@ def gen_mem_layout(base_addr, available_mem, word_size, section_constraints):
             }
         else:
             size = int((val / weight_sum) * available_weight_mem)
-            size = (size // word_size) * word_size
+            size = (size // word_size_bytes) * word_size_bytes
             res[section] = {
                 'start': curr_addr,
                 'size': size,
diff --git a/python/tvm/micro/device/host.py b/python/tvm/micro/device/host.py
index f6d0dadc5f43..55e0ac326f2d 100644
--- a/python/tvm/micro/device/host.py
+++ b/python/tvm/micro/device/host.py
@@ -21,7 +21,7 @@
 
 DEVICE_ID = 'host'
 TOOLCHAIN_PREFIX = ''
-WORD_SIZE = 8 if sys.maxsize > 2**32 else 4
+WORD_SIZE_BITS = 64 if sys.maxsize > 2**32 else 32
 
 # we pretend we only have 320kb in the default case, so we can use `gen_mem_layout`
 DEFAULT_AVAILABLE_MEM = 3200000
@@ -89,7 +89,7 @@ def generate_config(available_mem=None, section_constraints=None):
         available_mem = DEFAULT_AVAILABLE_MEM
     if section_constraints is None:
         section_constraints = DEFAULT_SECTION_CONSTRAINTS
-    mem_layout = gen_mem_layout(0, available_mem, WORD_SIZE, section_constraints)
+    mem_layout = gen_mem_layout(0, available_mem, WORD_SIZE_BITS, section_constraints)
     # TODO the host emulated device is an outlier, since we don't know how what
     # its base address will be until we've created it in the C++. is there any
     # way to change the infrastructure around this so it's not so much of an
@@ -103,7 +103,7 @@ def generate_config(available_mem=None, section_constraints=None):
         'device_id': DEVICE_ID,
         'toolchain_prefix': TOOLCHAIN_PREFIX,
         'mem_layout': mem_layout,
-        'word_size': WORD_SIZE,
+        'word_size_bits': WORD_SIZE_BITS,
         'thumb_mode': False,
         'use_device_timer': False,
         'comms_method': 'host',
diff --git a/python/tvm/micro/device/riscv_spike.py b/python/tvm/micro/device/riscv_spike.py
index f26f04604cac..b9f55dff0ce7 100644
--- a/python/tvm/micro/device/riscv_spike.py
+++ b/python/tvm/micro/device/riscv_spike.py
@@ -20,7 +20,7 @@
 
 DEVICE_ID = 'riscv_spike'
 TOOLCHAIN_PREFIX = 'riscv64-unknown-elf-'
-WORD_SIZE = 8
+WORD_SIZE_BITS = 64
 
 DEFAULT_SECTION_CONSTRAINTS = {
     'text': (18000, MemConstraint.ABSOLUTE_BYTES),
@@ -92,8 +92,8 @@ def generate_config(base_addr, available_mem, server_addr, server_port, section_
     return {
         'device_id': DEVICE_ID,
         'toolchain_prefix': TOOLCHAIN_PREFIX,
-        'mem_layout': gen_mem_layout(base_addr, available_mem, WORD_SIZE, section_constraints),
-        'word_size': WORD_SIZE,
+        'mem_layout': gen_mem_layout(base_addr, available_mem, WORD_SIZE_BITS, section_constraints),
+        'word_size_bits': WORD_SIZE_BITS,
         'thumb_mode': False,
         'use_device_timer': False,
         'comms_method': 'openocd',
diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc
index 13f68f295fa6..da4ade41c008 100644
--- a/src/runtime/micro/host_low_level_device.cc
+++ b/src/runtime/micro/host_low_level_device.cc
@@ -43,14 +43,15 @@ class HostLowLevelDevice final : public LowLevelDevice {
    * \brief constructor to initialize on-host memory region to act as device
    * \param num_bytes size of the emulated on-device memory region
    */
-  explicit HostLowLevelDevice(size_t num_bytes, void** base_addr) : size_(num_bytes) {
+  explicit HostLowLevelDevice(size_t num_bytes, TargetPtr* base_addr) : size_(num_bytes) {
     size_t size_in_pages = (num_bytes + kPageSize - 1) / kPageSize;
     // TODO(weberlo): Set permissions per section (e.g., read-write perms for
     // the heap, execute perms for text, etc.).
     int mmap_prot = PROT_READ | PROT_WRITE | PROT_EXEC;
     int mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE;
     base_addr_ = mmap(nullptr, size_in_pages * kPageSize, mmap_prot, mmap_flags, -1, 0);
-    *base_addr = base_addr_;
+    *base_addr = TargetPtr(TargetWordSize(sizeof(size_t) * 8),
+                           reinterpret_cast<uint64_t>(base_addr_));
   }
 
   /*!
@@ -83,9 +84,9 @@ class HostLowLevelDevice final : public LowLevelDevice {
   size_t size_;
 };
 
-const std::shared_ptr<LowLevelDevice> HostLowLevelDeviceCreate(size_t num_bytes, void** base_addr) {
-  std::shared_ptr<LowLevelDevice> lld =
-      std::make_shared<HostLowLevelDevice>(num_bytes, base_addr);
+const std::shared_ptr<LowLevelDevice> HostLowLevelDeviceCreate(size_t num_bytes,
+                                                               TargetPtr* base_addr) {
+  std::shared_ptr<LowLevelDevice> lld = std::make_shared<HostLowLevelDevice>(num_bytes, base_addr);
   return lld;
 }
 
diff --git a/src/runtime/micro/low_level_device.h b/src/runtime/micro/low_level_device.h
index 666b08199a6b..c5b5f3df634b 100644
--- a/src/runtime/micro/low_level_device.h
+++ b/src/runtime/micro/low_level_device.h
@@ -78,7 +78,8 @@ class LowLevelDevice {
  * \param num_bytes size of the memory region
  * \param base_addr pointer to write the host device's resulting base address into
  */
-const std::shared_ptr<LowLevelDevice> HostLowLevelDeviceCreate(size_t num_bytes, void** base_addr);
+const std::shared_ptr<LowLevelDevice> HostLowLevelDeviceCreate(size_t num_bytes,
+                                                               TargetPtr* base_addr);
 
 /*!
  * \brief connect to OpenOCD and create an OpenOCD low-level device
diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc
index 3e322c639edd..c544fcd32801 100644
--- a/src/runtime/micro/micro_common.cc
+++ b/src/runtime/micro/micro_common.cc
@@ -51,7 +51,7 @@ const char* SectionToString(SectionKind section) {
 
 std::string RelocateBinarySections(
     const std::string& binary_path,
-    size_t word_size,
+    TargetWordSize word_size,
     TargetPtr text_start,
     TargetPtr rodata_start,
     TargetPtr data_start,
@@ -62,7 +62,7 @@ std::string RelocateBinarySections(
   CHECK(f != nullptr)
     << "Require tvm_callback_relocate_binary to exist in registry";
   std::string relocated_bin = (*f)(binary_path,
-                                   word_size,
+                                   word_size.bytes(),
                                    text_start.cast_to<uint64_t>(),
                                    rodata_start.cast_to<uint64_t>(),
                                    data_start.cast_to<uint64_t>(),
@@ -91,7 +91,7 @@ std::string ReadSection(const std::string& binary,
 size_t GetSectionSize(const std::string& binary_path,
                       SectionKind section,
                       const std::string& toolchain_prefix,
-                      size_t align) {
+                      TargetWordSize word_size) {
   CHECK(section == SectionKind::kText || section == SectionKind::kRodata ||
         section == SectionKind::kData || section == SectionKind::kBss)
       << "GetSectionSize requires section to be one of text, rodata, data, or bss.";
@@ -99,7 +99,7 @@ size_t GetSectionSize(const std::string& binary_path,
   CHECK(f != nullptr)
     << "Require tvm_callback_get_section_size to exist in registry";
   int size = (*f)(binary_path, SectionToString(section), toolchain_prefix);
-  return UpperAlignValue(size, align);
+  return UpperAlignValue(size, word_size.bytes());
 }
 
 }  // namespace runtime
diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h
index a8d16d2b9803..2093f385fca0 100644
--- a/src/runtime/micro/micro_common.h
+++ b/src/runtime/micro/micro_common.h
@@ -30,6 +30,7 @@
 #include <sstream>
 #include <string>
 #include <unordered_map>
+#include <utility>
 
 namespace tvm {
 namespace runtime {
@@ -52,6 +53,27 @@ enum class SectionKind : size_t {
   kNumKinds,
 };
 
+/*! \brief data type for word sizes */
+class TargetWordSize {
+ private:
+  size_t word_size_bits_;
+
+ public:
+  explicit TargetWordSize(size_t word_size_bits) : word_size_bits_{word_size_bits} {
+    CHECK(word_size_bits == 32 || word_size_bits == 64)
+      << "only 32-bit and 64-bit are supported now";
+  }
+
+  size_t bytes() const {
+    return word_size_bits_ / 8;
+  }
+
+  size_t bits() const {
+    return word_size_bits_;
+  }
+};
+
+
 /*! \brief class for storing values on varying target word sizes */
 class TargetVal {
  private:
@@ -59,42 +81,63 @@ class TargetVal {
   uint64_t value_;
 
  public:
+  /*! \brief construct a TargetVal matching the size of the given integral argument */
   template<typename T, typename U = typename std::enable_if<std::is_integral<T>::value, T>::type>
-  explicit constexpr TargetVal(T value) :
-      width_bits_{sizeof(T) * 8}, value_{value} {}
+  explicit constexpr TargetVal(T value) : TargetVal(sizeof(T) * 8, value) {}
 
+  /*! \brief construct an uninitialized value */
+  TargetVal() : width_bits_{0}, value_{0} {}
+
+  /*! \brief construct a TargetVal with explicit size and value */
   TargetVal(size_t width_bits, uint64_t value) : width_bits_{width_bits} {
-    CHECK(width_bits != 0 && (width_bits & (width_bits - 1)) == 0)
-      << "width_bits must be a power of 2, got " << width_bits;
-    *this = value;
+    CHECK(width_bits >= 8 &&
+          width_bits <= 64 &&
+          (width_bits & (width_bits - 1)) == 0)
+      << "width_bits must be a power of 2 in [8, 64], got " << width_bits;
+    value_ = value & bitmask();
+  }
+
+  bool is_initialized() const { return width_bits_ != 0; }
+
+  size_t width_bits() const {
+    CHECK(is_initialized()) << "TargetVal is not initialized";
+    return width_bits_;
   }
 
-  size_t width_bits() const { return width_bits_; }
   uint64_t bitmask() const {
+    CHECK(is_initialized()) << "TargetVal is not initialized";
+
     if (width_bits_ == 64) {
-      return 0xffffffff;
+      return ~0UL;
     } else {
       return (1UL << width_bits_) - 1;
     }
   }
 
   uint32_t uint32() const {
+    CHECK(is_initialized()) << "TargetVal is not initialized";
     CHECK(width_bits_ <= 32) << "TargetVal: requested 32-bit value, actual width is "
                              << width_bits_;
     return uint32_t(value_ & bitmask());
   }
 
   uint64_t uint64() const {
+    CHECK(is_initialized()) << "TargetVal is not initialized";
     return value_;
   }
 
-  TargetVal& operator=(const uint64_t& value) {
-    if (width_bits_ == 64) {
-      value_ = value;
-    } else {
-      CHECK((value & ~bitmask()) == 0) << "bits above " << width_bits_ << " are non-zero";
-      value_ = value & bitmask();
+  TargetVal& operator=(const TargetVal& other) {
+    CHECK(other.is_initialized()) << "Cannot assign an uninitialized TargetVal";
+
+    if (!is_initialized()) {
+      width_bits_ = other.width_bits_;
     }
+
+    CHECK(width_bits_ >= other.width_bits_)
+      << "Cannot assign TargetVal with width " << other.width_bits_
+      << "bits to TargetVal with width " << width_bits_ << "bits";
+
+    value_ = other.value_ & bitmask();
     return *this;
   }
 };
@@ -103,14 +146,19 @@ class TargetVal {
 /*! \brief absolute device address */
 class TargetPtr {
  public:
-  /*! \brief construct a device address with val64 `value` */
-  explicit TargetPtr(std::uint64_t value) : value_(TargetVal(64, value)) {}
+  /*! \brief construct a device address with variable-length value `value` */
+  TargetPtr(TargetWordSize word_size, std::uint64_t value) :
+      value_(TargetVal(word_size.bits(), value)) {}
+
+  /*! \brief construct a null address */
+  TargetPtr(TargetWordSize word_size, std::nullptr_t value) :
+      value_{TargetVal(word_size.bits(), 0)} {}
 
-  /*! \brief default constructor (val64 0) */
-  TargetPtr() : value_(TargetVal(64, 0)) {}
+  /*! \brief construct an uninitialized pointer whose word_size can be changed once */
+  TargetPtr() = default;
 
-  /*! \brief construct a null address (stored in val64) */
-  explicit TargetPtr(std::nullptr_t value) : value_{TargetVal(64, 0)} {}
+  /*! \brief construct a device address using the given TargetVal */
+  explicit TargetPtr(const TargetVal& value) : value_{value} {}
 
   /*! \brief destructor */
   ~TargetPtr() {}
@@ -136,23 +184,23 @@ class TargetPtr {
 
   /*! \brief add an integer to this absolute address to get a larger absolute address */
   TargetPtr operator+(size_t n) const {
-    return TargetPtr(value_.uint64() + n);
+    return TargetPtr(TargetWordSize(value_.width_bits()), value_.uint64() + n);
   }
 
   /*! \brief mutably add an integer to this absolute address */
   TargetPtr& operator+=(size_t n) {
-    value_ = value_.uint64() + n;
+    value_ = TargetVal(value_.width_bits(), value_.uint64() + n);
     return *this;
   }
 
   /*! \brief subtract an integer from this absolute address to get a smaller absolute address */
   TargetPtr operator-(size_t n) const {
-    return TargetPtr(value_.uint64() - n);
+    return TargetPtr(TargetWordSize(value_.width_bits()), value_.uint64() - n);
   }
 
   /*! \brief mutably subtract an integer from this absolute address */
   TargetPtr& operator-=(size_t n) {
-    value_ = value_.uint64() - n;
+    value_ = TargetVal(value_.width_bits(), value_.uint64() - n);
     return *this;
   }
 
@@ -177,7 +225,8 @@ class SymbolMap {
    * \param toolchain_prefix prefix of compiler toolchain to use
    */
   SymbolMap(const std::string& binary,
-            const std::string& toolchain_prefix) {
+            const std::string& toolchain_prefix,
+            TargetWordSize word_size) {
     const auto* f = Registry::Get("tvm_callback_get_symbol_map");
     CHECK(f != nullptr) << "require tvm_callback_get_symbol_map to exist in registry";
     TVMByteArray arr;
@@ -192,7 +241,7 @@ class SymbolMap {
     stream >> name;
     stream >> std::hex >> addr;
     while (stream) {
-      map_[name] = TargetPtr(addr);
+      map_.emplace(std::make_pair(name, TargetPtr(word_size, addr)));
       stream >> name;
       stream >> std::hex >> addr;
     }
@@ -285,7 +334,7 @@ const char* SectionToString(SectionKind section);
  */
 std::string RelocateBinarySections(
     const std::string& binary_path,
-    size_t word_size,
+    TargetWordSize word_size,
     TargetPtr text_start,
     TargetPtr rodata_start,
     TargetPtr data_start,
@@ -309,13 +358,13 @@ std::string ReadSection(const std::string& binary,
  * \param binary input binary contents
  * \param section section type
  * \param toolchain_prefix prefix of compiler toolchain to use
- * \param align alignment of the returned size (default: 8)
+ * \param word_size word size of the target, for alignment
  * \return size of the section if it exists, 0 otherwise
  */
 size_t GetSectionSize(const std::string& binary_name,
                       SectionKind section,
                       const std::string& toolchain_prefix,
-                      size_t align);
+                      TargetWordSize word_size);
 
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc
index 619c31a8b1ba..77ad86577f7e 100644
--- a/src/runtime/micro/micro_device_api.cc
+++ b/src/runtime/micro/micro_device_api.cc
@@ -50,18 +50,14 @@ class MicroDeviceAPI final : public DeviceAPI {
                        size_t alignment,
                        DLDataType type_hint) final {
     ObjectPtr<MicroSession>& session = MicroSession::Current();
-    void* data = session->AllocateInSection(SectionKind::kHeap, nbytes).cast_to<void*>();
+    TargetPtr data = session->AllocateInSection(SectionKind::kHeap, nbytes);
     CHECK(data != nullptr) << "unable to allocate " << nbytes << " bytes on device heap";
-    MicroDevSpace* dev_space = new MicroDevSpace();
-    dev_space->data = data;
-    dev_space->session = session;
-    return static_cast<void*>(dev_space);
+    return reinterpret_cast<void*>(new MicroDevSpace{data, session});
   }
 
   void FreeDataSpace(TVMContext ctx, void* ptr) final {
     MicroDevSpace* dev_space = static_cast<MicroDevSpace*>(ptr);
-    dev_space->session->FreeInSection(
-      SectionKind::kHeap, TargetPtr(reinterpret_cast<std::uintptr_t>(dev_space->data)));
+    dev_space->session->FreeInSection(SectionKind::kHeap, dev_space->data);
     delete dev_space;
   }
 
@@ -134,20 +130,17 @@ class MicroDeviceAPI final : public DeviceAPI {
     CHECK(false) << "the on-device workspace allocator isn't aware of this function";
     ObjectPtr<MicroSession>& session = MicroSession::Current();
 
-    void* data = session->AllocateInSection(SectionKind::kWorkspace, size).cast_to<void*>();
-    CHECK(data != nullptr) << "unable to allocate " << size << " bytes on device workspace";
-    MicroDevSpace* dev_space = new MicroDevSpace();
-    dev_space->data = data;
-    dev_space->session = session;
-    return static_cast<void*>(dev_space);
+    TargetPtr data = session->AllocateInSection(SectionKind::kWorkspace, size);
+    CHECK(data.value().uint64() != 0)
+      << "unable to allocate " << size << " bytes on device workspace";
+    return static_cast<void*>(new MicroDevSpace{data, session});
   }
 
   void FreeWorkspace(TVMContext ctx, void* data) final {
     CHECK(false) << "the on-device workspace allocator isn't aware of this function";
     MicroDevSpace* dev_space = static_cast<MicroDevSpace*>(data);
     ObjectPtr<MicroSession>& session = dev_space->session;
-    session->FreeInSection(SectionKind::kWorkspace,
-                           TargetPtr(reinterpret_cast<std::uintptr_t>(dev_space->data)));
+    session->FreeInSection(SectionKind::kWorkspace, dev_space->data);
     delete dev_space;
   }
 
@@ -162,7 +155,7 @@ class MicroDeviceAPI final : public DeviceAPI {
 
  private:
   TargetPtr GetDevLoc(MicroDevSpace* dev_space, size_t offset) {
-    return TargetPtr(reinterpret_cast<std::uintptr_t>(dev_space->data) + offset);
+    return dev_space->data + offset;
   }
 
   void* GetHostLoc(const void* ptr, size_t offset) {
diff --git a/src/runtime/micro/micro_section_allocator.h b/src/runtime/micro/micro_section_allocator.h
index 8fff39b61d19..2067794f393f 100644
--- a/src/runtime/micro/micro_section_allocator.h
+++ b/src/runtime/micro/micro_section_allocator.h
@@ -39,16 +39,18 @@ class MicroSectionAllocator {
    * \brief constructor that specifies section boundaries
    * \param region location and size of the section on the device
    */
-  explicit MicroSectionAllocator(std::string section_name, DevMemRegion region, size_t word_size)
+  explicit MicroSectionAllocator(std::string section_name,
+                                 DevMemRegion region,
+                                 TargetWordSize word_size)
     : section_name_(section_name),
       start_addr_(region.start),
       size_(0),
       capacity_(region.size),
       word_size_(word_size) {
-      CHECK_EQ(start_addr_.value().uint64() % word_size, 0)
-        << "micro section start not aligned to " << word_size << " bytes";
-      CHECK_EQ(capacity_ % word_size, 0)
-        << "micro section end not aligned to " << word_size << " bytes";
+      CHECK_EQ(start_addr_.value().uint64() % word_size.bytes(), 0)
+        << "micro section start not aligned to " << word_size.bytes() << " bytes";
+      CHECK_EQ(capacity_ % word_size.bytes(), 0)
+        << "micro section end not aligned to " << word_size.bytes() << " bytes";
     }
 
   /*!
@@ -62,7 +64,7 @@ class MicroSectionAllocator {
    * \return pointer to allocated memory region in section, nullptr if out of space
    */
   TargetPtr Allocate(size_t size) {
-    size_ = UpperAlignValue(size_, word_size_);
+    size_ = UpperAlignValue(size_, word_size_.bytes());
     CHECK(size_ + size < capacity_)
       << "cannot alloc " << size << " bytes in section \""
       << section_name_ << "\" (start_addr=" << start_addr_.cast_to<void*>()
@@ -122,7 +124,7 @@ class MicroSectionAllocator {
   /*! \brief total storage capacity of the section */
   size_t capacity_;
   /*! \brief number of bytes in a word on the target device */
-  size_t word_size_;
+  TargetWordSize word_size_;
   /*! \brief allocation map for allocation sizes */
   std::unordered_map<uint64_t, size_t> alloc_map_;
 };
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index 11ee035820a2..2cfde7e990a1 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -80,7 +80,7 @@ MicroSession::MicroSession(
     size_t workspace_size,
     uint64_t stack_start,
     size_t stack_size,
-    size_t word_size,
+    TargetWordSize word_size,
     bool thumb_mode,
     bool use_device_timer,
     const std::string& server_addr,
@@ -90,7 +90,6 @@ MicroSession::MicroSession(
       thumb_mode_(thumb_mode),
       use_device_timer_(use_device_timer),
       batch_args_encoder_(args_size, word_size) {
-  CHECK(word_size_ == 4 || word_size_ == 8) << "unsupported word size " << word_size_;
   if (comms_method == "host") {
     // TODO(weberlo): move checks to python
     CHECK(
@@ -105,11 +104,11 @@ MicroSession::MicroSession(
     size_t memory_size =
       text_size + rodata_size + data_size + bss_size +
       args_size + heap_size + workspace_size + stack_size;
-    void* base_addr;
+    TargetPtr base_addr;
     low_level_device_ = HostLowLevelDeviceCreate(memory_size, &base_addr);
-    CHECK_EQ(reinterpret_cast<std::uintptr_t>(base_addr) % word_size_, 0)
-      << "base address not aligned to " << word_size_ << " bytes";
-    TargetPtr curr_addr = TargetPtr(reinterpret_cast<std::uintptr_t>(base_addr));
+    CHECK_EQ(base_addr.value().uint64() % word_size.bytes(), 0)
+      << "base address not aligned to " << word_size.bytes() << " bytes";
+    TargetPtr curr_addr = base_addr;
 
     section_allocators_[0] = std::make_shared<MicroSectionAllocator>(
       "text",
@@ -172,49 +171,49 @@ MicroSession::MicroSession(
     section_allocators_[0] = std::make_shared<MicroSectionAllocator>(
       "text",
       DevMemRegion {
-        .start = TargetPtr(text_start),
+        .start = TargetPtr(word_size_, text_start),
         .size = text_size,
       }, word_size_);
     section_allocators_[1] = std::make_shared<MicroSectionAllocator>(
       "rodata",
       DevMemRegion {
-        .start = TargetPtr(rodata_start),
+        .start = TargetPtr(word_size_, rodata_start),
         .size = rodata_size,
       }, word_size_);
     section_allocators_[2] = std::make_shared<MicroSectionAllocator>(
       "data",
       DevMemRegion {
-        .start = TargetPtr(data_start),
+        .start = TargetPtr(word_size_, data_start),
         .size = data_size,
       }, word_size_);
     section_allocators_[3] = std::make_shared<MicroSectionAllocator>(
       "bss",
       DevMemRegion {
-        .start = TargetPtr(bss_start),
+        .start = TargetPtr(word_size_, bss_start),
         .size = bss_size,
       }, word_size_);
     section_allocators_[4] = std::make_shared<MicroSectionAllocator>(
       "args",
       DevMemRegion {
-        .start = TargetPtr(args_start),
+        .start = TargetPtr(word_size_, args_start),
         .size = args_size,
       }, word_size_);
     section_allocators_[5] = std::make_shared<MicroSectionAllocator>(
       "heap",
       DevMemRegion {
-        .start = TargetPtr(heap_start),
+        .start = TargetPtr(word_size_, heap_start),
         .size = heap_size,
       }, word_size_);
     section_allocators_[6] = std::make_shared<MicroSectionAllocator>(
       "workspace",
       DevMemRegion {
-        .start = TargetPtr(workspace_start),
+        .start = TargetPtr(word_size_, workspace_start),
         .size = workspace_size,
       }, word_size_);
     section_allocators_[7] = std::make_shared<MicroSectionAllocator>(
       "stack",
       DevMemRegion {
-        .start = TargetPtr(stack_start),
+        .start = TargetPtr(word_size_, stack_start),
         .size = stack_size,
       }, word_size_);
   } else {
@@ -229,17 +228,14 @@ MicroSession::MicroSession(
   // Patch pointers to define the bounds of the workspace section and the word
   // size (for allocation alignment).
   std::shared_ptr<MicroSectionAllocator> ws_allocator = GetAllocator(SectionKind::kWorkspace);
-  TargetVal ws_start(word_size_ * 8, ws_allocator->start_addr().value().uint64());
-  TargetVal ws_end(word_size_ * 8, ws_allocator->max_addr().value().uint64());
-  TargetVal target_word_size(word_size_ * 8, word_size_);
-  if (word_size_ == 4) {
-    DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.uint32());
-    DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.uint32());
-    DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.uint32());
-  } else if (word_size_ == 8) {
-    DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.uint64());
-    DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.uint64());
-    DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.uint64());
+  DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_allocator->start_addr());
+  DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_allocator->max_addr());
+  if (word_size.bytes() == 4) {
+    DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", uint32_t(word_size.bytes()));
+  } else if (word_size.bytes() == 8) {
+    DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", uint64_t(word_size.bytes()));
+  } else {
+    CHECK(false) << "Unsupported word size unexpectedly here";
   }
 }
 
@@ -258,8 +254,8 @@ void MicroSession::PushToTaskQueue(TargetPtr func_ptr, const TVMArgs& args) {
   TargetVal func_dev_addr = func_ptr.value();
 
   std::tuple<TargetPtr, TargetPtr> arg_field_addrs = EncoderAppend(&batch_args_encoder_, args);
-  TargetVal arg_values_dev_addr{std::get<0>(arg_field_addrs).cast_to<uint64_t>()};
-  TargetVal arg_type_codes_dev_addr{std::get<1>(arg_field_addrs).cast_to<uint64_t>()};
+  TargetVal arg_values_dev_addr{std::get<0>(arg_field_addrs).value()};
+  TargetVal arg_type_codes_dev_addr{std::get<1>(arg_field_addrs).value()};
 
   task_queue_.push_back(
       DevTask {
@@ -279,9 +275,9 @@ void MicroSession::FlushTaskQueue() {
     // nothing to run
     return;
   }
-  if (word_size_ == 4) {
+  if (word_size_.bytes() == 4) {
     FlushTaskQueuePriv<StructUTVMTask32>();
-  } else if (word_size_ == 8) {
+  } else if (word_size_.bytes() == 8) {
     FlushTaskQueuePriv<StructUTVMTask64>();
   }
 }
@@ -389,9 +385,6 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d
   rodata_section.start = AllocateInSection(SectionKind::kRodata, rodata_section.size);
   data_section.start = AllocateInSection(SectionKind::kData, data_section.size);
   bss_section.start = AllocateInSection(SectionKind::kBss, bss_section.size);
-  CHECK(text_section.start != nullptr && rodata_section.start != nullptr &&
-        data_section.start != nullptr && bss_section.start != nullptr)
-      << "not enough space to load module on device";
 
   std::string relocated_bin = RelocateBinarySections(
       binary_path,
@@ -411,7 +404,7 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d
   low_level_device_->Write(rodata_section.start, &rodata_contents[0], rodata_section.size);
   low_level_device_->Write(data_section.start, &data_contents[0], data_section.size);
   low_level_device_->Write(bss_section.start, &bss_contents[0], bss_section.size);
-  SymbolMap symbol_map {relocated_bin, toolchain_prefix_};
+  SymbolMap symbol_map {relocated_bin, toolchain_prefix_, word_size_};
 
   if (patch_dylib_pointers) {
     // Patch device lib pointers.
@@ -447,12 +440,13 @@ std::tuple<TargetPtr, TargetPtr> MicroSession::EncoderAppend(
         // order to prevent premature session destruction.
         void* old_data = base_arr_handle->data;
         // Mutate the array to unwrap the `data` field.
-        base_arr_handle->data = reinterpret_cast<MicroDevSpace*>(old_data)->data;
+        MicroDevSpace* dev_arr_ptr = reinterpret_cast<MicroDevSpace*>(old_data);
+        base_arr_handle->data = reinterpret_cast<void*>(dev_arr_ptr->data.value().uint64());
         // Now, encode the unwrapped version.
         void* arr_ptr = nullptr;
-        if (word_size_ == 4) {
+        if (word_size_.bytes() == 4) {
           arr_ptr = EncoderAppend<TVMArray32>(encoder, *base_arr_handle).cast_to<void*>();
-        } else if (word_size_ == 8) {
+        } else if (word_size_.bytes() == 8) {
           arr_ptr = EncoderAppend<TVMArray64>(encoder, *base_arr_handle).cast_to<void*>();
         }
         // And restore the original wrapped version.
@@ -486,7 +480,7 @@ TargetPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DL
   // is a device pointer, so we don't need to write it.
   shape_slot.WriteArray(arr.shape, arr.ndim);
   TargetPtr shape_dev_addr = shape_slot.start_addr();
-  TargetPtr strides_dev_addr = TargetPtr(nullptr);
+  TargetPtr strides_dev_addr = TargetPtr(word_size_, nullptr);
   if (arr.strides != nullptr) {
     auto stride_slot = encoder->Alloc<int64_t>(arr.ndim);
     stride_slot.WriteArray(arr.strides, arr.ndim);
@@ -494,13 +488,13 @@ TargetPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DL
   }
 
   T dev_arr(
-      TargetVal { word_size_ * 8, reinterpret_cast<uint64_t>(arr.data) },
+      TargetVal { word_size_.bits(), reinterpret_cast<uint64_t>(arr.data) },
       arr.ctx,
       arr.ndim,
       arr.dtype,
       shape_dev_addr.value(),
       strides_dev_addr.value(),
-      TargetVal { word_size_ * 8, arr.byte_offset });
+      TargetVal { word_size_.bits(), arr.byte_offset });
   CHECK(dev_arr.ctx.device_type == static_cast<DLDeviceType>(kDLMicroDev))
     << "attempt to write DLTensor with non-micro device type";
   // Update the device type to CPU, because from the microcontroller's
@@ -570,11 +564,7 @@ void MicroSession::PatchImplHole(const SymbolMap& symbol_map, const std::string&
   }
   std::ostringstream func_name_underscore;
   func_name_underscore << func_name << "_";
-  if (word_size_ == 4) {
-    DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().uint32());
-  } else if (word_size_ == 8) {
-    DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().uint64());
-  }
+  DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr);
 }
 
 std::string MicroSession::ReadString(TargetPtr str_addr) {
@@ -611,6 +601,18 @@ T MicroSession::DevSymbolRead(const SymbolMap& symbol_map, const std::string& sy
   return result;
 }
 
+void MicroSession::DevSymbolWrite(const SymbolMap& symbol_map,
+                                  const std::string& symbol,
+                                  const TargetPtr& ptr) {
+  if (word_size_.bytes() == 4) {
+    DevSymbolWrite(symbol_map, symbol, ptr.value().uint32());
+  } else if (word_size_.bytes() == 8) {
+    DevSymbolWrite(symbol_map, symbol, ptr.value().uint64());
+  } else {
+    CHECK(false) << "Unsupported word size unexpectedly here";
+  }
+}
+
 template <typename T>
 void MicroSession::DevSymbolWrite(const SymbolMap& symbol_map,
                                   const std::string& symbol,
@@ -701,7 +703,7 @@ TVM_REGISTER_GLOBAL("micro._CreateSession")
     size_t workspace_size = uint64_t(args[16]);
     uint64_t stack_start = args[17];
     size_t stack_size = uint64_t(args[18]);
-    size_t word_size = uint64_t(args[19]);
+    TargetWordSize word_size{uint64_t(args[19])};
     bool thumb_mode = args[20];
     bool use_device_timer = args[21];
     const std::string& server_addr = args[22];
diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h
index f7b05ae211e9..bf0996c3e8c4 100644
--- a/src/runtime/micro/micro_session.h
+++ b/src/runtime/micro/micro_session.h
@@ -99,7 +99,7 @@ class MicroSession : public ModuleNode {
    * \param workspace_size workspace section size
    * \param stack_start stack section start address
    * \param stack_size stack section size
-   * \param word_size number of bytes in a word on the target device
+   * \param word_size_bytes number of bytes in a word on the target device
    * \param thumb_mode whether the target device requires a thumb-mode bit on function addresses
    * \param server_addr address of the OpenOCD server to connect to (if `comms_method == "openocd"`)
    * \param port port of the OpenOCD server to connect to (if `comms_method == "openocd"`)
@@ -124,7 +124,7 @@ class MicroSession : public ModuleNode {
       size_t workspace_size,
       uint64_t stack_start,
       size_t stack_size,
-      size_t word_size,
+      TargetWordSize word_size,
       bool thumb_mode,
       bool use_device_timer,
       const std::string& server_addr,
@@ -196,6 +196,16 @@ class MicroSession : public ModuleNode {
   template <typename T>
   T DevSymbolRead(const SymbolMap& symbol_map, const std::string& symbol);
 
+  /*!
+   * \brief write pointer value into device memory corresponding to symbol
+  * \param symbol_map symbol map to read location of symbol from
+  * \param symbol name of symbol being written to
+  * \param ptr pointer value to write into symbol
+   */
+  void DevSymbolWrite(const SymbolMap& symbol_map,
+                      const std::string& symbol,
+                      const TargetPtr& ptr);
+
   /*!
   * \brief write value into device memory corresponding to symbol
   * \param symbol_map symbol map to read location of symbol from
@@ -235,7 +245,7 @@ class MicroSession : public ModuleNode {
   std::shared_ptr<MicroSectionAllocator>
       section_allocators_[static_cast<size_t>(SectionKind::kNumKinds)];
   /*! \brief number of bytes in a word on the target device */
-  size_t word_size_;
+  TargetWordSize word_size_;
   /*! \brief whether the target device requires a thumb-mode bit on function addresses
    *
    * ARM and other manufacturers use the lowest bit of a function address to determine
@@ -251,11 +261,11 @@ class MicroSession : public ModuleNode {
   std::vector<DevTask> task_queue_;
   // TODO(weberlo): we don't even need an allocator mechanism for the args
   // section. there's only ever one allocation.
-  /*! \brief TODO fukn hack */
+  /*! \brief TODO hack */
   TargetDataLayoutEncoder batch_args_encoder_;
-  /*! \brief TODO fukn hack */
+  /*! \brief TODO hack */
   double last_batch_time_;
-  /*! \brief TODO fukn hack */
+  /*! \brief TODO hack */
   double last_batch_cycles_;
 
   /*!
@@ -317,7 +327,7 @@ class MicroSession : public ModuleNode {
  */
 struct MicroDevSpace {
   /*! \brief data being wrapped */
-  void* data;
+  TargetPtr data;
   /*! \brief shared ptr to session where this data is valid */
   ObjectPtr<MicroSession> session;
 };
diff --git a/src/runtime/micro/openocd_low_level_device.cc b/src/runtime/micro/openocd_low_level_device.cc
index d1f279f6120f..68c036a89301 100644
--- a/src/runtime/micro/openocd_low_level_device.cc
+++ b/src/runtime/micro/openocd_low_level_device.cc
@@ -50,7 +50,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
     socket_.SendCommand();
   }
 
-  void Read(TargetPtr addr, void* buf, size_t num_bytes) {
+  void Read(TargetPtr addr, void* buf, size_t num_bytes) override {
     if (num_bytes == 0) {
       return;
     }
@@ -119,7 +119,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
     }
   }
 
-  void Write(TargetPtr addr, const void* buf, size_t num_bytes) {
+  void Write(TargetPtr addr, const void* buf, size_t num_bytes) override {
     if (num_bytes == 0) {
       return;
     }
@@ -171,7 +171,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
     }
   }
 
-  void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) {
+  void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) override {
     socket_.cmd_builder() << "halt 0";
     socket_.SendCommand();
 
diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h
index ea682e261348..e30c72b79fb5 100644
--- a/src/runtime/micro/target_data_layout_encoder.h
+++ b/src/runtime/micro/target_data_layout_encoder.h
@@ -96,8 +96,10 @@ class TargetDataLayoutEncoder {
    * \brief constructor
    * \param start_addr start address of the encoder in device memory
    */
-  explicit TargetDataLayoutEncoder(size_t capacity, size_t word_size)
-      : buf_(std::vector<uint8_t>()), curr_offset_(0), capacity_(capacity), word_size_(word_size) {
+  explicit TargetDataLayoutEncoder(size_t capacity, TargetWordSize word_size)
+      : buf_(std::vector<uint8_t>()), curr_offset_(0),
+        start_addr_(word_size, nullptr),
+        capacity_(capacity), word_size_(word_size) {
   }
 
   /*!
@@ -107,7 +109,7 @@ class TargetDataLayoutEncoder {
    */
   template <typename T>
   Slot<T> Alloc(size_t num_elems = 1) {
-    curr_offset_ = UpperAlignValue(curr_offset_, word_size_);
+    curr_offset_ = UpperAlignValue(curr_offset_, word_size_.bytes());
     size_t size = sizeof(T) * num_elems;
     if (curr_offset_ + size > buf_.size()) {
       buf_.resize(curr_offset_ + size);
@@ -146,7 +148,8 @@ class TargetDataLayoutEncoder {
 
   void set_start_addr(TargetPtr start_addr) {
     CHECK_EQ(buf_.size(), 0) << "cannot change encoder start addr unless empty";
-    start_addr_ = TargetPtr(UpperAlignValue(start_addr.value().uint64(), word_size_));
+    start_addr_ = TargetPtr(word_size_,
+                            UpperAlignValue(start_addr.value().uint64(), word_size_.bytes()));
   }
 
  private:
@@ -159,7 +162,7 @@ class TargetDataLayoutEncoder {
   /*! \brief TODO */
   size_t capacity_;
   /*! \brief number of bytes in a word on the target device */
-  size_t word_size_;
+  TargetWordSize word_size_;
 };
 
 template <typename T>
diff --git a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py
index 245692337bc3..13a83393a912 100644
--- a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py
+++ b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py
@@ -14,4 +14,3 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-

From 03782a81c146c9f2044557869587adfbeac4c904 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 24 Apr 2020 23:29:48 +0000
Subject: [PATCH 07/11] fix compile warning

---
 src/runtime/micro/micro_session.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index 2cfde7e990a1..672f16dc06f9 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -651,14 +651,14 @@ TVM_REGISTER_GLOBAL("micro._GetMicroTimeEvaluator")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
   PackedFunc pf = args[0];
   TVMContext ctx = args[1];
-  int number = args[2];
-  int repeat = args[3];
+  uint64_t number = args[2];
+  uint64_t repeat = args[3];
 
   auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) mutable {
     TVMRetValue temp;
     std::ostringstream os;
 
-    for (int i = 0; i < repeat; ++i) {
+    for (unsigned int i = 0; i < repeat; ++i) {
       // start timing
       CHECK(number < MicroSession::kTaskQueueCapacity)
         << "`number` must be less than uTVM task queue capacity";

From d351b9c0097cb8a6d8461adae41bebdb09bab773 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 24 Apr 2020 23:41:22 +0000
Subject: [PATCH 08/11] address logan's comments

---
 src/runtime/micro/openocd_low_level_device.cc           | 4 ++--
 topi/python/topi/arm_cpu/conv2d_spatial_pack.py         | 2 +-
 topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/runtime/micro/openocd_low_level_device.cc b/src/runtime/micro/openocd_low_level_device.cc
index 68c036a89301..0f21d6674740 100644
--- a/src/runtime/micro/openocd_low_level_device.cc
+++ b/src/runtime/micro/openocd_low_level_device.cc
@@ -207,8 +207,8 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
 
   /*! \brief number of bytes in a word on the target device (64-bit) */
   static const constexpr ssize_t kWordSize = 8;
-  // NOTE: OpenOCD will call any request larger than this constant an "absurd
-  // request".
+  // NOTE: The OS pipe buffer must be able to handle a line long enough to
+  // print this transfer request.
   /*! \brief maximum number of bytes allowed in a single memory transfer */
   static const constexpr ssize_t kMemTransferLimit = 8000;
   /*! \brief number of milliseconds to wait for function execution to halt */
diff --git a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
index b4f8e7cf836d..a4d7ad83b1c8 100644
--- a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
+++ b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
@@ -173,7 +173,7 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec,
                                  axis_lens=[cfg['tile_oh'].size[-1],
                                             cfg['tile_ow'].size[-1],
                                             cfg['tile_co'].size[-1]],
-                                 max_unroll=None,
+                                 max_unroll=16,
                                  cfg=cfg)
     s[conv].compute_at(s[last], ow)
 
diff --git a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
index 70cd46bb32c2..2169176c0711 100644
--- a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
+++ b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
@@ -34,7 +34,7 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
     # collisions in the generated source (e.g., if there are multiple operators
     # in the same module that use the same intrinsic)
     #
-    # TODO to cut down on memory usage, we should cache each intrinsic
+    # TODO(areusch): to cut down on memory usage, we should cache each intrinsic
     # instantiation and include it only once, eliminating the need for unique
     # IDs
     UNIQ_ID_LEN = 8
@@ -47,7 +47,7 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
     if isinstance(N, tvm.tir.IntImm):
         N = N.value
     assert K % 4 == 0
-    # TODO support more dtypes?
+    # TODO(areusch): support more dtypes?
     assert in_dtype == 'int8'
     assert out_dtype == 'int32'
     A = te.placeholder((M, K), name='a', dtype=in_dtype)
@@ -124,7 +124,7 @@ def _body():
 
 def gemm_MxKxN_impl(M, K, N, uniq_id):
     """Emit C code for gemm impl."""
-    # TODO are there any SIMD tricks to zero out arrays quickly?
+    # TODO(areusch): are there any SIMD tricks to zero out arrays quickly?
     aa_pad_size = M * K
     bb_pad_size = N * K
     # code reference: CMSIS-NN paper (https://arxiv.org/abs/1801.06601)

From 4d426ab3a33b22d155579b49377675afac94a51c Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 27 Apr 2020 10:26:20 -0700
Subject: [PATCH 09/11] address logan and liangfu comments

---
 Makefile                                      |   2 -
 python/tvm/autotvm/tuner/callback.py          |   4 +-
 python/tvm/contrib/binutil.py                 |  80 +++++-----
 python/tvm/contrib/debugger/debug_runtime.py  |   1 -
 python/tvm/exec/rpc_server.py                 |   2 +-
 python/tvm/micro/base.py                      | 138 +++++++++---------
 python/tvm/micro/device/arm/stm32f746xx.py    |  74 +++++-----
 python/tvm/micro/device/base.py               |  66 ++++-----
 python/tvm/micro/device/host.py               |  46 +++---
 python/tvm/micro/device/riscv_spike.py        |  42 +++---
 python/tvm/relay/_parser.py                   |   5 +-
 python/tvm/relay/op/strategy/arm_cpu.py       |   4 +-
 python/tvm/target/arm_isa.py                  |   2 +-
 src/ir/error.cc                               |  17 +--
 .../host_driven/utvm_device_dylib_redirect.c  |  15 +-
 src/runtime/micro/host_driven/utvm_runtime.c  |  10 +-
 src/runtime/micro/host_driven/utvm_runtime.h  |  18 ++-
 src/runtime/micro/micro_common.h              |  30 ++--
 .../micro/target_data_layout_encoder.h        |   8 +-
 src/target/source/codegen_c_host.cc           |   4 +-
 .../topi/arm_cpu/cortex_m7/conv2d/direct.py   |   4 +-
 .../arm_cpu/cortex_m7/micro_kernel/gemm.py    |   6 +-
 22 files changed, 288 insertions(+), 290 deletions(-)

diff --git a/Makefile b/Makefile
index 64a127346e53..e54b9a93b230 100644
--- a/Makefile
+++ b/Makefile
@@ -73,9 +73,7 @@ build/libtvm_web_runtime.js: build/libtvm_web_runtime.bc
 cpplint:
 	python3 3rdparty/dmlc-core/scripts/lint.py vta cpp vta/include vta/src
 	python3 3rdparty/dmlc-core/scripts/lint.py topi cpp topi/include;
-	# Note: exclude src/runtime/micro/host_driven becuase it contains C99 files.
 	python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp \
-	 --exclude_path=src/runtime/micro/host_driven \
 	 include src \
 	 examples/extension/src examples/graph_executor/src
 
diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py
index f2e608088510..1ae64e2ddfef 100644
--- a/python/tvm/autotvm/tuner/callback.py
+++ b/python/tvm/autotvm/tuner/callback.py
@@ -144,10 +144,10 @@ def __del__(self):
     def _callback(tuner, inputs, results):
         ctx.ct += len(inputs)
 
-        flops = 0
+        flops = float("inf")
         for inp, res in zip(inputs, results):
             if res.error_no == 0:
-                flops = inp.task.flop / np.mean(res.costs)
+                flops = min(inp.task.flop / np.mean(res.costs), flops)
 
         if logger.level > logging.DEBUG:  # only print progress bar in non-debug mode
             ctx.cur_flops = flops
diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py
index 2b18d645dbc0..21e06df9f7f0 100644
--- a/python/tvm/contrib/binutil.py
+++ b/python/tvm/contrib/binutil.py
@@ -120,18 +120,18 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
         size of the section in bytes
     """
     if not os.path.isfile(binary_path):
-        raise RuntimeError('no such file \"{}\"'.format(binary_path))
+        raise RuntimeError('no such file "{}"'.format(binary_path))
     # We use the "-A" flag here to get the ".rodata" section's size, which is
     # not included by default.
-    size_output = run_cmd(['{}size'.format(toolchain_prefix), '-A', binary_path])
+    size_output = run_cmd(["{}size".format(toolchain_prefix), "-A", binary_path])
 
     # TODO(weberlo): Refactor this method and `*relocate_binary` so they are
     # both aware of [".bss", ".sbss", ".sdata"] being relocated to ".bss".
     section_mapping = {
-        '.text': ['.text'],
-        '.rodata': ['.rodata'],
-        '.data': ['.data', '.sdata'],
-        '.bss': ['.bss', '.sbss'],
+        ".text": [".text"],
+        ".rodata": [".rodata"],
+        ".data": [".data", ".sdata"],
+        ".bss": [".bss", ".sbss"],
     }
     sections_to_sum = section_mapping["." + section_name]
     section_size = 0
@@ -150,7 +150,7 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
     # NOTE: For some reason, the size of the BSS section on the RISC-V
     # GCC is sometimes reported to be smaller than it is, so we need to adjust
     # for this.
-    if 'riscv' in toolchain_prefix and section_name == 'bss':
+    if "riscv" in toolchain_prefix and section_name == "bss":
         # TODO(weberlo): Figure out why 32 is the minimum constant that works.
         #
         # The current hypothesis is that the last symbols in the ".bss" and
@@ -214,10 +214,10 @@ def tvm_callback_relocate_binary(
     """
     assert text_start < rodata_start < data_start < bss_start < stack_end
     stack_pointer_init = stack_end - word_size
-    ld_script_contents = ''
+    ld_script_contents = ""
     # TODO(weberlo): There should be a better way to configure this for different archs.
     # TODO is this line even necessary?
-    if 'riscv' in toolchain_prefix:
+    if "riscv" in toolchain_prefix:
         ld_script_contents += 'OUTPUT_ARCH( "riscv" )\n\n'
     ld_script_contents += RELOCATION_LD_SCRIPT_TEMPLATE.format(
         word_size=word_size,
@@ -228,31 +228,31 @@ def tvm_callback_relocate_binary(
         stack_pointer_init=stack_pointer_init)
 
     tmp_dir = util.tempdir()
-    rel_obj_path = tmp_dir.relpath('relocated.obj')
-    rel_ld_script_path = tmp_dir.relpath('relocate.lds')
-    with open(rel_ld_script_path, 'w') as f:
+    rel_obj_path = tmp_dir.relpath("relocated.obj")
+    rel_ld_script_path = tmp_dir.relpath("relocate.lds")
+    with open(rel_ld_script_path, "w") as f:
         f.write(ld_script_contents)
     run_cmd([
-        '{}ld'.format(toolchain_prefix),
+        "{}ld".format(toolchain_prefix),
         binary_path,
-        '-T', rel_ld_script_path,
-        '-o', rel_obj_path])
+        "-T", rel_ld_script_path,
+        "-o", rel_obj_path])
 
-    with open(rel_obj_path, 'rb') as f:
+    with open(rel_obj_path, "rb") as f:
         rel_bin = bytearray(f.read())
 
-    gdb_init_dir = os.environ.get('MICRO_GDB_INIT_DIR')
+    gdb_init_dir = os.environ.get("MICRO_GDB_INIT_DIR")
     if gdb_init_dir is not None:
-        gdb_init_path = f'{gdb_init_dir}/.gdbinit'
-        with open(gdb_init_path, 'r') as f:
-            gdbinit_contents = f.read().split('\n')
+        gdb_init_path = f"{gdb_init_dir}/.gdbinit"
+        with open(gdb_init_path, "r") as f:
+            gdbinit_contents = f.read().split("\n")
         new_contents = []
         for line in gdbinit_contents:
             new_contents.append(line)
-            if line.startswith('target'):
-                new_contents.append(f'add-symbol-file {rel_obj_path}')
-        with open(gdb_init_path, 'w') as f:
-            f.write('\n'.join(new_contents))
+            if line.startswith("target"):
+                new_contents.append(f"add-symbol-file {rel_obj_path}")
+        with open(gdb_init_path, "w") as f:
+            f.write("\n".join(new_contents))
 
     return rel_bin
 
@@ -278,22 +278,22 @@ def tvm_callback_read_binary_section(binary, section, toolchain_prefix):
         contents of the read section
     """
     tmp_dir = util.tempdir()
-    tmp_bin = tmp_dir.relpath('temp.bin')
-    tmp_section = tmp_dir.relpath('tmp_section.bin')
-    with open(tmp_bin, 'wb') as out_file:
+    tmp_bin = tmp_dir.relpath("temp.bin")
+    tmp_section = tmp_dir.relpath("tmp_section.bin")
+    with open(tmp_bin, "wb") as out_file:
         out_file.write(bytes(binary))
     run_cmd([
-        '{}objcopy'.format(toolchain_prefix),
-        '--dump-section',
-        '.{}={}'.format(section, tmp_section),
+        "{}objcopy".format(toolchain_prefix),
+        "--dump-section",
+        ".{}={}".format(section, tmp_section),
         tmp_bin])
     if os.path.isfile(tmp_section):
         # Get section content if it exists.
-        with open(tmp_section, 'rb') as f:
+        with open(tmp_section, "rb") as f:
             section_bin = bytearray(f.read())
     else:
         # Return empty bytearray if the section does not exist.
-        section_bin = bytearray('', 'utf-8')
+        section_bin = bytearray("", "utf-8")
     return section_bin
 
 
@@ -316,18 +316,18 @@ def tvm_callback_get_symbol_map(binary, toolchain_prefix):
         alternating newline-separated keys and values
     """
     tmp_dir = util.tempdir()
-    tmp_obj = tmp_dir.relpath('tmp_obj.bin')
-    with open(tmp_obj, 'wb') as out_file:
+    tmp_obj = tmp_dir.relpath("tmp_obj.bin")
+    with open(tmp_obj, "wb") as out_file:
         out_file.write(bytes(binary))
     nm_output = run_cmd([
-        '{}nm'.format(toolchain_prefix),
-        '-C',
-        '--defined-only',
+        "{}nm".format(toolchain_prefix),
+        "-C",
+        "--defined-only",
         tmp_obj])
     nm_output = nm_output.splitlines()
-    map_str = ''
+    map_str = ""
     for line in nm_output:
         line = line.split()
-        map_str += line[2] + '\n'
-        map_str += line[0] + '\n'
+        map_str += line[2] + "\n"
+        map_str += line[0] + "\n"
     return map_str
diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py
index fd838c407617..848d7f57d1de 100644
--- a/python/tvm/contrib/debugger/debug_runtime.py
+++ b/python/tvm/contrib/debugger/debug_runtime.py
@@ -181,7 +181,6 @@ def _run_debug(self):
         """
         self.debug_datum._time_list = [
             [float(t) * 1e-6] for t in self.run_individual(10, 1, 1)
-            #[float(t) * 1e-6] for t in self.run_individual(1, 1, 1)
         ]
         for i, node in enumerate(self.debug_datum.get_graph_nodes()):
             num_outputs = self.debug_datum.get_graph_node_output_num(node)
diff --git a/python/tvm/exec/rpc_server.py b/python/tvm/exec/rpc_server.py
index dd275a3bad4d..805c3a8afb1a 100644
--- a/python/tvm/exec/rpc_server.py
+++ b/python/tvm/exec/rpc_server.py
@@ -42,7 +42,7 @@ def main(args):
         tracker_addr = (url, port)
         if not args.key:
             raise RuntimeError(
-                "Need key to present type of resource when tracker is available")
+                'Need key to present type of resource when tracker is available')
     else:
         tracker_addr = None
 
diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py
index c34dd728841a..bf4fd0ac9b76 100644
--- a/python/tvm/micro/base.py
+++ b/python/tvm/micro/base.py
@@ -32,14 +32,14 @@
 # all sections that comprise a device's memory layout, in order from lowest
 # starting address to highest
 DEVICE_SECTIONS = [
-    'text',
-    'rodata',
-    'data',
-    'bss',
-    'args',
-    'heap',
-    'workspace',
-    'stack',
+    "text",
+    "rodata",
+    "data",
+    "bss",
+    "args",
+    "heap",
+    "workspace",
+    "stack",
 ]
 
 class LibType(Enum):
@@ -74,75 +74,75 @@ def __init__(self, config):
         # TODO(weberlo): add config validation
 
         # grab a binutil instance from the ID in the config
-        dev_funcs = tvm.micro.device.get_device_funcs(config['device_id'])
-        self.toolchain_prefix = config['toolchain_prefix']
-        self.mem_layout = config['mem_layout']
-        self.word_size_bits = config['word_size_bits']
-        self.thumb_mode = config['thumb_mode']
-        self.use_device_timer = config['use_device_timer']
-        self.comms_method = config['comms_method']
+        dev_funcs = tvm.micro.device.get_device_funcs(config["device_id"])
+        self.toolchain_prefix = config["toolchain_prefix"]
+        self.mem_layout = config["mem_layout"]
+        self.word_size_bits = config["word_size_bits"]
+        self.thumb_mode = config["thumb_mode"]
+        self.use_device_timer = config["use_device_timer"]
+        self.comms_method = config["comms_method"]
 
         # First, find and compile runtime library.
-        runtime_src_path = os.path.join(get_micro_host_driven_dir(), 'utvm_runtime.c')
+        runtime_src_path = os.path.join(get_micro_host_driven_dir(), "utvm_runtime.c")
         tmp_dir = _util.tempdir()
-        runtime_obj_path = tmp_dir.relpath('utvm_runtime.obj')
-        options = ['-I{}'.format(get_micro_host_driven_dir())]
-        dev_funcs['create_micro_lib'](
+        runtime_obj_path = tmp_dir.relpath("utvm_runtime.obj")
+        options = ["-I{}".format(get_micro_host_driven_dir())]
+        dev_funcs["create_micro_lib"](
             runtime_obj_path, runtime_src_path, LibType.RUNTIME, options=options)
 
-        comms_method = config['comms_method']
-        if comms_method == 'openocd':
-            server_addr = config['server_addr']
-            server_port = config['server_port']
-        elif comms_method == 'host':
-            server_addr = ''
+        comms_method = config["comms_method"]
+        if comms_method == "openocd":
+            server_addr = config["server_addr"]
+            server_port = config["server_port"]
+        elif comms_method == "host":
+            server_addr = ""
             server_port = 0
         else:
-            raise RuntimeError(f'unknown communication method: f{self.comms_method}')
+            raise RuntimeError(f"unknown communication method: f{self.comms_method}")
 
         assert all(map(lambda sec: sec in self.mem_layout, DEVICE_SECTIONS)), \
-            'not all sections have an assigned memory layout'
+            "not all sections have an assigned memory layout"
         self.module = _CreateSession(
             comms_method,
             runtime_obj_path,
             self.toolchain_prefix,
-            self.mem_layout['text'].get('start', 0),
-            self.mem_layout['text']['size'],
-            self.mem_layout['rodata'].get('start', 0),
-            self.mem_layout['rodata']['size'],
-            self.mem_layout['data'].get('start', 0),
-            self.mem_layout['data']['size'],
-            self.mem_layout['bss'].get('start', 0),
-            self.mem_layout['bss']['size'],
-            self.mem_layout['args'].get('start', 0),
-            self.mem_layout['args']['size'],
-            self.mem_layout['heap'].get('start', 0),
-            self.mem_layout['heap']['size'],
-            self.mem_layout['workspace'].get('start', 0),
-            self.mem_layout['workspace']['size'],
-            self.mem_layout['stack'].get('start', 0),
-            self.mem_layout['stack']['size'],
+            self.mem_layout["text"].get("start", 0),
+            self.mem_layout["text"]["size"],
+            self.mem_layout["rodata"].get("start", 0),
+            self.mem_layout["rodata"]["size"],
+            self.mem_layout["data"].get("start", 0),
+            self.mem_layout["data"]["size"],
+            self.mem_layout["bss"].get("start", 0),
+            self.mem_layout["bss"]["size"],
+            self.mem_layout["args"].get("start", 0),
+            self.mem_layout["args"]["size"],
+            self.mem_layout["heap"].get("start", 0),
+            self.mem_layout["heap"]["size"],
+            self.mem_layout["workspace"].get("start", 0),
+            self.mem_layout["workspace"]["size"],
+            self.mem_layout["stack"].get("start", 0),
+            self.mem_layout["stack"]["size"],
             self.word_size_bits,
             self.thumb_mode,
             self.use_device_timer,
             server_addr,
             server_port)
-        self._enter = self.module['enter']
-        self._exit = self.module['exit']
-        self.get_last_batch_time = self.module['get_last_batch_time']
-        self.get_last_batch_cycles = self.module['get_last_batch_cycles']
+        self._enter = self.module["enter"]
+        self._exit = self.module["exit"]
+        self.get_last_batch_time = self.module["get_last_batch_time"]
+        self.get_last_batch_cycles = self.module["get_last_batch_cycles"]
 
     def _check_system(self):
         """Check if the user's system is supported by MicroTVM.
 
         Raises error if not supported.
         """
-        if not sys.platform.startswith('linux'):
-            raise RuntimeError('MicroTVM is currently only supported on Linux')
+        if not sys.platform.startswith("linux"):
+            raise RuntimeError("MicroTVM is currently only supported on Linux")
         # TODO(weberlo): Add 32-bit support.
         # It's primarily the compilation pipeline that isn't compatible.
         if sys.maxsize <= 2**32:
-            raise RuntimeError('MicroTVM is currently only supported on 64-bit host platforms')
+            raise RuntimeError("MicroTVM is currently only supported on 64-bit host platforms")
 
     def __enter__(self):
         self._enter()
@@ -159,8 +159,8 @@ def _calc_max_workspace_usage(src):
     free_re = re.compile(r'.*if \(TVMBackendFreeWorkspace\(.+, .+, (\(void\*\))? (.+)\) != 0\) {.*')
     max_usage = 0
     alloc_map = {}
-    for line in src.split('\n'):
-        if line.strip().startswith('//'):
+    for line in src.split("\n"):
+        if line.strip().startswith("//"):
             continue
         match = alloc_re.match(line)
         if match is not None:
@@ -198,7 +198,7 @@ def create_micro_mod(c_mod, dev_config, lib_src_paths=None, lib_headers=None,
         micro module for the target device
     """
     temp_dir = _util.tempdir()
-    lib_obj_path = temp_dir.relpath('dev_lib.obj')
+    lib_obj_path = temp_dir.relpath("dev_lib.obj")
     # TODO use dev config to dispatch on the type of C codegen to run through
     # (e.g., CodeGenCArm, CodeGenCHost, CodeGenCRiscV)
     c_mod.export_library(
@@ -252,7 +252,7 @@ def cross_compiler(dev_config, lib_type, lib_src_paths=None, lib_headers=None,
       c_mod.export_library('dev_lib.obj', fcompile=fcompile)
     """
     assert (lib_headers is None) == (lib_include_paths is None), \
-        'must specify both `lib_headers` and `lib_include_paths` or neither'
+        "must specify both `lib_headers` and `lib_include_paths` or neither"
 
     if lib_src_paths is None:
         lib_src_paths = []
@@ -260,39 +260,39 @@ def cross_compiler(dev_config, lib_type, lib_src_paths=None, lib_headers=None,
         lib_include_paths = []
     include_options = []
     for include_path in lib_include_paths:
-        include_options.append('-I')
+        include_options.append("-I")
         include_options.append(include_path)
     create_micro_lib = tvm.micro.device.get_device_funcs(
-        dev_config['device_id'])['create_micro_lib']
-    mem_layout = dev_config['mem_layout']
+        dev_config["device_id"])["create_micro_lib"]
+    mem_layout = dev_config["mem_layout"]
 
     def compile_func(obj_path, src_path, **kwargs):
         if isinstance(obj_path, list):
             obj_path = obj_path[0]
         if isinstance(src_path, list):
             src_path = src_path[0]
-        options = kwargs.get('options', [])
+        options = kwargs.get("options", [])
         options += include_options
 
         # check that workspace allocations don't exceed available workspace memory
         with open(src_path) as f:
             src_contents = f.read()
             max_ws_usage = _calc_max_workspace_usage(src_contents)
-            available_mem = mem_layout['workspace']['size']
+            available_mem = mem_layout["workspace"]["size"]
             if max_ws_usage > available_mem:
-                raise RuntimeError(f'workspace allocations in library ({max_ws_usage}) '
-                                   f'exceed available memory ({available_mem})')
+                raise RuntimeError(f"workspace allocations in library ({max_ws_usage}) "
+                                   f"exceed available memory ({available_mem})")
         # inject headers into new source path, if requested
         if lib_headers:
-            headers_to_inject = '\n'.join(map(lambda s: f'#include <{s}>', lib_headers)) + '\n'
+            headers_to_inject = "\n".join(map(lambda s: f"#include <{s}>", lib_headers)) + "\n"
             new_src_contents = headers_to_inject + src_contents
             tmp_dir = _util.tempdir()
             src_path = tmp_dir.relpath(os.path.basename(src_path))
-            with open(src_path, 'w') as f:
+            with open(src_path, "w") as f:
                 f.write(new_src_contents)
 
         create_micro_lib(obj_path, src_path, lib_type, options, lib_src_paths=lib_src_paths)
-    return _cc.cross_compiler(compile_func, output_format='obj')
+    return _cc.cross_compiler(compile_func, output_format="obj")
 
 
 def get_micro_host_driven_dir():
@@ -304,8 +304,8 @@ def get_micro_host_driven_dir():
         directory path
     """
     micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
-    micro_host_driven_dir = os.path.join(micro_dir, '..', '..', '..',
-                                         'src', 'runtime', 'micro', 'host_driven')
+    micro_host_driven_dir = os.path.join(micro_dir, "..", "..", "..",
+                                         "src", "runtime", "micro", "host_driven")
     return micro_host_driven_dir
 
 
@@ -318,9 +318,9 @@ def get_micro_device_dir():
         directory path
     """
     micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
-    micro_device_dir = os.path.join(micro_dir, '..', '..', '..',
-                                    'src', 'runtime', 'micro', 'device')
+    micro_device_dir = os.path.join(micro_dir, "..", "..", "..",
+                                    "src", "runtime", "micro", "device")
     return micro_device_dir
 
 
-tvm._ffi._init_api('tvm.micro', 'tvm.micro.base')
+tvm._ffi._init_api("tvm.micro", "tvm.micro.base")
diff --git a/python/tvm/micro/device/arm/stm32f746xx.py b/python/tvm/micro/device/arm/stm32f746xx.py
index 9a2b46ac0cd7..6d4b8f74aacb 100644
--- a/python/tvm/micro/device/arm/stm32f746xx.py
+++ b/python/tvm/micro/device/arm/stm32f746xx.py
@@ -18,8 +18,8 @@
 import os
 from .. import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint
 
-DEVICE_ID = 'arm.stm32f746xx'
-TOOLCHAIN_PREFIX = 'arm-none-eabi-'
+DEVICE_ID = "arm.stm32f746xx"
+TOOLCHAIN_PREFIX = "arm-none-eabi-"
 WORD_SIZE_BITS = 32
 #
 # [Device Memory Layout]
@@ -29,14 +29,14 @@
 BASE_ADDR = 0x20000000
 AVAILABLE_MEM = 320000
 DEFAULT_SECTION_CONSTRAINTS = {
-    'text': (18000, MemConstraint.ABSOLUTE_BYTES),
-    'rodata': (100, MemConstraint.ABSOLUTE_BYTES),
-    'data': (100, MemConstraint.ABSOLUTE_BYTES),
-    'bss': (600, MemConstraint.ABSOLUTE_BYTES),
-    'args': (4096, MemConstraint.ABSOLUTE_BYTES),
-    'heap': (100.0, MemConstraint.WEIGHT),
-    'workspace': (64000, MemConstraint.ABSOLUTE_BYTES),
-    'stack': (32, MemConstraint.ABSOLUTE_BYTES),
+    "text": (18000, MemConstraint.ABSOLUTE_BYTES),
+    "rodata": (100, MemConstraint.ABSOLUTE_BYTES),
+    "data": (100, MemConstraint.ABSOLUTE_BYTES),
+    "bss": (600, MemConstraint.ABSOLUTE_BYTES),
+    "args": (4096, MemConstraint.ABSOLUTE_BYTES),
+    "heap": (100.0, MemConstraint.WEIGHT),
+    "workspace": (64000, MemConstraint.ABSOLUTE_BYTES),
+    "stack": (32, MemConstraint.ABSOLUTE_BYTES),
 }
 
 def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None):
@@ -66,22 +66,22 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=N
 
     options += [
         # TODO(weberlo): make a debug flag
-        '-O2',
-        '-march=armv7e-m',
-        '-mcpu=cortex-m7',
-        '-mlittle-endian',
-        '-mfloat-abi=hard',
-        '-mfpu=fpv5-sp-d16',
-        '-mthumb',
-        '-ffast-math',
-        '-gdwarf-5',
-        '-DARM_MATH_CM7',
-        '-D__FPU_PRESENT=1U',
-        '-DARM_MATH_DSP',
-        '-Wno-unused-variable',
-        '-Wno-unused-parameter',
-        '-I{}'.format(os.environ['CMSIS_ST_PATH']),
-        '-I{}/Core/Include'.format(os.environ['CMSIS_ST_PATH'])
+        "-O2",
+        "-march=armv7e-m",
+        "-mcpu=cortex-m7",
+        "-mlittle-endian",
+        "-mfloat-abi=hard",
+        "-mfpu=fpv5-sp-d16",
+        "-mthumb",
+        "-ffast-math",
+        "-gdwarf-5",
+        "-DARM_MATH_CM7",
+        "-D__FPU_PRESENT=1U",
+        "-DARM_MATH_DSP",
+        "-Wno-unused-variable",
+        "-Wno-unused-parameter",
+        "-I{}".format(os.environ["CMSIS_ST_PATH"]),
+        "-I{}/Core/Include".format(os.environ["CMSIS_ST_PATH"])
         ]
     create_micro_lib_base(
         obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options,
@@ -110,19 +110,19 @@ def generate_config(server_addr, server_port, section_constraints=None):
     if section_constraints is None:
         section_constraints = DEFAULT_SECTION_CONSTRAINTS
     return {
-        'device_id': DEVICE_ID,
-        'toolchain_prefix': TOOLCHAIN_PREFIX,
-        'mem_layout': gen_mem_layout(BASE_ADDR, AVAILABLE_MEM, WORD_SIZE_BITS, section_constraints),
-        'word_size_bits': WORD_SIZE_BITS,
-        'thumb_mode': True,
-        'use_device_timer': True,
-        'comms_method': 'openocd',
-        'server_addr': server_addr,
-        'server_port': server_port,
+        "device_id": DEVICE_ID,
+        "toolchain_prefix": TOOLCHAIN_PREFIX,
+        "mem_layout": gen_mem_layout(BASE_ADDR, AVAILABLE_MEM, WORD_SIZE_BITS, section_constraints),
+        "word_size_bits": WORD_SIZE_BITS,
+        "thumb_mode": True,
+        "use_device_timer": True,
+        "comms_method": "openocd",
+        "server_addr": server_addr,
+        "server_port": server_port,
     }
 
 
 register_device(DEVICE_ID, {
-    'create_micro_lib': create_micro_lib,
-    'generate_config': generate_config,
+    "create_micro_lib": create_micro_lib,
+    "generate_config": generate_config,
 })
diff --git a/python/tvm/micro/device/base.py b/python/tvm/micro/device/base.py
index 4d42bff8ebbc..767284c9c254 100644
--- a/python/tvm/micro/device/base.py
+++ b/python/tvm/micro/device/base.py
@@ -102,18 +102,18 @@ def create_micro_lib_base(
     # look at these (specifically `strip`):
     #   https://stackoverflow.com/questions/15314581/g-compiler-flag-to-minimize-binary-size
     base_compile_cmd = [
-        f'{toolchain_prefix}gcc',
-        '-std=c11',
-        '-Wall',
-        '-Wextra',
-        '--pedantic',
-        '-c',
-        '-g',
-        '-nostartfiles',
-        '-nodefaultlibs',
-        '-nostdlib',
-        '-fdata-sections',
-        '-ffunction-sections',
+        f"{toolchain_prefix}gcc",
+        "-std=c11",
+        "-Wall",
+        "-Wextra",
+        "--pedantic",
+        "-c",
+        "-g",
+        "-nostartfiles",
+        "-nodefaultlibs",
+        "-nostdlib",
+        "-fdata-sections",
+        "-ffunction-sections",
         ]
     if options is not None:
         base_compile_cmd += options
@@ -126,24 +126,24 @@ def create_micro_lib_base(
     if lib_type == LibType.RUNTIME:
         dev_dir = _get_device_source_dir(device_id)
 
-        dev_src_paths = glob.glob(f'{dev_dir}/*.[csS]')
+        dev_src_paths = glob.glob(f"{dev_dir}/*.[csS]")
         # there needs to at least be a utvm_timer.c file
         assert dev_src_paths
-        assert 'utvm_timer.c' in map(os.path.basename, dev_src_paths)
+        assert "utvm_timer.c" in map(os.path.basename, dev_src_paths)
 
         src_paths += dev_src_paths
     elif lib_type == LibType.OPERATOR:
         # create a temporary copy of the operator source, so we can inject the dev lib
         # header without modifying the original.
-        temp_src_path = tmp_dir.relpath('temp.c')
-        with open(in_src_path, 'r') as f:
+        temp_src_path = tmp_dir.relpath("temp.c")
+        with open(in_src_path, "r") as f:
             src_lines = f.read().splitlines()
         src_lines.insert(0, '#include "utvm_device_dylib_redirect.c"')
-        with open(temp_src_path, 'w') as f:
-            f.write('\n'.join(src_lines))
+        with open(temp_src_path, "w") as f:
+            f.write("\n".join(src_lines))
         new_in_src_path = temp_src_path
     else:
-        raise RuntimeError('unknown lib type')
+        raise RuntimeError("unknown lib type")
 
     src_paths += [new_in_src_path]
 
@@ -151,23 +151,23 @@ def create_micro_lib_base(
     if lib_src_paths is not None:
         src_paths += lib_src_paths
 
-    # print(f'include paths: {include_paths}')
+    # print(f"include paths: {include_paths}")
     for path in include_paths:
-        base_compile_cmd += ['-I', path]
+        base_compile_cmd += ["-I", path]
 
     prereq_obj_paths = []
     # print(src_paths)
     for src_path in src_paths:
-        curr_obj_path = tmp_dir.relpath(pathlib.Path(src_path).with_suffix('.o').name)
+        curr_obj_path = tmp_dir.relpath(pathlib.Path(src_path).with_suffix(".o").name)
         assert curr_obj_path not in prereq_obj_paths
         prereq_obj_paths.append(curr_obj_path)
-        curr_compile_cmd = base_compile_cmd + [src_path, '-o', curr_obj_path]
+        curr_compile_cmd = base_compile_cmd + [src_path, "-o", curr_obj_path]
         # TODO(weberlo): make compilation fail if there are any warnings
         run_cmd(curr_compile_cmd)
 
-    ld_cmd = [f'{toolchain_prefix}ld', '-relocatable']
+    ld_cmd = [f"{toolchain_prefix}ld", "-relocatable"]
     ld_cmd += prereq_obj_paths
-    ld_cmd += ['-o', out_obj_path]
+    ld_cmd += ["-o", out_obj_path]
     run_cmd(ld_cmd)
 
 
@@ -195,7 +195,7 @@ def gen_mem_layout(base_addr, available_mem, word_size_bits, section_constraints
     section_constraints: Optional[Dict[str, [Number, MemConstraint]]]
         maps section name to the quantity of available memory
     """
-    assert word_size_bits in (32, 64), 'only 32- or 64-bit devices are supported now'
+    assert word_size_bits in (32, 64), "only 32- or 64-bit devices are supported now"
     word_size_bytes = word_size_bits // 8
     byte_sum = sum(x[0]
                    for x in section_constraints.values()
@@ -212,18 +212,18 @@ def gen_mem_layout(base_addr, available_mem, word_size_bits, section_constraints
         (val, cons_type) = section_constraints[section]
         if cons_type == MemConstraint.ABSOLUTE_BYTES:
             assert val % word_size_bytes == 0, \
-                f'constraint {val} for {section} section is not word-aligned'
+                f"constraint {val} for {section} section is not word-aligned"
             size = val
             res[section] = {
-                'start': curr_addr,
-                'size': size,
+                "start": curr_addr,
+                "size": size,
             }
         else:
             size = int((val / weight_sum) * available_weight_mem)
             size = (size // word_size_bytes) * word_size_bytes
             res[section] = {
-                'start': curr_addr,
-                'size': size,
+                "start": curr_addr,
+                "size": size,
             }
         curr_addr += size
 
@@ -232,5 +232,5 @@ def gen_mem_layout(base_addr, available_mem, word_size_bits, section_constraints
 
 def _get_device_source_dir(device_id):
     """Grabs the source directory for device-specific uTVM files"""
-    dev_subdir = '/'.join(device_id.split('.'))
-    return get_micro_device_dir() + '/' + dev_subdir
+    dev_subdir = "/".join(device_id.split("."))
+    return get_micro_device_dir() + "/" + dev_subdir
diff --git a/python/tvm/micro/device/host.py b/python/tvm/micro/device/host.py
index 55e0ac326f2d..0cf29874ab57 100644
--- a/python/tvm/micro/device/host.py
+++ b/python/tvm/micro/device/host.py
@@ -19,21 +19,21 @@
 
 from . import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint
 
-DEVICE_ID = 'host'
-TOOLCHAIN_PREFIX = ''
+DEVICE_ID = "host"
+TOOLCHAIN_PREFIX = ""
 WORD_SIZE_BITS = 64 if sys.maxsize > 2**32 else 32
 
 # we pretend we only have 320kb in the default case, so we can use `gen_mem_layout`
 DEFAULT_AVAILABLE_MEM = 3200000
 DEFAULT_SECTION_CONSTRAINTS = {
-    'text': (20480, MemConstraint.ABSOLUTE_BYTES),
-    'rodata': (20480, MemConstraint.ABSOLUTE_BYTES),
-    'data': (768, MemConstraint.ABSOLUTE_BYTES),
-    'bss': (4096, MemConstraint.ABSOLUTE_BYTES),
-    'args': (4096, MemConstraint.ABSOLUTE_BYTES),
-    'heap': (262144, MemConstraint.ABSOLUTE_BYTES),
-    'workspace': (64000, MemConstraint.ABSOLUTE_BYTES),
-    'stack': (80, MemConstraint.ABSOLUTE_BYTES),
+    "text": (20480, MemConstraint.ABSOLUTE_BYTES),
+    "rodata": (20480, MemConstraint.ABSOLUTE_BYTES),
+    "data": (768, MemConstraint.ABSOLUTE_BYTES),
+    "bss": (4096, MemConstraint.ABSOLUTE_BYTES),
+    "args": (4096, MemConstraint.ABSOLUTE_BYTES),
+    "heap": (262144, MemConstraint.ABSOLUTE_BYTES),
+    "workspace": (64000, MemConstraint.ABSOLUTE_BYTES),
+    "stack": (80, MemConstraint.ABSOLUTE_BYTES),
 }
 
 def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None):
@@ -61,9 +61,9 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=N
     else:
         options = list(options)
     # Cannot increase optimization level on host due to code loading method.
-    options.append('-O0')
-    if sys.maxsize > 2**32 and sys.platform.startswith('linux'):
-        options += ['-mcmodel=large']
+    options.append("-O0")
+    if sys.maxsize > 2**32 and sys.platform.startswith("linux"):
+        options += ["-mcmodel=large"]
     create_micro_lib_base(
         obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options,
         lib_src_paths=lib_src_paths)
@@ -98,19 +98,19 @@ def generate_config(available_mem=None, section_constraints=None):
     # need to zero out all start addresses, because they don't make sense for a
     # host device (the memory region is allocated in the backend)
     for section in mem_layout:
-        mem_layout[section]['start'] = 0
+        mem_layout[section]["start"] = 0
     return {
-        'device_id': DEVICE_ID,
-        'toolchain_prefix': TOOLCHAIN_PREFIX,
-        'mem_layout': mem_layout,
-        'word_size_bits': WORD_SIZE_BITS,
-        'thumb_mode': False,
-        'use_device_timer': False,
-        'comms_method': 'host',
+        "device_id": DEVICE_ID,
+        "toolchain_prefix": TOOLCHAIN_PREFIX,
+        "mem_layout": mem_layout,
+        "word_size_bits": WORD_SIZE_BITS,
+        "thumb_mode": False,
+        "use_device_timer": False,
+        "comms_method": "host",
     }
 
 
 register_device(DEVICE_ID, {
-    'create_micro_lib': create_micro_lib,
-    'generate_config': generate_config,
+    "create_micro_lib": create_micro_lib,
+    "generate_config": generate_config,
 })
diff --git a/python/tvm/micro/device/riscv_spike.py b/python/tvm/micro/device/riscv_spike.py
index b9f55dff0ce7..32881cab6ba9 100644
--- a/python/tvm/micro/device/riscv_spike.py
+++ b/python/tvm/micro/device/riscv_spike.py
@@ -18,19 +18,19 @@
 
 from . import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint
 
-DEVICE_ID = 'riscv_spike'
-TOOLCHAIN_PREFIX = 'riscv64-unknown-elf-'
+DEVICE_ID = "riscv_spike"
+TOOLCHAIN_PREFIX = "riscv64-unknown-elf-"
 WORD_SIZE_BITS = 64
 
 DEFAULT_SECTION_CONSTRAINTS = {
-    'text': (18000, MemConstraint.ABSOLUTE_BYTES),
-    'rodata': (128, MemConstraint.ABSOLUTE_BYTES),
-    'data': (128, MemConstraint.ABSOLUTE_BYTES),
-    'bss': (2048, MemConstraint.ABSOLUTE_BYTES),
-    'args': (4096, MemConstraint.ABSOLUTE_BYTES),
-    'heap': (100.0, MemConstraint.WEIGHT),
-    'workspace': (64000, MemConstraint.ABSOLUTE_BYTES),
-    'stack': (32, MemConstraint.ABSOLUTE_BYTES),
+    "text": (18000, MemConstraint.ABSOLUTE_BYTES),
+    "rodata": (128, MemConstraint.ABSOLUTE_BYTES),
+    "data": (128, MemConstraint.ABSOLUTE_BYTES),
+    "bss": (2048, MemConstraint.ABSOLUTE_BYTES),
+    "args": (4096, MemConstraint.ABSOLUTE_BYTES),
+    "heap": (100.0, MemConstraint.WEIGHT),
+    "workspace": (64000, MemConstraint.ABSOLUTE_BYTES),
+    "stack": (32, MemConstraint.ABSOLUTE_BYTES),
 }
 
 def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None):
@@ -90,19 +90,19 @@ def generate_config(base_addr, available_mem, server_addr, server_port, section_
     if section_constraints is None:
         section_constraints = DEFAULT_SECTION_CONSTRAINTS
     return {
-        'device_id': DEVICE_ID,
-        'toolchain_prefix': TOOLCHAIN_PREFIX,
-        'mem_layout': gen_mem_layout(base_addr, available_mem, WORD_SIZE_BITS, section_constraints),
-        'word_size_bits': WORD_SIZE_BITS,
-        'thumb_mode': False,
-        'use_device_timer': False,
-        'comms_method': 'openocd',
-        'server_addr': server_addr,
-        'server_port': server_port,
+        "device_id": DEVICE_ID,
+        "toolchain_prefix": TOOLCHAIN_PREFIX,
+        "mem_layout": gen_mem_layout(base_addr, available_mem, WORD_SIZE_BITS, section_constraints),
+        "word_size_bits": WORD_SIZE_BITS,
+        "thumb_mode": False,
+        "use_device_timer": False,
+        "comms_method": "openocd",
+        "server_addr": server_addr,
+        "server_port": server_port,
     }
 
 
 register_device(DEVICE_ID, {
-    'create_micro_lib': create_micro_lib,
-    'generate_config': generate_config,
+    "create_micro_lib": create_micro_lib,
+    "generate_config": generate_config,
 })
diff --git a/python/tvm/relay/_parser.py b/python/tvm/relay/_parser.py
index 6a1d928352f4..7731efe04dfd 100644
--- a/python/tvm/relay/_parser.py
+++ b/python/tvm/relay/_parser.py
@@ -342,10 +342,7 @@ def visitLocalVar(self, ctx):
         return local_var
 
     def visitGraphVar(self, ctx):
-        graph_var_idx = int(ctx.NAT().getText())
-        if graph_var_idx >= len(self.graph_expr):
-            raise ParseError(f"graph var `%{graph_var_idx}` is unbound")
-        return self.graph_expr[graph_var_idx]
+        return self.graph_expr[int(ctx.NAT().getText())]
 
     def visit_list(self, ctx_list) -> List[Any]:
         """"Visit a list of contexts."""
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 2ad75533e807..6bdec67617e1 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -262,7 +262,7 @@ def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target):
         name="conv2d_tranpose_nchw.arm_cpu")
     return strategy
 
-@bitserial_conv2d_strategy.register(["arm_cpu", "micro_dev"])
+@bitserial_conv2d_strategy.register("arm_cpu")
 def bitserial_conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
     """bitserial_conv2d x86 strategy"""
     strategy = _op.OpStrategy()
@@ -281,7 +281,7 @@ def bitserial_conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
         raise ValueError("Data layout {} not supported.".format(layout))
     return strategy
 
-@bitserial_dense_strategy.register(["arm_cpu", "micro_dev"])
+@bitserial_dense_strategy.register("arm_cpu")
 def schedule_bitserial_dense_arm_cpu(attrs, inputs, out_type, target):
     """bitserial_dense arm cpu strategy"""
     strategy = _op.OpStrategy()
diff --git a/python/tvm/target/arm_isa.py b/python/tvm/target/arm_isa.py
index 6b2eda8f050e..c40296e50713 100644
--- a/python/tvm/target/arm_isa.py
+++ b/python/tvm/target/arm_isa.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Defines functions to analyze availble opcodes in the ARM ISA."""
+"""Defines functions to analyze available opcodes in the ARM ISA."""
 
 
 ARM_ISA_MAP = {
diff --git a/src/ir/error.cc b/src/ir/error.cc
index 67694342db45..9d498288d2ba 100644
--- a/src/ir/error.cc
+++ b/src/ir/error.cc
@@ -62,21 +62,16 @@ void ErrorReporter::RenderErrors(const IRModule& module, bool use_color) {
 
     CHECK(has_errs != this->node_to_error_.end());
 
-    const auto& error_indices = has_errs->second;
+    const auto& error_indicies = has_errs->second;
 
     std::stringstream err_msg;
 
-    if (error_indices.size() != 0) {
-      err_msg << rang::fg::red;
-      err_msg << " ";
-      // the errors are in reverse order, so print them with a reversed iteration
-      err_msg << this->errors_[error_indices[error_indices.size()-1]].what();
-      for (int i = error_indices.size() - 2; i >= 0; i--) {
-        size_t err_idx = error_indices[i];
-        err_msg << "; " << this->errors_[err_idx].what();
-      }
-      err_msg << rang::fg::reset;
+    err_msg << rang::fg::red;
+    err_msg << " ";
+    for (auto index : error_indicies) {
+      err_msg << this->errors_[index].what() << "; ";
     }
+    err_msg << rang::fg::reset;
 
     // Setup error map.
     auto it = error_maps.find(global);
diff --git a/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c b/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c
index 3a840e7a7861..9fabce6bdc1e 100644
--- a/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c
+++ b/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c
@@ -32,7 +32,8 @@ extern "C" {
 #include <stdint.h>
 #include <stddef.h>
 
-// TODO(areusch): compiler errors say volatile qualifier is discarded. should we just get rid of em?
+// TODO(weberlo, areusch): compiler errors say volatile qualifier is discarded.
+// should we just get rid of em?
 void* (* volatile TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = NULL;
 int (* volatile TVMBackendFreeWorkspace_)(int, int, void*) = NULL;
 void (* volatile TVMAPISetLastError_)(const char*) = NULL;
@@ -52,9 +53,9 @@ void TVMAPISetLastError(const char* msg) {
 }
 
 void *memset(void *s, int c, size_t n) {
-  char *p = (char*) s;
+  char *p = (char*) s;  // NOLINT(readability/casting): linter is configured for c++
   while (n > 0) {
-    *p = (char) c;
+    *p = (char) c;  // NOLINT(readability/casting): linter is configured for c++
     p++;
     n--;
   }
@@ -62,21 +63,23 @@ void *memset(void *s, int c, size_t n) {
 }
 
 void *memmove(void *to, const void *from, size_t n) {
-  // TODO will need to factor memmove calls into workspace size calculation
+  // TODO(weberlo, areusch): will need to factor memmove calls into workspace size calculation
+  // NOLINTNEXTLINE(readability/casting): linter is configured for c++
   char *temp = (char*) TVMBackendAllocWorkspace(1, 1, (uint64_t) n, 2, 8);
   if (temp == NULL) {
     return NULL;
   }
 
-  const char *from_pp = (char*) from;
+  const char *from_pp = (char*) from;  // NOLINT(readability/casting): linter is configured for c++
   for (size_t i = 0; i < n; i++) {
     temp[i] = from_pp[i];
   }
-  char *to_pp = (char*) to;
+  char *to_pp = (char*) to;  // NOLINT(readability/casting): linter is configured for c++
   for (size_t i = 0; i < n; i++) {
     to_pp[i] = temp[i];
   }
 
+  // NOLINTNEXTLINE(readability/casting): linter is configured for c++
   if (TVMBackendFreeWorkspace(1, (uint64_t) 1, (void*) temp) != 0) {
     return NULL;
   }
diff --git a/src/runtime/micro/host_driven/utvm_runtime.c b/src/runtime/micro/host_driven/utvm_runtime.c
index f05fdb5fe2c1..2f2f0c1e0dea 100644
--- a/src/runtime/micro/host_driven/utvm_runtime.c
+++ b/src/runtime/micro/host_driven/utvm_runtime.c
@@ -34,7 +34,8 @@ extern "C" {
 
 #include "utvm_runtime.h"
 
-// TODO(areusch): move defines into header
+// TODO(weberlo, areusch): move defines into header
+// TODO(weberlo, areusch): unify TASK_QUEUE_SIZE and MicroSession::kTaskQueueCapacity.
 #define TASK_QUEUE_SIZE 20
 volatile UTVMTask utvm_tasks[TASK_QUEUE_SIZE] = { };
 volatile uint32_t utvm_num_tasks = 0;
@@ -139,7 +140,7 @@ void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size,
 }
 
 int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) {
-  // TODO(areusch): add dev type check
+  // TODO(weberlo, areusch): add dev type check
   if (utvm_num_active_allocs == 0) {
     TVMAPISetLastError("free called with no active workspace allocations");
     // Reset allocations and workspace (for future task executions).
@@ -153,7 +154,6 @@ int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) {
       // it's the first allocation
       utvm_alloc_ends[0] = NULL;
     } else {
-      // TODO(areusch): reverse loop iteration since usually it's the last alloc being freed
       for (uint32_t i = utvm_alloc_idx - 1; i >= 0; i--) {
         if (utvm_alloc_ends[i] == ptr) {
           utvm_alloc_ends[i + 1] = NULL;
@@ -167,8 +167,8 @@ int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) {
     if (utvm_alloc_idx == 0) {
       utvm_workspace_curr = utvm_workspace_start;
     } else {
-      // TODO(areusch): could you possibly have utvm_alloc_idx pointing to a NULL entry in this
-      // branch?
+      // TODO(weberlo, areusch): could you possibly have utvm_alloc_idx pointing to a NULL entry in
+      // this branch?
       utvm_workspace_curr = utvm_alloc_ends[utvm_alloc_idx - 1];
     }
     return 0;
diff --git a/src/runtime/micro/host_driven/utvm_runtime.h b/src/runtime/micro/host_driven/utvm_runtime.h
index 42ca025fc2ab..fc11b70ed8e2 100644
--- a/src/runtime/micro/host_driven/utvm_runtime.h
+++ b/src/runtime/micro/host_driven/utvm_runtime.h
@@ -63,27 +63,35 @@ typedef struct {
 } UTVMTask;
 
 /*!
- * \brief TODO
+ * \brief microTVM processor startup.
+ * Expected to reset the stack pointer, configure any hardware required to support the CRT
+ * (i.e. FPU), and then jump to UTVMMain.
  */
 extern void UTVMInit();
 
 /*!
- * \brief TODO
+ * \brief Start the on-device timer.
+ * \return UTVMReturnCode indicating the outcome of the operation.
  */
 extern int32_t UTVMTimerStart();
 
 /*!
- * \brief TODO
+ * \brief Stop the on-device timer.
+ * TODO(areusch): Use an SI specification of timer units here.
+ * \param err Receives a UTVMReturnCode indicating the outcome of the operation.
+ * \return elapsed time since UTVMTimerStart returned, in device timer ticks.
  */
 extern uint32_t UTVMTimerStop(int32_t* err);
 
 /*!
- * \brief TODO
+ * \brief Main entry point for UTVM runtime.
+ * Waits for "go" signal, then executes tasks and reports result. Should never return.
  */
 void UTVMMain();
 
 /*!
- * \brief TODO
+ * \brief Function entered when UTVMMain is complete.
+ * Should never return. The host sets a breakpoint here to detect end of computation.
  */
 void UTVMDone();
 
diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h
index 2093f385fca0..2d74bc349f42 100644
--- a/src/runtime/micro/micro_common.h
+++ b/src/runtime/micro/micro_common.h
@@ -55,9 +55,6 @@ enum class SectionKind : size_t {
 
 /*! \brief data type for word sizes */
 class TargetWordSize {
- private:
-  size_t word_size_bits_;
-
  public:
   explicit TargetWordSize(size_t word_size_bits) : word_size_bits_{word_size_bits} {
     CHECK(word_size_bits == 32 || word_size_bits == 64)
@@ -71,6 +68,9 @@ class TargetWordSize {
   size_t bits() const {
     return word_size_bits_;
   }
+
+ private:
+  size_t word_size_bits_;
 };
 
 
@@ -94,18 +94,18 @@ class TargetVal {
           width_bits <= 64 &&
           (width_bits & (width_bits - 1)) == 0)
       << "width_bits must be a power of 2 in [8, 64], got " << width_bits;
-    value_ = value & bitmask();
+    value_ = value & Bitmask();
   }
 
-  bool is_initialized() const { return width_bits_ != 0; }
+  bool IsInitialized() const { return width_bits_ != 0; }
 
   size_t width_bits() const {
-    CHECK(is_initialized()) << "TargetVal is not initialized";
+    CHECK(IsInitialized()) << "TargetVal is not initialized";
     return width_bits_;
   }
 
-  uint64_t bitmask() const {
-    CHECK(is_initialized()) << "TargetVal is not initialized";
+  uint64_t Bitmask() const {
+    CHECK(IsInitialized()) << "TargetVal is not initialized";
 
     if (width_bits_ == 64) {
       return ~0UL;
@@ -115,21 +115,21 @@ class TargetVal {
   }
 
   uint32_t uint32() const {
-    CHECK(is_initialized()) << "TargetVal is not initialized";
+    CHECK(IsInitialized()) << "TargetVal is not initialized";
     CHECK(width_bits_ <= 32) << "TargetVal: requested 32-bit value, actual width is "
                              << width_bits_;
-    return uint32_t(value_ & bitmask());
+    return uint32_t(value_ & Bitmask());
   }
 
   uint64_t uint64() const {
-    CHECK(is_initialized()) << "TargetVal is not initialized";
+    CHECK(IsInitialized()) << "TargetVal is not initialized";
     return value_;
   }
 
   TargetVal& operator=(const TargetVal& other) {
-    CHECK(other.is_initialized()) << "Cannot assign an uninitialized TargetVal";
+    CHECK(other.IsInitialized()) << "Cannot assign an uninitialized TargetVal";
 
-    if (!is_initialized()) {
+    if (!IsInitialized()) {
       width_bits_ = other.width_bits_;
     }
 
@@ -137,12 +137,12 @@ class TargetVal {
       << "Cannot assign TargetVal with width " << other.width_bits_
       << "bits to TargetVal with width " << width_bits_ << "bits";
 
-    value_ = other.value_ & bitmask();
+    value_ = other.value_ & Bitmask();
     return *this;
   }
 };
 
-// TODO(areusch): just get rid of `TargetPtr`.
+// TODO(weberlo, areusch): just get rid of `TargetPtr`.
 /*! \brief absolute device address */
 class TargetPtr {
  public:
diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h
index e30c72b79fb5..c99d79652edb 100644
--- a/src/runtime/micro/target_data_layout_encoder.h
+++ b/src/runtime/micro/target_data_layout_encoder.h
@@ -30,7 +30,7 @@
 namespace tvm {
 namespace runtime {
 
-// TODO(weberlo): Handle endianness.
+// TODO(weberlo, areusch): Handle endianness.
 
 /*!
  * \brief data encoder for uTVM that builds a host-side buffer
@@ -159,7 +159,7 @@ class TargetDataLayoutEncoder {
   size_t curr_offset_;
   /*! \brief start address of the encoder in device memory */
   TargetPtr start_addr_;
-  /*! \brief TODO */
+  /*! \brief number of bytes available in device memory */
   size_t capacity_;
   /*! \brief number of bytes in a word on the target device */
   TargetWordSize word_size_;
@@ -178,8 +178,8 @@ TargetDataLayoutEncoder::Slot<T>::Slot(TargetDataLayoutEncoder* parent,
 
 template <typename T>
 TargetDataLayoutEncoder::Slot<T>::~Slot() {
-  // TODO(areusch): this can mask the exception thrown by slot allocation... even though that
-  // doesn't make sense.
+  // TODO(weberlo, areusch): this can mask the exception thrown by slot allocation... even though
+  // that doesn't make sense.
   CHECK(curr_offset_ == size_) << "unwritten space in slot; curr_offset="
                                << curr_offset_ << ", size=" << size_;
 }
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index 4cca1efea2ca..5e5db82eb931 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -20,11 +20,11 @@
 /*!
  * \file codegen_c_host.cc
  */
-#include "codegen_c_host.h"
+#include <tvm/target/codegen.h>
 #include <vector>
 #include <string>
-#include "tvm/target/codegen.h"
 #include "../build_common.h"
+#include "codegen_c_host.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct.py b/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct.py
index 1fdb596b7163..7d3e945fef14 100644
--- a/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct.py
+++ b/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct.py
@@ -60,9 +60,7 @@ def _conv2d_direct_nhwc_compute(cfg, data, kernel, strides, padding, dilation, l
     assert layout == 'NHWC'
     conv = conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
 
-    ###########################
-    # Config Space Definition #
-    ###########################
+    # Config Space Definition
     N, H, W, CI = get_const_tuple(data.shape)
     KH, KW, _, CO = get_const_tuple(kernel.shape)
     n, oh, ow, co = cfg.axis(N), cfg.axis(H), cfg.axis(W), cfg.axis(CO)
diff --git a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
index 2169176c0711..01acbb72caca 100644
--- a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
+++ b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
@@ -34,7 +34,7 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
     # collisions in the generated source (e.g., if there are multiple operators
     # in the same module that use the same intrinsic)
     #
-    # TODO(areusch): to cut down on memory usage, we should cache each intrinsic
+    # TODO(weberlo, areusch): to cut down on memory usage, we should cache each intrinsic
     # instantiation and include it only once, eliminating the need for unique
     # IDs
     UNIQ_ID_LEN = 8
@@ -47,7 +47,7 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
     if isinstance(N, tvm.tir.IntImm):
         N = N.value
     assert K % 4 == 0
-    # TODO(areusch): support more dtypes?
+    # TODO(weberlo, areusch): support more dtypes?
     assert in_dtype == 'int8'
     assert out_dtype == 'int32'
     A = te.placeholder((M, K), name='a', dtype=in_dtype)
@@ -124,7 +124,7 @@ def _body():
 
 def gemm_MxKxN_impl(M, K, N, uniq_id):
     """Emit C code for gemm impl."""
-    # TODO(areusch): are there any SIMD tricks to zero out arrays quickly?
+    # TODO(weberlo, areusch): are there any SIMD tricks to zero out arrays quickly?
     aa_pad_size = M * K
     bb_pad_size = N * K
     # code reference: CMSIS-NN paper (https://arxiv.org/abs/1801.06601)

From 99e4718fc6818dbc5746a6319af693591a4be847 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 27 Apr 2020 13:59:23 -0700
Subject: [PATCH 10/11] address thierry's comments

---
 python/tvm/autotvm/tuner/callback.py              |  6 +++---
 python/tvm/autotvm/tuner/tuner.py                 | 10 +---------
 src/runtime/micro/micro_session.cc                |  2 --
 tests/python/unittest/test_runtime_micro.py       | 15 ---------------
 .../topi/arm_cpu/cortex_m7/micro_kernel/gemm.py   | 14 --------------
 5 files changed, 4 insertions(+), 43 deletions(-)

diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py
index 1ae64e2ddfef..cfc1b2c38f85 100644
--- a/python/tvm/autotvm/tuner/callback.py
+++ b/python/tvm/autotvm/tuner/callback.py
@@ -144,12 +144,12 @@ def __del__(self):
     def _callback(tuner, inputs, results):
         ctx.ct += len(inputs)
 
-        flops = float("inf")
+        flops = 0
         for inp, res in zip(inputs, results):
             if res.error_no == 0:
-                flops = min(inp.task.flop / np.mean(res.costs), flops)
+                flops = inp.task.flop / np.mean(res.costs)
 
-        if logger.level > logging.DEBUG:  # only print progress bar in non-debug mode
+        if not logger.isEnabledFor(logging.DEBUG):  # only print progress bar in non-debug mode
             ctx.cur_flops = flops
             ctx.best_flops = tuner.best_flops
 
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index 0d66d34ac316..2441a4ae642f 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -150,15 +150,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=(), si_pr
                              i + k + 1, si_prefix, format_si_prefix(flops, si_prefix),
                              format_si_prefix(self.best_flops, si_prefix), res, config)
 
-            num_successes = 0
-            for result in results:
-                if isinstance(result.costs[0], float):
-                    num_successes += 1
-            if num_successes != len(results):
-                logger.debug('not counting %d failures towards trial count',
-                             len(results) - num_successes)
-            i += num_successes
-
+            i += len(results)
             self.ttl = min(early_stopping + self.best_iter, n_trial) - i
 
             self.update(inputs, results)
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index 672f16dc06f9..0e8e1693448e 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -284,7 +284,6 @@ void MicroSession::FlushTaskQueue() {
 
 template <typename T>
 void MicroSession::FlushTaskQueuePriv() {
-  // std::cout << "[MicroSession::FlushTaskQueue]" << std::endl;
   std::vector<T> prepped_tasks;
   for (const auto& task : task_queue_) {
     prepped_tasks.push_back(T(task));
@@ -297,7 +296,6 @@ void MicroSession::FlushTaskQueuePriv() {
       batch_args_encoder_.buf_size());
 
   // Flush `tasks` to device memory.
-//  runtime_symbol_map_.Dump(std::cout);
   TargetPtr dev_tasks_addr = runtime_symbol_map_["utvm_tasks"];
   low_level_device()->Write(
       dev_tasks_addr,
diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py
index 448f53c57f3c..bec74fb6644c 100644
--- a/tests/python/unittest/test_runtime_micro.py
+++ b/tests/python/unittest/test_runtime_micro.py
@@ -30,21 +30,6 @@
 DEV_CONFIG_B = micro.device.host.generate_config()
 TARGET = 'c -device=micro_dev'
 
-# # TODO why do spike examples have memory that starts at 0x10000000, but you
-# # should set the base addr as 0x10010000? should somehow help the user to be
-# # aware of that.
-# # are there always 0x10000 bytes reserved at the beginning of the address space?
-# BASE_ADDR = 0x10010000
-
-# AVAILABLE_MEM = 0x200000
-# DEV_CONFIG_A = micro.device.riscv_spike.generate_config(BASE_ADDR, AVAILABLE_MEM, '127.0.0.1', 6666)
-# DEV_CONFIG_B = micro.device.riscv_spike.generate_config(BASE_ADDR, AVAILABLE_MEM, '127.0.0.1', 6667)
-# TARGET = 'c -device=micro_dev'
-
-# DEV_CONFIG_A = micro.device.arm.stm32f746xx.generate_config('127.0.0.1', 6666)
-# DEV_CONFIG_B = micro.device.arm.stm32f746xx.generate_config('127.0.0.1', 6667)
-# TARGET = 'c -device=micro_dev'
-
 def relay_micro_build(func, dev_config, params=None):
     """Create a graph runtime module with a micro device context from a Relay function.
 
diff --git a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
index 01acbb72caca..9af7bef95b7c 100644
--- a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
+++ b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
@@ -93,20 +93,6 @@ def _reduce_reset():
             return ib.get()
         def _body():
             ib = tvm.tir.ir_builder.create()
-            # # NOTE we need the reset in the body for cases where the buffer
-            # # we're accumulating into is uninitialized (e.g., if it's the
-            # # result of a workspace allocation, because there are no guarantees
-            # # on the contents).
-            # ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_reset",
-            #                         cc.access_ptr("w"),
-            #                         cc.strides[0]))
-            # ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_update",
-            #                         aa.access_ptr("r"),
-            #                         bb.access_ptr("r"),
-            #                         cc.access_ptr("w"),
-            #                         aa.strides[0],
-            #                         bb.strides[0],
-            #                         cc.strides[0]))
             ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_body_{uniq_id}",
                                         aa.access_ptr("r"),
                                         bb.access_ptr("r"),

From bef97f1324a514f2c914d50ec874b0c2c9855b7d Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 28 Apr 2020 17:09:52 -0700
Subject: [PATCH 11/11] address u99127, liangfu, tmoreau89 comments

---
 python/tvm/autotvm/measure/local_executor.py |  3 +--
 python/tvm/exec/rpc_server.py                | 18 +++++++++---------
 python/tvm/micro/device/arm/stm32f746xx.py   |  1 -
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/python/tvm/autotvm/measure/local_executor.py b/python/tvm/autotvm/measure/local_executor.py
index d838a92daa01..a0a826abccf6 100644
--- a/python/tvm/autotvm/measure/local_executor.py
+++ b/python/tvm/autotvm/measure/local_executor.py
@@ -145,8 +145,7 @@ def submit(self, func, *args, **kwargs):
         if not self.do_fork:
             return LocalFutureNoFork(func(*args, **kwargs))
 
-        # TODO why they choose a queue size of 2? add a comment
-        queue = Queue(2)
+        queue = Queue(2)  # Size of 2 to avoid a race condition with size 1.
         process = Process(target=call_with_timeout,
                           args=(queue, self.timeout, func, args, kwargs))
         process.start()
diff --git a/python/tvm/exec/rpc_server.py b/python/tvm/exec/rpc_server.py
index 805c3a8afb1a..e281e58e3879 100644
--- a/python/tvm/exec/rpc_server.py
+++ b/python/tvm/exec/rpc_server.py
@@ -101,20 +101,20 @@ def server_shutdown():
     parser.add_argument('--port-end', type=int, default=9199,
                         help='The end search port of the RPC')
     parser.add_argument('--tracker', type=str,
-                        help=('The address of RPC tracker in host:port format. '
-                              'e.g. (10.77.1.234:9190)'))
+                        help=("The address of RPC tracker in host:port format. "
+                              "e.g. (10.77.1.234:9190)"))
     parser.add_argument('--key', type=str, default="",
-                        help='The key used to identify the device type in tracker.')
+                        help="The key used to identify the device type in tracker.")
     parser.add_argument('--silent', action='store_true',
-                        help='Whether run in silent mode.')
+                        help="Whether run in silent mode.")
     parser.add_argument('--load-library', type=str,
-                        help='Additional library to load')
+                        help="Additional library to load")
     parser.add_argument('--no-fork', dest='fork', action='store_false',
-                        help=('Use spawn mode to avoid fork. This option '
-                              'is able to avoid potential fork problems with Metal, OpenCL '
-                              'and ROCM compilers.'))
+                        help="Use spawn mode to avoid fork. This option \
+                        is able to avoid potential fork problems with Metal, OpenCL \
+                        and ROCM compilers.")
     parser.add_argument('--custom-addr', type=str,
-                        help='Custom IP Address to Report to RPC Tracker')
+                        help="Custom IP Address to Report to RPC Tracker")
     parser.add_argument('--utvm-dev-config', type=str,
                         help=('JSON config file for the target device (if using MicroTVM). '
                               'This file should contain serialized output similar to that returned '
diff --git a/python/tvm/micro/device/arm/stm32f746xx.py b/python/tvm/micro/device/arm/stm32f746xx.py
index 6d4b8f74aacb..746958504eda 100644
--- a/python/tvm/micro/device/arm/stm32f746xx.py
+++ b/python/tvm/micro/device/arm/stm32f746xx.py
@@ -67,7 +67,6 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=N
     options += [
         # TODO(weberlo): make a debug flag
         "-O2",
-        "-march=armv7e-m",
         "-mcpu=cortex-m7",
         "-mlittle-endian",
         "-mfloat-abi=hard",