From ab7f5cdc6b882a9da9597385a05d7273d23573db Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 1 Oct 2019 16:25:10 -0700 Subject: [PATCH 01/11] Prototype for micro TVM. --- python/tvm/autotvm/measure/local_executor.py | 1 + python/tvm/autotvm/measure/measure_methods.py | 6 +- python/tvm/autotvm/task/space.py | 6 +- python/tvm/autotvm/tuner/model_based_tuner.py | 1 + python/tvm/autotvm/tuner/tuner.py | 8 +- python/tvm/contrib/binutil.py | 164 +++++-- python/tvm/contrib/debugger/debug_runtime.py | 1 + python/tvm/exec/rpc_server.py | 39 +- python/tvm/micro/__init__.py | 5 +- python/tvm/micro/base.py | 222 ++++++--- python/tvm/micro/device/__init__.py | 2 +- python/tvm/micro/device/arm/stm32f746xx.py | 120 +++-- python/tvm/micro/device/base.py | 142 ++++-- python/tvm/micro/device/host.py | 101 ++-- python/tvm/micro/device/riscv_spike.py | 92 ++-- python/tvm/relay/_parser.py | 15 +- python/tvm/relay/build_module.py | 22 +- python/tvm/relay/frontend/keras.py | 2 +- python/tvm/rpc/client.py | 10 + python/tvm/rpc/server.py | 11 +- python/tvm/rpc/tracker.py | 3 + python/tvm/runtime/module.py | 2 +- src/ir/error.cc | 19 +- .../micro/device/arm/stm32f746xx/utvm_init.s | 1 + .../micro/device/arm/stm32f746xx/utvm_timer.c | 60 ++- src/runtime/micro/device/host/utvm_init.c | 1 + src/runtime/micro/device/host/utvm_timer.c | 32 +- .../micro/device/riscv_spike/utvm_init.s | 29 ++ .../micro/device/riscv_spike/utvm_timer.c | 43 ++ .../host_driven/utvm_device_dylib_redirect.c | 40 +- src/runtime/micro/host_driven/utvm_runtime.h | 45 +- src/runtime/micro/micro_common.h | 41 +- src/runtime/micro/micro_device_api.cc | 17 +- src/runtime/micro/micro_module.cc | 4 +- src/runtime/micro/micro_section_allocator.h | 9 +- src/runtime/micro/micro_session.cc | 447 +++++++++++++----- src/runtime/micro/micro_session.h | 89 +++- .../micro/target_data_layout_encoder.h | 31 +- src/runtime/micro/tcl_socket.cc | 6 +- src/runtime/rpc/rpc_session.cc | 85 +++- src/runtime/rpc/rpc_session.h | 48 +- src/target/source/codegen_c.cc | 6 + src/target/source/codegen_c_host.cc | 5 +- src/target/source/codegen_c_host.h | 2 + tests/python/unittest/test_runtime_micro.py | 192 ++++++-- .../topi/arm_cpu/conv2d_spatial_pack.py | 21 +- topi/python/topi/arm_cpu/injective.py | 3 + .../python/topi/testing/conv2d_nhwc_python.py | 6 +- 48 files changed, 1642 insertions(+), 615 deletions(-) create mode 100644 src/runtime/micro/device/riscv_spike/utvm_init.s create mode 100644 src/runtime/micro/device/riscv_spike/utvm_timer.c diff --git a/python/tvm/autotvm/measure/local_executor.py b/python/tvm/autotvm/measure/local_executor.py index cf81e2b50e50..d838a92daa01 100644 --- a/python/tvm/autotvm/measure/local_executor.py +++ b/python/tvm/autotvm/measure/local_executor.py @@ -145,6 +145,7 @@ def submit(self, func, *args, **kwargs): if not self.do_fork: return LocalFutureNoFork(func(*args, **kwargs)) + # TODO why they choose a queue size of 2? add a comment queue = Queue(2) process = Process(target=call_with_timeout, args=(queue, self.timeout, func, args, kwargs)) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 6533e75eef93..00ecd2e98a06 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -249,6 +249,7 @@ def get_build_kwargs(self): return kwargs def run(self, measure_inputs, build_results): + print('[RPCRunner.run]') results = [] remote_args = (self.key, self.host, self.port, self.priority, self.timeout) @@ -273,7 +274,9 @@ def run(self, measure_inputs, build_results): if isinstance(res, Exception): # executor error or timeout results.append(MeasureResult((str(res),), MeasureErrorNo.RUN_TIMEOUT, self.timeout, time.time())) + #raise Exception(f'encountered exception during measurement: {results}') else: + print(f' got a result: {res}') results.append(res) return results @@ -508,7 +511,8 @@ def run_through_rpc(measure_input, build_result, msg = msg[:msg.index("Stack trace returned")] if "CUDA Source" in msg: msg = msg[:msg.index("CUDA Source")] - costs = (RuntimeError(msg[:1024]),) + #costs = (RuntimeError(msg[:1024]),) + costs = (RuntimeError(msg),) errno = MeasureErrorNo.RUNTIME_DEVICE tstamp = time.time() time.sleep(cooldown_interval) diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py index fbf474fc4df7..f8de1fbf32d5 100644 --- a/python/tvm/autotvm/task/space.py +++ b/python/tvm/autotvm/task/space.py @@ -544,8 +544,10 @@ def apply(self, sch, op, axes, axis_lens=None, if ann == 'none': pass elif ann == 'unroll': - if max_unroll and axis_lens[i] > max_unroll: - cfg.raise_error("Too large factor for unrolling") + #if max_unroll and axis_lens[i] > max_unroll: + # cfg.raise_error("Too large factor for unrolling") + #if max_unroll and axis_lens[i] < max_unroll: + # cfg.raise_error("Too large factor for unrolling") sch[op].unroll(axes[i]) elif ann == 'vec': if vec_size and axis_lens[i] not in vec_size: diff --git a/python/tvm/autotvm/tuner/model_based_tuner.py b/python/tvm/autotvm/tuner/model_based_tuner.py index 432f7070c349..56fe5b4f3f72 100644 --- a/python/tvm/autotvm/tuner/model_based_tuner.py +++ b/python/tvm/autotvm/tuner/model_based_tuner.py @@ -263,6 +263,7 @@ def update(self, inputs, results): # if we have enough new training samples if len(self.xs) >= self.plan_size * (self.train_ct + 1) \ and self.flops_max > 1e-6: + import pdb; pdb.set_trace() self.cost_model.fit(self.xs, self.ys, self.plan_size) if self.diversity_filter_ratio: candidate = self.model_optimizer.find_maximums( diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py index 2441a4ae642f..52d745104e63 100644 --- a/python/tvm/autotvm/tuner/tuner.py +++ b/python/tvm/autotvm/tuner/tuner.py @@ -150,7 +150,13 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=(), si_pr i + k + 1, si_prefix, format_si_prefix(flops, si_prefix), format_si_prefix(self.best_flops, si_prefix), res, config) - i += len(results) + for result in results: + if isinstance(result.costs[0], float): + i += 1 + else: + print('[Tuner.tune]') + print(' not counting failure towards trial count') + #i += len(results) self.ttl = min(early_stopping + self.best_iter, n_trial) - i self.update(inputs, results) diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 521e0885548c..9d6d469ba7cd 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -21,6 +21,82 @@ import tvm._ffi from . import util +# TODO does this file still belong in `contrib`. is it too µTVM-specific? + +# TODO shouldn't need so many `ALIGN` directives +RELOCATION_LD_SCRIPT_TEMPLATE = """ +/* linker symbol for use in UTVMInit */ +_utvm_stack_pointer_init = 0x{stack_pointer_init:x}; + +SECTIONS +{{ + . = 0x{text_start:x}; + . = ALIGN({word_size}); + .text : + {{ + . = ALIGN({word_size}); + KEEP(*(.text)) + KEEP(*(.text*)) + . = ALIGN({word_size}); + }} + + . = 0x{rodata_start:x}; + . = ALIGN({word_size}); + .rodata : + {{ + . = ALIGN({word_size}); + KEEP(*(.rodata)) + KEEP(*(.rodata*)) + . = ALIGN({word_size}); + }} + + . = 0x{data_start:x}; + . = ALIGN({word_size}); + .data : + {{ + . = ALIGN({word_size}); + KEEP(*(.data)) + KEEP(*(.data*)) + . = ALIGN({word_size}); + }} + + . = 0x{bss_start:x}; + . = ALIGN({word_size}); + .bss : + {{ + . = ALIGN({word_size}); + KEEP(*(.bss)) + KEEP(*(.bss*)) + . = ALIGN({word_size}); + }} +}} +""" + +def run_cmd(cmd): + """Runs `cmd` in a subprocess and awaits its completion. + + Parameters + ---------- + cmd : List[str] + list of command-line arguments + + Returns + ------- + output : str + resulting stdout capture from the subprocess + """ + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + (output, _) = proc.communicate() + output = output.decode('utf-8') + if proc.returncode != 0: + cmd_str = ' '.join(cmd) + msg = f'error while running command \"{cmd_str}\":\n{output}' + raise RuntimeError(msg) + return output + RELOCATION_LD_SCRIPT_TEMPLATE = """ /* linker symbol for use in UTVMInit */ @@ -118,18 +194,18 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix): size of the section in bytes """ if not os.path.isfile(binary_path): - raise RuntimeError("no such file \"{}\"".format(binary_path)) + raise RuntimeError('no such file \"{}\"'.format(binary_path)) # We use the "-A" flag here to get the ".rodata" section's size, which is # not included by default. - size_output = run_cmd(["{}size".format(toolchain_prefix), "-A", binary_path]) + size_output = run_cmd(['{}size'.format(toolchain_prefix), '-A', binary_path]) # TODO(weberlo): Refactor this method and `*relocate_binary` so they are # both aware of [".bss", ".sbss", ".sdata"] being relocated to ".bss". section_mapping = { - ".text": [".text"], - ".rodata": [".rodata"], - ".data": [".data", ".sdata"], - ".bss": [".bss", ".sbss"], + '.text': ['.text'], + '.rodata': ['.rodata'], + '.data': ['.data', '.sdata'], + '.bss': ['.bss', '.sbss'], } sections_to_sum = section_mapping["." + section_name] section_size = 0 @@ -148,7 +224,7 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix): # NOTE: For some reason, the size of the BSS section on the RISC-V # GCC is sometimes reported to be smaller than it is, so we need to adjust # for this. - if "riscv" in toolchain_prefix and section_name == "bss": + if 'riscv' in toolchain_prefix and section_name == 'bss': # TODO(weberlo): Figure out why 32 is the minimum constant that works. # # The current hypothesis is that the last symbols in the ".bss" and @@ -160,7 +236,11 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix): # padding for most cases, but symbols can be arbitrarily large, so this # isn't bulletproof. return section_size + 32 - return section_size + # TODO remove this arbitrary addition once we figure out why section sizes + # are being undercalculated. + # maybe stop relying on `*size` to give us the size and instead read the + # section with `*objcopy` and count the bytes. + return section_size + 8 @tvm._ffi.register_func("tvm_callback_relocate_binary") @@ -206,11 +286,13 @@ def tvm_callback_relocate_binary( rel_bin : bytearray the relocated binary """ + assert text_start < rodata_start < data_start < bss_start < stack_end stack_pointer_init = stack_end - word_size - ld_script_contents = "" + ld_script_contents = '' # TODO(weberlo): There should be a better way to configure this for different archs. - if "riscv" in toolchain_prefix: - ld_script_contents += "OUTPUT_ARCH( \"riscv\" )\n\n" + # TODO is this line even necessary? + if 'riscv' in toolchain_prefix: + ld_script_contents += 'OUTPUT_ARCH( "riscv" )\n\n' ld_script_contents += RELOCATION_LD_SCRIPT_TEMPLATE.format( word_size=word_size, text_start=text_start, @@ -220,17 +302,31 @@ def tvm_callback_relocate_binary( stack_pointer_init=stack_pointer_init) tmp_dir = util.tempdir() - rel_obj_path = tmp_dir.relpath("relocated.obj") - rel_ld_script_path = tmp_dir.relpath("relocated.lds") - with open(rel_ld_script_path, "w") as f: + rel_obj_path = tmp_dir.relpath('relocated.obj') + rel_ld_script_path = tmp_dir.relpath('relocate.lds') + with open(rel_ld_script_path, 'w') as f: f.write(ld_script_contents) run_cmd([ - "{}ld".format(toolchain_prefix), + '{}ld'.format(toolchain_prefix), binary_path, - "-T", rel_ld_script_path, - "-o", rel_obj_path]) - with open(rel_obj_path, "rb") as f: + '-T', rel_ld_script_path, + '-o', rel_obj_path]) + + with open(rel_obj_path, 'rb') as f: rel_bin = bytearray(f.read()) + + gdb_init_dir = os.environ['MICRO_GDB_INIT_DIR'] + gdb_init_path = f'{gdb_init_dir}/.gdbinit' + with open(gdb_init_path, 'r') as f: + gdbinit_contents = f.read().split('\n') + new_contents = [] + for line in gdbinit_contents: + new_contents.append(line) + if line.startswith('target'): + new_contents.append(f'add-symbol-file {rel_obj_path}') + with open(gdb_init_path, 'w') as f: + f.write('\n'.join(new_contents)) + return rel_bin @@ -255,22 +351,22 @@ def tvm_callback_read_binary_section(binary, section, toolchain_prefix): contents of the read section """ tmp_dir = util.tempdir() - tmp_bin = tmp_dir.relpath("temp.bin") - tmp_section = tmp_dir.relpath("tmp_section.bin") - with open(tmp_bin, "wb") as out_file: + tmp_bin = tmp_dir.relpath('temp.bin') + tmp_section = tmp_dir.relpath('tmp_section.bin') + with open(tmp_bin, 'wb') as out_file: out_file.write(bytes(binary)) run_cmd([ - "{}objcopy".format(toolchain_prefix), - "--dump-section", - ".{}={}".format(section, tmp_section), + '{}objcopy'.format(toolchain_prefix), + '--dump-section', + '.{}={}'.format(section, tmp_section), tmp_bin]) if os.path.isfile(tmp_section): # Get section content if it exists. - with open(tmp_section, "rb") as f: + with open(tmp_section, 'rb') as f: section_bin = bytearray(f.read()) else: # Return empty bytearray if the section does not exist. - section_bin = bytearray("", "utf-8") + section_bin = bytearray('', 'utf-8') return section_bin @@ -293,18 +389,18 @@ def tvm_callback_get_symbol_map(binary, toolchain_prefix): alternating newline-separated keys and values """ tmp_dir = util.tempdir() - tmp_obj = tmp_dir.relpath("tmp_obj.bin") - with open(tmp_obj, "wb") as out_file: + tmp_obj = tmp_dir.relpath('tmp_obj.bin') + with open(tmp_obj, 'wb') as out_file: out_file.write(bytes(binary)) nm_output = run_cmd([ - "{}nm".format(toolchain_prefix), - "-C", - "--defined-only", + '{}nm'.format(toolchain_prefix), + '-C', + '--defined-only', tmp_obj]) nm_output = nm_output.splitlines() - map_str = "" + map_str = '' for line in nm_output: line = line.split() - map_str += line[2] + "\n" - map_str += line[0] + "\n" + map_str += line[2] + '\n' + map_str += line[0] + '\n' return map_str diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py index 848d7f57d1de..fd838c407617 100644 --- a/python/tvm/contrib/debugger/debug_runtime.py +++ b/python/tvm/contrib/debugger/debug_runtime.py @@ -181,6 +181,7 @@ def _run_debug(self): """ self.debug_datum._time_list = [ [float(t) * 1e-6] for t in self.run_individual(10, 1, 1) + #[float(t) * 1e-6] for t in self.run_individual(1, 1, 1) ] for i, node in enumerate(self.debug_datum.get_graph_nodes()): num_outputs = self.debug_datum.get_graph_node_output_num(node) diff --git a/python/tvm/exec/rpc_server.py b/python/tvm/exec/rpc_server.py index dbb690267e2a..a3c43583f44d 100644 --- a/python/tvm/exec/rpc_server.py +++ b/python/tvm/exec/rpc_server.py @@ -20,6 +20,7 @@ import argparse import ast +import json import multiprocessing import sys import logging @@ -75,8 +76,8 @@ def init_utvm(args): dev_config = json.load(dev_conf_file) else: dev_config_args = ast.literal_eval(args.utvm_dev_config_args) - default_config_func = micro.device.get_device_funcs(args.utvm_dev_id)['default_config'] - dev_config = default_config_func(*dev_config_args) + generate_config_func = micro.device.get_device_funcs(args.utvm_dev_id)['generate_config'] + dev_config = generate_config_func(*dev_config_args) if args.utvm_dev_config or args.utvm_dev_id: # add MicroTVM overrides @@ -100,27 +101,33 @@ def server_shutdown(): parser.add_argument('--port-end', type=int, default=9199, help='The end search port of the RPC') parser.add_argument('--tracker', type=str, - help="The address of RPC tracker in host:port format. " - "e.g. (10.77.1.234:9190)") + help=('The address of RPC tracker in host:port format. ' + 'e.g. (10.77.1.234:9190)')) parser.add_argument('--key', type=str, default="", - help="The key used to identify the device type in tracker.") + help='The key used to identify the device type in tracker.') parser.add_argument('--silent', action='store_true', - help="Whether run in silent mode.") + help='Whether run in silent mode.') parser.add_argument('--load-library', type=str, - help="Additional library to load") + help='Additional library to load') parser.add_argument('--no-fork', dest='fork', action='store_false', - help="Use spawn mode to avoid fork. This option \ - is able to avoid potential fork problems with Metal, OpenCL \ - and ROCM compilers.") + help=('Use spawn mode to avoid fork. This option ' + 'is able to avoid potential fork problems with Metal, OpenCL ' + 'and ROCM compilers.')) parser.add_argument('--custom-addr', type=str, - help="Custom IP Address to Report to RPC Tracker") + help='Custom IP Address to Report to RPC Tracker') parser.add_argument('--utvm-dev-config', type=str, - help='JSON config file for the target device (if using MicroTVM)') - parser.add_argument('--utvm-dev-id', type=str, - help='Unique ID for the target device (if using MicroTVM)') + help=('JSON config file for the target device (if using MicroTVM). ' + 'This file should contain serialized output similar to that returned ' + "from the device module's generate_config. Can't be specified when " + '--utvm-dev-config-args is specified.')) parser.add_argument('--utvm-dev-config-args', type=str, - help=('Python list of literals required to generate a default' - ' MicroTVM config (if --utvm-dev-id is specified)')) + help=("Arguments to the device module's generate_config function. " + 'Must be a python literal parseable by literal_eval. If specified, the ' + "device configuration is generated using the device module's generate_config. " + "Can't be specified when --utvm-dev-config is specified.")) + parser.add_argument('--utvm-dev-id', type=str, + help=('Unique ID for the target device (if using MicroTVM). Should ' + 'match the name of a module underneath tvm.micro.device).')) parser.set_defaults(fork=True) args = parser.parse_args() diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py index 9e984c08fe2c..7c1389cc4eef 100644 --- a/python/tvm/micro/__init__.py +++ b/python/tvm/micro/__init__.py @@ -17,6 +17,7 @@ """MicroTVM module for bare-metal backends""" from ..contrib import binutil -from .base import Session, create_micro_mod, cross_compiler -from .base import LibType, get_micro_host_driven_dir, get_micro_device_dir +from .base import DEVICE_SECTIONS +from .base import Session, create_micro_mod, cross_compiler, LibType +from .base import get_micro_host_driven_dir, get_micro_device_dir from . import device diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 9f50f9855303..5a1b71846630 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -25,9 +25,22 @@ import tvm import tvm._ffi +import tvm from tvm.contrib import util as _util from tvm.contrib import cc as _cc +# all sections that comprise a device's memory layout, in order from lowest +# starting address to highest +DEVICE_SECTIONS = [ + 'text', + 'rodata', + 'data', + 'bss', + 'args', + 'heap', + 'workspace', + 'stack', +] class LibType(Enum): """Enumeration of library types that can be compiled and loaded onto a device""" @@ -51,9 +64,9 @@ class Session: .. code-block:: python c_mod = ... # some module generated with "c" as the target - dev_config = micro.device.arm.stm32f746xx.default_config("127.0.0.1", 6666) + dev_config = micro.device.arm.stm32f746xx.default_config('127.0.0.1', 6666) with tvm.micro.Session(dev_config) as sess: - micro_mod = create_micro_mod(c_mod, dev_config) + micro_mod = sess.create_micro_mod(c_mod) """ def __init__(self, config): @@ -61,69 +74,72 @@ def __init__(self, config): # TODO(weberlo): add config validation # grab a binutil instance from the ID in the config - dev_funcs = tvm.micro.device.get_device_funcs(config["device_id"]) - self.create_micro_lib = dev_funcs["create_micro_lib"] - self.toolchain_prefix = config["toolchain_prefix"] - self.mem_layout = config["mem_layout"] - self.word_size = config["word_size"] - self.thumb_mode = config["thumb_mode"] - self.comms_method = config["comms_method"] + dev_funcs = tvm.micro.device.get_device_funcs(config['device_id']) + self.toolchain_prefix = config['toolchain_prefix'] + self.mem_layout = config['mem_layout'] + self.word_size = config['word_size'] + self.thumb_mode = config['thumb_mode'] + self.use_device_timer = config['use_device_timer'] + self.comms_method = config['comms_method'] # First, find and compile runtime library. - runtime_src_path = os.path.join(get_micro_host_driven_dir(), "utvm_runtime.c") + runtime_src_path = os.path.join(get_micro_host_driven_dir(), 'utvm_runtime.c') tmp_dir = _util.tempdir() - runtime_obj_path = tmp_dir.relpath("utvm_runtime.obj") - self.create_micro_lib(runtime_obj_path, runtime_src_path, LibType.RUNTIME) - #input(f"check {runtime_obj_path}: ") - - comms_method = config["comms_method"] - if comms_method == "openocd": - server_addr = config["server_addr"] - server_port = config["server_port"] - elif comms_method == "host": - server_addr = "" + runtime_obj_path = tmp_dir.relpath('utvm_runtime.obj') + dev_funcs['create_micro_lib'](runtime_obj_path, runtime_src_path, LibType.RUNTIME) + + comms_method = config['comms_method'] + if comms_method == 'openocd': + server_addr = config['server_addr'] + server_port = config['server_port'] + elif comms_method == 'host': + server_addr = '' server_port = 0 else: - raise RuntimeError(f"unknown communication method: f{self.comms_method}") + raise RuntimeError(f'unknown communication method: f{self.comms_method}') + assert all(map(lambda sec: sec in self.mem_layout, DEVICE_SECTIONS)), 'not all sections have an assigned memory layout' self.module = _CreateSession( comms_method, runtime_obj_path, self.toolchain_prefix, - self.mem_layout["text"].get("start", 0), - self.mem_layout["text"]["size"], - self.mem_layout["rodata"].get("start", 0), - self.mem_layout["rodata"]["size"], - self.mem_layout["data"].get("start", 0), - self.mem_layout["data"]["size"], - self.mem_layout["bss"].get("start", 0), - self.mem_layout["bss"]["size"], - self.mem_layout["args"].get("start", 0), - self.mem_layout["args"]["size"], - self.mem_layout["heap"].get("start", 0), - self.mem_layout["heap"]["size"], - self.mem_layout["workspace"].get("start", 0), - self.mem_layout["workspace"]["size"], - self.mem_layout["stack"].get("start", 0), - self.mem_layout["stack"]["size"], + self.mem_layout['text'].get('start', 0), + self.mem_layout['text']['size'], + self.mem_layout['rodata'].get('start', 0), + self.mem_layout['rodata']['size'], + self.mem_layout['data'].get('start', 0), + self.mem_layout['data']['size'], + self.mem_layout['bss'].get('start', 0), + self.mem_layout['bss']['size'], + self.mem_layout['args'].get('start', 0), + self.mem_layout['args']['size'], + self.mem_layout['heap'].get('start', 0), + self.mem_layout['heap']['size'], + self.mem_layout['workspace'].get('start', 0), + self.mem_layout['workspace']['size'], + self.mem_layout['stack'].get('start', 0), + self.mem_layout['stack']['size'], self.word_size, self.thumb_mode, + self.use_device_timer, server_addr, server_port) - self._enter = self.module["enter"] - self._exit = self.module["exit"] + self._enter = self.module['enter'] + self._exit = self.module['exit'] + self.get_last_batch_time = self.module['get_last_batch_time'] + self.get_last_batch_cycles = self.module['get_last_batch_cycles'] def _check_system(self): """Check if the user's system is supported by MicroTVM. Raises error if not supported. """ - if not sys.platform.startswith("linux"): - raise RuntimeError("MicroTVM is currently only supported on Linux hosts") + if not sys.platform.startswith('linux'): + raise RuntimeError('MicroTVM is currently only supported on Linux') # TODO(weberlo): Add 32-bit support. # It's primarily the compilation pipeline that isn't compatible. if sys.maxsize <= 2**32: - raise RuntimeError("MicroTVM is currently only supported on 64-bit host platforms") + raise RuntimeError('MicroTVM is currently only supported on 64-bit platforms') def __enter__(self): self._enter() @@ -133,44 +149,90 @@ def __exit__(self, exc_type, exc_value, exc_traceback): self._exit() -def create_micro_mod(c_mod, dev_config): +def _calc_max_workspace_usage(src): + # TODO factor in alignment to the calculation (alloc sizes will be aligned up to the word size) + import re + alloc_re = re.compile(r'.*\* ?(.+) = (\(.+\))? TVMBackendAllocWorkspace\(.+, .+, \(uint64_t\)(.+), .+, .+\).*') + free_re = re.compile(r'.*if \(TVMBackendFreeWorkspace\(.+, .+, (\(void\*\))? (.+)\) != 0\) {.*') + max_usage = 0 + alloc_map = {} + for line in src.split('\n'): + if line.strip().startswith('//'): + continue + match = alloc_re.match(line) + if match is not None: + alloc_map[match.group(1)] = int(match.group(3)) + max_usage = max(max_usage, sum(alloc_map.values())) + else: + match = free_re.match(line) + if match is not None: + print(alloc_map) + del alloc_map[match.group(2)] + return max_usage + + +def create_micro_mod(c_mod, dev_config, lib_src_paths=None, lib_headers=None, lib_include_paths=None): """Produces a micro module from a given module. Parameters ---------- - c_mod : tvm.runtime.Module + c_mod : tvm.module.Module module with "c" as its target backend - dev_config : Dict[str, Any] - MicroTVM config dict for the target device + lib_src_paths: TODO + TODO + + lib_headers: TODO + TODO + + lib_include_paths: TODO + TODO Return ------ - micro_mod : tvm.runtim.Module + micro_mod : tvm.module.Module micro module for the target device """ + print('[create_micro_mod]') temp_dir = _util.tempdir() - lib_obj_path = temp_dir.relpath("dev_lib.obj") + lib_obj_path = temp_dir.relpath('dev_lib.obj') + # TODO use dev config to dispatch on the type of C codegen to run through + # (e.g., CodeGenCArm, CodeGenCHost, CodeGenCRiscV) c_mod.export_library( - lib_obj_path, - fcompile=cross_compiler(dev_config, LibType.OPERATOR)) - micro_mod = tvm.runtime.load_module(lib_obj_path) + lib_obj_path, + fcompile=cross_compiler( + dev_config, + LibType.OPERATOR, + lib_src_paths=lib_src_paths, + lib_headers=lib_headers, + lib_include_paths=lib_include_paths)) + micro_mod = tvm.module.load(lib_obj_path) return micro_mod -def cross_compiler(dev_config, lib_type): - """Create a cross-compile function that wraps `create_lib` for a `Binutil` instance. +def cross_compiler(dev_config, lib_type, lib_src_paths=None, lib_headers=None, lib_include_paths=None): + """Create a cross compile function that wraps `create_lib` for a `Binutil` instance. For use in `tvm.runtime.Module.export_library`. Parameters ---------- - dev_config : Dict[str, Any] - MicroTVM config dict for the target device + create_micro_lib : func + function for creating MicroTVM libraries for a specific device (e.g., + `tvm.micro.device.get_device_funcs('arm.stm32f746xx')['create_micro_lib']`) lib_type : micro.LibType whether to compile a MicroTVM runtime or operator library + lib_src_paths: TODO + TODO + + lib_headers: TODO + e.g., `['cmsis_gcc.h', 'arm_math.h']` + + lib_include_paths: TODO + TODO + Return ------ func : Callable[[str, str, Optional[str]], None] @@ -183,17 +245,47 @@ def cross_compiler(dev_config, lib_type): c_mod = ... # some module generated with "c" as the target fcompile = tvm.micro.cross_compiler(dev_config, LibType.OPERATOR) - c_mod.export_library("dev_lib.obj", fcompile=fcompile) + c_mod.export_library('dev_lib.obj', fcompile=fcompile) """ - dev_funcs = tvm.micro.device.get_device_funcs(dev_config['device_id']) - create_micro_lib = dev_funcs['create_micro_lib'] + assert (lib_headers is None) == (lib_include_paths is None), 'must specify both `lib_headers` and `lib_include_paths` or neither' + + if lib_src_paths is None: + lib_src_paths = [] + if lib_include_paths is None: + lib_include_paths = [] + include_options = [] + for include_path in lib_include_paths: + include_options.append('-I') + include_options.append(include_path) + create_micro_lib = tvm.micro.device.get_device_funcs(dev_config['device_id'])['create_micro_lib'] + mem_layout = dev_config['mem_layout'] + def compile_func(obj_path, src_path, **kwargs): if isinstance(obj_path, list): obj_path = obj_path[0] if isinstance(src_path, list): src_path = src_path[0] - create_micro_lib(obj_path, src_path, lib_type, kwargs.get("options", None)) - return _cc.cross_compiler(compile_func, output_format="obj") + options = kwargs.get('options', []) + options += include_options + + # check that workspace allocations don't exceed available workspace memory + with open(src_path) as f: + src_contents = f.read() + max_ws_usage = _calc_max_workspace_usage(src_contents) + available_mem = mem_layout['workspace']['size'] + if max_ws_usage > available_mem: + raise RuntimeError(f'workspace allocations in library ({max_ws_usage}) exceed available memory ({available_mem})') + # inject headers into new source path, if requested + if lib_headers: + headers_to_inject = '\n'.join(map(lambda s: f'#include <{s}>', lib_headers)) + '\n' + new_src_contents = headers_to_inject + src_contents + tmp_dir = _util.tempdir() + src_path = tmp_dir.relpath(os.path.basename(src_path)) + with open(src_path, 'w') as f: + f.write(new_src_contents) + + create_micro_lib(obj_path, src_path, lib_type, options, lib_src_paths=lib_src_paths) + return _cc.cross_compiler(compile_func, output_format='obj') def get_micro_host_driven_dir(): @@ -205,8 +297,8 @@ def get_micro_host_driven_dir(): directory path """ micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) - micro_host_driven_dir = os.path.join(micro_dir, "..", "..", "..", - "src", "runtime", "micro", "host_driven") + micro_host_driven_dir = os.path.join(micro_dir, '..', '..', '..', + 'src', 'runtime', 'micro', 'host_driven') return micro_host_driven_dir @@ -219,9 +311,9 @@ def get_micro_device_dir(): directory path """ micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) - micro_device_dir = os.path.join(micro_dir, "..", "..", "..", - "src", "runtime", "micro", "device") + micro_device_dir = os.path.join(micro_dir, '..', '..', '..', + 'src', 'runtime', 'micro', 'device') return micro_device_dir -tvm._ffi._init_api("tvm.micro", "tvm.micro.base") +tvm._ffi._init_api('tvm.micro', 'tvm.micro.base') diff --git a/python/tvm/micro/device/__init__.py b/python/tvm/micro/device/__init__.py index 1ccd6847edd8..3d2291c6a052 100644 --- a/python/tvm/micro/device/__init__.py +++ b/python/tvm/micro/device/__init__.py @@ -16,7 +16,7 @@ # under the License. """Device-specific configuration for MicroTVM""" -from .base import register_device, get_device_funcs, create_micro_lib_base +from .base import create_micro_lib_base, gen_mem_layout, MemConstraint, register_device, get_device_funcs from . import host from . import arm from . import riscv_spike diff --git a/python/tvm/micro/device/arm/stm32f746xx.py b/python/tvm/micro/device/arm/stm32f746xx.py index 31b44cf9d36b..8cd354738fe7 100644 --- a/python/tvm/micro/device/arm/stm32f746xx.py +++ b/python/tvm/micro/device/arm/stm32f746xx.py @@ -14,13 +14,31 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Compilation and config definitions for ARM STM32F746XX devices""" -from .. import create_micro_lib_base, register_device +"""Compilation and config definitions for Arm STM32F746XX devices""" +from .. import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint -DEVICE_ID = "arm.stm32f746xx" -TOOLCHAIN_PREFIX = "arm-none-eabi-" +DEVICE_ID = 'arm.stm32f746xx' +TOOLCHAIN_PREFIX = 'arm-none-eabi-' +WORD_SIZE = 4 +# +# [Device Memory Layout] +# RAM (rwx) : START = 0x20000000, LENGTH = 320K +# Flash (rx) : START = 0x8000000, LENGTH = 1024K +# +BASE_ADDR = 0x20000000 +AVAILABLE_MEM = 320000 +DEFAULT_SECTION_CONSTRAINTS = { + 'text': (18000, MemConstraint.ABSOLUTE_BYTES), + 'rodata': (100, MemConstraint.ABSOLUTE_BYTES), + 'data': (100, MemConstraint.ABSOLUTE_BYTES), + 'bss': (600, MemConstraint.ABSOLUTE_BYTES), + 'args': (4096, MemConstraint.ABSOLUTE_BYTES), + 'heap': (100.0, MemConstraint.WEIGHT), + 'workspace': (64000, MemConstraint.ABSOLUTE_BYTES), + 'stack': (32, MemConstraint.ABSOLUTE_BYTES), +} -def create_micro_lib(obj_path, src_path, lib_type, options=None): +def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None): """Wrapper over `create_micro_lib_base` to add device-specific options Parameters @@ -36,23 +54,33 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None): options : Optional[List[str]] additional options to pass to GCC + + lib_src_paths : Optional[List[str]] + TODO """ if options is None: options = [] options += [ - "-mcpu=cortex-m7", - "-mlittle-endian", - "-mfloat-abi=hard", - "-mfpu=fpv5-sp-d16", - "-mthumb", - "-gdwarf-5", + '-march=armv7e-m', + '-mcpu=cortex-m7', + '-mlittle-endian', + '-mfloat-abi=hard', + # TODO try this one? + #'-mfpu=fpv5-d16', + '-mfpu=fpv5-sp-d16', + '-mthumb', + '-ffast-math', + '-gdwarf-5', + '-DARM_MATH_CM7', + '-D__FPU_PRESENT=1U', + '-DARM_MATH_DSP', ] create_micro_lib_base( - obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options) + obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options, lib_src_paths=lib_src_paths) -def default_config(server_addr, server_port): - """Generates a default configuration for ARM STM32F746XX devices +def generate_config(server_addr, server_port, section_constraints=None): + """Generates a configuration for Arm STM32F746XX devices Parameters ---------- @@ -62,62 +90,30 @@ def default_config(server_addr, server_port): server_port : int port of OpenOCD server to connect to + section_constraints: Optional[Dict[str, Tuple[Number, MemConstraint]]] + TODO correct type annotation? + Return ------ config : Dict[str, Any] MicroTVM config dict for this device """ + if section_constraints is None: + section_constraints = DEFAULT_SECTION_CONSTRAINTS return { - "device_id": DEVICE_ID, - "toolchain_prefix": TOOLCHAIN_PREFIX, - # - # [Device Memory Layout] - # RAM (rwx) : START = 0x20000000, LENGTH = 320K - # FLASH (rx) : START = 0x8000000, LENGTH = 1024K - # - "mem_layout": { - "text": { - "start": 0x20000180, - "size": 20480, - }, - "rodata": { - "start": 0x20005180, - "size": 20480, - }, - "data": { - "start": 0x2000a180, - "size": 768, - }, - "bss": { - "start": 0x2000a480, - "size": 768, - }, - "args": { - "start": 0x2000a780, - "size": 1280, - }, - "heap": { - "start": 0x2000ac80, - "size": 262144, - }, - "workspace": { - "start": 0x2004ac80, - "size": 20480, - }, - "stack": { - "start": 0x2004fc80, - "size": 80, - }, - }, - "word_size": 4, - "thumb_mode": True, - "comms_method": "openocd", - "server_addr": server_addr, - "server_port": server_port, + 'device_id': DEVICE_ID, + 'toolchain_prefix': TOOLCHAIN_PREFIX, + 'mem_layout': gen_mem_layout(BASE_ADDR, AVAILABLE_MEM, WORD_SIZE, section_constraints), + 'word_size': WORD_SIZE, + 'thumb_mode': True, + 'use_device_timer': False, + 'comms_method': 'openocd', + 'server_addr': server_addr, + 'server_port': server_port, } register_device(DEVICE_ID, { - "create_micro_lib": create_micro_lib, - "default_config": default_config, + 'create_micro_lib': create_micro_lib, + 'generate_config': generate_config, }) diff --git a/python/tvm/micro/device/base.py b/python/tvm/micro/device/base.py index ae53b9cc539f..41d1a3ed1a95 100644 --- a/python/tvm/micro/device/base.py +++ b/python/tvm/micro/device/base.py @@ -17,12 +17,14 @@ """Base definitions for MicroTVM config""" import glob import os -from pathlib import Path +import enum +import pathlib +import operator from tvm.contrib import util as _util from tvm.contrib.binutil import run_cmd from tvm._ffi.libinfo import find_include_path -from tvm.micro import LibType, get_micro_host_driven_dir, get_micro_device_dir +from tvm.micro import DEVICE_SECTIONS, LibType, get_micro_host_driven_dir, get_micro_device_dir _DEVICE_REGISTRY = {} @@ -38,7 +40,7 @@ def register_device(device_id, device_funcs): dictionary with compilation and config generation functions as values """ if device_id in _DEVICE_REGISTRY: - raise RuntimeError(f"\"{device_id}\" already exists in the device registry") + raise RuntimeError(f'"{device_id}" already exists in the device registry') _DEVICE_REGISTRY[device_id] = device_funcs @@ -56,7 +58,7 @@ def get_device_funcs(device_id): dictionary with compilation and config generation functions as values """ if device_id not in _DEVICE_REGISTRY: - raise RuntimeError(f"\"{device_id}\" does not exist in the binutil registry") + raise RuntimeError(f'"{device_id}" does not exist in the binutil registry') device_funcs = _DEVICE_REGISTRY[device_id] return device_funcs @@ -67,7 +69,9 @@ def create_micro_lib_base( toolchain_prefix, device_id, lib_type, - options=None): + options=None, + lib_src_paths=None, + ): """Compiles code into a binary for the target micro device. Parameters @@ -92,21 +96,35 @@ def create_micro_lib_base( options : List[str] additional options to pass to GCC + + lib_src_paths : Optional[List[str]] + TODO """ + print('[MicroBinutil.create_lib]') + print(' EXTENDED OPTIONS') + print(f' {out_obj_path}') + print(f' {in_src_path}') + print(f' {lib_type}') + print(f' {options}') + # look at these (specifically `strip`): + # https://stackoverflow.com/questions/15314581/g-compiler-flag-to-minimize-binary-size base_compile_cmd = [ - f"{toolchain_prefix}gcc", - "-std=c11", - "-Wall", - "-Wextra", - "--pedantic", - "-c", - "-O0", - "-g", - "-nostartfiles", - "-nodefaultlibs", - "-nostdlib", - "-fdata-sections", - "-ffunction-sections", + f'{toolchain_prefix}gcc', + '-std=c11', + '-Wall', + '-Wextra', + '--pedantic', + '-c', + # TODO(weberlo): make a debug flag + '-O0', + # '-O2', + # '-Os', + '-g', + '-nostartfiles', + '-nodefaultlibs', + '-nostdlib', + '-fdata-sections', + '-ffunction-sections', ] if options is not None: base_compile_cmd += options @@ -114,49 +132,101 @@ def create_micro_lib_base( src_paths = [] include_paths = find_include_path() + [get_micro_host_driven_dir()] tmp_dir = _util.tempdir() - # we might transform the src path in one of the branches below + # we need to create a new src file in the operator branch new_in_src_path = in_src_path if lib_type == LibType.RUNTIME: dev_dir = _get_device_source_dir(device_id) - dev_src_paths = glob.glob(f"{dev_dir}/*.[csS]") + + dev_src_paths = glob.glob(f'{dev_dir}/*.[csS]') # there needs to at least be a utvm_timer.c file assert dev_src_paths - assert "utvm_timer.c" in map(os.path.basename, dev_src_paths) + assert 'utvm_timer.c' in map(os.path.basename, dev_src_paths) + src_paths += dev_src_paths elif lib_type == LibType.OPERATOR: - # create a temporary copy of the source, so we can inject the dev lib + # create a temporary copy of the operator source, so we can inject the dev lib # header without modifying the original. - temp_src_path = tmp_dir.relpath("temp.c") - with open(in_src_path, "r") as f: + temp_src_path = tmp_dir.relpath('temp.c') + with open(in_src_path, 'r') as f: src_lines = f.read().splitlines() - src_lines.insert(0, "#include \"utvm_device_dylib_redirect.c\"") - with open(temp_src_path, "w") as f: - f.write("\n".join(src_lines)) + src_lines.insert(0, '#include "utvm_device_dylib_redirect.c"') + with open(temp_src_path, 'w') as f: + f.write('\n'.join(src_lines)) new_in_src_path = temp_src_path - base_compile_cmd += ["-c"] else: - raise RuntimeError("unknown lib type") + raise RuntimeError('unknown lib type') src_paths += [new_in_src_path] + # add any src paths required by the operator + if lib_src_paths is not None: + src_paths += lib_src_paths + + print(f'include paths: {include_paths}') for path in include_paths: - base_compile_cmd += ["-I", path] + base_compile_cmd += ['-I', path] prereq_obj_paths = [] + print(src_paths) for src_path in src_paths: - curr_obj_path = Path(src_path).with_suffix(".o").name + curr_obj_path = tmp_dir.relpath(pathlib.Path(src_path).with_suffix('.o').name) assert curr_obj_path not in prereq_obj_paths prereq_obj_paths.append(curr_obj_path) - curr_compile_cmd = base_compile_cmd + [src_path, "-o", curr_obj_path] + curr_compile_cmd = base_compile_cmd + [src_path, '-o', curr_obj_path] + # TODO(weberlo): make compilation fail if there are any warnings run_cmd(curr_compile_cmd) - ld_cmd = [f"{toolchain_prefix}ld", "-relocatable"] + ld_cmd = [f'{toolchain_prefix}ld', '-relocatable'] ld_cmd += prereq_obj_paths - ld_cmd += ["-o", out_obj_path] + ld_cmd += ['-o', out_obj_path] run_cmd(ld_cmd) +# TODO we shouldn't need an enum for this. too much bureaucracy. +class MemConstraint(enum.Enum): + """Represents a constraint on the device's memory layout""" + ABSOLUTE_BYTES = 0 + WEIGHT = 1 + + +def gen_mem_layout(base_addr, available_mem, word_size, section_constraints): + print('[gen_mem_layout]') + byte_sum = sum(map(operator.itemgetter(0), filter(lambda x: x[1] == MemConstraint.ABSOLUTE_BYTES, section_constraints.values()))) + weight_sum = sum(map(operator.itemgetter(0), filter(lambda x: x[1] == MemConstraint.WEIGHT, section_constraints.values()))) + assert byte_sum <= available_mem + available_weight_mem = available_mem - byte_sum + + res = {} + curr_addr = base_addr + for section in DEVICE_SECTIONS: + (val, cons_type) = section_constraints[section] + if cons_type == MemConstraint.ABSOLUTE_BYTES: + assert val % word_size == 0, f'constraint {val} for {section} section is not word-aligned' + size = val + res[section] = { + 'start': curr_addr, + 'size': size, + } + else: + size = int((val / weight_sum) * available_weight_mem) + size = (size // word_size) * word_size + res[section] = { + 'start': curr_addr, + 'size': size, + } + curr_addr += size + + print(' result mem layout:') + for section in DEVICE_SECTIONS: + start = res[section]['start'] + size = res[section]['size'] + print(f' {section}: start={start:x}, size={size}') + # import pprint + # pprint.pprint(res) + return res + + def _get_device_source_dir(device_id): """Grabs the source directory for device-specific uTVM files""" - dev_subdir = "/".join(device_id.split(".")) - return get_micro_device_dir() + "/" + dev_subdir + dev_subdir = '/'.join(device_id.split('.')) + return get_micro_device_dir() + '/' + dev_subdir diff --git a/python/tvm/micro/device/host.py b/python/tvm/micro/device/host.py index a5495b60cf99..737026dfd51a 100644 --- a/python/tvm/micro/device/host.py +++ b/python/tvm/micro/device/host.py @@ -17,12 +17,26 @@ """Compilation and config definitions for the host emulated device""" import sys -from . import create_micro_lib_base, register_device +from . import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint -DEVICE_ID = "host" -TOOLCHAIN_PREFIX = "" +DEVICE_ID = 'host' +TOOLCHAIN_PREFIX = '' +WORD_SIZE = 8 if sys.maxsize > 2**32 else 4 -def create_micro_lib(obj_path, src_path, lib_type, options=None): +# we pretend we only have 320kb in the default case, so we can use `gen_mem_layout` +DEFAULT_AVAILABLE_MEM = 3200000 +DEFAULT_SECTION_CONSTRAINTS = { + 'text': (20480, MemConstraint.ABSOLUTE_BYTES), + 'rodata': (20480, MemConstraint.ABSOLUTE_BYTES), + 'data': (768, MemConstraint.ABSOLUTE_BYTES), + 'bss': (4096, MemConstraint.ABSOLUTE_BYTES), + 'args': (4096, MemConstraint.ABSOLUTE_BYTES), + 'heap': (262144, MemConstraint.ABSOLUTE_BYTES), + 'workspace': (64000, MemConstraint.ABSOLUTE_BYTES), + 'stack': (80, MemConstraint.ABSOLUTE_BYTES), +} + +def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None): """Wrapper over `create_micro_lib_base` to add device-specific options Parameters @@ -38,59 +52,62 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None): options : Optional[List[str]] additional options to pass to GCC + + lib_src_paths : Optional[List[str]] + TODO """ if options is None: options = [] - if sys.maxsize > 2**32 and sys.platform.startswith("linux"): - options += ["-mcmodel=large"] + if sys.maxsize > 2**32 and sys.platform.startswith('linux'): + options += ['-mcmodel=large'] create_micro_lib_base( - obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options) + obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options, lib_src_paths=lib_src_paths) -def default_config(): - """Generates a default configuration for the host emulated device +def generate_config(available_mem=None, section_constraints=None): + """Generates a configuration for the host emulated device + + Parameters + ---------- + TODO correct type annotation? + available_mem: int + TODO + + TODO correct type annotation? + section_constraints: Optional[Dict[str, Dict[str, Number]]] + TODO Return ------ config : Dict[str, Any] MicroTVM config dict for this device """ + if available_mem is None: + available_mem = DEFAULT_AVAILABLE_MEM + if section_constraints is None: + section_constraints = DEFAULT_SECTION_CONSTRAINTS + mem_layout = gen_mem_layout(0, available_mem, WORD_SIZE, section_constraints) + # TODO the host emulated device is an outlier, since we don't know how what + # its base address will be until we've created it in the C++. is there any + # way to change the infrastructure around this so it's not so much of an + # outlier? + + # need to zero out all start addresses, because they don't make sense for a + # host device (the memory region is allocated in the backend) + for section in mem_layout: + mem_layout[section]['start'] = 0 return { - "device_id": DEVICE_ID, - "toolchain_prefix": TOOLCHAIN_PREFIX, - "mem_layout": { - "text": { - "size": 20480, - }, - "rodata": { - "size": 20480, - }, - "data": { - "size": 768, - }, - "bss": { - "size": 768, - }, - "args": { - "size": 1280, - }, - "heap": { - "size": 262144, - }, - "workspace": { - "size": 20480, - }, - "stack": { - "size": 80, - }, - }, - "word_size": 8 if sys.maxsize > 2**32 else 4, - "thumb_mode": False, - "comms_method": "host", + 'device_id': DEVICE_ID, + 'toolchain_prefix': TOOLCHAIN_PREFIX, + 'mem_layout': mem_layout, + 'word_size': WORD_SIZE, + 'thumb_mode': False, + 'use_device_timer': False, + 'comms_method': 'host', } register_device(DEVICE_ID, { - "create_micro_lib": create_micro_lib, - "default_config": default_config, + 'create_micro_lib': create_micro_lib, + 'generate_config': generate_config, }) diff --git a/python/tvm/micro/device/riscv_spike.py b/python/tvm/micro/device/riscv_spike.py index 923e5dfb23a2..b7beff3c5cb6 100644 --- a/python/tvm/micro/device/riscv_spike.py +++ b/python/tvm/micro/device/riscv_spike.py @@ -17,12 +17,24 @@ """Compilation and config definitions for Spike, a RISC-V functional ISA simulator""" from collections import OrderedDict -from . import create_micro_lib_base, register_device +from . import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint -DEVICE_ID = "riscv_spike" -TOOLCHAIN_PREFIX = "riscv64-unknown-elf-" +DEVICE_ID = 'riscv_spike' +TOOLCHAIN_PREFIX = 'riscv64-unknown-elf-' +WORD_SIZE = 8 -def create_micro_lib(obj_path, src_path, lib_type, options=None): +DEFAULT_SECTION_CONSTRAINTS = { + 'text': (18000, MemConstraint.ABSOLUTE_BYTES), + 'rodata': (128, MemConstraint.ABSOLUTE_BYTES), + 'data': (128, MemConstraint.ABSOLUTE_BYTES), + 'bss': (2048, MemConstraint.ABSOLUTE_BYTES), + 'args': (4096, MemConstraint.ABSOLUTE_BYTES), + 'heap': (100.0, MemConstraint.WEIGHT), + 'workspace': (64000, MemConstraint.ABSOLUTE_BYTES), + 'stack': (32, MemConstraint.ABSOLUTE_BYTES), +} + +def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None): """Wrapper over `create_micro_lib_base` to add device-specific options Parameters @@ -38,6 +50,9 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None): options : Optional[List[str]] additional options to pass to GCC + + lib_src_paths : Optional[List[str]] + TODO """ create_micro_lib_base( obj_path, @@ -45,11 +60,13 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None): TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, - options=options) + options=options, + lib_src_paths=lib_src_paths + ) -def default_config(base_addr, server_addr, server_port): - """Generates a default configuration for Spike +def generate_config(base_addr, available_mem, server_addr, server_port, section_constraints=None): + """Generates a configuration for Spike Parameters ---------- @@ -62,56 +79,31 @@ def default_config(base_addr, server_addr, server_port): server_port : int port of OpenOCD server to connect to + TODO correct type annotation? + section_constraints: Optional[Dict[str, Tuple[Number, MemConstraint]]] + TODO + Return ------ config : Dict[str, Any] MicroTVM config dict for this device """ - res = { - "device_id": DEVICE_ID, - "toolchain_prefix": TOOLCHAIN_PREFIX, - "mem_layout": OrderedDict([ - ("text", { - "size": 20480, - }), - ("rodata", { - "size": 20480, - }), - ("data", { - "size": 768, - }), - ("bss", { - "size": 768, - }), - ("args", { - "size": 1280, - }), - ("heap", { - "size": 262144, - }), - ("workspace", { - "size": 20480, - }), - ("stack", { - "size": 80, - }), - ]), - "word_size": 4, - "thumb_mode": True, - "comms_method": "openocd", - "server_addr": server_addr, - "server_port": server_port, + if section_constraints is None: + section_constraints = DEFAULT_SECTION_CONSTRAINTS + return { + 'device_id': DEVICE_ID, + 'toolchain_prefix': TOOLCHAIN_PREFIX, + 'mem_layout': gen_mem_layout(base_addr, available_mem, WORD_SIZE, section_constraints), + 'word_size': WORD_SIZE, + 'thumb_mode': False, + 'use_device_timer': False, + 'comms_method': 'openocd', + 'server_addr': server_addr, + 'server_port': server_port, } - # generate section start addresses from the given `base_addr` - curr_offset = 0 - mem_layout = res["mem_layout"] - for region_dict in mem_layout.values(): - region_dict["start"] = base_addr + curr_offset - curr_offset += region_dict["size"] - return res register_device(DEVICE_ID, { - "create_micro_lib": create_micro_lib, - "default_config": default_config, + 'create_micro_lib': create_micro_lib, + 'generate_config': generate_config, }) diff --git a/python/tvm/relay/_parser.py b/python/tvm/relay/_parser.py index 4a73e572f924..42ced795f6ae 100644 --- a/python/tvm/relay/_parser.py +++ b/python/tvm/relay/_parser.py @@ -151,7 +151,9 @@ def __call__(self, args, attrs, type_args): "nn.dropout": op.nn.dropout_raw, "zeros": op.zeros, "split": op.split, - "cast": op.cast + "cast": op.cast, + "clip": op.clip, + "right_shift": op.right_shift, } TYPE_PREFIXES = [ @@ -340,7 +342,10 @@ def visitLocalVar(self, ctx): return local_var def visitGraphVar(self, ctx): - return self.graph_expr[int(ctx.NAT().getText())] + graph_var_idx = int(ctx.NAT().getText()) + if graph_var_idx >= len(self.graph_expr): + raise ParseError(f"graph var `%{graph_var_idx}` is unbound") + return self.graph_expr[graph_var_idx] def visit_list(self, ctx_list) -> List[Any]: """"Visit a list of contexts.""" @@ -625,6 +630,12 @@ def visitCallWithAttr(self, ctx: RelayParser.CallWithAttrContext): def call(self, func, args, attrs, type_args): if isinstance(func, OpWrapper): + #if hasattr(func.operator, '__name__') and func.operator.__name__ == 'clip': + # # TODO(wbelrlo) this big fucking hack yes + # import copy + # args = copy.deepcopy(args) + # args[1] = float(args[1].data.asnumpy()) + # args[2] = float(args[2].data.asnumpy()) return func(args, attrs, type_args) if isinstance(func, adt.Constructor): return func(*args) diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py index 30c5971e32b9..459a4a588324 100644 --- a/python/tvm/relay/build_module.py +++ b/python/tvm/relay/build_module.py @@ -239,16 +239,26 @@ def build(mod, target=None, target_host=None, params=None): raise ValueError("target host must be the type of str, " + "tvm.target.Target, or None") - # If current dispatch context is fallback context (the default root context), - # then load pre-tuned parameters from TopHub + # # If current dispatch context is fallback context (the default root context), + # # then load pre-tuned parameters from TopHub + # if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext): + # tophub_context = autotvm.tophub.context(list(target.values())) + # else: + # tophub_context = autotvm.util.EmptyContext() + + # with tophub_context: + # bld_mod = BuildModule() + # graph_json, mod, params = bld_mod.build(func, target, target_host, params) + if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext): tophub_context = autotvm.tophub.context(list(target.values())) + with tophub_context: + bld_mod = BuildModule() + graph_json, mod, params = bld_mod.build(func, target, target_host, params) else: - tophub_context = autotvm.util.EmptyContext() - - with tophub_context: bld_mod = BuildModule() - graph_json, mod, params = bld_mod.build(mod, target, target_host, params) + graph_json, mod, params = bld_mod.build(func, target, target_host, params) + return graph_json, mod, params diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py index 43065bef838a..d455f2229439 100644 --- a/python/tvm/relay/frontend/keras.py +++ b/python/tvm/relay/frontend/keras.py @@ -85,7 +85,7 @@ def _convert_activation(inexpr, keras_layer, _): return _op.sigmoid(inexpr) if act_type == 'tanh': return _op.tanh(inexpr) - if act_type == 'relu': + if act_type in ('relu', 'swish'): return _op.nn.relu(inexpr) if act_type == 'softplus': return _op.log(_op.add(_op.exp(inexpr), _expr.const(1., dtype='float32'))) diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py index ed57e0d4276d..f5a16941083b 100644 --- a/python/tvm/rpc/client.py +++ b/python/tvm/rpc/client.py @@ -70,6 +70,11 @@ def context(self, dev_type, dev_id=0): ctx: TVMContext The corresponding encoded remote context. """ + print('[RPCSession.context]') + print(f' dev_type: {dev_type}') + print(f' dev_id: {dev_id}') + if '-device=micro_dev' in dev_type: + dev_type = 'micro_dev' ctx = nd.context(dev_type, dev_id) encode = (self._tbl_index + 1) * base.RPC_SESS_MASK ctx.device_type += encode @@ -403,9 +408,14 @@ def connect(url, port, key="", session_timeout=0): The connected session. """ try: + print('[client.connect]') + #session_timeout = 0 + print(' hardcoding timeout to 0 (always keep alive)!') if session_timeout: key += " -timeout=%s" % str(session_timeout) + print(f' connecting to RPC server with {url}, {port}, {key}') sess = base._Connect(url, port, key) + print(f' finished connecting!') except NameError: raise RuntimeError("Please compile with USE_RPC=1") return RPCSession(sess) diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py index 627d67a0a835..74296cc2ceaa 100644 --- a/python/tvm/rpc/server.py +++ b/python/tvm/rpc/server.py @@ -64,7 +64,9 @@ def get_workpath(path): def load_module(file_name): """Load module from remote side.""" path = temp.relpath(file_name) + print('[rpc.server] ABOUT TO LOAD MOD') m = _load_module(path) + print('[rpc.server] DONE LOADING MOD') logger.info("load_module %s", path) return m @@ -325,7 +327,10 @@ def __init__(self, key="", load_library=None, custom_addr=None, - silent=False): + silent=False, + utvm_dev_id=None, + utvm_dev_config_args=None, + ): try: if base._ServerLoop is None: raise RuntimeError("Please compile with USE_RPC=1") @@ -355,6 +360,10 @@ def __init__(self, cmd += ["--custom-addr", custom_addr] if silent: cmd += ["--silent"] + if utvm_dev_id is not None: + assert utvm_dev_config_args is not None + cmd += [f"--utvm-dev-id={utvm_dev_id}"] + cmd += [f"--utvm-dev-config-args={utvm_dev_config_args}"] # prexec_fn is not thread safe and may result in deadlock. # python 3.2 introduced the start_new_session parameter as diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py index e3346b162aaf..001c76c76fef 100644 --- a/python/tvm/rpc/tracker.py +++ b/python/tvm/rpc/tracker.py @@ -297,8 +297,11 @@ def _event_handler(_, events): def _on_event(self, _): while True: try: + print('waiting for connection!') conn, addr = self._sock.accept() + print(f'got new conn: {conn}, {addr}') TCPEventHandler(self, conn, addr) + print(f'made event handler') except socket.error as err: if err.args[0] in (errno.EAGAIN, errno.EWOULDBLOCK): break diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py index 7845a26bfca2..45dd79e50019 100644 --- a/python/tvm/runtime/module.py +++ b/python/tvm/runtime/module.py @@ -109,7 +109,6 @@ def __call__(self, *args): # pylint: disable=not-callable return self.entry_func(*args) - def __repr__(self): return "Module(%s, %x)" % (self.type_key, self.handle.value) @@ -212,6 +211,7 @@ def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0): def evaluator(*args): """Internal wrapped evaluator.""" + print('[Module.time_evaluator.evaluator]') # Wrap feval so we can add more stats in future. blob = feval(*args) fmt = "@" + ("d" * repeat) diff --git a/src/ir/error.cc b/src/ir/error.cc index 9d498288d2ba..e700691f3e07 100644 --- a/src/ir/error.cc +++ b/src/ir/error.cc @@ -62,16 +62,23 @@ void ErrorReporter::RenderErrors(const IRModule& module, bool use_color) { CHECK(has_errs != this->node_to_error_.end()); - const auto& error_indicies = has_errs->second; + const auto& error_indices = has_errs->second; std::stringstream err_msg; - err_msg << rang::fg::red; - err_msg << " "; - for (auto index : error_indicies) { - err_msg << this->errors_[index].what() << "; "; + if (error_indices.size() != 0) { + err_msg << rang::fg::red; + err_msg << " "; + // TODO should fix this reverse problem further upstream (in the error reporter). + // + // the errors are in reverse order, so print them with a reversed iteration + err_msg << this->errors_[error_indices[error_indices.size()-1]].what(); + for (int i = error_indices.size() - 2; i >= 0; i--) { + size_t err_idx = error_indices[i]; + err_msg << "; " << this->errors_[err_idx].what(); + } + err_msg << rang::fg::reset; } - err_msg << rang::fg::reset; // Setup error map. auto it = error_maps.find(global); diff --git a/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s b/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s index 300deb8079a0..5861c0326dbf 100644 --- a/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s +++ b/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s @@ -18,6 +18,7 @@ */ /*! + * Copyright (c) 2019 by Contributors * \file utvm_init.s * \brief uTVM init definition for STM32F746XX-series boards */ diff --git a/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c b/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c index 1b8376150fce..a5a12cf86ebd 100644 --- a/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c +++ b/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c @@ -18,6 +18,7 @@ */ /*! + * Copyright (c) 2019 by Contributors * \file utvm_timer.c * \brief uTVM timer API definitions for STM32F746XX-series boards */ @@ -51,34 +52,31 @@ extern "C" { #define SYST_CALIB_NOREF 31 #define SYST_CALIB_SKEW 30 -uint32_t start_time = 0; -uint32_t stop_time = 0; +volatile uint32_t start_time = 0; +volatile uint32_t stop_time = 0; int32_t UTVMTimerStart() { + SYST_CSR = 0; + // maximum reload value (24-bit) + SYST_RVR = (~((uint32_t) 0)) >> 8; + SYST_CVR = 0; + SYST_CSR = (1 << SYST_CSR_ENABLE) | (1 << SYST_CSR_CLKSOURCE); // wait until timer starts while (SYST_CVR == 0) {} start_time = SYST_CVR; - return 0; + return UTVM_ERR_OK; } -void UTVMTimerStop() { - SYST_CSR = 0; +uint32_t UTVMTimerStop(int32_t *err) { + SYST_CSR &= ~((uint32_t) 1); stop_time = SYST_CVR; -} - -void UTVMTimerReset() { - SYST_CSR = 0; - // maximum reload value (24-bit) - SYST_RVR = (~((uint32_t) 0)) >> 8; - SYST_CVR = 0; -} - -uint32_t UTVMTimerRead() { - if (SYST_CSR & SYST_COUNTFLAG) { + if (SYST_CSR & (1 << SYST_COUNTFLAG)) { TVMAPISetLastError("timer overflowed"); - return -1; + *err = UTVM_ERR_TIMER_OVERFLOW; + return 0; } else { + *err = UTVM_ERR_OK; return start_time - stop_time; } } @@ -91,33 +89,33 @@ uint32_t UTVMTimerRead() { #define DWT_CTRL_NOCYCCNT 25 #define DWT_CTRL_CYCCNTENA 0 -uint32_t start_time = 0; -uint32_t stop_time = 0; +volatile uint32_t start_time = 0; +volatile uint32_t stop_time = 0; -void UTVMTimerReset() { +int32_t UTVMTimerStart() { + DWT_CTRL &= ~(1 << DWT_CTRL_CYCCNTENA); DWT_CYCCNT = 0; -} -int32_t UTVMTimerStart() { - if (DWT_CTRL & DWT_CTRL_NOCYCCNT) { + if (DWT_CTRL & (1 << DWT_CTRL_NOCYCCNT)) { TVMAPISetLastError("cycle counter not implemented on device"); - return -1; + return UTVM_ERR_TIMER_NOT_IMPLEMENTED; } start_time = DWT_CYCCNT; DWT_CTRL |= (1 << DWT_CTRL_CYCCNTENA); + return UTVM_ERR_OK; } -void UTVMTimerStop() { +uint32_t UTVMTimerStop(int32_t* err) { stop_time = DWT_CYCCNT; DWT_CTRL &= ~(1 << DWT_CTRL_CYCCNTENA); -} - -int32_t UTVMTimerRead() { - if (stop_time > stop_time) { + // even with this check, we can't know for sure if the timer has overflowed + // (it may have overflowed and gone past `start_time`). + if (stop_time > start_time) { + *err = UTVM_ERR_OK; return stop_time - start_time; } else { - uint32_t largest = ~0; - return (largest - start_time) + stop_time; + *err = UTVM_ERR_TIMER_OVERFLOW; + return 0; } } diff --git a/src/runtime/micro/device/host/utvm_init.c b/src/runtime/micro/device/host/utvm_init.c index 4fb43c11d20e..02888f04c48e 100644 --- a/src/runtime/micro/device/host/utvm_init.c +++ b/src/runtime/micro/device/host/utvm_init.c @@ -18,6 +18,7 @@ */ /*! + * Copyright (c) 2019 by Contributors * \file utvm_init.c * \brief uTVM init definition for the host emulated device */ diff --git a/src/runtime/micro/device/host/utvm_timer.c b/src/runtime/micro/device/host/utvm_timer.c index 56a36ebae86d..7b24aab473d1 100644 --- a/src/runtime/micro/device/host/utvm_timer.c +++ b/src/runtime/micro/device/host/utvm_timer.c @@ -20,28 +20,34 @@ /*! * \file utvm_timer.c * \brief uTVM timer API stubs for the host emulated device + * Copyright (c) 2019 by Contributors */ -#ifdef __cplusplus -extern "C" { -#endif - -#include "utvm_runtime.h" +#include +#include +#include // TODO(weberlo): use this? https://stackoverflow.com/questions/5141960/get-the-current-time-in-c int32_t UTVMTimerStart() { + return UTVM_ERR_OK; +} + +uint32_t UTVMTimerStop(int32_t* err) { + *err = UTVM_ERR_OK; return 0; } -void UTVMTimerStop() { } +extern void UTVMInit(); -void UTVMTimerReset() { } +extern void UTVMTimerReset(); -uint32_t UTVMTimerRead() { - return 1; -} +extern int32_t UTVMTimerStart(); + +extern void UTVMTimerStop(); + +extern uint32_t UTVMTimerRead(); + +void UTVMMain(); -#ifdef __cplusplus -} // TVM_EXTERN_C -#endif +void UTVMDone(); diff --git a/src/runtime/micro/device/riscv_spike/utvm_init.s b/src/runtime/micro/device/riscv_spike/utvm_init.s new file mode 100644 index 000000000000..a73641249165 --- /dev/null +++ b/src/runtime/micro/device/riscv_spike/utvm_init.s @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2019 by Contributors + * \file utvm_init.s + * \brief uTVM init definition for Spike + */ + +UTVMInit: + /* set stack pointer */ + la sp, _utvm_stack_pointer_init + call UTVMMain diff --git a/src/runtime/micro/device/riscv_spike/utvm_timer.c b/src/runtime/micro/device/riscv_spike/utvm_timer.c new file mode 100644 index 000000000000..c4e0af2b230b --- /dev/null +++ b/src/runtime/micro/device/riscv_spike/utvm_timer.c @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2019 by Contributors + * \file utvm_timer.c + * \brief uTVM timer API stubs for Spike + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "utvm_runtime.h" + +int32_t UTVMTimerStart() { + return UTVM_ERR_OK; +} + +uint32_t UTVMTimerStop(int32_t* err) { + *err = UTVM_ERR_OK; + return 0; +} + +#ifdef __cplusplus +} // TVM_EXTERN_C +#endif diff --git a/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c b/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c index a8c600ed347b..970eb27a1ef3 100644 --- a/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c +++ b/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c @@ -32,10 +32,11 @@ extern "C" { #include #include -void *(*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = +// TODO compiler errors say volatile qualifier is discarded. should we just get rid of em? +volatile void *(*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = (void *(*)(int, int, uint64_t, int, int)) NULL; -int (*TVMBackendFreeWorkspace_)(int, int, void*) = (int (*)(int, int, void*)) NULL; -void (*TVMAPISetLastError_)(const char*) = (void (*)(const char*)) NULL; +volatile int (*TVMBackendFreeWorkspace_)(int, int, void*) = (int (*)(int, int, void*)) NULL; +volatile void (*TVMAPISetLastError_)(const char*) = (void (*)(const char*)) NULL; void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, int dtype_code_hint, int dtype_bits_hint) { @@ -51,6 +52,39 @@ void TVMAPISetLastError(const char* msg) { (*TVMAPISetLastError_)(msg); } +void *memset(void *s, int c, size_t n) { + char *p = (char*) s; + while (n > 0) { + *p = (char) c; + p++; + n--; + } + return s; +} + +void *memmove(void *to, const void *from, size_t n) { + // TODO will need to factor memmove calls into workspace size calculation + char *temp = (char*) TVMBackendAllocWorkspace(1, 1, (uint64_t) n, 2, 8); + if (temp == NULL) { + return NULL; + } + + const char *from_pp = (char*) from; + for (size_t i = 0; i < n; i++) { + temp[i] = from_pp[i]; + } + char *to_pp = (char*) to; + for (size_t i = 0; i < n; i++) { + to_pp[i] = temp[i]; + } + + if (TVMBackendFreeWorkspace(1, (uint64_t) 1, (void*) temp) != 0) { + return NULL; + } + + return to; +} + #ifdef __cplusplus } // TVM_EXTERN_C #endif diff --git a/src/runtime/micro/host_driven/utvm_runtime.h b/src/runtime/micro/host_driven/utvm_runtime.h index c364ecf40792..b39309a784f5 100644 --- a/src/runtime/micro/host_driven/utvm_runtime.h +++ b/src/runtime/micro/host_driven/utvm_runtime.h @@ -18,6 +18,7 @@ */ /*! + * Copyright (c) 2019 by Contributors * \file utvm_runtime.h * \brief uTVM runtime headers */ @@ -32,6 +33,22 @@ extern "C" { #include #include +/*! + * \brief TODO + */ +enum UTVMReturnCode { + UTVM_ERR_OK = 0, + UTVM_ERR_NOT_FINISHED = -1, + UTVM_ERR_TIMER_NOT_IMPLEMENTED = -2, + UTVM_ERR_TIMER_OVERFLOW = -3, + UTVM_ERR_WS_DOUBLE_FREE = -4, + UTVM_ERR_WS_OUT_OF_SPACE = -5, + UTVM_ERR_WS_TOO_MANY_ALLOCS = -6, + UTVM_ERR_WS_ZERO_SIZE_ALLOC = -7, + UTVM_ERR_WS_UNALIGNED_START = -8, + UTVM_ERR_WS_UNALIGNED_ALLOC_SIZE = -9, +}; + /*! * \brief Task structure for uTVM */ @@ -46,20 +63,38 @@ typedef struct { int32_t num_args; } UTVMTask; +/*! + * \brief TODO + */ extern void UTVMInit(); -extern void UTVMTimerReset(); - +/*! + * \brief TODO + */ extern int32_t UTVMTimerStart(); -extern void UTVMTimerStop(); - -extern uint32_t UTVMTimerRead(); +/*! + * \brief TODO + */ +extern uint32_t UTVMTimerStop(int32_t* err); +/*! + * \brief TODO + */ void UTVMMain(); +/*! + * \brief TODO + */ void UTVMDone(); +// GCC -O3 begins to inject memset and memmove calls, so we provide impls in +// the runtime for this case and for general usage. + +void *memset(void *s, int c, size_t n); + +void *memmove(void *to, const void *from, size_t n); + #ifdef __cplusplus } // TVM_EXTERN_C #endif diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index 4a0189b3e89e..e6696f56f06c 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -60,20 +60,21 @@ union TargetVal { uint64_t val64; }; +// TODO just get rid of `DevPtr`. /*! \brief absolute device address */ -class DevPtr { +class TargetPtr { public: /*! \brief construct a device address with value `value` */ - explicit DevPtr(std::uintptr_t value) : value_(TargetVal { .val64 = value }) {} + explicit TargetPtr(std::uintptr_t value) : value_(TargetVal { .val64 = value }) {} /*! \brief default constructor */ - DevPtr() : value_(TargetVal { .val64 = 0 }) {} + TargetPtr() : value_(TargetVal { .val64 = 0 }) {} /*! \brief construct a null address */ - explicit DevPtr(std::nullptr_t value) : value_(TargetVal { .val64 = 0 }) {} + explicit TargetPtr(std::nullptr_t value) : value_(TargetVal { .val64 = 0 }) {} /*! \brief destructor */ - ~DevPtr() {} + ~TargetPtr() {} /*! * \brief get value of pointer @@ -95,23 +96,23 @@ class DevPtr { bool operator!=(std::nullptr_t) const { return value_.val64 != 0; } /*! \brief add an integer to this absolute address to get a larger absolute address */ - DevPtr operator+(size_t n) const { - return DevPtr(value_.val64 + n); + TargetPtr operator+(size_t n) const { + return TargetPtr(value_.val64 + n); } /*! \brief mutably add an integer to this absolute address */ - DevPtr& operator+=(size_t n) { + TargetPtr& operator+=(size_t n) { value_.val64 += n; return *this; } /*! \brief subtract an integer from this absolute address to get a smaller absolute address */ - DevPtr operator-(size_t n) const { - return DevPtr(value_.val64 - n); + TargetPtr operator-(size_t n) const { + return TargetPtr(value_.val64 - n); } /*! \brief mutably subtract an integer from this absolute address */ - DevPtr& operator-=(size_t n) { + TargetPtr& operator-=(size_t n) { value_.val64 -= n; return *this; } @@ -152,7 +153,7 @@ class SymbolMap { stream >> name; stream >> std::hex >> addr; while (stream) { - map_[name] = DevPtr(addr); + map_[name] = TargetPtr(addr); stream >> name; stream >> std::hex >> addr; } @@ -163,7 +164,7 @@ class SymbolMap { * \param name name of the symbol * \return on-device offset of the symbol */ - DevPtr operator[](const std::string& name) const { + TargetPtr operator[](const std::string& name) const { auto result = map_.find(name); CHECK(result != map_.end()) << "\"" << name << "\" not in symbol map"; return result->second; @@ -175,13 +176,13 @@ class SymbolMap { private: /*! \brief backing map */ - std::unordered_map map_; + std::unordered_map map_; }; /*! \brief struct containing start and size of a device memory region */ struct DevMemRegion { /*! \brief section start offset */ - DevPtr start; + TargetPtr start; /*! \brief size of section */ size_t size; }; @@ -240,11 +241,11 @@ const char* SectionToString(SectionKind section); std::string RelocateBinarySections( const std::string& binary_path, size_t word_size, - DevPtr text_start, - DevPtr rodata_start, - DevPtr data_start, - DevPtr bss_start, - DevPtr stack_end, + TargetPtr text_start, + TargetPtr rodata_start, + TargetPtr data_start, + TargetPtr bss_start, + TargetPtr stack_end, const std::string& toolchain_prefix); /*! diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index 3d0a6889c4f7..fbaef4af5b07 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -74,8 +74,10 @@ class MicroDeviceAPI final : public DeviceAPI { TVMContext ctx_to, DLDataType type_hint, TVMStreamHandle stream) final { + std::cout << "[MicroDeviceAPI::CopyDataFromTo]" << std::endl; std::tuple type_from_to(ctx_from.device_type, ctx_to.device_type); if (type_from_to == std::make_tuple(kDLMicroDev, kDLMicroDev)) { + std::cout << " device to device" << std::endl; // Copying from the device to the device. MicroDevSpace* from_space = static_cast(const_cast(from)); @@ -87,6 +89,8 @@ class MicroDeviceAPI final : public DeviceAPI { CHECK(ctx_from.device_id == ctx_to.device_id) << "can only copy between the same micro device"; ObjectPtr& session = from_space->session; + // flush all pending tasks to ensure data is consistent + session->FlushTaskQueue(); const std::shared_ptr& lld = session->low_level_device(); DevPtr from_dev_addr = GetDevLoc(from_space, from_offset); @@ -96,20 +100,27 @@ class MicroDeviceAPI final : public DeviceAPI { lld->Read(from_dev_addr, static_cast(buffer.data()), size); lld->Write(to_dev_addr, static_cast(buffer.data()), size); } else if (type_from_to == std::make_tuple(kDLMicroDev, kDLCPU)) { + std::cout << " reading from device" << std::endl; + std::cout << " num_bytes: " << size << std::endl; // Reading from the device. MicroDevSpace* from_space = static_cast(const_cast(from)); ObjectPtr& session = from_space->session; + // flush all pending tasks to ensure data is consistent + session->FlushTaskQueue(); const std::shared_ptr& lld = session->low_level_device(); DevPtr from_dev_addr = GetDevLoc(from_space, from_offset); - void* to_host_ptr = GetHostLoc(to, to_offset); lld->Read(from_dev_addr, to_host_ptr, size); } else if (type_from_to == std::make_tuple(kDLCPU, kDLMicroDev)) { + std::cout << " writing to device" << std::endl; + std::cout << " num_bytes: " << size << std::endl; // Writing to the device. MicroDevSpace* to_space = static_cast(const_cast(to)); ObjectPtr& session = to_space->session; + // flush all pending tasks to ensure data is consistent + session->FlushTaskQueue(); const std::shared_ptr& lld = session->low_level_device(); void* from_host_ptr = GetHostLoc(from, from_offset); @@ -121,9 +132,12 @@ class MicroDeviceAPI final : public DeviceAPI { } void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { + std::cout << "[MicroDeviceAPI::StreamSync]" << std::endl; + MicroSession::Current()->FlushTaskQueue(); } void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final { + CHECK(false) << "the on-device workspace allocator isn't aware of this function"; ObjectPtr& session = MicroSession::Current(); void* data = session->AllocateInSection(SectionKind::kWorkspace, size).cast_to(); @@ -135,6 +149,7 @@ class MicroDeviceAPI final : public DeviceAPI { } void FreeWorkspace(TVMContext ctx, void* data) final { + CHECK(false) << "the on-device workspace allocator isn't aware of this function"; MicroDevSpace* dev_space = static_cast(data); ObjectPtr& session = dev_space->session; session->FreeInSection(SectionKind::kWorkspace, diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 50cee34be4a6..b27bd923c638 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -54,6 +54,8 @@ class MicroModuleNode final : public ModuleNode { * \param binary_path path of the binary to be loaded */ void InitMicroModule(const std::string& binary_path) { + std::cout << "[MicroModuleNode::InitMicroModule]" << std::endl; + std::cout << " start" << std::endl; session_ = MicroSession::Current(); symbol_map_ = session_->LoadBinary(binary_path, true).symbol_map; } @@ -73,7 +75,7 @@ class MicroWrappedFunc { } void operator()(TVMArgs args, TVMRetValue* rv) const { - *rv = session_->PushToExecQueue(func_ptr_, args); + *rv = session_->PushToTaskQueue(func_ptr_, args); } private: diff --git a/src/runtime/micro/micro_section_allocator.h b/src/runtime/micro/micro_section_allocator.h index 5c75f92737ab..4e8f7201ad7a 100644 --- a/src/runtime/micro/micro_section_allocator.h +++ b/src/runtime/micro/micro_section_allocator.h @@ -38,8 +38,9 @@ class MicroSectionAllocator { * \brief constructor that specifies section boundaries * \param region location and size of the section on the device */ - explicit MicroSectionAllocator(DevMemRegion region, size_t word_size) - : start_addr_(region.start), + explicit MicroSectionAllocator(std::string section_name, DevMemRegion region, size_t word_size) + : section_name_(section_name), + start_addr_(region.start), size_(0), capacity_(region.size), word_size_(word_size) { @@ -56,7 +57,7 @@ class MicroSectionAllocator { /*! * \brief memory allocator - * \param size size of allocated memory in bytes + * \param alloc_size size of allocated memory in bytes * \return pointer to allocated memory region in section, nullptr if out of space */ DevPtr Allocate(size_t size) { @@ -110,6 +111,8 @@ class MicroSectionAllocator { size_t capacity() const { return capacity_; } private: + /*! \brief name of the section (for debugging) */ + std::string section_name_; /*! \brief start address of the section */ DevPtr start_addr_; /*! \brief current size of the section */ diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 4bdc8ed69797..e6cdea5d1c88 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -79,11 +80,14 @@ MicroSession::MicroSession( size_t stack_size, size_t word_size, bool thumb_mode, + bool use_device_timer, const std::string& server_addr, int port) - : toolchain_prefix_(toolchain_prefix) - , word_size_(word_size) - , thumb_mode_(thumb_mode) { + : toolchain_prefix_(toolchain_prefix), + word_size_(word_size), + thumb_mode_(thumb_mode), + use_device_timer_(use_device_timer), + batch_args_encoder_(args_size, word_size) { CHECK(word_size_ == 4 || word_size_ == 8) << "unsupported word size " << word_size_; if (comms_method == "host") { // TODO(weberlo): move checks to python @@ -105,84 +109,129 @@ MicroSession::MicroSession( << "base address not aligned to " << word_size_ << " bytes"; DevPtr curr_addr = DevPtr(reinterpret_cast(base_addr)); - section_allocators_[0] = std::make_shared(DevMemRegion { + section_allocators_[0] = std::make_shared( + "text", + DevMemRegion { .start = curr_addr, .size = text_size, - }, word_size_); + }, word_size_); curr_addr += text_size; - section_allocators_[1] = std::make_shared(DevMemRegion { - .start = curr_addr, - .size = rodata_size, - }, word_size_); + section_allocators_[1] = std::make_shared( + "rodata", + DevMemRegion { + .start = curr_addr, + .size = rodata_size, + }, word_size_); curr_addr += rodata_size; - section_allocators_[2] = std::make_shared(DevMemRegion { - .start = curr_addr, - .size = data_size, - }, word_size_); + section_allocators_[2] = std::make_shared( + "data", + DevMemRegion { + .start = curr_addr, + .size = data_size, + }, word_size_); curr_addr += data_size; - section_allocators_[3] = std::make_shared(DevMemRegion { - .start = curr_addr, - .size = bss_size, - }, word_size_); + section_allocators_[3] = std::make_shared( + "bss", + DevMemRegion { + .start = curr_addr, + .size = bss_size, + }, word_size_); curr_addr += bss_size; - section_allocators_[4] = std::make_shared(DevMemRegion { - .start = curr_addr, - .size = args_size, - }, word_size_); + section_allocators_[4] = std::make_shared( + "args", + DevMemRegion { + .start = curr_addr, + .size = args_size, + }, word_size_); curr_addr += args_size; - section_allocators_[5] = std::make_shared(DevMemRegion { - .start = curr_addr, - .size = heap_size, - }, word_size_); + section_allocators_[5] = std::make_shared( + "heap", + DevMemRegion { + .start = curr_addr, + .size = heap_size, + }, word_size_); curr_addr += heap_size; - section_allocators_[6] = std::make_shared(DevMemRegion { - .start = curr_addr, - .size = workspace_size, - }, word_size_); + section_allocators_[6] = std::make_shared( + "workspace", + DevMemRegion { + .start = curr_addr, + .size = workspace_size, + }, word_size_); curr_addr += workspace_size; - section_allocators_[7] = std::make_shared(DevMemRegion { - .start = curr_addr, - .size = stack_size, - }, word_size_); + section_allocators_[7] = std::make_shared( + "stack", + DevMemRegion { + .start = curr_addr, + .size = stack_size, + }, word_size_); curr_addr += stack_size; } else if (comms_method == "openocd") { low_level_device_ = OpenOCDLowLevelDeviceCreate(server_addr, port); - section_allocators_[0] = std::make_shared(DevMemRegion { - .start = DevPtr(text_start), - .size = text_size, - }, word_size_); - section_allocators_[1] = std::make_shared(DevMemRegion { - .start = DevPtr(rodata_start), - .size = rodata_size, - }, word_size_); - section_allocators_[2] = std::make_shared(DevMemRegion { - .start = DevPtr(data_start), - .size = data_size, - }, word_size_); - section_allocators_[3] = std::make_shared(DevMemRegion { - .start = DevPtr(bss_start), - .size = bss_size, - }, word_size_); - section_allocators_[4] = std::make_shared(DevMemRegion { - .start = DevPtr(args_start), - .size = args_size, - }, word_size_); - section_allocators_[5] = std::make_shared(DevMemRegion { - .start = DevPtr(heap_start), - .size = heap_size, - }, word_size_); - section_allocators_[6] = std::make_shared(DevMemRegion { - .start = DevPtr(workspace_start), - .size = workspace_size, - }, word_size_); - section_allocators_[7] = std::make_shared(DevMemRegion { - .start = DevPtr(stack_start), - .size = stack_size, - }, word_size_); + section_allocators_[0] = std::make_shared( + "text", + DevMemRegion { + .start = DevPtr(text_start), + .size = text_size, + }, word_size_); + section_allocators_[1] = std::make_shared( + "rodata", + DevMemRegion { + .start = DevPtr(rodata_start), + .size = rodata_size, + }, word_size_); + section_allocators_[2] = std::make_shared( + "data", + DevMemRegion { + .start = DevPtr(data_start), + .size = data_size, + }, word_size_); + section_allocators_[3] = std::make_shared( + "bss", + DevMemRegion { + .start = DevPtr(bss_start), + .size = bss_size, + }, word_size_); + section_allocators_[4] = std::make_shared( + "args", + DevMemRegion { + .start = DevPtr(args_start), + .size = args_size, + }, word_size_); + section_allocators_[5] = std::make_shared( + "heap", + DevMemRegion { + .start = DevPtr(heap_start), + .size = heap_size, + }, word_size_); + section_allocators_[6] = std::make_shared( + "workspace", + DevMemRegion { + .start = DevPtr(workspace_start), + .size = workspace_size, + }, word_size_); + section_allocators_[7] = std::make_shared( + "stack", + DevMemRegion { + .start = DevPtr(stack_start), + .size = stack_size, + }, word_size_); } else { LOG(FATAL) << "unsupported micro low-level device"; } + std::cout << "[Memory Layout]" << std::endl; + std::cout << " text (size = " << (section_allocators_[0]->capacity() / 1000.0) << " KB): " << section_allocators_[0]->start_addr().cast_to() << std::endl; + std::cout << " rodata (size = " << (section_allocators_[1]->capacity() / 1000.0) << " KB): " << section_allocators_[1]->start_addr().cast_to() << std::endl; + std::cout << " data (size = " << (section_allocators_[2]->capacity() / 1000.0) << " KB): " << section_allocators_[2]->start_addr().cast_to() << std::endl; + std::cout << " bss (size = " << (section_allocators_[3]->capacity() / 1000.0) << " KB): " << section_allocators_[3]->start_addr().cast_to() << std::endl; + std::cout << " args (size = " << (section_allocators_[4]->capacity() / 1000.0) << " KB): " << section_allocators_[4]->start_addr().cast_to() << std::endl; + std::cout << " heap (size = " << (section_allocators_[5]->capacity() / 1000.0) << " KB): " << section_allocators_[5]->start_addr().cast_to() << std::endl; + std::cout << " workspace (size = " << (section_allocators_[6]->capacity() / 1000.0) << " KB): " << section_allocators_[6]->start_addr().cast_to() << std::endl; + std::cout << " stack (size = " << (section_allocators_[7]->capacity() / 1000.0) << " KB): " << section_allocators_[7]->start_addr().cast_to() << std::endl; + + DevPtr args_start_addr = GetAllocator(SectionKind::kArgs)->start_addr(); + batch_args_encoder_.set_start_addr(args_start_addr); + runtime_symbol_map_ = LoadBinary(binary_path, false).symbol_map; // Patch pointers to define the bounds of the workspace section and the word @@ -209,54 +258,80 @@ MicroSession::~MicroSession() { low_level_device_ = nullptr; } -double MicroSession::PushToExecQueue(DevPtr func_ptr, const TVMArgs& args) { +void MicroSession::PushToTaskQueue(DevPtr func_ptr, const TVMArgs& args) { + std::cout << "[MicroSession::PushToTaskQueue]" << std::endl; + std::cout << " pushed func ptr: " << func_ptr.cast_to() << std::endl; if (thumb_mode_) { - func_ptr += 1; + func_ptr |= 1; } + DevVal func_dev_addr = func_ptr.value(); + + std::tuple arg_field_addrs = EncoderAppend(&batch_args_encoder_, args); + DevVal arg_values_dev_addr = { .val64 = std::get<0>(arg_field_addrs).value() }; + DevVal arg_type_codes_dev_addr = { .val64 = std::get<1>(arg_field_addrs).value() }; + + task_queue_.push_back( + DevTask { + .func = func_dev_addr, + .arg_values = arg_values_dev_addr, + .arg_type_codes = arg_type_codes_dev_addr, + .num_args = args.num_args + }); + + if (task_queue_.size() == MicroSession::kTaskQueueCapacity) { + FlushTaskQueue(); + } +} - // Create an allocator stream for the memory region after the most recent - // allocation in the args section. - DevPtr args_addr = GetAllocator(SectionKind::kArgs)->curr_end_addr(); - TargetDataLayoutEncoder encoder(args_addr, word_size_); - - std::tuple arg_field_addrs = EncoderAppend(&encoder, args); - - // Flush `stream` to device memory. - DevPtr stream_dev_addr = - GetAllocator(SectionKind::kArgs)->Allocate(encoder.buf_size()); - low_level_device()->Write(stream_dev_addr, - reinterpret_cast(encoder.data()), - encoder.buf_size()); - - TargetVal arg_values_dev_addr = std::get<0>(arg_field_addrs).value(); - TargetVal arg_type_codes_dev_addr = std::get<1>(arg_field_addrs).value(); +void MicroSession::FlushTaskQueue() { + if (task_queue_.size() == 0) { + // nothing to run + return; + } if (word_size_ == 4) { - UTVMTask32 task = { - .func = func_ptr.value().val32, - .arg_values = arg_values_dev_addr.val32, - .arg_type_codes = arg_type_codes_dev_addr.val32, - .num_args = args.num_args, - }; - // Write the task. - DevSymbolWrite(runtime_symbol_map_, "utvm_task", task); + FlushTaskQueuePriv(); } else if (word_size_ == 8) { - UTVMTask64 task = { - .func = func_ptr.value().val64, - .arg_values = arg_values_dev_addr.val64, - .arg_type_codes = arg_type_codes_dev_addr.val64, - .num_args = args.num_args, - }; - // Write the task. - DevSymbolWrite(runtime_symbol_map_, "utvm_task", task); + FlushTaskQueuePriv(); + } +} + +template +void MicroSession::FlushTaskQueuePriv() { + std::cout << "[MicroSession::FlushTaskQueue]" << std::endl; + std::vector prepped_tasks; + for (const auto& task : task_queue_) { + prepped_tasks.push_back(T(task)); } + // Flush `args` to device memory. + low_level_device()->Write( + batch_args_encoder_.start_addr(), + reinterpret_cast(batch_args_encoder_.data()), + batch_args_encoder_.buf_size()); + + // Flush `tasks` to device memory. + DevPtr dev_tasks_addr = runtime_symbol_map_["utvm_tasks"]; + low_level_device()->Write( + dev_tasks_addr, + reinterpret_cast(prepped_tasks.data()), + prepped_tasks.size() * sizeof(T)); + DevSymbolWrite(runtime_symbol_map_, "utvm_num_tasks", prepped_tasks.size()); + DevPtr utvm_init_addr = runtime_symbol_map_["UTVMInit"]; DevPtr utvm_done_addr = runtime_symbol_map_["UTVMDone"]; if (thumb_mode_) { - utvm_init_addr += 1; + utvm_init_addr |= 1; } + std::chrono::time_point< + std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin, tend; + tbegin = std::chrono::high_resolution_clock::now(); + // std::cout << " do execution things: "; + // char tmp; + // std::cin >> tmp; low_level_device()->Execute(utvm_init_addr, utvm_done_addr); + tend = std::chrono::high_resolution_clock::now(); + // Check if there was an error during execution. If so, log it. CheckDeviceError(); uint32_t task_time = DevSymbolRead(runtime_symbol_map_, "utvm_task_time"); @@ -314,6 +389,88 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d PatchImplHole(symbol_map, "TVMAPISetLastError"); } + if (use_device_timer_) { + uint64_t sum = 0; + std::vector times; + times.resize(task_queue_.size()); + low_level_device()->Read(runtime_symbol_map_["utvm_task_times"], times.data(), task_queue_.size() * sizeof(uint32_t)); + for (uint32_t time : times) { + sum += time; + } + last_batch_time_ += static_cast(sum); + } else { + last_batch_time_ += std::chrono::duration_cast > + (tend - tbegin).count() * 1000; + // TODO fukn hack + uint64_t sum = 0; + std::vector times; + times.resize(task_queue_.size()); + low_level_device()->Read(runtime_symbol_map_["utvm_task_times"], times.data(), task_queue_.size() * sizeof(uint32_t)); + for (uint32_t time : times) { + sum += time; + } + last_batch_cycles_ += static_cast(sum); + } + + batch_args_encoder_.Clear(); + task_queue_.clear(); +} + +BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_dylib_pointers) { + std::cout << "[MicroSession::LoadBinary]" << std::endl; + DevMemRegion text_section; + DevMemRegion rodata_section; + DevMemRegion data_section; + DevMemRegion bss_section; + + text_section.size = GetSectionSize( + binary_path, SectionKind::kText, toolchain_prefix_, word_size_); + rodata_section.size = GetSectionSize( + binary_path, SectionKind::kRodata, toolchain_prefix_, word_size_); + data_section.size = GetSectionSize( + binary_path, SectionKind::kData, toolchain_prefix_, word_size_); + bss_section.size = GetSectionSize( + binary_path, SectionKind::kBss, toolchain_prefix_, word_size_); + std::cout << " text_section.size: " << text_section.size << std::endl; + std::cout << " rodata_section.size: " << rodata_section.size << std::endl; + std::cout << " data_section.size: " << data_section.size << std::endl; + std::cout << " bss_section.size: " << bss_section.size << std::endl; + + text_section.start = AllocateInSection(SectionKind::kText, text_section.size); + rodata_section.start = AllocateInSection(SectionKind::kRodata, rodata_section.size); + data_section.start = AllocateInSection(SectionKind::kData, data_section.size); + bss_section.start = AllocateInSection(SectionKind::kBss, bss_section.size); + CHECK(text_section.start != nullptr && rodata_section.start != nullptr && + data_section.start != nullptr && bss_section.start != nullptr) + << "not enough space to load module on device"; + + std::string relocated_bin = RelocateBinarySections( + binary_path, + word_size_, + text_section.start, + rodata_section.start, + data_section.start, + bss_section.start, + GetAllocator(SectionKind::kStack)->max_addr(), + toolchain_prefix_); + std::string text_contents = ReadSection(relocated_bin, SectionKind::kText, toolchain_prefix_); + std::string rodata_contents = ReadSection(relocated_bin, SectionKind::kRodata, toolchain_prefix_); + std::string data_contents = ReadSection(relocated_bin, SectionKind::kData, toolchain_prefix_); + std::string bss_contents = ReadSection(relocated_bin, SectionKind::kBss, toolchain_prefix_); + + low_level_device_->Write(text_section.start, &text_contents[0], text_section.size); + low_level_device_->Write(rodata_section.start, &rodata_contents[0], rodata_section.size); + low_level_device_->Write(data_section.start, &data_contents[0], data_section.size); + low_level_device_->Write(bss_section.start, &bss_contents[0], bss_section.size); + SymbolMap symbol_map {relocated_bin, toolchain_prefix_}; + + if (patch_dylib_pointers) { + // Patch device lib pointers. + PatchImplHole(symbol_map, "TVMBackendAllocWorkspace"); + PatchImplHole(symbol_map, "TVMBackendFreeWorkspace"); + PatchImplHole(symbol_map, "TVMAPISetLastError"); + } + return BinaryInfo { .text_section = text_section, .rodata_section = rodata_section, @@ -325,6 +482,7 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d std::tuple MicroSession::EncoderAppend( TargetDataLayoutEncoder* encoder, const TVMArgs& args) { + std::cout << "[MicroSession::EncoderAppend(TVMArgs)]" << std::endl; const int* type_codes = args.type_codes; int num_args = args.num_args; @@ -404,21 +562,56 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTen return tvm_arr_slot.start_addr(); } +// TODO(weberlo): switch over entirely to error codes that expand to error +// messages on the host side. void MicroSession::CheckDeviceError() { - int32_t return_code = DevSymbolRead(runtime_symbol_map_, "utvm_return_code"); - - if (return_code) { - std::uintptr_t last_error = - DevSymbolRead(runtime_symbol_map_, "utvm_last_error"); - std::string last_error_str; - if (last_error) { - DevPtr last_err_addr = DevPtr(last_error); - last_error_str = ReadString(last_err_addr); + int32_t last_error = DevSymbolRead(runtime_symbol_map_, "utvm_last_error"); + + if (last_error) { + if (!use_device_timer_ && + (last_error == UTVM_ERR_TIMER_OVERFLOW || + last_error == UTVM_ERR_TIMER_NOT_IMPLEMENTED)) { + // these errors don't matter if we're not using the on-device timer + return; + } + std::string err_msg; + switch(last_error) { + case UTVM_ERR_NOT_FINISHED: + err_msg = "execution timed out"; + break; + case UTVM_ERR_TIMER_NOT_IMPLEMENTED: + err_msg = "timer is not implemented for the target device"; + break; + case UTVM_ERR_TIMER_OVERFLOW: + // TODO this should be remedied by using interrupts to accumulate the + // timer into a larger datatype (ARM timers are only 24 bits) + err_msg = "timer overflowed during execution"; + break; + case UTVM_ERR_WS_DOUBLE_FREE: + err_msg = "free called with no active workspace allocations"; + break; + case UTVM_ERR_WS_OUT_OF_SPACE: + err_msg = "ran out of space in workspace section"; + break; + case UTVM_ERR_WS_TOO_MANY_ALLOCS: + err_msg = "exceeded number of allocs the runtime can keep track of"; + break; + case UTVM_ERR_WS_ZERO_SIZE_ALLOC: + err_msg = "attempt to allocate scratchpad of size zero"; + break; + case UTVM_ERR_WS_UNALIGNED_START: + err_msg = "start of workspace section is not word-aligned"; + break; + case UTVM_ERR_WS_UNALIGNED_ALLOC_SIZE: + err_msg = "scratchpad allocation size is not a multiple of the word size"; + break; + default: + err_msg = "unknown error code"; + break; } LOG(FATAL) << "error during micro function execution:\n" - << " return code: " << std::dec << return_code << "\n" - << " dev str addr: 0x" << std::hex << last_error << "\n" - << " dev str data: " << last_error_str << std::endl; + << " error ID: " << std::dec << last_error << std::endl + << " error message: " << err_msg; } } @@ -455,10 +648,22 @@ std::string MicroSession::ReadString(DevPtr str_addr) { } DevPtr MicroSession::AllocateInSection(SectionKind type, size_t size) { - return GetAllocator(type)->Allocate(size); + if (type == SectionKind::kHeap) { + std::cout << "[MicroSession::AllocateInSection(Heap)]" << std::endl; + std::cout << " allocating " << std::dec << size << " hex=" << (void*) size << " bytes" << std::endl; + } + DevPtr result = GetAllocator(type)->Allocate(size); + if (type == SectionKind::kHeap) { + std::cout << " allocated at addr " << result.cast_to() << std::endl; + } + return result; } void MicroSession::FreeInSection(SectionKind type, DevPtr addr) { + if (type == SectionKind::kHeap) { + std::cout << "[MicroSession::FreeInSection]" << std::endl; + std::cout << " freeing alloc at addr " << addr.cast_to() << std::endl; + } return GetAllocator(type)->Free(addr); } @@ -489,6 +694,16 @@ PackedFunc MicroSession::GetFunction( return PackedFunc([sptr_to_self](TVMArgs args, TVMRetValue* rv) { MicroSession::ExitWithScope(); }); + // TODO add a `clear_batch_timer` func + } else if (name == "get_last_batch_time") { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + *rv = this->GetLastBatchTime(); + }); + // TODO remove this func + } else if (name == "get_last_batch_cycles") { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + *rv = this->GetLastBatchCycles(); + }); } else { return PackedFunc(); } @@ -518,8 +733,9 @@ TVM_REGISTER_GLOBAL("micro._CreateSession") size_t stack_size = args[18]; size_t word_size = args[19]; bool thumb_mode = args[20]; - const std::string& server_addr = args[21]; - int port = args[22]; + bool use_device_timer = args[21]; + const std::string& server_addr = args[22]; + int port = args[23]; ObjectPtr session = make_object( comms_method, binary_path, @@ -542,6 +758,7 @@ TVM_REGISTER_GLOBAL("micro._CreateSession") stack_size, word_size, thumb_mode, + use_device_timer, server_addr, port); *rv = Module(session); diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 9e844e8b2140..00e5bcdd7e01 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -52,6 +52,8 @@ namespace tvm { namespace runtime { +struct DevTask; + /*! * \brief session for facilitating micro device interaction */ @@ -66,6 +68,9 @@ class MicroSession : public ModuleNode { virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr& sptr_to_self); + // todo having this decoupled from the value in utvm_runtime.c gives me stress dreams + static const size_t kTaskQueueCapacity = 20; + /*! * \return The type key of the executor. */ @@ -121,6 +126,7 @@ class MicroSession : public ModuleNode { size_t stack_size, size_t word_size, bool thumb_mode, + bool use_device_timer, const std::string& server_addr, int port); @@ -137,7 +143,19 @@ class MicroSession : public ModuleNode { * \param args args to the packed function * \return elapsed time during function execution on the device */ - double PushToExecQueue(DevPtr func, const TVMArgs& args); + double PushToTaskQueue(DevPtr func, const TVMArgs& args); + + /*! + * \brief serialize runtime metadata to the device for enqueued tasks and execute + * \return elapsed time during function execution on the device + */ + void FlushTaskQueue(); + + /*! + * \brief TODO + */ + template + void FlushTaskQueuePriv(); /*! * \brief loads binary onto device @@ -196,6 +214,18 @@ class MicroSession : public ModuleNode { return low_level_device_; } + const double GetLastBatchTime() { + double result = last_batch_time_; + last_batch_time_ = 0.0; + return result; + } + + const double GetLastBatchCycles() { + double result = last_batch_cycles_; + last_batch_cycles_ = 0.0; + return result; + } + private: /*! \brief low-level device pointer */ std::shared_ptr low_level_device_; @@ -204,6 +234,8 @@ class MicroSession : public ModuleNode { /*! \brief array of memory allocators for each on-device section */ std::shared_ptr section_allocators_[static_cast(SectionKind::kNumKinds)]; + /*! \brief total number of bytes of usable device memory for this session */ + size_t memory_size_; /*! \brief number of bytes in a word on the target device */ size_t word_size_; /*! \brief whether the target device requires a thumb-mode bit on function addresses @@ -213,8 +245,20 @@ class MicroSession : public ModuleNode { * results in more compact binaries. */ bool thumb_mode_; + /*! \brief TODO */ + bool use_device_timer_; /*! \brief symbol map for the device runtime */ SymbolMap runtime_symbol_map_; + /*! \brief TODO */ + std::vector task_queue_; + // TODO(weberlo): we don't even need an allocator mechanism for the args + // section. there's only ever one allocation. + /*! \brief TODO fukn hack */ + TargetDataLayoutEncoder batch_args_encoder_; + /*! \brief TODO fukn hack */ + double last_batch_time_; + /*! \brief TODO fukn hack */ + double last_batch_cycles_; /*! * \brief patches a function pointer in this module to an implementation @@ -237,7 +281,7 @@ class MicroSession : public ModuleNode { * \return device address of the allocated `DLTensor` */ template - DevPtr EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr); + TargetPtr EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr); /*! * \brief checks and logs if there was an error during the device's most recent execution @@ -302,7 +346,11 @@ struct TVMArray32 { byte_offset(byte_offset.val32), pad2(0) { } - /*! \brief opaque pointer to the allocated data */ + /*! + * \brief The opaque data pointer points to the allocated data. + * This will be CUDA device pointer or cl_mem handle in OpenCL. + * This pointer is always aligns to 256 bytes as in CUDA. + */ uint32_t data; /*! \brief The device context of the tensor */ DLContext ctx; @@ -345,8 +393,11 @@ struct TVMArray64 { shape(shape.val64), strides(strides.val64), byte_offset(byte_offset.val64) { } - - /*! \brief opaque pointer to the allocated data */ + /*! + * \brief The opaque data pointer points to the allocated data. + * This will be CUDA device pointer or cl_mem handle in OpenCL. + * This pointer is always aligns to 256 bytes as in CUDA. + */ uint64_t data; /*! \brief The device context of the tensor */ DLContext ctx; @@ -367,8 +418,26 @@ struct TVMArray64 { uint64_t byte_offset; }; +/*! \brief MicroTVM task to store in task queue before specializing to word size */ +struct DevTask { + /*! \brief Pointer to function to call for this task */ + DevVal func; + /*! \brief Array of argument values */ + DevVal arg_values; + /*! \brief Array of type codes for each argument value */ + DevVal arg_type_codes; + /*! \brief Number of arguments */ + int32_t num_args; +}; + /*! \brief MicroTVM task for serialization to 32-bit devices */ typedef struct StructUTVMTask32 { + StructUTVMTask32(DevTask task) + : func(task.func.val32), + arg_values(task.arg_values.val32), + arg_type_codes(task.arg_type_codes.val32), + num_args(task.num_args) { } + /*! \brief Pointer to function to call for this task */ uint32_t func; /*! \brief Array of argument values */ @@ -377,10 +446,16 @@ typedef struct StructUTVMTask32 { uint32_t arg_type_codes; /*! \brief Number of arguments */ int32_t num_args; -} UTVMTask32; +} StructUTVMTask32; /*! \brief MicroTVM task for serialization to 64-bit devices */ typedef struct StructUTVMTask64 { + StructUTVMTask64(DevTask task) + : func(task.func.val64), + arg_values(task.arg_values.val64), + arg_type_codes(task.arg_type_codes.val64), + num_args(task.num_args) { } + /*! \brief Pointer to function to call for this task */ uint64_t func; /*! \brief Array of argument values */ @@ -389,7 +464,7 @@ typedef struct StructUTVMTask64 { uint64_t arg_type_codes; /*! \brief Number of arguments */ int32_t num_args; -} UTVMTask64; +} StructUTVMTask64; } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h index e0275165e774..0abd269eedd4 100644 --- a/src/runtime/micro/target_data_layout_encoder.h +++ b/src/runtime/micro/target_data_layout_encoder.h @@ -96,9 +96,8 @@ class TargetDataLayoutEncoder { * \brief constructor * \param start_addr start address of the encoder in device memory */ - explicit TargetDataLayoutEncoder(DevPtr start_addr, size_t word_size) - : buf_(std::vector()), curr_offset_(0), word_size_(word_size) { - start_addr_ = DevPtr(UpperAlignValue(start_addr.value().val64, word_size_)); + explicit TargetDataLayoutEncoder(size_t capacity, size_t word_size) + : buf_(std::vector()), curr_offset_(0), capacity_(capacity), word_size_(word_size) { } /*! @@ -113,9 +112,15 @@ class TargetDataLayoutEncoder { if (curr_offset_ + size > buf_.size()) { buf_.resize(curr_offset_ + size); } + CHECK(buf_.size() < capacity_) << "out of space in data encoder"; size_t slot_start_offset = curr_offset_; curr_offset_ += size; - return Slot(this, slot_start_offset, size, start_addr_ + slot_start_offset); + return Slot(this, slot_start_offset, size, start_addr() + slot_start_offset); + } + + void Clear() { + buf_.clear(); + curr_offset_ = 0; } /*! @@ -130,10 +135,23 @@ class TargetDataLayoutEncoder { * \brief returns current size of the encoder's buffer * \return buffer size */ - size_t buf_size() { + size_t buf_size() const { return buf_.size(); } + /*! + * \brief TODO + */ + DevPtr start_addr() const { + CHECK_NE(start_addr_.value().val64, 0) << "start addr uninitialized"; + return start_addr_; + } + + void set_start_addr(DevPtr start_addr) { + CHECK_EQ(buf_.size(), 0) << "cannot change encoder start addr unless empty"; + start_addr_ = DevPtr(UpperAlignValue(start_addr.value().val64, word_size_)); + } + private: /*! \brief in-memory backing buffer */ std::vector buf_; @@ -141,6 +159,8 @@ class TargetDataLayoutEncoder { size_t curr_offset_; /*! \brief start address of the encoder in device memory */ DevPtr start_addr_; + /*! \brief TODO */ + size_t capacity_; /*! \brief number of bytes in a word on the target device */ size_t word_size_; }; @@ -158,6 +178,7 @@ TargetDataLayoutEncoder::Slot::Slot(TargetDataLayoutEncoder* parent, template TargetDataLayoutEncoder::Slot::~Slot() { + // TODO this can mask the exception thrown by slot allocation... even though that doesn't make sense. CHECK(curr_offset_ == size_) << "unwritten space in slot"; } diff --git a/src/runtime/micro/tcl_socket.cc b/src/runtime/micro/tcl_socket.cc index 64dfbf218388..d0bb6007f815 100644 --- a/src/runtime/micro/tcl_socket.cc +++ b/src/runtime/micro/tcl_socket.cc @@ -42,9 +42,11 @@ void TclSocket::Connect(tvm::support::SockAddr addr) { } void TclSocket::SendCommand() { - const char terminate_token = kCommandTerminateToken; - cmd_builder_ << terminate_token; + //std::cout << "[TclSocket::SendCommand]" << std::endl; + //std::cout << " cmd: " << cmd_builder_.str() << std::endl; + cmd_builder_ << kCommandTerminateToken; std::string full_cmd = cmd_builder_.str(); + CHECK(tcp_socket_.Send(full_cmd.data(), full_cmd.length()) != -1) << "failed to send command"; cmd_builder_.str(std::string()); diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc index 43ca630f9496..7556c4915b9c 100644 --- a/src/runtime/rpc/rpc_session.cc +++ b/src/runtime/rpc/rpc_session.cc @@ -36,12 +36,43 @@ #include #include "rpc_session.h" #include "../object_internal.h" -#include "../../support/ring_buffer.h" -#include "../../support/socket.h" +#include "../../common/ring_buffer.h" +#include "../../common/socket.h" +#include "../micro/micro_session.h" namespace tvm { namespace runtime { +std::string RPCCodeToString(RPCCode code) { + switch (code) { + case RPCCode::kNone: return "None"; + case RPCCode::kCallFunc: return "CallFunc"; + case RPCCode::kReturn: return "Return"; + case RPCCode::kException: return "Exception"; + case RPCCode::kShutdown: return "Shutdown"; + case RPCCode::kCopyFromRemote: return "CopyFromRemote"; + case RPCCode::kCopyToRemote: return "CopyToRemote"; + case RPCCode::kCopyAck: return "CopyAck"; + case RPCCode::kSystemFuncStart: return "SystemFuncStart"; + case RPCCode::kGetGlobalFunc: return "GetGlobalFunc"; + case RPCCode::kGetTimeEvaluator: return "GetTimeEvaluator"; + case RPCCode::kFreeFunc: return "FreeFunc"; + case RPCCode::kDevSetDevice: return "DevSetDevice"; + case RPCCode::kDevGetAttr: return "DevGetAttr"; + case RPCCode::kDevAllocData: return "DevAllocData"; + case RPCCode::kDevFreeData: return "DevFreeData"; + case RPCCode::kDevStreamSync: return "DevStreamSync"; + case RPCCode::kCopyAmongRemote: return "CopyAmongRemote"; + case RPCCode::kModuleLoad: return "ModuleLoad"; + case RPCCode::kModuleImport: return "ModuleImport"; + case RPCCode::kModuleFree: return "ModuleFree"; + case RPCCode::kModuleGetFunc: return "ModuleGetFunc"; + case RPCCode::kModuleGetSource: return "ModuleGetSource"; + case RPCCode::kNDArrayFree: return "NDArrayFree"; + default: CHECK(false) << "invalid RPC code"; + } +} + // Temp buffer for data array struct RPCByteArrayBuffer { TVMByteArray arr; @@ -898,6 +929,12 @@ void RPCSession::Init() { &reader_, &writer_, table_index_, name_, &remote_key_); // Quick function to call remote. call_remote_ = PackedFunc([this](TVMArgs args, TVMRetValue* rv) { + std::cout << "[RPCSession::call_remote_]" << std::endl; + if (args.type_codes[0] == kTVMContext) { + const TVMContext ctx = args[0]; + std::cout << " ctx.device_type: " << ctx.device_type << std::endl; + std::cout << " ctx.device_id: " << ctx.device_id << std::endl; + } handler_->SendPackedSeq(args.values, args.type_codes, args.num_args, true); RPCCode code = HandleUntilReturnEvent(rv, true, nullptr); CHECK(code == RPCCode::kReturn) << "code=" << static_cast(code); @@ -1080,7 +1117,10 @@ void RPCDevSetDevice(TVMArgs args, TVMRetValue *rv) { } void RPCDevGetAttr(TVMArgs args, TVMRetValue *rv) { + std::cout << "[RPCDevGetAttr]" << std::endl; TVMContext ctx = args[0]; + std::cout << " ctx.device_type: " << ctx.device_type << std::endl; + std::cout << " ctx.device_id: " << ctx.device_id << std::endl; DeviceAttrKind kind = static_cast(args[1].operator int()); if (kind == kExist) { DeviceAPI* api = DeviceAPI::Get(ctx, true); @@ -1096,7 +1136,10 @@ void RPCDevGetAttr(TVMArgs args, TVMRetValue *rv) { } void RPCDevAllocData(TVMArgs args, TVMRetValue *rv) { + std::cout << "[RPCDevAllocData]" << std::endl; TVMContext ctx = args[0]; + std::cout << " ctx.device_type: " << ctx.device_type << std::endl; + std::cout << " ctx.device_id: " << ctx.device_id << std::endl; uint64_t nbytes = args[1]; uint64_t alignment = args[2]; DLDataType type_hint = args[3]; @@ -1118,13 +1161,18 @@ void RPCDevStreamSync(TVMArgs args, TVMRetValue *rv) { } void RPCCopyAmongRemote(TVMArgs args, TVMRetValue *rv) { + std::cout << "[RPCCopyAmongRemote]" << std::endl; void* from = args[0]; uint64_t from_offset = args[1]; void* to = args[2]; uint64_t to_offset = args[3]; uint64_t size = args[4]; TVMContext ctx_from = args[5]; + std::cout << " ctx_from.device_type: " << ctx_from.device_type << std::endl; + std::cout << " ctx_from.device_id: " << ctx_from.device_type << std::endl; TVMContext ctx_to = args[6]; + std::cout << " ctx_to.device_type: " << ctx_to.device_type << std::endl; + std::cout << " ctx_to.device_id: " << ctx_to.device_type << std::endl; DLDataType type_hint = args[7]; TVMStreamHandle stream = args[8]; TVMContext ctx = ctx_from; @@ -1135,10 +1183,12 @@ void RPCCopyAmongRemote(TVMArgs args, TVMRetValue *rv) { ctx_to.device_type == ctx_from.device_type) << "Can not copy across different ctx types directly"; } + std::cout << " before CopyDataFromTo" << std::endl; DeviceAPI::Get(ctx)->CopyDataFromTo( from, from_offset, to, to_offset, size, ctx_from, ctx_to, type_hint, stream); + std::cout << " after CopyDataFromTo" << std::endl; } void RPCModuleLoad(TVMArgs args, TVMRetValue *rv) { @@ -1200,6 +1250,7 @@ void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) { } void RPCSession::EventHandler::HandlePackedCall() { + std::cout << "[RPCSession::EventHandler::HandlePackedCall]" << std::endl; CHECK_EQ(pending_request_bytes_, 0U); if (code_ == RPCCode::kReturn) { state_ = kReturnReceived; return; @@ -1208,6 +1259,7 @@ void RPCSession::EventHandler::HandlePackedCall() { state_ = kRecvCode; this->RequestBytes(sizeof(RPCCode)); // Event handler sit at clean state at this point. + std::cout << " RPC code is " << static_cast(code_) << "(" << RPCCodeToString(code_) << ")" << std::endl; switch (code_) { case RPCCode::kCallFunc: { PackedFunc* pf = reinterpret_cast(call_handle_); @@ -1222,6 +1274,7 @@ void RPCSession::EventHandler::HandlePackedCall() { std::ostringstream os; os << "Except caught from RPC call: " << arg_buf_->value[0].v_str; arg_buf_.reset(); + std::cout << os.str() << std::endl; throw dmlc::Error(os.str()); break; } @@ -1250,21 +1303,24 @@ PackedFunc MicroTimeEvaluator( PackedFunc pf, TVMContext ctx, int number, - int repeat) { - auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) mutable { + int repeat, + int min_repeat_ms) { + auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable { TVMRetValue temp; std::ostringstream os; - // skip first time call, to activate lazy compilation components. - pf.CallPacked(args, &temp); - DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); + for (int i = 0; i < repeat; ++i) { - double speed = 0.0; - for (int j = 0; j < number; ++j) { + // start timing + CHECK(number < MicroSession::kTaskQueueCapacity) + << "`number` must be less than uTVM task queue capacity"; + for (int i = 0; i < number; ++i) { pf.CallPacked(args, &temp); - DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); - speed += (temp.operator double()) / number; } - os.write(reinterpret_cast(&speed), sizeof(speed)); + ObjectPtr session = MicroSession::Current(); + DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); + double time_per_batch = session->GetLastBatchTime() / number; + std::cout << "LAST AVERAGE BATCH TIME WAS " << time_per_batch << std::endl; + os.write(reinterpret_cast(&time_per_batch), sizeof(time_per_batch)); } std::string blob = os.str(); TVMByteArray arr; @@ -1281,9 +1337,12 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, int number, int repeat, int min_repeat_ms) { + std::cout << "[WrapTimeEvaluator]" << std::endl; if (static_cast(ctx.device_type) == static_cast(kDLMicroDev)) { - return MicroTimeEvaluator(pf, ctx, number, repeat); + std::cout << " USING MICRO TIME EVAL" << std::endl; + return MicroTimeEvaluator(pf, ctx, number, repeat, min_repeat_ms); } + std::cout << " USING NORMAL TIME EVAL" << std::endl; auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable { TVMRetValue temp; diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h index db63be4be74d..1bb75357e030 100644 --- a/src/runtime/rpc/rpc_session.h +++ b/src/runtime/rpc/rpc_session.h @@ -63,31 +63,31 @@ struct RPCArgBuffer; /*! \brief The RPC code */ enum class RPCCode : int { - kNone, - kCallFunc, - kReturn, - kException, - kShutdown, - kCopyFromRemote, - kCopyToRemote, - kCopyAck, + kNone = 0, + kCallFunc = 1, + kReturn = 2, + kException = 3, + kShutdown = 4, + kCopyFromRemote = 5, + kCopyToRemote = 6, + kCopyAck = 7, // The following are code that can send over CallRemote - kSystemFuncStart, - kGetGlobalFunc, - kGetTimeEvaluator, - kFreeFunc, - kDevSetDevice, - kDevGetAttr, - kDevAllocData, - kDevFreeData, - kDevStreamSync, - kCopyAmongRemote, - kModuleLoad, - kModuleImport, - kModuleFree, - kModuleGetFunc, - kModuleGetSource, - kNDArrayFree + kSystemFuncStart = 8, + kGetGlobalFunc = 9, + kGetTimeEvaluator = 10, + kFreeFunc = 11, + kDevSetDevice = 12, + kDevGetAttr = 13, + kDevAllocData = 14, + kDevFreeData = 15, + kDevStreamSync = 16, + kCopyAmongRemote = 17, + kModuleLoad = 18, + kModuleImport = 19, + kModuleFree = 20, + kModuleGetFunc = 21, + kModuleGetSource = 22, + kNDArrayFree = 23 }; /*! diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc index 6461908f1c4a..0c4404f515c0 100644 --- a/src/target/source/codegen_c.cc +++ b/src/target/source/codegen_c.cc @@ -840,6 +840,12 @@ void CodeGenC::VisitStmt_(const AttrStmtNode* op) { const VarNode* v = op->node.as(); CHECK(v); volatile_buf_.insert(v); + } else if (op->attr_key == ir::attr::pragma_import_c) { + const StringImm* value = op->value.as(); + CHECK(value != nullptr); + decl_stream << value->value; + //this->HandleImport(value->value); + //this->VisitStmt(op->body); } this->PrintStmt(op->body); } diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc index cbdec6201742..c356ac816957 100644 --- a/src/target/source/codegen_c_host.cc +++ b/src/target/source/codegen_c_host.cc @@ -29,6 +29,7 @@ namespace tvm { namespace codegen { +// TODO rename to CodeGenCMicro? CodeGenCHost::CodeGenCHost() { module_name_ = GetUniqueName("__tvm_module_ctx"); } @@ -254,7 +255,7 @@ runtime::Module BuildCHost(IRModule mod) { TVM_REGISTER_GLOBAL("target.build.c") .set_body([](TVMArgs args, TVMRetValue* rv) { - *rv = BuildCHost(args[0]); - }); + *rv = BuildCHost(args[0]); + }); } // namespace codegen } // namespace tvm diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h index 4f9a0a74511f..f854b7abf68c 100644 --- a/src/target/source/codegen_c_host.h +++ b/src/target/source/codegen_c_host.h @@ -36,6 +36,8 @@ class CodeGenCHost final : public CodeGenC { public: CodeGenCHost(); void Init(bool output_ssa, bool emit_asserts); + void AddFunction(LoweredFunc f); + std::string Finish(); void PrintType(DataType t, std::ostream& os) final; // NOLINT(*) void PrintFuncPrefix() final; // NOLINT(*) diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 28fdb11c3de4..1ab324e50228 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -25,8 +25,25 @@ from tvm.micro import create_micro_mod from tvm.relay.testing import resnet -# Use the host emulated micro device. -DEV_CONFIG = micro.device.host.default_config() +# # Use the host emulated micro device. +# DEV_CONFIG_A = micro.device.host.generate_config() +# DEV_CONFIG_B = micro.device.host.generate_config() +# TARGET = 'c -device=micro_dev' + +# # TODO why do spike examples have memory that starts at 0x10000000, but you +# # should set the base addr as 0x10010000? should somehow help the user to be +# # aware of that. +# # are there always 0x10000 bytes reserved at the beginning of the address space? +# BASE_ADDR = 0x10010000 + +# AVAILABLE_MEM = 0x200000 +# DEV_CONFIG_A = micro.device.riscv_spike.generate_config(BASE_ADDR, AVAILABLE_MEM, '127.0.0.1', 6666) +# DEV_CONFIG_B = micro.device.riscv_spike.generate_config(BASE_ADDR, AVAILABLE_MEM, '127.0.0.1', 6667) +# TARGET = 'c -device=micro_dev' + +DEV_CONFIG_A = micro.device.arm.stm32f746xx.generate_config('127.0.0.1', 6666) +DEV_CONFIG_B = micro.device.arm.stm32f746xx.generate_config('127.0.0.1', 6667) +TARGET = 'c -device=micro_dev' def relay_micro_build(func, dev_config, params=None): """Create a graph runtime module with a micro device context from a Relay function. @@ -47,22 +64,41 @@ def relay_micro_build(func, dev_config, params=None): mod : tvm.runtime.Module graph runtime module for the target device """ - with tvm.target.build_config(disable_vectorize=True): - graph, c_mod, params = relay.build(func, target="c", params=params) - micro_mod = create_micro_mod(c_mod, dev_config) + disable_vectorize = tvm.build_config(disable_vectorize=True) + disable_fusion = relay.build_config(disabled_pass={'FuseOps'}) + with disable_vectorize, disable_fusion: + graph, c_mod, params = relay.build(func, target=TARGET, params=params) + print(c_mod.get_source()) + micro_mod = micro.create_micro_mod(c_mod, dev_config) ctx = tvm.micro_dev(0) mod = graph_runtime.create(graph, micro_mod, ctx) mod.set_input(**params) return mod +GDB_INIT_TEMPLATE = """ +layout asm +target remote localhost:{gdb_port} +set $pc = UTVMInit +break UTVMDone +""" + +def reset_gdbinit(): + if 'server_port' not in DEV_CONFIG_A: + return + gdb_init_dir = os.environ['MICRO_GDB_INIT_DIR'] + with open(f'{gdb_init_dir}/.gdbinit', 'w') as f: + gdb_port = DEV_CONFIG_A['server_port'] - 3333 + f.write(GDB_INIT_TEMPLATE.format(gdb_port=gdb_port)) + + def test_alloc(): """Test tensor allocation on the device.""" if not tvm.runtime.enabled("micro_dev"): return shape = (1024,) dtype = "float32" - with micro.Session(DEV_CONFIG): + with micro.Session(DEV_CONFIG_A): ctx = tvm.micro_dev(0) np_tensor = np.random.uniform(size=shape).astype(dtype) micro_tensor = tvm.nd.array(np_tensor, ctx) @@ -76,6 +112,8 @@ def test_add(): shape = (1024,) dtype = "float32" + reset_gdbinit() + # Construct TVM expression. tvm_shape = tvm.runtime.convert(shape) A = te.placeholder(tvm_shape, name="A", dtype=dtype) @@ -86,14 +124,24 @@ def test_add(): func_name = "fadd" c_mod = tvm.build(s, [A, B, C], target="c", name=func_name) - with micro.Session(DEV_CONFIG): - micro_mod = create_micro_mod(c_mod, DEV_CONFIG) + with micro.Session(DEV_CONFIG_A) as sess: + micro_mod = micro.create_micro_mod(c_mod, DEV_CONFIG_A) micro_func = micro_mod[func_name] ctx = tvm.micro_dev(0) - a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) + + a_np = np.random.uniform(size=shape).astype(dtype) + a = tvm.nd.array(a_np, ctx) + b_np = np.random.uniform(size=shape).astype(dtype) + b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) micro_func(a, b, c) + + # ensure inputs weren't corrupted + tvm.testing.assert_allclose( + a.asnumpy(), a_np) + tvm.testing.assert_allclose( + b.asnumpy(), b_np) + # ensure output is correct tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) @@ -105,6 +153,8 @@ def test_workspace_add(): shape = (1024,) dtype = "float32" + reset_gdbinit() + # Construct TVM expression. tvm_shape = tvm.runtime.convert(shape) A = te.placeholder(tvm_shape, name="A", dtype=dtype) @@ -116,14 +166,19 @@ def test_workspace_add(): func_name = "fadd_two_workspace" c_mod = tvm.build(s, [A, C], target="c", name=func_name) - with micro.Session(DEV_CONFIG): - micro_mod = create_micro_mod(c_mod, DEV_CONFIG) + with micro.Session(DEV_CONFIG_A) as sess: + micro_mod = micro.create_micro_mod(c_mod, DEV_CONFIG_A) micro_func = micro_mod[func_name] ctx = tvm.micro_dev(0) - a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) + a_np = np.random.uniform(size=shape).astype(dtype) + a = tvm.nd.array(a_np, ctx) c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) micro_func(a, c) + # ensure input wasn't corrupted + tvm.testing.assert_allclose( + a.asnumpy(), a_np) + # ensure output is correct tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + 2.0) @@ -141,17 +196,68 @@ def test_graph_runtime(): z = relay.add(xx, relay.const(1.0)) func = relay.Function([x], z) - with micro.Session(DEV_CONFIG): - mod = relay_micro_build(func, DEV_CONFIG) + with micro.Session(DEV_CONFIG_A): + mod = relay_micro_build(func, DEV_CONFIG_A) x_in = np.random.uniform(size=shape[0]).astype(dtype) mod.run(x=x_in) result = mod.get_output(0).asnumpy() + tvm.testing.assert_allclose( + mod.get_input(0).asnumpy(), x_in) tvm.testing.assert_allclose( result, x_in * x_in + 1.0) +def test_conv2d(): + if not tvm.module.enabled("micro_dev"): + return + + from tvm.relay import create_executor + from tvm.relay import transform + + dshape = (1, 4, 16, 16) + dtype = 'int8' + func_name = 'fused_nn_conv2d' + + reset_gdbinit() + + # Construct Relay program. + x = relay.var("x", shape=dshape, dtype=dtype) + conv_expr = relay.nn.conv2d( + x, relay.var("w"), + kernel_size=(3, 3), + padding=(1, 1), + channels=4) + func = relay.Function(relay.analysis.free_vars(conv_expr), conv_expr) + mod = relay.Module.from_expr(func) + mod = transform.InferType()(mod) + + x_shape = list(map(lambda x: x.value, mod['main'].params[0].checked_type.shape)) + w_shape = list(map(lambda x: x.value, mod['main'].params[1].checked_type.shape)) + out_shape = list(map(lambda x: x.value, mod['main'].ret_type.shape)) + + with tvm.build_config(disable_vectorize=True): + graph, c_mod, params = relay.build(mod, target="c") + + with micro.Session(DEV_CONFIG_A): + micro_mod = micro.create_micro_mod(c_mod, DEV_CONFIG_A) + micro_func = micro_mod[func_name] + ctx = tvm.micro_dev(0) + + x_data = tvm.nd.array(np.random.uniform(size=x_shape).astype(dtype), ctx) + w_data = tvm.nd.array(np.random.uniform(size=w_shape).astype(dtype), ctx) + result = tvm.nd.array(np.zeros(shape=out_shape, dtype=dtype), ctx) + micro_func(x_data, w_data, result) + + out_data = np.zeros(out_shape, dtype=dtype) + params = { 'x': x_data.asnumpy(), 'w': w_data.asnumpy() } + intrp = create_executor('debug') + expected_result = intrp.evaluate(mod['main'])(x_data, w_data).data + + tvm.testing.assert_allclose(result.asnumpy(), expected_result.asnumpy()) + + def test_multiple_modules(): """Test loading multiple modules on the device simultaneously.""" if not tvm.runtime.enabled("micro_dev"): @@ -168,9 +274,9 @@ def test_multiple_modules(): ret = relay.subtract(x, relay.const(1.0)) sub_const_func = relay.Function([x], ret) - with micro.Session(DEV_CONFIG): - add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG) - sub_const_mod = relay_micro_build(sub_const_func, DEV_CONFIG) + with micro.Session(DEV_CONFIG_A): + add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_A) + sub_const_mod = relay_micro_build(sub_const_func, DEV_CONFIG_A) x_in = np.random.uniform(size=shape[0]).astype(dtype) add_const_mod.run(x=x_in) @@ -196,8 +302,8 @@ def test_interleave_sessions(): ret = relay.add(x, relay.const(1.0)) add_const_func = relay.Function([x], ret) - sess_a = micro.Session(DEV_CONFIG) - sess_b = micro.Session(DEV_CONFIG) + sess_a = micro.Session(DEV_CONFIG_A) + sess_b = micro.Session(DEV_CONFIG_B) with sess_a: np_tensor_a = np.random.uniform(size=shape).astype(dtype) micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0)) @@ -205,13 +311,13 @@ def test_interleave_sessions(): np_tensor_b = np.random.uniform(size=shape).astype(dtype) micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro_dev(0)) with sess_a: - add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG) + add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_A) add_const_mod.run(x=micro_tensor_a) add_result = add_const_mod.get_output(0).asnumpy() tvm.testing.assert_allclose( add_result, np_tensor_a + 1.0) with sess_b: - add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG) + add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_B) add_const_mod.run(x=micro_tensor_b) add_result = add_const_mod.get_output(0).asnumpy() tvm.testing.assert_allclose( @@ -230,15 +336,15 @@ def test_nested_sessions(): ret = relay.add(x, relay.const(1.0)) add_const_func = relay.Function([x], ret) - sess_a = micro.Session(DEV_CONFIG) - sess_b = micro.Session(DEV_CONFIG) + sess_a = micro.Session(DEV_CONFIG_A) + sess_b = micro.Session(DEV_CONFIG_B) with sess_a: np_tensor_a = np.random.uniform(size=shape).astype(dtype) micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0)) with sess_b: np_tensor_b = np.random.uniform(size=shape).astype(dtype) micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro_dev(0)) - add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG) + add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_A) add_const_mod.run(x=micro_tensor_a) add_result = add_const_mod.get_output(0).asnumpy() tvm.testing.assert_allclose( @@ -257,12 +363,12 @@ def test_inactive_session_use(): ret = relay.add(x, relay.const(1.0)) add_const_func = relay.Function([x], ret) - sess_a = micro.Session(DEV_CONFIG) - sess_b = micro.Session(DEV_CONFIG) + sess_a = micro.Session(DEV_CONFIG_A) + sess_b = micro.Session(DEV_CONFIG_B) with sess_a: np_tensor_a = np.random.uniform(size=shape).astype(dtype) micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0)) - add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG) + add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_A) with sess_b: # These objects belong to `sess_a`. @@ -272,12 +378,42 @@ def test_inactive_session_use(): add_result, np_tensor_a + 1.0) +# TODO add workspace alloc/free stress test + if __name__ == "__main__": test_alloc() + print() + print('finished alloc test') + input('[press enter to continue]') test_add() + print() + print('finished add test') + input('[press enter to continue]') test_workspace_add() + print() + print('finished workspace add test') + input('[press enter to continue]') test_graph_runtime() + print() + print('finished graph runtime test') + input('[press enter to continue]') + test_conv2d() + print() + print('finished conv2d test') + input('[press enter to continue]') test_multiple_modules() + print() + print('finished multiple modules test') + input('[press enter to continue]') test_interleave_sessions() + print() + print('finished interleaved sessions test') + input('[press enter to continue]') test_nested_sessions() + print() + print('finished nested sessions test') + input('[press enter to continue]') test_inactive_session_use() + print() + print('finished use inactive session test') + input('[press enter to continue]') diff --git a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py index 3bb9dc73e2db..9f9785e834d7 100644 --- a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py +++ b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py @@ -74,7 +74,8 @@ def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation, [n, co, oh, ow, ci, kh, kw, vc, vh, vw]]) cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll') - cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec') + #cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec') + cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll') # fallback support if cfg.is_fallback: @@ -158,7 +159,7 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec, axis_lens=[cfg['tile_oh'].size[-1], cfg['tile_ow'].size[-1], cfg['tile_co'].size[-1]], - max_unroll=16, + max_unroll=None, cfg=cfg) # schedule fusion @@ -173,18 +174,18 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec, axis_lens=[cfg['tile_oh'].size[-1], cfg['tile_ow'].size[-1], cfg['tile_co'].size[-1]], - max_unroll=16, + max_unroll=None, cfg=cfg) s[conv].compute_at(s[last], ow) - # mark parallel - s[last].parallel(co) + ## mark parallel + #s[last].parallel(co) if data_vec.op.name == 'data_vec_undilated': _, h, _, _, _, _, _, _ = s[data_vec].op.axis else: _, h, _, _, _, _ = s[data_vec].op.axis - s[data_vec].parallel(h) + #s[data_vec].parallel(h) if kernel_vec.op.name == 'kernel_vec': co, _, _, _, _ = s[kernel_vec].op.axis @@ -193,10 +194,12 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec, # this part to make tuning records correct s[kernel_vec].pragma(co, 'debug_skip_region') else: - s[kernel_vec].parallel(co) + #s[kernel_vec].parallel(co) + pass elif kernel_vec.op.name == 'kernel_vec_conv2d_transpose': # for conv2d transpose - co, _, _, _, _ = s[kernel_vec].op.axis - s[kernel_vec].parallel(co) + #co, _, _, _, _ = s[kernel_vec].op.axis + #s[kernel_vec].parallel(co) + pass return s diff --git a/topi/python/topi/arm_cpu/injective.py b/topi/python/topi/arm_cpu/injective.py index 966520088bc7..8cf3fd5f11bf 100644 --- a/topi/python/topi/arm_cpu/injective.py +++ b/topi/python/topi/arm_cpu/injective.py @@ -20,6 +20,7 @@ from tvm import te from ..util import is_empty_shape +@generic.schedule_injective_from_existing.register(["arm_cpu", "micro_dev"]) def schedule_injective_from_existing(sch, out): """Schedule for injective op from existing schedule. @@ -45,6 +46,7 @@ def schedule_injective_from_existing(sch, out): sch[out].parallel(sch[out].op.axis[0]) return sch +@generic.schedule_injective.register(["arm_cpu", "micro_dev"]) def schedule_injective(outs): """ARM CPU schedule for injective op. @@ -72,6 +74,7 @@ def schedule_injective(outs): schedule_injective_from_existing(s, x) return s +@generic.schedule_concatenate.register(["arm_cpu", "micro_dev"]) def schedule_concatenate(outs): """Schedule for concatenate op. diff --git a/topi/python/topi/testing/conv2d_nhwc_python.py b/topi/python/topi/testing/conv2d_nhwc_python.py index d8713110056a..7c021785544c 100644 --- a/topi/python/topi/testing/conv2d_nhwc_python.py +++ b/topi/python/topi/testing/conv2d_nhwc_python.py @@ -35,10 +35,8 @@ def _conv2d_nhwc_python(a_np, w_np, stride, padding): stride : int or a list/tuple of two ints Stride size, or [stride_height, stride_width] - padding : int or str or a list/tuple of 2 or 4 ints - Padding size, or ['VALID', 'SAME'], or - [pad_height, pad_width] for 2 ints, or - [pad_top, pad_left, pad_bottom, pad_right] for 2 ints + padding : int or str or a list/tuple of two ints + Padding size, or ['VALID', 'SAME'], or [pad_height, pad_width] Returns ------- From 91e132d56503b3f93a5343656274048e7e61e37c Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Tue, 7 Apr 2020 19:13:44 -0700 Subject: [PATCH 02/11] Cleanup and sync micro tvm prototype. --- 3rdparty/dmlc-core | 2 +- Makefile | 5 +- include/tvm/tir/stmt.h | 2 + python/tvm/autotvm/measure/measure_methods.py | 13 +- python/tvm/autotvm/task/relay_integration.py | 1 + python/tvm/autotvm/task/space.py | 6 +- python/tvm/autotvm/tuner/callback.py | 2 +- python/tvm/autotvm/tuner/ga_tuner.py | 8 +- python/tvm/autotvm/tuner/model_based_tuner.py | 1 - python/tvm/autotvm/tuner/tuner.py | 12 +- python/tvm/contrib/binutil.py | 107 ++------ python/tvm/exec/rpc_server.py | 7 +- python/tvm/micro/base.py | 47 ++-- python/tvm/micro/device/__init__.py | 3 +- python/tvm/micro/device/arm/stm32f746xx.py | 21 +- python/tvm/micro/device/base.py | 52 ++-- python/tvm/micro/device/host.py | 17 +- python/tvm/micro/device/riscv_spike.py | 1 - python/tvm/relay/_parser.py | 6 - python/tvm/relay/build_module.py | 22 +- python/tvm/relay/frontend/keras.py | 2 +- python/tvm/relay/op/strategy/arm_cpu.py | 36 ++- python/tvm/rpc/client.py | 10 - python/tvm/rpc/server.py | 2 - python/tvm/rpc/tracker.py | 3 - python/tvm/runtime/module.py | 1 - python/tvm/runtime/ndarray.py | 2 +- python/tvm/target/arm_isa.py | 34 +++ src/driver/driver_api.cc | 2 +- src/ir/error.cc | 2 - .../micro/device/arm/stm32f746xx/utvm_init.s | 6 - .../micro/device/arm/stm32f746xx/utvm_timer.c | 107 +++----- src/runtime/micro/device/host/utvm_init.c | 1 - src/runtime/micro/device/host/utvm_timer.c | 18 +- .../micro/device/riscv_spike/utvm_init.s | 6 - .../micro/device/riscv_spike/utvm_timer.c | 1 - .../host_driven/utvm_device_dylib_redirect.c | 9 +- src/runtime/micro/host_driven/utvm_runtime.c | 149 +++++++---- src/runtime/micro/host_driven/utvm_runtime.h | 1 - src/runtime/micro/host_low_level_device.cc | 8 +- src/runtime/micro/low_level_device.h | 6 +- src/runtime/micro/micro_common.cc | 10 +- src/runtime/micro/micro_common.h | 85 +++++-- src/runtime/micro/micro_device_api.cc | 30 +-- src/runtime/micro/micro_module.cc | 12 +- src/runtime/micro/micro_section_allocator.h | 28 ++- src/runtime/micro/micro_session.cc | 237 +++++++----------- src/runtime/micro/micro_session.h | 47 ++-- src/runtime/micro/openocd_low_level_device.cc | 12 +- .../micro/target_data_layout_encoder.h | 29 ++- src/runtime/micro/tcl_socket.cc | 5 +- src/runtime/rpc/rpc_session.cc | 66 +---- src/runtime/rpc/rpc_session.h | 48 ++-- src/target/source/codegen_c.cc | 6 +- src/target/source/codegen_c_host.cc | 25 +- src/target/source/codegen_c_host.h | 9 +- src/target/target.cc | 2 +- tests/python/unittest/test_runtime_micro.py | 66 ++--- topi/python/topi/arm_cpu/__init__.py | 1 + topi/python/topi/arm_cpu/conv2d.py | 13 + .../topi/arm_cpu/conv2d_spatial_pack.py | 19 +- .../python/topi/arm_cpu/cortex_m7/__init__.py | 20 ++ .../topi/arm_cpu/cortex_m7/conv2d/__init__.py | 19 ++ .../topi/arm_cpu/cortex_m7/conv2d/direct.py | 177 +++++++++++++ .../arm_cpu/cortex_m7/conv2d/direct_simd.py | 163 ++++++++++++ .../cortex_m7/micro_kernel/__init__.py | 0 .../arm_cpu/cortex_m7/micro_kernel/gemm.py | 221 ++++++++++++++++ topi/python/topi/arm_cpu/injective.py | 3 - topi/python/topi/generic/default.py | 2 +- 69 files changed, 1277 insertions(+), 819 deletions(-) create mode 100644 python/tvm/target/arm_isa.py create mode 100644 topi/python/topi/arm_cpu/cortex_m7/__init__.py create mode 100644 topi/python/topi/arm_cpu/cortex_m7/conv2d/__init__.py create mode 100644 topi/python/topi/arm_cpu/cortex_m7/conv2d/direct.py create mode 100644 topi/python/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py create mode 100644 topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py create mode 100644 topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core index 981b1c32f916..808f485387f9 160000 --- a/3rdparty/dmlc-core +++ b/3rdparty/dmlc-core @@ -1 +1 @@ -Subproject commit 981b1c32f91668e669ee376856f92f36cfd2a351 +Subproject commit 808f485387f9a03f78fa9f1159f387d0d91b7a28 diff --git a/Makefile b/Makefile index 757b3300f7d5..64a127346e53 100644 --- a/Makefile +++ b/Makefile @@ -73,7 +73,10 @@ build/libtvm_web_runtime.js: build/libtvm_web_runtime.bc cpplint: python3 3rdparty/dmlc-core/scripts/lint.py vta cpp vta/include vta/src python3 3rdparty/dmlc-core/scripts/lint.py topi cpp topi/include; - python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp include src \ + # Note: exclude src/runtime/micro/host_driven becuase it contains C99 files. + python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp \ + --exclude_path=src/runtime/micro/host_driven \ + include src \ examples/extension/src examples/graph_executor/src pylint: diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h index aed8b5c77ae5..0d3cf42d8190 100644 --- a/include/tvm/tir/stmt.h +++ b/include/tvm/tir/stmt.h @@ -930,6 +930,8 @@ constexpr const char* loop_scope = "loop_scope"; constexpr const char* reduce_scope = "reduce_scope"; /*! \brief Mark region is guarded by the pragma extension */ constexpr const char* pragma_scope_prefix = "pragma_"; +/*! \brief Import C source or file into the final code gen module */ +constexpr const char* pragma_import_c = "pragma_import_c"; /*! \brief Import llvm source or file into the final code gen module */ constexpr const char* pragma_import_llvm = "pragma_import_llvm"; /*! \brief Try to modify the AST to support Tensor Core */ diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 00ecd2e98a06..185ed7d05019 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -245,11 +245,12 @@ def get_build_kwargs(self): if 'cuda' in self.task.target.keys: kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split('.')) + if self.task.target.device_name == 'micro_dev': + kwargs.setdefault('build_option', {})['disable_vectorize'] = True return kwargs def run(self, measure_inputs, build_results): - print('[RPCRunner.run]') results = [] remote_args = (self.key, self.host, self.port, self.priority, self.timeout) @@ -274,10 +275,9 @@ def run(self, measure_inputs, build_results): if isinstance(res, Exception): # executor error or timeout results.append(MeasureResult((str(res),), MeasureErrorNo.RUN_TIMEOUT, self.timeout, time.time())) - #raise Exception(f'encountered exception during measurement: {results}') - else: - print(f' got a result: {res}') - results.append(res) + raise Exception(f'encountered exception during measurement: {results}') + + results.append(res) return results @@ -511,8 +511,7 @@ def run_through_rpc(measure_input, build_result, msg = msg[:msg.index("Stack trace returned")] if "CUDA Source" in msg: msg = msg[:msg.index("CUDA Source")] - #costs = (RuntimeError(msg[:1024]),) - costs = (RuntimeError(msg),) + costs = (RuntimeError(msg[:1024]),) errno = MeasureErrorNo.RUNTIME_DEVICE tstamp = time.time() time.sleep(cooldown_interval) diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py index de183db41e2c..f3edfb01dc07 100644 --- a/python/tvm/autotvm/task/relay_integration.py +++ b/python/tvm/autotvm/task/relay_integration.py @@ -48,6 +48,7 @@ def _lower(mod, grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target) grc.codegen(mod["main"]) return + # default case # Try graph codegen first to extract autotvm tasks. # If failed to compile, then fallback to use VM compiler. diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py index f8de1fbf32d5..fbf474fc4df7 100644 --- a/python/tvm/autotvm/task/space.py +++ b/python/tvm/autotvm/task/space.py @@ -544,10 +544,8 @@ def apply(self, sch, op, axes, axis_lens=None, if ann == 'none': pass elif ann == 'unroll': - #if max_unroll and axis_lens[i] > max_unroll: - # cfg.raise_error("Too large factor for unrolling") - #if max_unroll and axis_lens[i] < max_unroll: - # cfg.raise_error("Too large factor for unrolling") + if max_unroll and axis_lens[i] > max_unroll: + cfg.raise_error("Too large factor for unrolling") sch[op].unroll(axes[i]) elif ann == 'vec': if vec_size and axis_lens[i] not in vec_size: diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py index eede450eaeaf..f2e608088510 100644 --- a/python/tvm/autotvm/tuner/callback.py +++ b/python/tvm/autotvm/tuner/callback.py @@ -149,7 +149,7 @@ def _callback(tuner, inputs, results): if res.error_no == 0: flops = inp.task.flop / np.mean(res.costs) - if logger.level < logging.DEBUG: # only print progress bar in non-debug mode + if logger.level > logging.DEBUG: # only print progress bar in non-debug mode ctx.cur_flops = flops ctx.best_flops = tuner.best_flops diff --git a/python/tvm/autotvm/tuner/ga_tuner.py b/python/tvm/autotvm/tuner/ga_tuner.py index a4c36bcd385e..da10f73d5a53 100644 --- a/python/tvm/autotvm/tuner/ga_tuner.py +++ b/python/tvm/autotvm/tuner/ga_tuner.py @@ -50,7 +50,11 @@ def __init__(self, task, pop_size=100, elite_num=3, mutation_prob=0.1): # space info self.space = task.config_space - self.dims = [len(x) for x in self.space.space_map.values()] + self.dim_keys = [] + self.dims = [] + for k, v in self.space.space_map.items(): + self.dim_keys.append(k) + self.dims.append(len(v)) self.visited = set([]) @@ -123,7 +127,7 @@ def update(self, inputs, results): if len(self.visited) < len(self.space): while knob2point(tmp_gene, self.dims) in self.visited: j = np.random.randint(len(self.dims)) - tmp_gene[j] = np.random.randint(self.dims[j]) + tmp_gene[j] = np.random.randint(self.dims[j]) # pylint: disable=invalid-sequence-index next_genes.append(tmp_gene) self.visited.add(knob2point(tmp_gene, self.dims)) else: diff --git a/python/tvm/autotvm/tuner/model_based_tuner.py b/python/tvm/autotvm/tuner/model_based_tuner.py index 56fe5b4f3f72..432f7070c349 100644 --- a/python/tvm/autotvm/tuner/model_based_tuner.py +++ b/python/tvm/autotvm/tuner/model_based_tuner.py @@ -263,7 +263,6 @@ def update(self, inputs, results): # if we have enough new training samples if len(self.xs) >= self.plan_size * (self.train_ct + 1) \ and self.flops_max > 1e-6: - import pdb; pdb.set_trace() self.cost_model.fit(self.xs, self.ys, self.plan_size) if self.diversity_filter_ratio: candidate = self.model_optimizer.find_maximums( diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py index 52d745104e63..0d66d34ac316 100644 --- a/python/tvm/autotvm/tuner/tuner.py +++ b/python/tvm/autotvm/tuner/tuner.py @@ -150,13 +150,15 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=(), si_pr i + k + 1, si_prefix, format_si_prefix(flops, si_prefix), format_si_prefix(self.best_flops, si_prefix), res, config) + num_successes = 0 for result in results: if isinstance(result.costs[0], float): - i += 1 - else: - print('[Tuner.tune]') - print(' not counting failure towards trial count') - #i += len(results) + num_successes += 1 + if num_successes != len(results): + logger.debug('not counting %d failures towards trial count', + len(results) - num_successes) + i += num_successes + self.ttl = min(early_stopping + self.best_iter, n_trial) - i self.update(inputs, results) diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 9d6d469ba7cd..2b18d645dbc0 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -72,80 +72,6 @@ }} """ -def run_cmd(cmd): - """Runs `cmd` in a subprocess and awaits its completion. - - Parameters - ---------- - cmd : List[str] - list of command-line arguments - - Returns - ------- - output : str - resulting stdout capture from the subprocess - """ - proc = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - (output, _) = proc.communicate() - output = output.decode('utf-8') - if proc.returncode != 0: - cmd_str = ' '.join(cmd) - msg = f'error while running command \"{cmd_str}\":\n{output}' - raise RuntimeError(msg) - return output - - -RELOCATION_LD_SCRIPT_TEMPLATE = """ -/* linker symbol for use in UTVMInit */ -_utvm_stack_pointer_init = 0x{stack_pointer_init:x}; - -SECTIONS -{{ - . = 0x{text_start:x}; - . = ALIGN({word_size}); - .text : - {{ - . = ALIGN({word_size}); - KEEP(*(.text)) - KEEP(*(.text*)) - . = ALIGN({word_size}); - }} - - . = 0x{rodata_start:x}; - . = ALIGN({word_size}); - .rodata : - {{ - . = ALIGN({word_size}); - KEEP(*(.rodata)) - KEEP(*(.rodata*)) - . = ALIGN({word_size}); - }} - - . = 0x{data_start:x}; - . = ALIGN({word_size}); - .data : - {{ - . = ALIGN({word_size}); - KEEP(*(.data)) - KEEP(*(.data*)) - . = ALIGN({word_size}); - }} - - . = 0x{bss_start:x}; - . = ALIGN({word_size}); - .bss : - {{ - . = ALIGN({word_size}); - KEEP(*(.bss)) - KEEP(*(.bss*)) - . = ALIGN({word_size}); - }} -}} -""" - def run_cmd(cmd): """Runs `cmd` in a subprocess and awaits its completion. @@ -236,11 +162,11 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix): # padding for most cases, but symbols can be arbitrarily large, so this # isn't bulletproof. return section_size + 32 - # TODO remove this arbitrary addition once we figure out why section sizes - # are being undercalculated. - # maybe stop relying on `*size` to give us the size and instead read the - # section with `*objcopy` and count the bytes. - return section_size + 8 + + # NOTE: in the past, section_size has been wrong on x86. it may be + # inconsistent. TODO: maybe stop relying on `*size` to give us the size and + # instead read the section with `*objcopy` and count the bytes. + return section_size @tvm._ffi.register_func("tvm_callback_relocate_binary") @@ -315,17 +241,18 @@ def tvm_callback_relocate_binary( with open(rel_obj_path, 'rb') as f: rel_bin = bytearray(f.read()) - gdb_init_dir = os.environ['MICRO_GDB_INIT_DIR'] - gdb_init_path = f'{gdb_init_dir}/.gdbinit' - with open(gdb_init_path, 'r') as f: - gdbinit_contents = f.read().split('\n') - new_contents = [] - for line in gdbinit_contents: - new_contents.append(line) - if line.startswith('target'): - new_contents.append(f'add-symbol-file {rel_obj_path}') - with open(gdb_init_path, 'w') as f: - f.write('\n'.join(new_contents)) + gdb_init_dir = os.environ.get('MICRO_GDB_INIT_DIR') + if gdb_init_dir is not None: + gdb_init_path = f'{gdb_init_dir}/.gdbinit' + with open(gdb_init_path, 'r') as f: + gdbinit_contents = f.read().split('\n') + new_contents = [] + for line in gdbinit_contents: + new_contents.append(line) + if line.startswith('target'): + new_contents.append(f'add-symbol-file {rel_obj_path}') + with open(gdb_init_path, 'w') as f: + f.write('\n'.join(new_contents)) return rel_bin diff --git a/python/tvm/exec/rpc_server.py b/python/tvm/exec/rpc_server.py index a3c43583f44d..dd275a3bad4d 100644 --- a/python/tvm/exec/rpc_server.py +++ b/python/tvm/exec/rpc_server.py @@ -122,9 +122,10 @@ def server_shutdown(): '--utvm-dev-config-args is specified.')) parser.add_argument('--utvm-dev-config-args', type=str, help=("Arguments to the device module's generate_config function. " - 'Must be a python literal parseable by literal_eval. If specified, the ' - "device configuration is generated using the device module's generate_config. " - "Can't be specified when --utvm-dev-config is specified.")) + 'Must be a python literal parseable by literal_eval. If specified, ' + "the device configuration is generated using the device module's " + "generate_config. Can't be specified when --utvm-dev-config is " + "specified.")) parser.add_argument('--utvm-dev-id', type=str, help=('Unique ID for the target device (if using MicroTVM). Should ' 'match the name of a module underneath tvm.micro.device).')) diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 5a1b71846630..d2ec7d398c46 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -19,13 +19,13 @@ from __future__ import absolute_import import os +import re import sys from enum import Enum import tvm import tvm._ffi -import tvm from tvm.contrib import util as _util from tvm.contrib import cc as _cc @@ -86,7 +86,9 @@ def __init__(self, config): runtime_src_path = os.path.join(get_micro_host_driven_dir(), 'utvm_runtime.c') tmp_dir = _util.tempdir() runtime_obj_path = tmp_dir.relpath('utvm_runtime.obj') - dev_funcs['create_micro_lib'](runtime_obj_path, runtime_src_path, LibType.RUNTIME) + options = ['-I{}'.format(get_micro_host_driven_dir())] + dev_funcs['create_micro_lib']( + runtime_obj_path, runtime_src_path, LibType.RUNTIME, options=options) comms_method = config['comms_method'] if comms_method == 'openocd': @@ -98,7 +100,8 @@ def __init__(self, config): else: raise RuntimeError(f'unknown communication method: f{self.comms_method}') - assert all(map(lambda sec: sec in self.mem_layout, DEVICE_SECTIONS)), 'not all sections have an assigned memory layout' + assert all(map(lambda sec: sec in self.mem_layout, DEVICE_SECTIONS)), \ + 'not all sections have an assigned memory layout' self.module = _CreateSession( comms_method, runtime_obj_path, @@ -139,7 +142,7 @@ def _check_system(self): # TODO(weberlo): Add 32-bit support. # It's primarily the compilation pipeline that isn't compatible. if sys.maxsize <= 2**32: - raise RuntimeError('MicroTVM is currently only supported on 64-bit platforms') + raise RuntimeError('MicroTVM is currently only supported on 64-bit host platforms') def __enter__(self): self._enter() @@ -151,8 +154,8 @@ def __exit__(self, exc_type, exc_value, exc_traceback): def _calc_max_workspace_usage(src): # TODO factor in alignment to the calculation (alloc sizes will be aligned up to the word size) - import re - alloc_re = re.compile(r'.*\* ?(.+) = (\(.+\))? TVMBackendAllocWorkspace\(.+, .+, \(uint64_t\)(.+), .+, .+\).*') + alloc_re = re.compile( + r'.*\* ?(.+) = (\(.+\))? TVMBackendAllocWorkspace\(.+, .+, \(uint64_t\)(.+), .+, .+\).*') free_re = re.compile(r'.*if \(TVMBackendFreeWorkspace\(.+, .+, (\(void\*\))? (.+)\) != 0\) {.*') max_usage = 0 alloc_map = {} @@ -171,7 +174,8 @@ def _calc_max_workspace_usage(src): return max_usage -def create_micro_mod(c_mod, dev_config, lib_src_paths=None, lib_headers=None, lib_include_paths=None): +def create_micro_mod(c_mod, dev_config, lib_src_paths=None, lib_headers=None, + lib_include_paths=None): """Produces a micro module from a given module. Parameters @@ -193,24 +197,24 @@ def create_micro_mod(c_mod, dev_config, lib_src_paths=None, lib_headers=None, li micro_mod : tvm.module.Module micro module for the target device """ - print('[create_micro_mod]') temp_dir = _util.tempdir() lib_obj_path = temp_dir.relpath('dev_lib.obj') # TODO use dev config to dispatch on the type of C codegen to run through # (e.g., CodeGenCArm, CodeGenCHost, CodeGenCRiscV) c_mod.export_library( - lib_obj_path, - fcompile=cross_compiler( - dev_config, - LibType.OPERATOR, - lib_src_paths=lib_src_paths, - lib_headers=lib_headers, - lib_include_paths=lib_include_paths)) - micro_mod = tvm.module.load(lib_obj_path) + lib_obj_path, + fcompile=cross_compiler( + dev_config, + LibType.OPERATOR, + lib_src_paths=lib_src_paths, + lib_headers=lib_headers, + lib_include_paths=lib_include_paths)) + micro_mod = tvm.runtime.load_module(lib_obj_path) return micro_mod -def cross_compiler(dev_config, lib_type, lib_src_paths=None, lib_headers=None, lib_include_paths=None): +def cross_compiler(dev_config, lib_type, lib_src_paths=None, lib_headers=None, + lib_include_paths=None): """Create a cross compile function that wraps `create_lib` for a `Binutil` instance. For use in `tvm.runtime.Module.export_library`. @@ -247,7 +251,8 @@ def cross_compiler(dev_config, lib_type, lib_src_paths=None, lib_headers=None, l fcompile = tvm.micro.cross_compiler(dev_config, LibType.OPERATOR) c_mod.export_library('dev_lib.obj', fcompile=fcompile) """ - assert (lib_headers is None) == (lib_include_paths is None), 'must specify both `lib_headers` and `lib_include_paths` or neither' + assert (lib_headers is None) == (lib_include_paths is None), \ + 'must specify both `lib_headers` and `lib_include_paths` or neither' if lib_src_paths is None: lib_src_paths = [] @@ -257,7 +262,8 @@ def cross_compiler(dev_config, lib_type, lib_src_paths=None, lib_headers=None, l for include_path in lib_include_paths: include_options.append('-I') include_options.append(include_path) - create_micro_lib = tvm.micro.device.get_device_funcs(dev_config['device_id'])['create_micro_lib'] + create_micro_lib = tvm.micro.device.get_device_funcs( + dev_config['device_id'])['create_micro_lib'] mem_layout = dev_config['mem_layout'] def compile_func(obj_path, src_path, **kwargs): @@ -274,7 +280,8 @@ def compile_func(obj_path, src_path, **kwargs): max_ws_usage = _calc_max_workspace_usage(src_contents) available_mem = mem_layout['workspace']['size'] if max_ws_usage > available_mem: - raise RuntimeError(f'workspace allocations in library ({max_ws_usage}) exceed available memory ({available_mem})') + raise RuntimeError(f'workspace allocations in library ({max_ws_usage}) ' + f'exceed available memory ({available_mem})') # inject headers into new source path, if requested if lib_headers: headers_to_inject = '\n'.join(map(lambda s: f'#include <{s}>', lib_headers)) + '\n' diff --git a/python/tvm/micro/device/__init__.py b/python/tvm/micro/device/__init__.py index 3d2291c6a052..89731b9aa797 100644 --- a/python/tvm/micro/device/__init__.py +++ b/python/tvm/micro/device/__init__.py @@ -16,7 +16,8 @@ # under the License. """Device-specific configuration for MicroTVM""" -from .base import create_micro_lib_base, gen_mem_layout, MemConstraint, register_device, get_device_funcs +from .base import create_micro_lib_base, gen_mem_layout +from .base import MemConstraint, register_device, get_device_funcs from . import host from . import arm from . import riscv_spike diff --git a/python/tvm/micro/device/arm/stm32f746xx.py b/python/tvm/micro/device/arm/stm32f746xx.py index 8cd354738fe7..f85a34e4e0a2 100644 --- a/python/tvm/micro/device/arm/stm32f746xx.py +++ b/python/tvm/micro/device/arm/stm32f746xx.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. """Compilation and config definitions for Arm STM32F746XX devices""" +import os from .. import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint DEVICE_ID = 'arm.stm32f746xx' @@ -60,13 +61,16 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=N """ if options is None: options = [] + else: + options = list(options) + options += [ + # TODO(weberlo): make a debug flag + '-O2', '-march=armv7e-m', '-mcpu=cortex-m7', '-mlittle-endian', '-mfloat-abi=hard', - # TODO try this one? - #'-mfpu=fpv5-d16', '-mfpu=fpv5-sp-d16', '-mthumb', '-ffast-math', @@ -74,9 +78,14 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=N '-DARM_MATH_CM7', '-D__FPU_PRESENT=1U', '-DARM_MATH_DSP', + '-Wno-unused-variable', + '-Wno-unused-parameter', + '-I{}'.format(os.environ['CMSIS_ST_PATH']), + '-I{}/Core/Include'.format(os.environ['CMSIS_ST_PATH']) ] create_micro_lib_base( - obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options, lib_src_paths=lib_src_paths) + obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options, + lib_src_paths=lib_src_paths) def generate_config(server_addr, server_port, section_constraints=None): @@ -90,8 +99,8 @@ def generate_config(server_addr, server_port, section_constraints=None): server_port : int port of OpenOCD server to connect to - section_constraints: Optional[Dict[str, Tuple[Number, MemConstraint]]] - TODO correct type annotation? + section_constraints: Optional[Dict[str, [Number, MemConstraint]]] + maps section name to the quantity of available memory Return ------ @@ -106,7 +115,7 @@ def generate_config(server_addr, server_port, section_constraints=None): 'mem_layout': gen_mem_layout(BASE_ADDR, AVAILABLE_MEM, WORD_SIZE, section_constraints), 'word_size': WORD_SIZE, 'thumb_mode': True, - 'use_device_timer': False, + 'use_device_timer': True, 'comms_method': 'openocd', 'server_addr': server_addr, 'server_port': server_port, diff --git a/python/tvm/micro/device/base.py b/python/tvm/micro/device/base.py index 41d1a3ed1a95..1621c69d1a77 100644 --- a/python/tvm/micro/device/base.py +++ b/python/tvm/micro/device/base.py @@ -19,7 +19,6 @@ import os import enum import pathlib -import operator from tvm.contrib import util as _util from tvm.contrib.binutil import run_cmd @@ -98,14 +97,8 @@ def create_micro_lib_base( additional options to pass to GCC lib_src_paths : Optional[List[str]] - TODO + paths to additional source files to be compiled into the library """ - print('[MicroBinutil.create_lib]') - print(' EXTENDED OPTIONS') - print(f' {out_obj_path}') - print(f' {in_src_path}') - print(f' {lib_type}') - print(f' {options}') # look at these (specifically `strip`): # https://stackoverflow.com/questions/15314581/g-compiler-flag-to-minimize-binary-size base_compile_cmd = [ @@ -115,10 +108,6 @@ def create_micro_lib_base( '-Wextra', '--pedantic', '-c', - # TODO(weberlo): make a debug flag - '-O0', - # '-O2', - # '-Os', '-g', '-nostartfiles', '-nodefaultlibs', @@ -162,12 +151,12 @@ def create_micro_lib_base( if lib_src_paths is not None: src_paths += lib_src_paths - print(f'include paths: {include_paths}') + # print(f'include paths: {include_paths}') for path in include_paths: base_compile_cmd += ['-I', path] prereq_obj_paths = [] - print(src_paths) + # print(src_paths) for src_path in src_paths: curr_obj_path = tmp_dir.relpath(pathlib.Path(src_path).with_suffix('.o').name) assert curr_obj_path not in prereq_obj_paths @@ -190,9 +179,28 @@ class MemConstraint(enum.Enum): def gen_mem_layout(base_addr, available_mem, word_size, section_constraints): - print('[gen_mem_layout]') - byte_sum = sum(map(operator.itemgetter(0), filter(lambda x: x[1] == MemConstraint.ABSOLUTE_BYTES, section_constraints.values()))) - weight_sum = sum(map(operator.itemgetter(0), filter(lambda x: x[1] == MemConstraint.WEIGHT, section_constraints.values()))) + """Template function to generate memory layout for devices. + + Parameters + ---------- + base_addr: Number + The address where usable memory begins on this device. + + available_mem: Number + Available memory at base_addr, given in bytes. + + word_size: Number + Number of bytes in one word on this device. + + section_constraints: Optional[Dict[str, [Number, MemConstraint]]] + maps section name to the quantity of available memory + """ + byte_sum = sum(x[0] + for x in section_constraints.values() + if x[1] == MemConstraint.ABSOLUTE_BYTES) + weight_sum = sum(x[0] + for x in section_constraints.values() + if x[1] == MemConstraint.WEIGHT) assert byte_sum <= available_mem available_weight_mem = available_mem - byte_sum @@ -201,7 +209,8 @@ def gen_mem_layout(base_addr, available_mem, word_size, section_constraints): for section in DEVICE_SECTIONS: (val, cons_type) = section_constraints[section] if cons_type == MemConstraint.ABSOLUTE_BYTES: - assert val % word_size == 0, f'constraint {val} for {section} section is not word-aligned' + assert val % word_size == 0, \ + f'constraint {val} for {section} section is not word-aligned' size = val res[section] = { 'start': curr_addr, @@ -216,13 +225,6 @@ def gen_mem_layout(base_addr, available_mem, word_size, section_constraints): } curr_addr += size - print(' result mem layout:') - for section in DEVICE_SECTIONS: - start = res[section]['start'] - size = res[section]['size'] - print(f' {section}: start={start:x}, size={size}') - # import pprint - # pprint.pprint(res) return res diff --git a/python/tvm/micro/device/host.py b/python/tvm/micro/device/host.py index 737026dfd51a..f6d0dadc5f43 100644 --- a/python/tvm/micro/device/host.py +++ b/python/tvm/micro/device/host.py @@ -54,14 +54,19 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=N additional options to pass to GCC lib_src_paths : Optional[List[str]] - TODO + paths to additional source files to be compiled into the library """ if options is None: options = [] + else: + options = list(options) + # Cannot increase optimization level on host due to code loading method. + options.append('-O0') if sys.maxsize > 2**32 and sys.platform.startswith('linux'): options += ['-mcmodel=large'] create_micro_lib_base( - obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options, lib_src_paths=lib_src_paths) + obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options, + lib_src_paths=lib_src_paths) def generate_config(available_mem=None, section_constraints=None): @@ -69,13 +74,11 @@ def generate_config(available_mem=None, section_constraints=None): Parameters ---------- - TODO correct type annotation? available_mem: int - TODO + number of RW bytes available for use on device - TODO correct type annotation? - section_constraints: Optional[Dict[str, Dict[str, Number]]] - TODO + section_constraints: Optional[Dict[str, Dict[Number, MemConstraint]]] + maps section name to the quantity of available memory Return ------ diff --git a/python/tvm/micro/device/riscv_spike.py b/python/tvm/micro/device/riscv_spike.py index b7beff3c5cb6..f26f04604cac 100644 --- a/python/tvm/micro/device/riscv_spike.py +++ b/python/tvm/micro/device/riscv_spike.py @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. """Compilation and config definitions for Spike, a RISC-V functional ISA simulator""" -from collections import OrderedDict from . import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint diff --git a/python/tvm/relay/_parser.py b/python/tvm/relay/_parser.py index 42ced795f6ae..6a1d928352f4 100644 --- a/python/tvm/relay/_parser.py +++ b/python/tvm/relay/_parser.py @@ -630,12 +630,6 @@ def visitCallWithAttr(self, ctx: RelayParser.CallWithAttrContext): def call(self, func, args, attrs, type_args): if isinstance(func, OpWrapper): - #if hasattr(func.operator, '__name__') and func.operator.__name__ == 'clip': - # # TODO(wbelrlo) this big fucking hack yes - # import copy - # args = copy.deepcopy(args) - # args[1] = float(args[1].data.asnumpy()) - # args[2] = float(args[2].data.asnumpy()) return func(args, attrs, type_args) if isinstance(func, adt.Constructor): return func(*args) diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py index 459a4a588324..30c5971e32b9 100644 --- a/python/tvm/relay/build_module.py +++ b/python/tvm/relay/build_module.py @@ -239,26 +239,16 @@ def build(mod, target=None, target_host=None, params=None): raise ValueError("target host must be the type of str, " + "tvm.target.Target, or None") - # # If current dispatch context is fallback context (the default root context), - # # then load pre-tuned parameters from TopHub - # if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext): - # tophub_context = autotvm.tophub.context(list(target.values())) - # else: - # tophub_context = autotvm.util.EmptyContext() - - # with tophub_context: - # bld_mod = BuildModule() - # graph_json, mod, params = bld_mod.build(func, target, target_host, params) - + # If current dispatch context is fallback context (the default root context), + # then load pre-tuned parameters from TopHub if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext): tophub_context = autotvm.tophub.context(list(target.values())) - with tophub_context: - bld_mod = BuildModule() - graph_json, mod, params = bld_mod.build(func, target, target_host, params) else: - bld_mod = BuildModule() - graph_json, mod, params = bld_mod.build(func, target, target_host, params) + tophub_context = autotvm.util.EmptyContext() + with tophub_context: + bld_mod = BuildModule() + graph_json, mod, params = bld_mod.build(mod, target, target_host, params) return graph_json, mod, params diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py index d455f2229439..43065bef838a 100644 --- a/python/tvm/relay/frontend/keras.py +++ b/python/tvm/relay/frontend/keras.py @@ -85,7 +85,7 @@ def _convert_activation(inexpr, keras_layer, _): return _op.sigmoid(inexpr) if act_type == 'tanh': return _op.tanh(inexpr) - if act_type in ('relu', 'swish'): + if act_type == 'relu': return _op.nn.relu(inexpr) if act_type == 'softplus': return _op.log(_op.add(_op.exp(inexpr), _expr.const(1., dtype='float32'))) diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 942d4c7f86af..2ad75533e807 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -20,24 +20,25 @@ import logging import topi +from ....target import arm_isa from .generic import * from .. import op as _op logger = logging.getLogger('strategy') -@schedule_injective.register("arm_cpu") +@schedule_injective.register(["arm_cpu", "micro_dev"]) def schedule_injective_arm_cpu(_, outs, target): """schedule injective ops for arm cpu""" with target: return topi.arm_cpu.schedule_injective(outs) -@schedule_concatenate.register("arm_cpu") +@schedule_concatenate.register(["arm_cpu", "micro_dev"]) def schedule_concatenate_arm_cpu(_, outs, target): """schedule concatenate for arm cpu""" with target: return topi.arm_cpu.schedule_concatenate(outs) -@conv2d_strategy.register("arm_cpu") +@conv2d_strategy.register(["arm_cpu", "micro_dev"]) def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): """conv2d arm cpu strategy""" strategy = _op.OpStrategy() @@ -51,6 +52,8 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") + isa = arm_isa.IsaAnalyzer(target) + if groups == 1: if layout == "NCHW": if kernel_layout == "OIHW": @@ -102,11 +105,22 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn), name="conv2d_hwcn.generic") elif layout == "NHWC": - assert kernel_layout == "HWIO" - strategy.add_implementation( - wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack), - name="conv2d_nhwc_spatial_pack.arm_cpu") + channels = data.shape[3] + if "SMLAD" in isa and (channels % 4) == 0 and kernel_layout == "HWOI": + strategy.add_implementation( + wrap_compute_conv2d(topi.arm_cpu.conv2d_direct_simd), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_direct_simd), + name='conv2d_direct_simd.micro_dev') + elif kernel_layout == "HWIO": + strategy.add_implementation( + wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack), + name="conv2d_nhwc_spatial_pack.arm_cpu") + else: + raise RuntimeError("Unsupported kernel layout {} for conv2d NHWC". + format(kernel_layout)) + + else: raise RuntimeError("Unsupported conv2d layout {} for arm cpu".format(layout)) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): @@ -232,7 +246,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_arm_cpu(attrs, inputs, out format(layout)) return strategy -@conv2d_transpose_strategy.register("arm_cpu") +@conv2d_transpose_strategy.register(["arm_cpu", "micro_dev"]) def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target): """conv2d_transpose arm cpu strategy""" layout = attrs.data_layout @@ -248,7 +262,7 @@ def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target): name="conv2d_tranpose_nchw.arm_cpu") return strategy -@bitserial_conv2d_strategy.register("arm_cpu") +@bitserial_conv2d_strategy.register(["arm_cpu", "micro_dev"]) def bitserial_conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): """bitserial_conv2d x86 strategy""" strategy = _op.OpStrategy() @@ -267,7 +281,7 @@ def bitserial_conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): raise ValueError("Data layout {} not supported.".format(layout)) return strategy -@bitserial_dense_strategy.register("arm_cpu") +@bitserial_dense_strategy.register(["arm_cpu", "micro_dev"]) def schedule_bitserial_dense_arm_cpu(attrs, inputs, out_type, target): """bitserial_dense arm cpu strategy""" strategy = _op.OpStrategy() diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py index f5a16941083b..ed57e0d4276d 100644 --- a/python/tvm/rpc/client.py +++ b/python/tvm/rpc/client.py @@ -70,11 +70,6 @@ def context(self, dev_type, dev_id=0): ctx: TVMContext The corresponding encoded remote context. """ - print('[RPCSession.context]') - print(f' dev_type: {dev_type}') - print(f' dev_id: {dev_id}') - if '-device=micro_dev' in dev_type: - dev_type = 'micro_dev' ctx = nd.context(dev_type, dev_id) encode = (self._tbl_index + 1) * base.RPC_SESS_MASK ctx.device_type += encode @@ -408,14 +403,9 @@ def connect(url, port, key="", session_timeout=0): The connected session. """ try: - print('[client.connect]') - #session_timeout = 0 - print(' hardcoding timeout to 0 (always keep alive)!') if session_timeout: key += " -timeout=%s" % str(session_timeout) - print(f' connecting to RPC server with {url}, {port}, {key}') sess = base._Connect(url, port, key) - print(f' finished connecting!') except NameError: raise RuntimeError("Please compile with USE_RPC=1") return RPCSession(sess) diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py index 74296cc2ceaa..03749c1c17e4 100644 --- a/python/tvm/rpc/server.py +++ b/python/tvm/rpc/server.py @@ -64,9 +64,7 @@ def get_workpath(path): def load_module(file_name): """Load module from remote side.""" path = temp.relpath(file_name) - print('[rpc.server] ABOUT TO LOAD MOD') m = _load_module(path) - print('[rpc.server] DONE LOADING MOD') logger.info("load_module %s", path) return m diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py index 001c76c76fef..e3346b162aaf 100644 --- a/python/tvm/rpc/tracker.py +++ b/python/tvm/rpc/tracker.py @@ -297,11 +297,8 @@ def _event_handler(_, events): def _on_event(self, _): while True: try: - print('waiting for connection!') conn, addr = self._sock.accept() - print(f'got new conn: {conn}, {addr}') TCPEventHandler(self, conn, addr) - print(f'made event handler') except socket.error as err: if err.args[0] in (errno.EAGAIN, errno.EWOULDBLOCK): break diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py index 45dd79e50019..716f87f33fc1 100644 --- a/python/tvm/runtime/module.py +++ b/python/tvm/runtime/module.py @@ -211,7 +211,6 @@ def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0): def evaluator(*args): """Internal wrapped evaluator.""" - print('[Module.time_evaluator.evaluator]') # Wrap feval so we can add more stats in future. blob = feval(*args) fmt = "@" + ("d" * repeat) diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py index 10bbb6ef54c2..9b7e7c52f351 100644 --- a/python/tvm/runtime/ndarray.py +++ b/python/tvm/runtime/ndarray.py @@ -219,7 +219,7 @@ def context(dev_type, dev_id=0): """ if isinstance(dev_type, string_types): if '-device=micro_dev' in dev_type: - dev_type = 'micro_dev' + dev_type = TVMContext.STR2MASK['micro_dev'] else: dev_type = dev_type.split()[0] if dev_type not in TVMContext.STR2MASK: diff --git a/python/tvm/target/arm_isa.py b/python/tvm/target/arm_isa.py new file mode 100644 index 000000000000..6b2eda8f050e --- /dev/null +++ b/python/tvm/target/arm_isa.py @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Defines functions to analyze availble opcodes in the ARM ISA.""" + + +ARM_ISA_MAP = { + 'armv7e-m': ['SMLAD'], +} + + +class IsaAnalyzer(object): + + def __init__(self, target): + self.target = target + # TODO: actually parse -mcpu + arch = 'armv7e-m' + self._isa_map = ARM_ISA_MAP[arch] + + def __contains__(self, instruction): + return instruction in self._isa_map diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc index 849c74028d63..8231c1b79d38 100644 --- a/src/driver/driver_api.cc +++ b/src/driver/driver_api.cc @@ -272,7 +272,7 @@ runtime::Module build(const Map& inputs, Target target_host_val = target_host; if (!target_host.defined()) { for (const auto& it : inputs) { - if (it.first->device_type == kDLCPU) { + if (it.first->device_type == kDLCPU || it.first->device_type == kDLMicroDev) { target_host_val = it.first; break; } diff --git a/src/ir/error.cc b/src/ir/error.cc index e700691f3e07..67694342db45 100644 --- a/src/ir/error.cc +++ b/src/ir/error.cc @@ -69,8 +69,6 @@ void ErrorReporter::RenderErrors(const IRModule& module, bool use_color) { if (error_indices.size() != 0) { err_msg << rang::fg::red; err_msg << " "; - // TODO should fix this reverse problem further upstream (in the error reporter). - // // the errors are in reverse order, so print them with a reversed iteration err_msg << this->errors_[error_indices[error_indices.size()-1]].what(); for (int i = error_indices.size() - 2; i >= 0; i--) { diff --git a/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s b/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s index 5861c0326dbf..f5720f4d7b28 100644 --- a/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s +++ b/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s @@ -17,12 +17,6 @@ * under the License. */ -/*! - * Copyright (c) 2019 by Contributors - * \file utvm_init.s - * \brief uTVM init definition for STM32F746XX-series boards - */ - .syntax unified .cpu cortex-m7 .fpu softvfp diff --git a/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c b/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c index a5a12cf86ebd..0f13a7dede88 100644 --- a/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c +++ b/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c @@ -18,7 +18,6 @@ */ /*! - * Copyright (c) 2019 by Contributors * \file utvm_timer.c * \brief uTVM timer API definitions for STM32F746XX-series boards */ @@ -30,97 +29,51 @@ extern "C" { #include #include "utvm_runtime.h" +// NOTE: This expects ST CMSIS to be in your include path. +// Download STM32CubeF7 here: +// https://www.st.com/content/st_com/en/products/embedded-software/mcu-mpu-embedded-software/stm32-embedded-software/stm32cube-mcu-mpu-packages/stm32cubef7.html +// and add Drivers/CMSIS to your C include path. +#include "Device/ST/STM32F7xx/Include/stm32f746xx.h" -// There are two implementations of cycle counters on the STM32F7X: SysTick and -// CYCCNT. SysTick is preferred, as it gives better error handling, but the -// counter is only 24 bits wide. If a larger timer is needed, use the CYCCNT -// implementation, which has a 32-bit counter. -#define USE_SYSTICK -#ifdef USE_SYSTICK - -#define SYST_CSR (*((volatile uint32_t *) 0xE000E010)) -#define SYST_RVR (*((volatile uint32_t *) 0xE000E014)) -#define SYST_CVR (*((volatile uint32_t *) 0xE000E018)) -#define SYST_CALIB (*((volatile uint32_t *) 0xE000E01C)) - -#define SYST_CSR_ENABLE 0 -#define SYST_CSR_TICKINT 1 -#define SYST_CSR_CLKSOURCE 2 -#define SYST_COUNTFLAG 16 - -#define SYST_CALIB_NOREF 31 -#define SYST_CALIB_SKEW 30 - -volatile uint32_t start_time = 0; -volatile uint32_t stop_time = 0; +#define utvm_SystemCoreClock 216000000UL int32_t UTVMTimerStart() { - SYST_CSR = 0; - // maximum reload value (24-bit) - SYST_RVR = (~((uint32_t) 0)) >> 8; - SYST_CVR = 0; - - SYST_CSR = (1 << SYST_CSR_ENABLE) | (1 << SYST_CSR_CLKSOURCE); - // wait until timer starts - while (SYST_CVR == 0) {} - start_time = SYST_CVR; + UTVMTimerReset(); + TIM2->CR1 = + TIM_CR1_CEN; // Start counter return UTVM_ERR_OK; } -uint32_t UTVMTimerStop(int32_t *err) { - SYST_CSR &= ~((uint32_t) 1); - stop_time = SYST_CVR; - if (SYST_CSR & (1 << SYST_COUNTFLAG)) { - TVMAPISetLastError("timer overflowed"); +uint32_t UTVMTimerStop(int32_t* err) { + TIM2->CR1 &= TIM_CR1_CEN; + if (TIM2->SR & TIM_SR_UIF_Msk) { *err = UTVM_ERR_TIMER_OVERFLOW; return 0; - } else { - *err = UTVM_ERR_OK; - return start_time - stop_time; - } -} - -#else // !USE_SYSTICK - -#define DWT_CTRL (*((volatile uint32_t *) 0xE0001000)) -#define DWT_CYCCNT (*((volatile uint32_t *) 0xE0001004)) - -#define DWT_CTRL_NOCYCCNT 25 -#define DWT_CTRL_CYCCNTENA 0 - -volatile uint32_t start_time = 0; -volatile uint32_t stop_time = 0; - -int32_t UTVMTimerStart() { - DWT_CTRL &= ~(1 << DWT_CTRL_CYCCNTENA); - DWT_CYCCNT = 0; - - if (DWT_CTRL & (1 << DWT_CTRL_NOCYCCNT)) { - TVMAPISetLastError("cycle counter not implemented on device"); - return UTVM_ERR_TIMER_NOT_IMPLEMENTED; } - start_time = DWT_CYCCNT; - DWT_CTRL |= (1 << DWT_CTRL_CYCCNTENA); - return UTVM_ERR_OK; + *err = UTVM_ERR_OK; + uint32_t tim_cnt = TIM2->CNT; + uint32_t millis = tim_cnt / (utvm_SystemCoreClock / 1000); + uint32_t micros = + (tim_cnt - (millis * (utvm_SystemCoreClock / 1000))) / + (utvm_SystemCoreClock / 1000000); + return millis * 1000 + micros; } -uint32_t UTVMTimerStop(int32_t* err) { - stop_time = DWT_CYCCNT; - DWT_CTRL &= ~(1 << DWT_CTRL_CYCCNTENA); - // even with this check, we can't know for sure if the timer has overflowed - // (it may have overflowed and gone past `start_time`). - if (stop_time > start_time) { - *err = UTVM_ERR_OK; - return stop_time - start_time; - } else { - *err = UTVM_ERR_TIMER_OVERFLOW; - return 0; +void UTVMTimerReset() { + RCC->APB1RSTR |= RCC_APB1RSTR_TIM2RST; // Hold TIM2 in reset + RCC->DCKCFGR1 = (RCC->DCKCFGR1 & ~RCC_DCKCFGR1_TIMPRE_Msk); // disable 2x clock boost to TIM2 + RCC->CFGR = (RCC->CFGR & ~RCC_CFGR_PPRE1_Msk); // No AHB clock division to APB1 (1:1). + RCC->APB1ENR |= RCC_APB1ENR_TIM2EN; // Enable TIM2 clock. + RCC->APB1RSTR &= ~RCC_APB1RSTR_TIM2RST; // Exit TIM2 reset. + + DBGMCU->APB1FZ |= DBGMCU_APB1_FZ_DBG_TIM2_STOP; // stop TIM2 clock during debug halt. + TIM2->ARR = 0xffffffff; + if (TIM2->SR & TIM_SR_UIF_Msk) { + for (;;) ; } } -#endif // USE_SYSTICK - #ifdef __cplusplus } // TVM_EXTERN_C #endif diff --git a/src/runtime/micro/device/host/utvm_init.c b/src/runtime/micro/device/host/utvm_init.c index 02888f04c48e..4fb43c11d20e 100644 --- a/src/runtime/micro/device/host/utvm_init.c +++ b/src/runtime/micro/device/host/utvm_init.c @@ -18,7 +18,6 @@ */ /*! - * Copyright (c) 2019 by Contributors * \file utvm_init.c * \brief uTVM init definition for the host emulated device */ diff --git a/src/runtime/micro/device/host/utvm_timer.c b/src/runtime/micro/device/host/utvm_timer.c index 7b24aab473d1..6ab585a88f24 100644 --- a/src/runtime/micro/device/host/utvm_timer.c +++ b/src/runtime/micro/device/host/utvm_timer.c @@ -20,12 +20,10 @@ /*! * \file utvm_timer.c * \brief uTVM timer API stubs for the host emulated device - * Copyright (c) 2019 by Contributors */ #include -#include -#include +#include "utvm_runtime.h" // TODO(weberlo): use this? https://stackoverflow.com/questions/5141960/get-the-current-time-in-c @@ -37,17 +35,3 @@ uint32_t UTVMTimerStop(int32_t* err) { *err = UTVM_ERR_OK; return 0; } - -extern void UTVMInit(); - -extern void UTVMTimerReset(); - -extern int32_t UTVMTimerStart(); - -extern void UTVMTimerStop(); - -extern uint32_t UTVMTimerRead(); - -void UTVMMain(); - -void UTVMDone(); diff --git a/src/runtime/micro/device/riscv_spike/utvm_init.s b/src/runtime/micro/device/riscv_spike/utvm_init.s index a73641249165..68662cce97e7 100644 --- a/src/runtime/micro/device/riscv_spike/utvm_init.s +++ b/src/runtime/micro/device/riscv_spike/utvm_init.s @@ -17,12 +17,6 @@ * under the License. */ -/*! - * Copyright (c) 2019 by Contributors - * \file utvm_init.s - * \brief uTVM init definition for Spike - */ - UTVMInit: /* set stack pointer */ la sp, _utvm_stack_pointer_init diff --git a/src/runtime/micro/device/riscv_spike/utvm_timer.c b/src/runtime/micro/device/riscv_spike/utvm_timer.c index c4e0af2b230b..5cf38559feab 100644 --- a/src/runtime/micro/device/riscv_spike/utvm_timer.c +++ b/src/runtime/micro/device/riscv_spike/utvm_timer.c @@ -18,7 +18,6 @@ */ /*! - * Copyright (c) 2019 by Contributors * \file utvm_timer.c * \brief uTVM timer API stubs for Spike */ diff --git a/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c b/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c index 970eb27a1ef3..3a840e7a7861 100644 --- a/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c +++ b/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c @@ -32,11 +32,10 @@ extern "C" { #include #include -// TODO compiler errors say volatile qualifier is discarded. should we just get rid of em? -volatile void *(*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = - (void *(*)(int, int, uint64_t, int, int)) NULL; -volatile int (*TVMBackendFreeWorkspace_)(int, int, void*) = (int (*)(int, int, void*)) NULL; -volatile void (*TVMAPISetLastError_)(const char*) = (void (*)(const char*)) NULL; +// TODO(areusch): compiler errors say volatile qualifier is discarded. should we just get rid of em? +void* (* volatile TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = NULL; +int (* volatile TVMBackendFreeWorkspace_)(int, int, void*) = NULL; +void (* volatile TVMAPISetLastError_)(const char*) = NULL; void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, int dtype_code_hint, int dtype_bits_hint) { diff --git a/src/runtime/micro/host_driven/utvm_runtime.c b/src/runtime/micro/host_driven/utvm_runtime.c index a4de495a185c..f05fdb5fe2c1 100644 --- a/src/runtime/micro/host_driven/utvm_runtime.c +++ b/src/runtime/micro/host_driven/utvm_runtime.c @@ -34,89 +34,148 @@ extern "C" { #include "utvm_runtime.h" -// Task pointers must be patched before calling a function. -UTVMTask utvm_task = { - .func = NULL, - .arg_values = NULL, - .arg_type_codes = NULL, - .num_args = 0, -}; - -size_t utvm_word_size = 0; // NOLINT(*) +// TODO(areusch): move defines into header +#define TASK_QUEUE_SIZE 20 +volatile UTVMTask utvm_tasks[TASK_QUEUE_SIZE] = { }; +volatile uint32_t utvm_num_tasks = 0; +volatile uint32_t utvm_task_times[TASK_QUEUE_SIZE] = { }; // These pointers are patched at load time to point to the workspace section. -char* utvm_workspace_start = NULL; // NOLINT(*) -char* utvm_workspace_end = NULL; // NOLINT(*) -char* utvm_workspace_curr = NULL; // NOLINT(*) +volatile char* utvm_workspace_start = NULL; // NOLINT(*) +volatile char* utvm_workspace_end = NULL; // NOLINT(*) +volatile char* utvm_workspace_curr = NULL; // NOLINT(*) +#define MAX_WS_ALLOCS 10 +volatile char* utvm_alloc_ends[MAX_WS_ALLOCS] = {}; // NOLINT(*) +volatile uint32_t utvm_alloc_idx = 0; // Keep track of how many active allocations there are on the workspace. -size_t utvm_num_active_allocs = 0; +volatile uint32_t utvm_num_active_allocs = 0; + +volatile uint32_t utvm_word_size = 0; -const char* utvm_last_error = NULL; // NOLINT(*) -int32_t utvm_return_code = 0; // NOLINT(*) +volatile int32_t utvm_last_error = 0; // NOLINT(*) -uint32_t utvm_task_time = 0; +volatile uint32_t utvm_done = 0; // Gets called by UTVMInit, after device-specific initialization is finished. void UTVMMain() { + utvm_done = 0; + // loss of precision should be fine here, since we only care about the lower bits + if (((uint32_t) utvm_workspace_start) % utvm_word_size) { + utvm_last_error = UTVM_ERR_WS_UNALIGNED_START; + UTVMDone(); + return; + } utvm_workspace_curr = utvm_workspace_start; utvm_num_active_allocs = 0; - utvm_last_error = NULL; // NOLINT(*) - utvm_return_code = 0; - utvm_task_time = 0; - UTVMTimerReset(); - int32_t err = UTVMTimerStart(); - if (err < 0) { - utvm_return_code = err; - UTVMDone(); + utvm_alloc_idx = 0; + utvm_last_error = UTVM_ERR_NOT_FINISHED; + for (uint32_t i = 0; i < utvm_num_tasks; i++) { + int32_t err = UTVM_ERR_OK; + utvm_task_times[i] = 0; + err = UTVMTimerStart(); + if (err < 0) { + utvm_last_error = err; + UTVMDone(); + return; + } + err = utvm_tasks[i].func( + (void*) utvm_tasks[i].arg_values, // NOLINT(*) + (void*) utvm_tasks[i].arg_type_codes, // NOLINT(*) + utvm_tasks[i].num_args); + if (err < 0) { + UTVMDone(); + return; + } + utvm_task_times[i] = UTVMTimerStop(&err); + if (err < 0) { + utvm_last_error = err; + UTVMDone(); + return; + } + } + if (utvm_last_error == UTVM_ERR_NOT_FINISHED) { + utvm_last_error = UTVM_ERR_OK; } - utvm_return_code = utvm_task.func( - (void*) utvm_task.arg_values, // NOLINT(*) - (void*) utvm_task.arg_type_codes, // NOLINT(*) - utvm_task.num_args); - UTVMTimerStop(); - utvm_task_time = UTVMTimerRead(); UTVMDone(); } // We use a dummy function to signal execution is finished for device // backends which require breakpoints. -void UTVMDone() { } +void __attribute__((noinline)) UTVMDone() { + utvm_done = 1; +} + +#define ALIGNED_UP(x, word_size) \ + ((((word_size) - (((uintptr_t) (x)) % (word_size))) % (word_size)) + (x)) void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, int dtype_code_hint, int dtype_bits_hint) { - // Align up to 8 bytes. - utvm_workspace_curr += - (utvm_word_size - ((uintptr_t) utvm_workspace_curr % utvm_word_size)) % utvm_word_size; // NOLINT(*) - if (utvm_workspace_curr + size > utvm_workspace_end) { + if (size == 0) { + utvm_last_error = UTVM_ERR_WS_ZERO_SIZE_ALLOC; + return NULL; + } + size_t alloc_requested_bytes = size; + size_t alloc_size_words = (alloc_requested_bytes + utvm_word_size - 1) / utvm_word_size; + size_t alloc_size_bytes = alloc_size_words * utvm_word_size; + + // Align up to the target word size. + if (utvm_workspace_curr + alloc_size_bytes > utvm_workspace_end) { // Out of space in workspace. + utvm_last_error = UTVM_ERR_WS_OUT_OF_SPACE; + return NULL; + } + if (utvm_alloc_idx == MAX_WS_ALLOCS - 1) { + // Exceeded number of allocs we can keep track of. + utvm_last_error = UTVM_ERR_WS_TOO_MANY_ALLOCS; return NULL; } void* ret_ptr = (void*) utvm_workspace_curr; // NOLINT(*) - utvm_workspace_curr += size; + utvm_workspace_curr = utvm_workspace_curr + alloc_size_bytes; + // store the *end* of the alloc, so we can restore the WS pointer when freeing + utvm_alloc_ends[utvm_alloc_idx] = utvm_workspace_curr; + utvm_alloc_idx++; utvm_num_active_allocs++; return ret_ptr; } int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { - utvm_num_active_allocs--; - if (utvm_num_active_allocs < 0) { + // TODO(areusch): add dev type check + if (utvm_num_active_allocs == 0) { TVMAPISetLastError("free called with no active workspace allocations"); // Reset allocations and workspace (for future task executions). utvm_num_active_allocs = 0; utvm_workspace_curr = utvm_workspace_start; + utvm_last_error = UTVM_ERR_WS_DOUBLE_FREE; return -1; - } else if (utvm_num_active_allocs == 0) { - // No more allocations. Reset workspace. - utvm_workspace_curr = utvm_workspace_start; - return 0; } else { + utvm_num_active_allocs--; + if (ptr == utvm_workspace_start) { + // it's the first allocation + utvm_alloc_ends[0] = NULL; + } else { + // TODO(areusch): reverse loop iteration since usually it's the last alloc being freed + for (uint32_t i = utvm_alloc_idx - 1; i >= 0; i--) { + if (utvm_alloc_ends[i] == ptr) { + utvm_alloc_ends[i + 1] = NULL; + break; + } + } + } + while (utvm_alloc_idx > 0 && utvm_alloc_ends[utvm_alloc_idx - 1] == NULL) { + utvm_alloc_idx--; + } + if (utvm_alloc_idx == 0) { + utvm_workspace_curr = utvm_workspace_start; + } else { + // TODO(areusch): could you possibly have utvm_alloc_idx pointing to a NULL entry in this + // branch? + utvm_workspace_curr = utvm_alloc_ends[utvm_alloc_idx - 1]; + } return 0; } } -void TVMAPISetLastError(const char* msg) { - utvm_last_error = msg; -} +void TVMAPISetLastError(const char* msg) { } #ifdef __cplusplus } // TVM_EXTERN_C diff --git a/src/runtime/micro/host_driven/utvm_runtime.h b/src/runtime/micro/host_driven/utvm_runtime.h index b39309a784f5..42ca025fc2ab 100644 --- a/src/runtime/micro/host_driven/utvm_runtime.h +++ b/src/runtime/micro/host_driven/utvm_runtime.h @@ -18,7 +18,6 @@ */ /*! - * Copyright (c) 2019 by Contributors * \file utvm_runtime.h * \brief uTVM runtime headers */ diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index a24994a2a0e5..13f68f295fa6 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -60,16 +60,16 @@ class HostLowLevelDevice final : public LowLevelDevice { munmap(base_addr_, size_); } - void Read(DevPtr addr, void* buf, size_t num_bytes) { + void Read(TargetPtr addr, void* buf, size_t num_bytes) { std::memcpy(buf, addr.cast_to(), num_bytes); } - void Write(DevPtr addr, const void* buf, size_t num_bytes) { + void Write(TargetPtr addr, const void* buf, size_t num_bytes) { std::memcpy(addr.cast_to(), buf, num_bytes); } - void Execute(DevPtr func_addr, DevPtr breakpoint_addr) { - reinterpret_cast(func_addr.value().val64)(); + void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) { + reinterpret_cast(func_addr.value().uint64())(); } const char* device_type() const final { diff --git a/src/runtime/micro/low_level_device.h b/src/runtime/micro/low_level_device.h index 3158e2fe20de..666b08199a6b 100644 --- a/src/runtime/micro/low_level_device.h +++ b/src/runtime/micro/low_level_device.h @@ -45,7 +45,7 @@ class LowLevelDevice { * \param buffer on-host buffer to be read into * \param num_bytes number of bytes to read */ - virtual void Read(DevPtr addr, + virtual void Read(TargetPtr addr, void* buffer, size_t num_bytes) = 0; @@ -55,7 +55,7 @@ class LowLevelDevice { * \param buffer host buffer to write from * \param num_bytes number of bytes to write */ - virtual void Write(DevPtr addr, + virtual void Write(TargetPtr addr, const void* buffer, size_t num_bytes) = 0; @@ -64,7 +64,7 @@ class LowLevelDevice { * \param func_addr offset of the init stub function * \param breakpoint_addr address at which to stop function execution */ - virtual void Execute(DevPtr func_addr, DevPtr breakpoint_addr) = 0; + virtual void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) = 0; /*! * \brief getter function for low-level device type diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 632b6048b182..3e322c639edd 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -52,11 +52,11 @@ const char* SectionToString(SectionKind section) { std::string RelocateBinarySections( const std::string& binary_path, size_t word_size, - DevPtr text_start, - DevPtr rodata_start, - DevPtr data_start, - DevPtr bss_start, - DevPtr stack_end, + TargetPtr text_start, + TargetPtr rodata_start, + TargetPtr data_start, + TargetPtr bss_start, + TargetPtr stack_end, const std::string& toolchain_prefix) { const auto* f = Registry::Get("tvm_callback_relocate_binary"); CHECK(f != nullptr) diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index e6696f56f06c..7ba62d3f30df 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -52,26 +52,65 @@ enum class SectionKind : size_t { kNumKinds, }; -/*! \brief union for storing values on varying target word sizes */ -union TargetVal { - /*! \brief 32-bit pointer */ - uint32_t val32; - /*! \brief 64-bit pointer */ - uint64_t val64; +/*! \brief class for storing values on varying target word sizes */ +class TargetVal { + private: + size_t width_bits_; + uint64_t value_; + + public: + template::value, T>::type> + explicit constexpr TargetVal(T value) : + width_bits_{sizeof(T) * 8}, value_{value} {} + + TargetVal(size_t width_bits, uint64_t value) : width_bits_{width_bits} { + CHECK(width_bits != 0 && (width_bits & (width_bits - 1)) == 0) + << "width_bits must be a power of 2, got " << width_bits; + *this = value; + } + + size_t width_bits() const { return width_bits_; } + uint64_t bitmask() const { + if (width_bits_ == 64) { + return 0xffffffff; + } else { + return (1 << width_bits_) - 1; + } + } + + uint32_t uint32() const { + CHECK(width_bits_ <= 32) << "TargetVal: requested 32-bit value, actual width is " + << width_bits_; + return uint32_t(value_ & bitmask()); + } + + uint64_t uint64() const { + return value_; + } + + TargetVal& operator=(const uint64_t& value) { + if (width_bits_ == 64) { + value_ = value; + } else { + CHECK((value & ~bitmask()) == 0) << "bits above " << width_bits_ << " are non-zero"; + value_ = value & bitmask(); + } + return *this; + } }; -// TODO just get rid of `DevPtr`. +// TODO(areusch): just get rid of `TargetPtr`. /*! \brief absolute device address */ class TargetPtr { public: - /*! \brief construct a device address with value `value` */ - explicit TargetPtr(std::uintptr_t value) : value_(TargetVal { .val64 = value }) {} + /*! \brief construct a device address with val64 `value` */ + explicit TargetPtr(std::uint64_t value) : value_(TargetVal(64, value)) {} - /*! \brief default constructor */ - TargetPtr() : value_(TargetVal { .val64 = 0 }) {} + /*! \brief default constructor (val64 0) */ + TargetPtr() : value_(TargetVal(64, 0)) {} - /*! \brief construct a null address */ - explicit TargetPtr(std::nullptr_t value) : value_(TargetVal { .val64 = 0 }) {} + /*! \brief construct a null address (stored in val64) */ + explicit TargetPtr(std::nullptr_t value) : value_{TargetVal(64, 0)} {} /*! \brief destructor */ ~TargetPtr() {} @@ -87,33 +126,33 @@ class TargetPtr { * \return casted result */ template - T cast_to() const { return reinterpret_cast(value_.val64); } + T cast_to() const { return reinterpret_cast(value_.uint64()); } /*! \brief check if location is null */ - bool operator==(std::nullptr_t) const { return value_.val64 == 0; } + bool operator==(std::nullptr_t) const { return value_.uint64() == 0; } /*! \brief check if location is not null */ - bool operator!=(std::nullptr_t) const { return value_.val64 != 0; } + bool operator!=(std::nullptr_t) const { return value_.uint64() != 0; } /*! \brief add an integer to this absolute address to get a larger absolute address */ TargetPtr operator+(size_t n) const { - return TargetPtr(value_.val64 + n); + return TargetPtr(value_.uint64() + n); } /*! \brief mutably add an integer to this absolute address */ TargetPtr& operator+=(size_t n) { - value_.val64 += n; + value_ = value_.uint64() + n; return *this; } /*! \brief subtract an integer from this absolute address to get a smaller absolute address */ TargetPtr operator-(size_t n) const { - return TargetPtr(value_.val64 - n); + return TargetPtr(value_.uint64() - n); } /*! \brief mutably subtract an integer from this absolute address */ TargetPtr& operator-=(size_t n) { - value_.val64 -= n; + value_ = value_.uint64() - n; return *this; } @@ -174,6 +213,12 @@ class SymbolMap { return map_.find(name) != map_.end(); } + void Dump(std::ostream& stream) const { + for (auto e : map_) { + stream << "Entry:" << e.first << std::endl; + } + } + private: /*! \brief backing map */ std::unordered_map map_; diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index fbaef4af5b07..619c31a8b1ba 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -61,7 +61,7 @@ class MicroDeviceAPI final : public DeviceAPI { void FreeDataSpace(TVMContext ctx, void* ptr) final { MicroDevSpace* dev_space = static_cast(ptr); dev_space->session->FreeInSection( - SectionKind::kHeap, DevPtr(reinterpret_cast(dev_space->data))); + SectionKind::kHeap, TargetPtr(reinterpret_cast(dev_space->data))); delete dev_space; } @@ -74,12 +74,9 @@ class MicroDeviceAPI final : public DeviceAPI { TVMContext ctx_to, DLDataType type_hint, TVMStreamHandle stream) final { - std::cout << "[MicroDeviceAPI::CopyDataFromTo]" << std::endl; std::tuple type_from_to(ctx_from.device_type, ctx_to.device_type); if (type_from_to == std::make_tuple(kDLMicroDev, kDLMicroDev)) { - std::cout << " device to device" << std::endl; // Copying from the device to the device. - MicroDevSpace* from_space = static_cast(const_cast(from)); MicroDevSpace* to_space = static_cast(const_cast(to)); CHECK(from_space->session == to_space->session) @@ -93,30 +90,27 @@ class MicroDeviceAPI final : public DeviceAPI { session->FlushTaskQueue(); const std::shared_ptr& lld = session->low_level_device(); - DevPtr from_dev_addr = GetDevLoc(from_space, from_offset); - DevPtr to_dev_addr = GetDevLoc(to_space, to_offset); + TargetPtr from_dev_addr = GetDevLoc(from_space, from_offset); + TargetPtr to_dev_addr = GetDevLoc(to_space, to_offset); std::vector buffer(size); lld->Read(from_dev_addr, static_cast(buffer.data()), size); lld->Write(to_dev_addr, static_cast(buffer.data()), size); + } else if (type_from_to == std::make_tuple(kDLMicroDev, kDLCPU)) { - std::cout << " reading from device" << std::endl; - std::cout << " num_bytes: " << size << std::endl; // Reading from the device. - MicroDevSpace* from_space = static_cast(const_cast(from)); ObjectPtr& session = from_space->session; // flush all pending tasks to ensure data is consistent session->FlushTaskQueue(); const std::shared_ptr& lld = session->low_level_device(); - DevPtr from_dev_addr = GetDevLoc(from_space, from_offset); + TargetPtr from_dev_addr = GetDevLoc(from_space, from_offset); + void* to_host_ptr = GetHostLoc(to, to_offset); lld->Read(from_dev_addr, to_host_ptr, size); + } else if (type_from_to == std::make_tuple(kDLCPU, kDLMicroDev)) { - std::cout << " writing to device" << std::endl; - std::cout << " num_bytes: " << size << std::endl; // Writing to the device. - MicroDevSpace* to_space = static_cast(const_cast(to)); ObjectPtr& session = to_space->session; // flush all pending tasks to ensure data is consistent @@ -124,15 +118,15 @@ class MicroDeviceAPI final : public DeviceAPI { const std::shared_ptr& lld = session->low_level_device(); void* from_host_ptr = GetHostLoc(from, from_offset); - DevPtr to_dev_addr = GetDevLoc(to_space, to_offset); + TargetPtr to_dev_addr = GetDevLoc(to_space, to_offset); lld->Write(to_dev_addr, from_host_ptr, size); + } else { LOG(FATAL) << "Expect copy from/to micro device or between micro device\n"; } } void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { - std::cout << "[MicroDeviceAPI::StreamSync]" << std::endl; MicroSession::Current()->FlushTaskQueue(); } @@ -153,7 +147,7 @@ class MicroDeviceAPI final : public DeviceAPI { MicroDevSpace* dev_space = static_cast(data); ObjectPtr& session = dev_space->session; session->FreeInSection(SectionKind::kWorkspace, - DevPtr(reinterpret_cast(dev_space->data))); + TargetPtr(reinterpret_cast(dev_space->data))); delete dev_space; } @@ -167,8 +161,8 @@ class MicroDeviceAPI final : public DeviceAPI { } private: - DevPtr GetDevLoc(MicroDevSpace* dev_space, size_t offset) { - return DevPtr(reinterpret_cast(dev_space->data) + offset); + TargetPtr GetDevLoc(MicroDevSpace* dev_space, size_t offset) { + return TargetPtr(reinterpret_cast(dev_space->data) + offset); } void* GetHostLoc(const void* ptr, size_t offset) { diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index b27bd923c638..01056deeeb18 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -54,8 +54,8 @@ class MicroModuleNode final : public ModuleNode { * \param binary_path path of the binary to be loaded */ void InitMicroModule(const std::string& binary_path) { - std::cout << "[MicroModuleNode::InitMicroModule]" << std::endl; - std::cout << " start" << std::endl; + // std::cout << "[MicroModuleNode::InitMicroModule]" << std::endl; + // std::cout << " start" << std::endl; session_ = MicroSession::Current(); symbol_map_ = session_->LoadBinary(binary_path, true).symbol_map; } @@ -69,26 +69,26 @@ class MicroModuleNode final : public ModuleNode { class MicroWrappedFunc { public: MicroWrappedFunc(ObjectPtr session, - DevPtr func_ptr) { + TargetPtr func_ptr) { session_ = session; func_ptr_ = func_ptr; } void operator()(TVMArgs args, TVMRetValue* rv) const { - *rv = session_->PushToTaskQueue(func_ptr_, args); + session_->PushToTaskQueue(func_ptr_, args); } private: /*! \brief reference to the session for this function (to keep the session alive) */ ObjectPtr session_; /*! \brief offset of the function to be called */ - DevPtr func_ptr_; + TargetPtr func_ptr_; }; PackedFunc MicroModuleNode::GetFunction( const std::string& name, const ObjectPtr& sptr_to_self) { - DevPtr func_ptr; + TargetPtr func_ptr; if (name == tvm::runtime::symbol::tvm_module_main) { if (symbol_map_.HasSymbol(tvm::runtime::symbol::tvm_module_main)) { func_ptr = symbol_map_[tvm::runtime::symbol::tvm_module_main]; diff --git a/src/runtime/micro/micro_section_allocator.h b/src/runtime/micro/micro_section_allocator.h index 4e8f7201ad7a..8fff39b61d19 100644 --- a/src/runtime/micro/micro_section_allocator.h +++ b/src/runtime/micro/micro_section_allocator.h @@ -23,6 +23,7 @@ #ifndef TVM_RUNTIME_MICRO_MICRO_SECTION_ALLOCATOR_H_ #define TVM_RUNTIME_MICRO_MICRO_SECTION_ALLOCATOR_H_ +#include #include #include "micro_common.h" @@ -44,7 +45,7 @@ class MicroSectionAllocator { size_(0), capacity_(region.size), word_size_(word_size) { - CHECK_EQ(start_addr_.value().val64 % word_size, 0) + CHECK_EQ(start_addr_.value().uint64() % word_size, 0) << "micro section start not aligned to " << word_size << " bytes"; CHECK_EQ(capacity_ % word_size, 0) << "micro section end not aligned to " << word_size << " bytes"; @@ -60,14 +61,15 @@ class MicroSectionAllocator { * \param alloc_size size of allocated memory in bytes * \return pointer to allocated memory region in section, nullptr if out of space */ - DevPtr Allocate(size_t size) { + TargetPtr Allocate(size_t size) { size_ = UpperAlignValue(size_, word_size_); CHECK(size_ + size < capacity_) - << "cannot alloc " << size << " bytes in section with start_addr " << - start_addr_.cast_to(); - DevPtr alloc_addr = start_addr_ + size_; + << "cannot alloc " << size << " bytes in section \"" + << section_name_ << "\" (start_addr=" << start_addr_.cast_to() + << ", used=" << size_ << ", capacity=" << capacity_ << ")"; + TargetPtr alloc_addr = start_addr_ + size_; size_ += size; - alloc_map_[alloc_addr.value().val64] = size; + alloc_map_[alloc_addr.value().uint64()] = size; return alloc_addr; } @@ -76,10 +78,10 @@ class MicroSectionAllocator { * \param offs offset to allocated memory * \note simple allocator scheme, more complex versions will be implemented later */ - void Free(DevPtr addr) { - CHECK(alloc_map_.find(addr.value().val64) != alloc_map_.end()) + void Free(TargetPtr addr) { + CHECK(alloc_map_.find(addr.value().uint64()) != alloc_map_.end()) << "freed pointer was never allocated"; - alloc_map_.erase(addr.value().val64); + alloc_map_.erase(addr.value().uint64()); if (alloc_map_.empty()) { size_ = 0; } @@ -88,17 +90,17 @@ class MicroSectionAllocator { /*! * \brief start offset of the memory region managed by this allocator */ - DevPtr start_addr() const { return start_addr_; } + TargetPtr start_addr() const { return start_addr_; } /*! * \brief current end addr of the space being used in this memory region */ - DevPtr curr_end_addr() const { return start_addr_ + size_; } + TargetPtr curr_end_addr() const { return start_addr_ + size_; } /*! * \brief end addr of the memory region managed by this allocator */ - DevPtr max_addr() const { return start_addr_ + capacity_; } + TargetPtr max_addr() const { return start_addr_ + capacity_; } /*! * \brief size of the section @@ -114,7 +116,7 @@ class MicroSectionAllocator { /*! \brief name of the section (for debugging) */ std::string section_name_; /*! \brief start address of the section */ - DevPtr start_addr_; + TargetPtr start_addr_; /*! \brief current size of the section */ size_t size_; /*! \brief total storage capacity of the section */ diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index e6cdea5d1c88..309fea45cd31 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -107,7 +108,7 @@ MicroSession::MicroSession( low_level_device_ = HostLowLevelDeviceCreate(memory_size, &base_addr); CHECK_EQ(reinterpret_cast(base_addr) % word_size_, 0) << "base address not aligned to " << word_size_ << " bytes"; - DevPtr curr_addr = DevPtr(reinterpret_cast(base_addr)); + TargetPtr curr_addr = TargetPtr(reinterpret_cast(base_addr)); section_allocators_[0] = std::make_shared( "text", @@ -170,66 +171,56 @@ MicroSession::MicroSession( section_allocators_[0] = std::make_shared( "text", DevMemRegion { - .start = DevPtr(text_start), + .start = TargetPtr(text_start), .size = text_size, }, word_size_); section_allocators_[1] = std::make_shared( "rodata", DevMemRegion { - .start = DevPtr(rodata_start), + .start = TargetPtr(rodata_start), .size = rodata_size, }, word_size_); section_allocators_[2] = std::make_shared( "data", DevMemRegion { - .start = DevPtr(data_start), + .start = TargetPtr(data_start), .size = data_size, }, word_size_); section_allocators_[3] = std::make_shared( "bss", DevMemRegion { - .start = DevPtr(bss_start), + .start = TargetPtr(bss_start), .size = bss_size, }, word_size_); section_allocators_[4] = std::make_shared( "args", DevMemRegion { - .start = DevPtr(args_start), + .start = TargetPtr(args_start), .size = args_size, }, word_size_); section_allocators_[5] = std::make_shared( "heap", DevMemRegion { - .start = DevPtr(heap_start), + .start = TargetPtr(heap_start), .size = heap_size, }, word_size_); section_allocators_[6] = std::make_shared( "workspace", DevMemRegion { - .start = DevPtr(workspace_start), + .start = TargetPtr(workspace_start), .size = workspace_size, }, word_size_); section_allocators_[7] = std::make_shared( "stack", DevMemRegion { - .start = DevPtr(stack_start), + .start = TargetPtr(stack_start), .size = stack_size, }, word_size_); } else { LOG(FATAL) << "unsupported micro low-level device"; } - std::cout << "[Memory Layout]" << std::endl; - std::cout << " text (size = " << (section_allocators_[0]->capacity() / 1000.0) << " KB): " << section_allocators_[0]->start_addr().cast_to() << std::endl; - std::cout << " rodata (size = " << (section_allocators_[1]->capacity() / 1000.0) << " KB): " << section_allocators_[1]->start_addr().cast_to() << std::endl; - std::cout << " data (size = " << (section_allocators_[2]->capacity() / 1000.0) << " KB): " << section_allocators_[2]->start_addr().cast_to() << std::endl; - std::cout << " bss (size = " << (section_allocators_[3]->capacity() / 1000.0) << " KB): " << section_allocators_[3]->start_addr().cast_to() << std::endl; - std::cout << " args (size = " << (section_allocators_[4]->capacity() / 1000.0) << " KB): " << section_allocators_[4]->start_addr().cast_to() << std::endl; - std::cout << " heap (size = " << (section_allocators_[5]->capacity() / 1000.0) << " KB): " << section_allocators_[5]->start_addr().cast_to() << std::endl; - std::cout << " workspace (size = " << (section_allocators_[6]->capacity() / 1000.0) << " KB): " << section_allocators_[6]->start_addr().cast_to() << std::endl; - std::cout << " stack (size = " << (section_allocators_[7]->capacity() / 1000.0) << " KB): " << section_allocators_[7]->start_addr().cast_to() << std::endl; - - DevPtr args_start_addr = GetAllocator(SectionKind::kArgs)->start_addr(); + TargetPtr args_start_addr = GetAllocator(SectionKind::kArgs)->start_addr(); batch_args_encoder_.set_start_addr(args_start_addr); runtime_symbol_map_ = LoadBinary(binary_path, false).symbol_map; @@ -237,17 +228,17 @@ MicroSession::MicroSession( // Patch pointers to define the bounds of the workspace section and the word // size (for allocation alignment). std::shared_ptr ws_allocator = GetAllocator(SectionKind::kWorkspace); - TargetVal ws_start = ws_allocator->start_addr().value(); - TargetVal ws_end = ws_allocator->max_addr().value(); - TargetVal target_word_size { .val64 = word_size_ }; + TargetVal ws_start(word_size_ * 8, ws_allocator->start_addr().value().uint64()); + TargetVal ws_end(word_size_ * 8, ws_allocator->max_addr().value().uint64()); + TargetVal target_word_size(word_size_ * 8, word_size_); if (word_size_ == 4) { - DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.val32); - DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.val32); - DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.val32); + DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.uint32()); + DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.uint32()); + DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.uint32()); } else if (word_size_ == 8) { - DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.val64); - DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.val64); - DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.val64); + DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.uint64()); + DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.uint64()); + DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.uint64()); } } @@ -258,17 +249,16 @@ MicroSession::~MicroSession() { low_level_device_ = nullptr; } -void MicroSession::PushToTaskQueue(DevPtr func_ptr, const TVMArgs& args) { - std::cout << "[MicroSession::PushToTaskQueue]" << std::endl; - std::cout << " pushed func ptr: " << func_ptr.cast_to() << std::endl; +void MicroSession::PushToTaskQueue(TargetPtr func_ptr, const TVMArgs& args) { if (thumb_mode_) { - func_ptr |= 1; + // TODO(areusch): should be |= + func_ptr += 1; } - DevVal func_dev_addr = func_ptr.value(); + TargetVal func_dev_addr = func_ptr.value(); - std::tuple arg_field_addrs = EncoderAppend(&batch_args_encoder_, args); - DevVal arg_values_dev_addr = { .val64 = std::get<0>(arg_field_addrs).value() }; - DevVal arg_type_codes_dev_addr = { .val64 = std::get<1>(arg_field_addrs).value() }; + std::tuple arg_field_addrs = EncoderAppend(&batch_args_encoder_, args); + TargetVal arg_values_dev_addr{std::get<0>(arg_field_addrs).cast_to()}; + TargetVal arg_type_codes_dev_addr{std::get<1>(arg_field_addrs).cast_to()}; task_queue_.push_back( DevTask { @@ -289,15 +279,15 @@ void MicroSession::FlushTaskQueue() { return; } if (word_size_ == 4) { - FlushTaskQueuePriv(); + FlushTaskQueuePriv(); } else if (word_size_ == 8) { - FlushTaskQueuePriv(); + FlushTaskQueuePriv(); } } template void MicroSession::FlushTaskQueuePriv() { - std::cout << "[MicroSession::FlushTaskQueue]" << std::endl; + // std::cout << "[MicroSession::FlushTaskQueue]" << std::endl; std::vector prepped_tasks; for (const auto& task : task_queue_) { prepped_tasks.push_back(T(task)); @@ -310,102 +300,65 @@ void MicroSession::FlushTaskQueuePriv() { batch_args_encoder_.buf_size()); // Flush `tasks` to device memory. - DevPtr dev_tasks_addr = runtime_symbol_map_["utvm_tasks"]; +// runtime_symbol_map_.Dump(std::cout); + TargetPtr dev_tasks_addr = runtime_symbol_map_["utvm_tasks"]; low_level_device()->Write( dev_tasks_addr, reinterpret_cast(prepped_tasks.data()), prepped_tasks.size() * sizeof(T)); DevSymbolWrite(runtime_symbol_map_, "utvm_num_tasks", prepped_tasks.size()); - DevPtr utvm_init_addr = runtime_symbol_map_["UTVMInit"]; - DevPtr utvm_done_addr = runtime_symbol_map_["UTVMDone"]; + TargetPtr utvm_init_addr = runtime_symbol_map_["UTVMInit"]; + TargetPtr utvm_done_addr = runtime_symbol_map_["UTVMDone"]; if (thumb_mode_) { - utvm_init_addr |= 1; + // TODO(areusch): should be |= + utvm_init_addr += 1; } std::chrono::time_point< std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin, tend; tbegin = std::chrono::high_resolution_clock::now(); - // std::cout << " do execution things: "; - // char tmp; - // std::cin >> tmp; + // std::string tmp; + // while (tmp[0] != 'd' && tmp[0] != 'e') { + // std::cout << "How to proceed? [Debug / Execute] "; + // getline(std::cin, tmp); + // CHECK(std::cin.good()) << "Stdin closed"; + // tmp[0] = std::tolower(tmp[0]); + // } + // if (tmp[0] == 'd') { + // std::cout << "Launch debugger; [Enter] to resume automated execution"; + // getline(std::cin, tmp); + // } else { low_level_device()->Execute(utvm_init_addr, utvm_done_addr); + // } tend = std::chrono::high_resolution_clock::now(); // Check if there was an error during execution. If so, log it. CheckDeviceError(); - uint32_t task_time = DevSymbolRead(runtime_symbol_map_, "utvm_task_time"); - GetAllocator(SectionKind::kArgs)->Free(stream_dev_addr); - return static_cast(task_time); -} - -BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_dylib_pointers) { - DevMemRegion text_section; - DevMemRegion rodata_section; - DevMemRegion data_section; - DevMemRegion bss_section; - - text_section.size = GetSectionSize( - binary_path, SectionKind::kText, toolchain_prefix_, word_size_); - rodata_section.size = GetSectionSize( - binary_path, SectionKind::kRodata, toolchain_prefix_, word_size_); - data_section.size = GetSectionSize( - binary_path, SectionKind::kData, toolchain_prefix_, word_size_); - bss_section.size = GetSectionSize( - binary_path, SectionKind::kBss, toolchain_prefix_, word_size_); - - text_section.start = AllocateInSection(SectionKind::kText, text_section.size); - rodata_section.start = AllocateInSection(SectionKind::kRodata, rodata_section.size); - data_section.start = AllocateInSection(SectionKind::kData, data_section.size); - bss_section.start = AllocateInSection(SectionKind::kBss, bss_section.size); - CHECK(text_section.start != nullptr && rodata_section.start != nullptr && - data_section.start != nullptr && bss_section.start != nullptr) - << "not enough space to load module on device"; - - std::string relocated_bin = RelocateBinarySections( - binary_path, - word_size_, - text_section.start, - rodata_section.start, - data_section.start, - bss_section.start, - GetAllocator(SectionKind::kStack)->max_addr(), - toolchain_prefix_); - std::string text_contents = ReadSection(relocated_bin, SectionKind::kText, toolchain_prefix_); - std::string rodata_contents = ReadSection(relocated_bin, SectionKind::kRodata, toolchain_prefix_); - std::string data_contents = ReadSection(relocated_bin, SectionKind::kData, toolchain_prefix_); - std::string bss_contents = ReadSection(relocated_bin, SectionKind::kBss, toolchain_prefix_); - - low_level_device_->Write(text_section.start, &text_contents[0], text_section.size); - low_level_device_->Write(rodata_section.start, &rodata_contents[0], rodata_section.size); - low_level_device_->Write(data_section.start, &data_contents[0], data_section.size); - low_level_device_->Write(bss_section.start, &bss_contents[0], bss_section.size); - SymbolMap symbol_map {relocated_bin, toolchain_prefix_}; - - if (patch_dylib_pointers) { - // Patch device lib pointers. - PatchImplHole(symbol_map, "TVMBackendAllocWorkspace"); - PatchImplHole(symbol_map, "TVMBackendFreeWorkspace"); - PatchImplHole(symbol_map, "TVMAPISetLastError"); - } if (use_device_timer_) { uint64_t sum = 0; std::vector times; times.resize(task_queue_.size()); - low_level_device()->Read(runtime_symbol_map_["utvm_task_times"], times.data(), task_queue_.size() * sizeof(uint32_t)); + low_level_device()->Read(runtime_symbol_map_["utvm_task_times"], + times.data(), + task_queue_.size() * sizeof(uint32_t)); + int i = 0; for (uint32_t time : times) { + LOG(INFO) << "Time " << i++ << ": " << time; sum += time; } - last_batch_time_ += static_cast(sum); + last_batch_time_ += static_cast(sum) / 1e3; } else { last_batch_time_ += std::chrono::duration_cast > (tend - tbegin).count() * 1000; - // TODO fukn hack + // TODO(weberlo): Reading internal data structure is hacky. uint64_t sum = 0; std::vector times; times.resize(task_queue_.size()); - low_level_device()->Read(runtime_symbol_map_["utvm_task_times"], times.data(), task_queue_.size() * sizeof(uint32_t)); + low_level_device()->Read(runtime_symbol_map_["utvm_task_times"], + times.data(), + task_queue_.size() * sizeof(uint32_t)); for (uint32_t time : times) { sum += time; } @@ -417,7 +370,6 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d } BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_dylib_pointers) { - std::cout << "[MicroSession::LoadBinary]" << std::endl; DevMemRegion text_section; DevMemRegion rodata_section; DevMemRegion data_section; @@ -431,10 +383,6 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d binary_path, SectionKind::kData, toolchain_prefix_, word_size_); bss_section.size = GetSectionSize( binary_path, SectionKind::kBss, toolchain_prefix_, word_size_); - std::cout << " text_section.size: " << text_section.size << std::endl; - std::cout << " rodata_section.size: " << rodata_section.size << std::endl; - std::cout << " data_section.size: " << data_section.size << std::endl; - std::cout << " bss_section.size: " << bss_section.size << std::endl; text_section.start = AllocateInSection(SectionKind::kText, text_section.size); rodata_section.start = AllocateInSection(SectionKind::kRodata, rodata_section.size); @@ -480,9 +428,8 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d }; } -std::tuple MicroSession::EncoderAppend( +std::tuple MicroSession::EncoderAppend( TargetDataLayoutEncoder* encoder, const TVMArgs& args) { - std::cout << "[MicroSession::EncoderAppend(TVMArgs)]" << std::endl; const int* type_codes = args.type_codes; int num_args = args.num_args; @@ -529,7 +476,7 @@ std::tuple MicroSession::EncoderAppend( } template -DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr) { +TargetPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr) { auto tvm_arr_slot = encoder->Alloc(); auto shape_slot = encoder->Alloc(arr.ndim); @@ -537,8 +484,8 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTen // the device first. The `data` field is already allocated on the device and // is a device pointer, so we don't need to write it. shape_slot.WriteArray(arr.shape, arr.ndim); - DevPtr shape_dev_addr = shape_slot.start_addr(); - DevPtr strides_dev_addr = DevPtr(nullptr); + TargetPtr shape_dev_addr = shape_slot.start_addr(); + TargetPtr strides_dev_addr = TargetPtr(nullptr); if (arr.strides != nullptr) { auto stride_slot = encoder->Alloc(arr.ndim); stride_slot.WriteArray(arr.strides, arr.ndim); @@ -546,13 +493,13 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTen } T dev_arr( - TargetVal { .val64 = reinterpret_cast(arr.data) }, + TargetVal { word_size_ * 8, reinterpret_cast(arr.data) }, arr.ctx, arr.ndim, arr.dtype, shape_dev_addr.value(), strides_dev_addr.value(), - TargetVal { .val64 = arr.byte_offset }); + TargetVal { word_size_ * 8, arr.byte_offset }); CHECK(dev_arr.ctx.device_type == static_cast(kDLMicroDev)) << "attempt to write DLTensor with non-micro device type"; // Update the device type to CPU, because from the microcontroller's @@ -575,7 +522,7 @@ void MicroSession::CheckDeviceError() { return; } std::string err_msg; - switch(last_error) { + switch (last_error) { case UTVM_ERR_NOT_FINISHED: err_msg = "execution timed out"; break; @@ -583,7 +530,7 @@ void MicroSession::CheckDeviceError() { err_msg = "timer is not implemented for the target device"; break; case UTVM_ERR_TIMER_OVERFLOW: - // TODO this should be remedied by using interrupts to accumulate the + // TODO(weberlo): this should be remedied by using interrupts to accumulate the // timer into a larger datatype (ARM timers are only 24 bits) err_msg = "timer overflowed during execution"; break; @@ -616,20 +563,20 @@ void MicroSession::CheckDeviceError() { } void MicroSession::PatchImplHole(const SymbolMap& symbol_map, const std::string& func_name) { - DevPtr runtime_impl_addr = runtime_symbol_map_[func_name]; + TargetPtr runtime_impl_addr = runtime_symbol_map_[func_name]; if (thumb_mode_) { runtime_impl_addr += 1; } std::ostringstream func_name_underscore; func_name_underscore << func_name << "_"; if (word_size_ == 4) { - DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().val32); + DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().uint32()); } else if (word_size_ == 8) { - DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().val64); + DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().uint64()); } } -std::string MicroSession::ReadString(DevPtr str_addr) { +std::string MicroSession::ReadString(TargetPtr str_addr) { std::ostringstream result; const size_t buf_size = 256; std::vector buf(buf_size, 0); @@ -647,29 +594,17 @@ std::string MicroSession::ReadString(DevPtr str_addr) { return result.str(); } -DevPtr MicroSession::AllocateInSection(SectionKind type, size_t size) { - if (type == SectionKind::kHeap) { - std::cout << "[MicroSession::AllocateInSection(Heap)]" << std::endl; - std::cout << " allocating " << std::dec << size << " hex=" << (void*) size << " bytes" << std::endl; - } - DevPtr result = GetAllocator(type)->Allocate(size); - if (type == SectionKind::kHeap) { - std::cout << " allocated at addr " << result.cast_to() << std::endl; - } - return result; +TargetPtr MicroSession::AllocateInSection(SectionKind type, size_t size) { + return GetAllocator(type)->Allocate(size); } -void MicroSession::FreeInSection(SectionKind type, DevPtr addr) { - if (type == SectionKind::kHeap) { - std::cout << "[MicroSession::FreeInSection]" << std::endl; - std::cout << " freeing alloc at addr " << addr.cast_to() << std::endl; - } +void MicroSession::FreeInSection(SectionKind type, TargetPtr addr) { return GetAllocator(type)->Free(addr); } template T MicroSession::DevSymbolRead(const SymbolMap& symbol_map, const std::string& symbol) { - DevPtr sym_addr = symbol_map[symbol]; + TargetPtr sym_addr = symbol_map[symbol]; T result; low_level_device()->Read(sym_addr, &result, sizeof(T)); return result; @@ -679,7 +614,7 @@ template void MicroSession::DevSymbolWrite(const SymbolMap& symbol_map, const std::string& symbol, const T& value) { - DevPtr sym_addr = symbol_map[symbol]; + TargetPtr sym_addr = symbol_map[symbol]; low_level_device()->Write(sym_addr, &value, sizeof(T)); } @@ -694,12 +629,12 @@ PackedFunc MicroSession::GetFunction( return PackedFunc([sptr_to_self](TVMArgs args, TVMRetValue* rv) { MicroSession::ExitWithScope(); }); - // TODO add a `clear_batch_timer` func + // TODO(weberlo): add a `clear_batch_timer` func } else if (name == "get_last_batch_time") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetLastBatchTime(); }); - // TODO remove this func + // TODO(weberlo): remove this func } else if (name == "get_last_batch_cycles") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetLastBatchCycles(); @@ -716,22 +651,22 @@ TVM_REGISTER_GLOBAL("micro._CreateSession") const std::string& binary_path = args[1]; const std::string& toolchain_prefix = args[2]; uint64_t text_start = args[3]; - size_t text_size = args[4]; + size_t text_size = uint64_t(args[4]); uint64_t rodata_start = args[5]; - size_t rodata_size = args[6]; + size_t rodata_size = uint64_t(args[6]); uint64_t data_start = args[7]; - size_t data_size = args[8]; + size_t data_size = uint64_t(args[8]); uint64_t bss_start = args[9]; - size_t bss_size = args[10]; + size_t bss_size = uint64_t(args[10]); uint64_t args_start = args[11]; - size_t args_size = args[12]; + size_t args_size = uint64_t(args[12]); uint64_t heap_start = args[13]; - size_t heap_size = args[14]; + size_t heap_size = uint64_t(args[14]); uint64_t workspace_start = args[15]; - size_t workspace_size = args[16]; + size_t workspace_size = uint64_t(args[16]); uint64_t stack_start = args[17]; - size_t stack_size = args[18]; - size_t word_size = args[19]; + size_t stack_size = uint64_t(args[18]); + size_t word_size = uint64_t(args[19]); bool thumb_mode = args[20]; bool use_device_timer = args[21]; const std::string& server_addr = args[22]; diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 00e5bcdd7e01..f7b05ae211e9 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -143,7 +143,7 @@ class MicroSession : public ModuleNode { * \param args args to the packed function * \return elapsed time during function execution on the device */ - double PushToTaskQueue(DevPtr func, const TVMArgs& args); + void PushToTaskQueue(TargetPtr func, const TVMArgs& args); /*! * \brief serialize runtime metadata to the device for enqueued tasks and execute @@ -171,21 +171,21 @@ class MicroSession : public ModuleNode { * \param size size of allocated memory in bytes * \return pointer to allocated memory region in section, nullptr if out of space */ - DevPtr AllocateInSection(SectionKind type, size_t size); + TargetPtr AllocateInSection(SectionKind type, size_t size); /*! * \brief free prior allocation from section * \param type type of section to allocate in * \param addr device address of allocated memory */ - void FreeInSection(SectionKind type, DevPtr addr); + void FreeInSection(SectionKind type, TargetPtr addr); /*! * \brief read string from device to host * \param str_addr device address of first character of string * \return host copy of device string that was read */ - std::string ReadString(DevPtr str_addr); + std::string ReadString(TargetPtr str_addr); /*! * \brief read value of symbol from device memory @@ -234,8 +234,6 @@ class MicroSession : public ModuleNode { /*! \brief array of memory allocators for each on-device section */ std::shared_ptr section_allocators_[static_cast(SectionKind::kNumKinds)]; - /*! \brief total number of bytes of usable device memory for this session */ - size_t memory_size_; /*! \brief number of bytes in a word on the target device */ size_t word_size_; /*! \brief whether the target device requires a thumb-mode bit on function addresses @@ -272,7 +270,8 @@ class MicroSession : public ModuleNode { * \param args args to be appended * \return device address of the allocated args */ - std::tuple EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArgs& args); + std::tuple EncoderAppend(TargetDataLayoutEncoder* encoder, + const TVMArgs& args); /*! * \brief appends a `DLTensor` to the host-side buffer of `encoder` @@ -335,15 +334,15 @@ struct TVMArray32 { TargetVal shape, TargetVal strides, TargetVal byte_offset) - : data(data.val32), + : data(data.uint32()), ctx(ctx), ndim(ndim), pad0(0), dtype(dtype), - shape(shape.val32), - strides(strides.val32), + shape(shape.uint32()), + strides(strides.uint32()), pad1(0), - byte_offset(byte_offset.val32), + byte_offset(byte_offset.uint32()), pad2(0) { } /*! @@ -385,14 +384,14 @@ struct TVMArray64 { TargetVal shape, TargetVal strides, TargetVal byte_offset) - : data(data.val64), + : data(data.uint64()), ctx(ctx), ndim(ndim), pad0(0), dtype(dtype), - shape(shape.val64), - strides(strides.val64), - byte_offset(byte_offset.val64) { } + shape(shape.uint64()), + strides(strides.uint64()), + byte_offset(byte_offset.uint64()) { } /*! * \brief The opaque data pointer points to the allocated data. * This will be CUDA device pointer or cl_mem handle in OpenCL. @@ -421,11 +420,11 @@ struct TVMArray64 { /*! \brief MicroTVM task to store in task queue before specializing to word size */ struct DevTask { /*! \brief Pointer to function to call for this task */ - DevVal func; + TargetVal func; /*! \brief Array of argument values */ - DevVal arg_values; + TargetVal arg_values; /*! \brief Array of type codes for each argument value */ - DevVal arg_type_codes; + TargetVal arg_type_codes; /*! \brief Number of arguments */ int32_t num_args; }; @@ -433,9 +432,9 @@ struct DevTask { /*! \brief MicroTVM task for serialization to 32-bit devices */ typedef struct StructUTVMTask32 { StructUTVMTask32(DevTask task) - : func(task.func.val32), - arg_values(task.arg_values.val32), - arg_type_codes(task.arg_type_codes.val32), + : func(task.func.uint32()), + arg_values(task.arg_values.uint32()), + arg_type_codes(task.arg_type_codes.uint32()), num_args(task.num_args) { } /*! \brief Pointer to function to call for this task */ @@ -451,9 +450,9 @@ typedef struct StructUTVMTask32 { /*! \brief MicroTVM task for serialization to 64-bit devices */ typedef struct StructUTVMTask64 { StructUTVMTask64(DevTask task) - : func(task.func.val64), - arg_values(task.arg_values.val64), - arg_type_codes(task.arg_type_codes.val64), + : func(task.func.uint64()), + arg_values(task.arg_values.uint64()), + arg_type_codes(task.arg_type_codes.uint64()), num_args(task.num_args) { } /*! \brief Pointer to function to call for this task */ diff --git a/src/runtime/micro/openocd_low_level_device.cc b/src/runtime/micro/openocd_low_level_device.cc index e5c83e590c36..d1f279f6120f 100644 --- a/src/runtime/micro/openocd_low_level_device.cc +++ b/src/runtime/micro/openocd_low_level_device.cc @@ -50,7 +50,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { socket_.SendCommand(); } - void Read(DevPtr addr, void* buf, size_t num_bytes) { + void Read(TargetPtr addr, void* buf, size_t num_bytes) { if (num_bytes == 0) { return; } @@ -88,7 +88,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { } { - socket_.cmd_builder() << "ocd_echo $output"; + socket_.cmd_builder() << "return $output"; socket_.SendCommand(); const std::string& reply = socket_.last_reply(); @@ -119,7 +119,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { } } - void Write(DevPtr addr, const void* buf, size_t num_bytes) { + void Write(TargetPtr addr, const void* buf, size_t num_bytes) { if (num_bytes == 0) { return; } @@ -171,7 +171,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { } } - void Execute(DevPtr func_addr, DevPtr breakpoint_addr) { + void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) { socket_.cmd_builder() << "halt 0"; socket_.SendCommand(); @@ -210,9 +210,9 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { // NOTE: OpenOCD will call any request larger than this constant an "absurd // request". /*! \brief maximum number of bytes allowed in a single memory transfer */ - static const constexpr ssize_t kMemTransferLimit = 64000; + static const constexpr ssize_t kMemTransferLimit = 8000; /*! \brief number of milliseconds to wait for function execution to halt */ - static const constexpr int kWaitTime = 10000; + static const constexpr int kWaitTime = 30000; }; const std::shared_ptr OpenOCDLowLevelDeviceCreate(const std::string& server_addr, diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h index 0abd269eedd4..ea682e261348 100644 --- a/src/runtime/micro/target_data_layout_encoder.h +++ b/src/runtime/micro/target_data_layout_encoder.h @@ -50,7 +50,7 @@ class TargetDataLayoutEncoder { * \param size size (in bytes) of the memory region allocated for this slot * \param start_addr start address of the slot in the device's memory */ - Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, DevPtr start_addr); + Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, TargetPtr start_addr); ~Slot(); @@ -71,7 +71,7 @@ class TargetDataLayoutEncoder { * \brief returns start address of the slot in device memory * \return device start address */ - DevPtr start_addr(); + TargetPtr start_addr(); /*! * \brief returns number of bytes allocated for this slot @@ -89,7 +89,7 @@ class TargetDataLayoutEncoder { /*! \brief size (in bytes) of the memory region allocated for this slot */ size_t size_; /*! \brief start address of the slot in the device's memory */ - DevPtr start_addr_; + TargetPtr start_addr_; }; /*! @@ -139,17 +139,14 @@ class TargetDataLayoutEncoder { return buf_.size(); } - /*! - * \brief TODO - */ - DevPtr start_addr() const { - CHECK_NE(start_addr_.value().val64, 0) << "start addr uninitialized"; + TargetPtr start_addr() const { + CHECK_NE(start_addr_.value().uint64(), 0) << "start addr uninitialized"; return start_addr_; } - void set_start_addr(DevPtr start_addr) { + void set_start_addr(TargetPtr start_addr) { CHECK_EQ(buf_.size(), 0) << "cannot change encoder start addr unless empty"; - start_addr_ = DevPtr(UpperAlignValue(start_addr.value().val64, word_size_)); + start_addr_ = TargetPtr(UpperAlignValue(start_addr.value().uint64(), word_size_)); } private: @@ -158,7 +155,7 @@ class TargetDataLayoutEncoder { /*! \brief current offset */ size_t curr_offset_; /*! \brief start address of the encoder in device memory */ - DevPtr start_addr_; + TargetPtr start_addr_; /*! \brief TODO */ size_t capacity_; /*! \brief number of bytes in a word on the target device */ @@ -169,7 +166,7 @@ template TargetDataLayoutEncoder::Slot::Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, - DevPtr start_addr) + TargetPtr start_addr) : parent_(parent), start_offset_(start_offset), curr_offset_(0), @@ -178,8 +175,10 @@ TargetDataLayoutEncoder::Slot::Slot(TargetDataLayoutEncoder* parent, template TargetDataLayoutEncoder::Slot::~Slot() { - // TODO this can mask the exception thrown by slot allocation... even though that doesn't make sense. - CHECK(curr_offset_ == size_) << "unwritten space in slot"; + // TODO(areusch): this can mask the exception thrown by slot allocation... even though that + // doesn't make sense. + CHECK(curr_offset_ == size_) << "unwritten space in slot; curr_offset=" + << curr_offset_ << ", size=" << size_; } template @@ -198,7 +197,7 @@ void TargetDataLayoutEncoder::Slot::WriteValue(const T& val) { } template -DevPtr TargetDataLayoutEncoder::Slot::start_addr() { +TargetPtr TargetDataLayoutEncoder::Slot::start_addr() { return start_addr_; } diff --git a/src/runtime/micro/tcl_socket.cc b/src/runtime/micro/tcl_socket.cc index d0bb6007f815..24abe42f786f 100644 --- a/src/runtime/micro/tcl_socket.cc +++ b/src/runtime/micro/tcl_socket.cc @@ -42,9 +42,8 @@ void TclSocket::Connect(tvm::support::SockAddr addr) { } void TclSocket::SendCommand() { - //std::cout << "[TclSocket::SendCommand]" << std::endl; - //std::cout << " cmd: " << cmd_builder_.str() << std::endl; - cmd_builder_ << kCommandTerminateToken; + const char terminate_token = kCommandTerminateToken; + cmd_builder_ << terminate_token; std::string full_cmd = cmd_builder_.str(); CHECK(tcp_socket_.Send(full_cmd.data(), full_cmd.length()) != -1) diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc index 7556c4915b9c..9b5612efc690 100644 --- a/src/runtime/rpc/rpc_session.cc +++ b/src/runtime/rpc/rpc_session.cc @@ -36,43 +36,13 @@ #include #include "rpc_session.h" #include "../object_internal.h" -#include "../../common/ring_buffer.h" -#include "../../common/socket.h" +#include "../../support/ring_buffer.h" +#include "../../support/socket.h" #include "../micro/micro_session.h" namespace tvm { namespace runtime { -std::string RPCCodeToString(RPCCode code) { - switch (code) { - case RPCCode::kNone: return "None"; - case RPCCode::kCallFunc: return "CallFunc"; - case RPCCode::kReturn: return "Return"; - case RPCCode::kException: return "Exception"; - case RPCCode::kShutdown: return "Shutdown"; - case RPCCode::kCopyFromRemote: return "CopyFromRemote"; - case RPCCode::kCopyToRemote: return "CopyToRemote"; - case RPCCode::kCopyAck: return "CopyAck"; - case RPCCode::kSystemFuncStart: return "SystemFuncStart"; - case RPCCode::kGetGlobalFunc: return "GetGlobalFunc"; - case RPCCode::kGetTimeEvaluator: return "GetTimeEvaluator"; - case RPCCode::kFreeFunc: return "FreeFunc"; - case RPCCode::kDevSetDevice: return "DevSetDevice"; - case RPCCode::kDevGetAttr: return "DevGetAttr"; - case RPCCode::kDevAllocData: return "DevAllocData"; - case RPCCode::kDevFreeData: return "DevFreeData"; - case RPCCode::kDevStreamSync: return "DevStreamSync"; - case RPCCode::kCopyAmongRemote: return "CopyAmongRemote"; - case RPCCode::kModuleLoad: return "ModuleLoad"; - case RPCCode::kModuleImport: return "ModuleImport"; - case RPCCode::kModuleFree: return "ModuleFree"; - case RPCCode::kModuleGetFunc: return "ModuleGetFunc"; - case RPCCode::kModuleGetSource: return "ModuleGetSource"; - case RPCCode::kNDArrayFree: return "NDArrayFree"; - default: CHECK(false) << "invalid RPC code"; - } -} - // Temp buffer for data array struct RPCByteArrayBuffer { TVMByteArray arr; @@ -929,12 +899,6 @@ void RPCSession::Init() { &reader_, &writer_, table_index_, name_, &remote_key_); // Quick function to call remote. call_remote_ = PackedFunc([this](TVMArgs args, TVMRetValue* rv) { - std::cout << "[RPCSession::call_remote_]" << std::endl; - if (args.type_codes[0] == kTVMContext) { - const TVMContext ctx = args[0]; - std::cout << " ctx.device_type: " << ctx.device_type << std::endl; - std::cout << " ctx.device_id: " << ctx.device_id << std::endl; - } handler_->SendPackedSeq(args.values, args.type_codes, args.num_args, true); RPCCode code = HandleUntilReturnEvent(rv, true, nullptr); CHECK(code == RPCCode::kReturn) << "code=" << static_cast(code); @@ -1117,10 +1081,7 @@ void RPCDevSetDevice(TVMArgs args, TVMRetValue *rv) { } void RPCDevGetAttr(TVMArgs args, TVMRetValue *rv) { - std::cout << "[RPCDevGetAttr]" << std::endl; TVMContext ctx = args[0]; - std::cout << " ctx.device_type: " << ctx.device_type << std::endl; - std::cout << " ctx.device_id: " << ctx.device_id << std::endl; DeviceAttrKind kind = static_cast(args[1].operator int()); if (kind == kExist) { DeviceAPI* api = DeviceAPI::Get(ctx, true); @@ -1136,10 +1097,7 @@ void RPCDevGetAttr(TVMArgs args, TVMRetValue *rv) { } void RPCDevAllocData(TVMArgs args, TVMRetValue *rv) { - std::cout << "[RPCDevAllocData]" << std::endl; TVMContext ctx = args[0]; - std::cout << " ctx.device_type: " << ctx.device_type << std::endl; - std::cout << " ctx.device_id: " << ctx.device_id << std::endl; uint64_t nbytes = args[1]; uint64_t alignment = args[2]; DLDataType type_hint = args[3]; @@ -1161,18 +1119,13 @@ void RPCDevStreamSync(TVMArgs args, TVMRetValue *rv) { } void RPCCopyAmongRemote(TVMArgs args, TVMRetValue *rv) { - std::cout << "[RPCCopyAmongRemote]" << std::endl; void* from = args[0]; uint64_t from_offset = args[1]; void* to = args[2]; uint64_t to_offset = args[3]; uint64_t size = args[4]; TVMContext ctx_from = args[5]; - std::cout << " ctx_from.device_type: " << ctx_from.device_type << std::endl; - std::cout << " ctx_from.device_id: " << ctx_from.device_type << std::endl; TVMContext ctx_to = args[6]; - std::cout << " ctx_to.device_type: " << ctx_to.device_type << std::endl; - std::cout << " ctx_to.device_id: " << ctx_to.device_type << std::endl; DLDataType type_hint = args[7]; TVMStreamHandle stream = args[8]; TVMContext ctx = ctx_from; @@ -1183,12 +1136,10 @@ void RPCCopyAmongRemote(TVMArgs args, TVMRetValue *rv) { ctx_to.device_type == ctx_from.device_type) << "Can not copy across different ctx types directly"; } - std::cout << " before CopyDataFromTo" << std::endl; DeviceAPI::Get(ctx)->CopyDataFromTo( from, from_offset, to, to_offset, size, ctx_from, ctx_to, type_hint, stream); - std::cout << " after CopyDataFromTo" << std::endl; } void RPCModuleLoad(TVMArgs args, TVMRetValue *rv) { @@ -1250,7 +1201,6 @@ void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) { } void RPCSession::EventHandler::HandlePackedCall() { - std::cout << "[RPCSession::EventHandler::HandlePackedCall]" << std::endl; CHECK_EQ(pending_request_bytes_, 0U); if (code_ == RPCCode::kReturn) { state_ = kReturnReceived; return; @@ -1259,7 +1209,6 @@ void RPCSession::EventHandler::HandlePackedCall() { state_ = kRecvCode; this->RequestBytes(sizeof(RPCCode)); // Event handler sit at clean state at this point. - std::cout << " RPC code is " << static_cast(code_) << "(" << RPCCodeToString(code_) << ")" << std::endl; switch (code_) { case RPCCode::kCallFunc: { PackedFunc* pf = reinterpret_cast(call_handle_); @@ -1274,7 +1223,6 @@ void RPCSession::EventHandler::HandlePackedCall() { std::ostringstream os; os << "Except caught from RPC call: " << arg_buf_->value[0].v_str; arg_buf_.reset(); - std::cout << os.str() << std::endl; throw dmlc::Error(os.str()); break; } @@ -1302,10 +1250,10 @@ void RPCSession::EventHandler::HandlePackedCall() { PackedFunc MicroTimeEvaluator( PackedFunc pf, TVMContext ctx, - int number, + size_t number, int repeat, int min_repeat_ms) { - auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable { + auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) mutable { TVMRetValue temp; std::ostringstream os; @@ -1313,13 +1261,12 @@ PackedFunc MicroTimeEvaluator( // start timing CHECK(number < MicroSession::kTaskQueueCapacity) << "`number` must be less than uTVM task queue capacity"; - for (int i = 0; i < number; ++i) { + for (unsigned int j = 0; j < number; ++j) { pf.CallPacked(args, &temp); } ObjectPtr session = MicroSession::Current(); DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); double time_per_batch = session->GetLastBatchTime() / number; - std::cout << "LAST AVERAGE BATCH TIME WAS " << time_per_batch << std::endl; os.write(reinterpret_cast(&time_per_batch), sizeof(time_per_batch)); } std::string blob = os.str(); @@ -1337,12 +1284,9 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, int number, int repeat, int min_repeat_ms) { - std::cout << "[WrapTimeEvaluator]" << std::endl; if (static_cast(ctx.device_type) == static_cast(kDLMicroDev)) { - std::cout << " USING MICRO TIME EVAL" << std::endl; return MicroTimeEvaluator(pf, ctx, number, repeat, min_repeat_ms); } - std::cout << " USING NORMAL TIME EVAL" << std::endl; auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable { TVMRetValue temp; diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h index 1bb75357e030..db63be4be74d 100644 --- a/src/runtime/rpc/rpc_session.h +++ b/src/runtime/rpc/rpc_session.h @@ -63,31 +63,31 @@ struct RPCArgBuffer; /*! \brief The RPC code */ enum class RPCCode : int { - kNone = 0, - kCallFunc = 1, - kReturn = 2, - kException = 3, - kShutdown = 4, - kCopyFromRemote = 5, - kCopyToRemote = 6, - kCopyAck = 7, + kNone, + kCallFunc, + kReturn, + kException, + kShutdown, + kCopyFromRemote, + kCopyToRemote, + kCopyAck, // The following are code that can send over CallRemote - kSystemFuncStart = 8, - kGetGlobalFunc = 9, - kGetTimeEvaluator = 10, - kFreeFunc = 11, - kDevSetDevice = 12, - kDevGetAttr = 13, - kDevAllocData = 14, - kDevFreeData = 15, - kDevStreamSync = 16, - kCopyAmongRemote = 17, - kModuleLoad = 18, - kModuleImport = 19, - kModuleFree = 20, - kModuleGetFunc = 21, - kModuleGetSource = 22, - kNDArrayFree = 23 + kSystemFuncStart, + kGetGlobalFunc, + kGetTimeEvaluator, + kFreeFunc, + kDevSetDevice, + kDevGetAttr, + kDevAllocData, + kDevFreeData, + kDevStreamSync, + kCopyAmongRemote, + kModuleLoad, + kModuleImport, + kModuleFree, + kModuleGetFunc, + kModuleGetSource, + kNDArrayFree }; /*! diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc index 0c4404f515c0..adb84e498e5d 100644 --- a/src/target/source/codegen_c.cc +++ b/src/target/source/codegen_c.cc @@ -840,12 +840,10 @@ void CodeGenC::VisitStmt_(const AttrStmtNode* op) { const VarNode* v = op->node.as(); CHECK(v); volatile_buf_.insert(v); - } else if (op->attr_key == ir::attr::pragma_import_c) { - const StringImm* value = op->value.as(); + } else if (op->attr_key == tir::attr::pragma_import_c) { + const StringImmNode* value = op->value.as(); CHECK(value != nullptr); decl_stream << value->value; - //this->HandleImport(value->value); - //this->VisitStmt(op->body); } this->PrintStmt(op->body); } diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc index c356ac816957..4cca1efea2ca 100644 --- a/src/target/source/codegen_c_host.cc +++ b/src/target/source/codegen_c_host.cc @@ -20,25 +20,25 @@ /*! * \file codegen_c_host.cc */ -#include +#include "codegen_c_host.h" #include #include -#include "codegen_c_host.h" +#include "tvm/target/codegen.h" #include "../build_common.h" namespace tvm { namespace codegen { -// TODO rename to CodeGenCMicro? CodeGenCHost::CodeGenCHost() { module_name_ = GetUniqueName("__tvm_module_ctx"); } void CodeGenCHost::Init(bool output_ssa, bool emit_asserts) { emit_asserts_ = emit_asserts; + declared_globals_.clear(); decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n"; decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n"; - decl_stream << "extern void* " << module_name_ << " = NULL;\n"; + decl_stream << "void* " << module_name_ << " = NULL;\n"; CodeGenC::Init(output_ssa); } @@ -183,8 +183,15 @@ void CodeGenCHost::VisitExpr_(const CallNode *op, std::ostream& os) { // NOLINT( int64_t num_args = end - begin; CHECK_GE(num_args, 0); std::string func_name = s->value; - std::string packed_func_name = GetUniqueName(func_name + "_packed"); - decl_stream << "static void* " << packed_func_name << " = NULL;\n"; + // NOTE: cannot rely on GetUnique for global decl_stream declarations + // because it is reset between AddFunction(). + std::string packed_func_name = func_name + "_packed"; + if (declared_globals_.insert(packed_func_name).second) { + // Still reserve the name among unique names. + CHECK(GetUniqueName(packed_func_name) == packed_func_name) << + "Expected name " << packed_func_name << " to not be taken"; + decl_stream << "static void* " << packed_func_name << " = NULL;\n"; + } this->PrintGetFuncFromBackend(func_name, packed_func_name); this->PrintFuncCall(packed_func_name, num_args); } else if (op->is_intrinsic(intrinsic::tvm_throw_last_error)) { @@ -242,7 +249,7 @@ runtime::Module BuildCHost(IRModule mod) { CodeGenCHost cg; cg.Init(output_ssa, emit_asserts); - for (auto kv : mod->functions) { + for (auto kv : mod->functions) { CHECK(kv.second->IsInstance()) << "CodegenCHost: Can only take PrimFunc"; auto f = Downcast(kv.second); @@ -255,7 +262,7 @@ runtime::Module BuildCHost(IRModule mod) { TVM_REGISTER_GLOBAL("target.build.c") .set_body([](TVMArgs args, TVMRetValue* rv) { - *rv = BuildCHost(args[0]); - }); + *rv = BuildCHost(args[0]); +}); } // namespace codegen } // namespace tvm diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h index f854b7abf68c..bec96860ad47 100644 --- a/src/target/source/codegen_c_host.h +++ b/src/target/source/codegen_c_host.h @@ -24,9 +24,10 @@ #ifndef TVM_TARGET_SOURCE_CODEGEN_C_HOST_H_ #define TVM_TARGET_SOURCE_CODEGEN_C_HOST_H_ -#include -#include +#include #include +#include "tvm/target/codegen.h" +#include "tvm/tir/expr.h" #include "codegen_c.h" namespace tvm { @@ -36,8 +37,6 @@ class CodeGenCHost final : public CodeGenC { public: CodeGenCHost(); void Init(bool output_ssa, bool emit_asserts); - void AddFunction(LoweredFunc f); - std::string Finish(); void PrintType(DataType t, std::ostream& os) final; // NOLINT(*) void PrintFuncPrefix() final; // NOLINT(*) @@ -55,6 +54,8 @@ class CodeGenCHost final : public CodeGenC { private: std::string module_name_; + /* \brief tracks declared global variables which live despite GetUniqueName */ + std::set declared_globals_; /*! \brief whether to emit asserts in the resulting C code */ bool emit_asserts_; diff --git a/src/target/target.cc b/src/target/target.cc index a72ce1c5b3e4..2cb72a285a41 100644 --- a/src/target/target.cc +++ b/src/target/target.cc @@ -140,7 +140,7 @@ Target CreateTarget(const std::string& target_name, t->keys_array.push_back("hexagon"); t->device_type = kDLHexagon; } else { - LOG(ERROR) << "Unknown target name " << target_name; + LOG(ERROR) << "Unknown target name " << target_name << "; falling back to stackvm"; return target::stackvm(); } diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 1ab324e50228..448f53c57f3c 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -26,9 +26,9 @@ from tvm.relay.testing import resnet # # Use the host emulated micro device. -# DEV_CONFIG_A = micro.device.host.generate_config() -# DEV_CONFIG_B = micro.device.host.generate_config() -# TARGET = 'c -device=micro_dev' +DEV_CONFIG_A = micro.device.host.generate_config() +DEV_CONFIG_B = micro.device.host.generate_config() +TARGET = 'c -device=micro_dev' # # TODO why do spike examples have memory that starts at 0x10000000, but you # # should set the base addr as 0x10010000? should somehow help the user to be @@ -41,9 +41,9 @@ # DEV_CONFIG_B = micro.device.riscv_spike.generate_config(BASE_ADDR, AVAILABLE_MEM, '127.0.0.1', 6667) # TARGET = 'c -device=micro_dev' -DEV_CONFIG_A = micro.device.arm.stm32f746xx.generate_config('127.0.0.1', 6666) -DEV_CONFIG_B = micro.device.arm.stm32f746xx.generate_config('127.0.0.1', 6667) -TARGET = 'c -device=micro_dev' +# DEV_CONFIG_A = micro.device.arm.stm32f746xx.generate_config('127.0.0.1', 6666) +# DEV_CONFIG_B = micro.device.arm.stm32f746xx.generate_config('127.0.0.1', 6667) +# TARGET = 'c -device=micro_dev' def relay_micro_build(func, dev_config, params=None): """Create a graph runtime module with a micro device context from a Relay function. @@ -64,11 +64,10 @@ def relay_micro_build(func, dev_config, params=None): mod : tvm.runtime.Module graph runtime module for the target device """ - disable_vectorize = tvm.build_config(disable_vectorize=True) + disable_vectorize = tvm.target.build_config(disable_vectorize=True) disable_fusion = relay.build_config(disabled_pass={'FuseOps'}) with disable_vectorize, disable_fusion: graph, c_mod, params = relay.build(func, target=TARGET, params=params) - print(c_mod.get_source()) micro_mod = micro.create_micro_mod(c_mod, dev_config) ctx = tvm.micro_dev(0) mod = graph_runtime.create(graph, micro_mod, ctx) @@ -83,6 +82,7 @@ def relay_micro_build(func, dev_config, params=None): break UTVMDone """ + def reset_gdbinit(): if 'server_port' not in DEV_CONFIG_A: return @@ -210,7 +210,7 @@ def test_graph_runtime(): def test_conv2d(): - if not tvm.module.enabled("micro_dev"): + if not tvm.runtime.enabled("micro_dev"): return from tvm.relay import create_executor @@ -230,19 +230,27 @@ def test_conv2d(): padding=(1, 1), channels=4) func = relay.Function(relay.analysis.free_vars(conv_expr), conv_expr) - mod = relay.Module.from_expr(func) + mod = tvm.IRModule.from_expr(func) mod = transform.InferType()(mod) x_shape = list(map(lambda x: x.value, mod['main'].params[0].checked_type.shape)) w_shape = list(map(lambda x: x.value, mod['main'].params[1].checked_type.shape)) out_shape = list(map(lambda x: x.value, mod['main'].ret_type.shape)) - with tvm.build_config(disable_vectorize=True): + with tvm.target.build_config(disable_vectorize=True): graph, c_mod, params = relay.build(mod, target="c") with micro.Session(DEV_CONFIG_A): micro_mod = micro.create_micro_mod(c_mod, DEV_CONFIG_A) - micro_func = micro_mod[func_name] + candidate_func_name = func_name + for i in range(100): + try: + micro_func = micro_mod[candidate_func_name] + break + except tvm.TVMError as e: + candidate_func_name = f'{func_name}_{i}' + else: + assert False ctx = tvm.micro_dev(0) x_data = tvm.nd.array(np.random.uniform(size=x_shape).astype(dtype), ctx) @@ -253,43 +261,11 @@ def test_conv2d(): out_data = np.zeros(out_shape, dtype=dtype) params = { 'x': x_data.asnumpy(), 'w': w_data.asnumpy() } intrp = create_executor('debug') - expected_result = intrp.evaluate(mod['main'])(x_data, w_data).data + expected_result = intrp.evaluate(mod['main'])(x_data, w_data) tvm.testing.assert_allclose(result.asnumpy(), expected_result.asnumpy()) -def test_multiple_modules(): - """Test loading multiple modules on the device simultaneously.""" - if not tvm.runtime.enabled("micro_dev"): - return - shape = (1024,) - dtype = "float32" - - # Construct Relay add program. - x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype)) - ret = relay.add(x, relay.const(1.0)) - add_const_func = relay.Function([x], ret) - # Construct Relay subtract program. - x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype)) - ret = relay.subtract(x, relay.const(1.0)) - sub_const_func = relay.Function([x], ret) - - with micro.Session(DEV_CONFIG_A): - add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_A) - sub_const_mod = relay_micro_build(sub_const_func, DEV_CONFIG_A) - - x_in = np.random.uniform(size=shape[0]).astype(dtype) - add_const_mod.run(x=x_in) - add_result = add_const_mod.get_output(0).asnumpy() - sub_const_mod.run(x=x_in) - sub_result = sub_const_mod.get_output(0).asnumpy() - - tvm.testing.assert_allclose( - add_result, x_in + 1.0) - tvm.testing.assert_allclose( - sub_result, x_in - 1.0) - - def test_interleave_sessions(): """Test closing and reopening sessions.""" if not tvm.runtime.enabled("micro_dev"): diff --git a/topi/python/topi/arm_cpu/__init__.py b/topi/python/topi/arm_cpu/__init__.py index eb05dd839e32..e121fbc7ec6d 100644 --- a/topi/python/topi/arm_cpu/__init__.py +++ b/topi/python/topi/arm_cpu/__init__.py @@ -25,3 +25,4 @@ from .bitserial_conv2d import * from .bitserial_dense import * from .injective import * +from . import cortex_m7 diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py index 25b338e06b5f..df63ae3e9e59 100644 --- a/topi/python/topi/arm_cpu/conv2d.py +++ b/topi/python/topi/arm_cpu/conv2d.py @@ -31,6 +31,7 @@ conv2d_spatial_pack_nhwc, \ schedule_conv2d_spatial_pack_nchw, \ schedule_conv2d_spatial_pack_nhwc +from .cortex_m7.conv2d import direct_simd @autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu") @@ -425,3 +426,15 @@ def _callback(op): traverse_inline(s, outs[0].op, _callback) return s + +@autotvm.register_topi_compute("conv2d_direct_simd.arm_cpu") +def conv2d_direct_simd(cfg, data, kernel, strides, padding, dilation, out_dtype): + """Compute conv2d with SIMD (v7e-m).""" + return direct_simd.conv2d_direct_simd_compute( + cfg, data, kernel, strides, padding, dilation, out_dtype) + + +@autotvm.register_topi_schedule("conv2d_direct_simd.arm_cpu") +def schedule_conv2d_direct_simd(cfg, outs): + """Create schedule for conv2d_direct_simd""" + return direct_simd.conv2d_direct_simd_nhwc_schedule(cfg, outs) diff --git a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py index 9f9785e834d7..b4f8e7cf836d 100644 --- a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py +++ b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py @@ -74,8 +74,7 @@ def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation, [n, co, oh, ow, ci, kh, kw, vc, vh, vw]]) cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll') - #cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec') - cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll') + cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec') # fallback support if cfg.is_fallback: @@ -153,7 +152,7 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec, cfg["ann_reduce"].apply(s, conv, [kh, kw], axis_lens=[get_const_int(kh.dom.extent), get_const_int(kw.dom.extent)], - max_unroll=16, + max_unroll=None, cfg=cfg) cfg["ann_spatial"].apply(s, conv, [vh, vw, vc], axis_lens=[cfg['tile_oh'].size[-1], @@ -178,14 +177,14 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec, cfg=cfg) s[conv].compute_at(s[last], ow) - ## mark parallel - #s[last].parallel(co) + # mark parallel + s[last].parallel(co) if data_vec.op.name == 'data_vec_undilated': _, h, _, _, _, _, _, _ = s[data_vec].op.axis else: _, h, _, _, _, _ = s[data_vec].op.axis - #s[data_vec].parallel(h) + s[data_vec].parallel(h) if kernel_vec.op.name == 'kernel_vec': co, _, _, _, _ = s[kernel_vec].op.axis @@ -194,12 +193,10 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec, # this part to make tuning records correct s[kernel_vec].pragma(co, 'debug_skip_region') else: - #s[kernel_vec].parallel(co) - pass + s[kernel_vec].parallel(co) elif kernel_vec.op.name == 'kernel_vec_conv2d_transpose': # for conv2d transpose - #co, _, _, _, _ = s[kernel_vec].op.axis - #s[kernel_vec].parallel(co) - pass + co, _, _, _, _ = s[kernel_vec].op.axis + s[kernel_vec].parallel(co) return s diff --git a/topi/python/topi/arm_cpu/cortex_m7/__init__.py b/topi/python/topi/arm_cpu/cortex_m7/__init__.py new file mode 100644 index 000000000000..631c5f7ff447 --- /dev/null +++ b/topi/python/topi/arm_cpu/cortex_m7/__init__.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Schedules specialized for cortex-m7.""" + + +from . import conv2d diff --git a/topi/python/topi/arm_cpu/cortex_m7/conv2d/__init__.py b/topi/python/topi/arm_cpu/cortex_m7/conv2d/__init__.py new file mode 100644 index 000000000000..cc4faf97b126 --- /dev/null +++ b/topi/python/topi/arm_cpu/cortex_m7/conv2d/__init__.py @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Conv2d implementations for cortex-m7.""" + +from . import direct_simd diff --git a/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct.py b/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct.py new file mode 100644 index 000000000000..1fdb596b7163 --- /dev/null +++ b/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct.py @@ -0,0 +1,177 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name +"""Direct implementation of conv2d.""" + +import tvm +from tvm import autotvm +from tvm.autotvm.task import deserialize_args +from topi.nn.conv2d import conv2d_nchw, conv2d_nhwc +from topi.util import get_const_tuple, get_const_int, traverse_inline + +def conv2d_direct(*args, **kwargs): + """Schedule function for directly-scheduled conv2d.""" + assert not kwargs, "Do not support kwargs in template function call" + args = deserialize_args(args) + data, kernel = args[:2] + layout = args[-2] + cfg = autotvm.get_config() + args = [cfg] + args + conv = conv2d_direct_compute(*args) + if layout == 'NHWC': + sched = conv2d_direct_nhwc_schedule(cfg, [data, kernel, conv]) + elif layout == 'NCHW': + sched = conv2d_direct_nchw_schedule(cfg, [data, kernel, conv]) + else: + raise RuntimeError(f'unsupported data layout "{layout}"') + return sched, [data, kernel, conv] + + +conv2d_direct.template_key = 'direct' +conv2d_direct.default_data_layout = 'NHWC' +conv2d_direct.default_kernel_layout = 'HWIO' + +@autotvm.register_topi_compute('conv2d_direct.micro_dev') +def conv2d_direct_compute(*args): + layout = args[-2] + if layout == 'NHWC': + return _conv2d_direct_nhwc_compute(*args) + if layout == 'NCHW': + return _conv2d_direct_nchw_compute(*args) + + raise RuntimeError(f'unsupported data layout "{layout}"') + + +def _conv2d_direct_nhwc_compute(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): + assert layout == 'NHWC' + conv = conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype) + + ########################### + # Config Space Definition # + ########################### + N, H, W, CI = get_const_tuple(data.shape) + KH, KW, _, CO = get_const_tuple(kernel.shape) + n, oh, ow, co = cfg.axis(N), cfg.axis(H), cfg.axis(W), cfg.axis(CO) + kh, kw, ci = cfg.reduce_axis(KH), cfg.reduce_axis(KW), cfg.reduce_axis(CI) + + # TODO should we add a max_factor attr to these splits? + co, vc = cfg.define_split('tile_co', co, num_outputs=2) + oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2) + ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2) + + cfg.define_reorder('reorder_0', + [n, co, oh, ow, ci, kh, kw, vh, vw, vc], + policy='candidate', candidate=[ + [n, co, oh, ow, ci, kh, kw, vh, vw, vc], + [n, co, oh, ow, ci, kh, kw, vc, vh, vw], + [n, co, oh, ow, ci, vh, vw, vc, kh, kw], + [n, co, oh, ow, ci, vc, vh, vw, kh, kw]]) + + cfg.define_annotate('ann_reduce', [kh, kw], policy='try_unroll') + cfg.define_annotate('ann_spatial', [vh, vw, vc], policy='try_unroll') + + cfg.define_knob('auto_unroll_max_step', [0, 2, 4, 8, 16, 32]) + cfg.define_knob('unroll_explicit', [0, 1]) + + return conv + + +def _conv2d_direct_nchw_compute(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): + assert layout == 'NCHW' + conv = conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype) + + ########################### + # Config Space Definition # + ########################### + cfg.define_knob('auto_unroll_max_step', [0, 2, 4, 8, 16, 32]) + cfg.define_knob('unroll_explicit', [0, 1]) + + return conv + + +@autotvm.register_topi_schedule('conv2d_direct_nhwc.micro_dev') +def conv2d_direct_nhwc_schedule(cfg, outs): + """Schedule function for directly-scheduled conv2d on NHWC layout.""" + sched = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if 'conv2d_nhwc' not in op.tag: + return + + ### extract tensors ### + output = op.output(0) + conv = op + data_vec = conv.input_tensors[0] + kernel = conv.input_tensors[1] # pylint: disable=unused-variable + last = outs[0] # pylint: disable=unused-variable + + # tile reduction axes + n, oh, ow, co = sched[conv].op.axis + kh, kw, ci = sched[conv].op.reduce_axis + # NOTE we can't inline data padding in the SIMD path, because it + # introduces conditionals in the inner loop. + data_pad = data_vec.op + sched[data_pad].compute_inline() + + co, vc = cfg['tile_co'].apply(sched, conv, co) + oh, vh = cfg['tile_oh'].apply(sched, conv, oh) + ow, vw = cfg['tile_ow'].apply(sched, conv, ow) + cfg['reorder_0'].apply(sched, conv, [n, co, oh, ow, ci, kh, kw, vh, vw, vc]) + cfg['ann_reduce'].apply(sched, conv, [kh, kw], + axis_lens=[get_const_int(kh.dom.extent), + get_const_int(kw.dom.extent)], + max_unroll=8, + cfg=cfg) + cfg['ann_spatial'].apply(sched, conv, [vh, vw, vc], + axis_lens=[cfg['tile_oh'].size[-1], + cfg['tile_ow'].size[-1], + cfg['tile_co'].size[-1]], + max_unroll=8, + cfg=cfg) + + kernel_scope = n # this is the scope to attach global config inside this kernel + + # tune unroll + sched[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) + sched[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val) + + traverse_inline(sched, outs[-1].op, _callback) + return sched + + +@autotvm.register_topi_schedule('conv2d_direct_nchw.micro_dev') +def conv2d_direct_nchw_schedule(cfg, outs): + """Schedule function for Cortex-M7 direct implementation of conv2d.""" + # use default schedule + sched = tvm.create_schedule([x.op for x in outs]) + + conv = outs[-1].op + output = conv.output(0) + data_vec = conv.input_tensors[0] + data_pad = data_vec.op + sched[data_pad].compute_inline() + + # TODO add more schedule opts (similar to the NHWC template) + + n, _, _, _ = sched[conv].op.axis + kernel_scope = n # this is the scope to attach global config inside this kernel + + # tune unroll + sched[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) + sched[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val) + + return sched diff --git a/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py b/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py new file mode 100644 index 000000000000..fd411251272e --- /dev/null +++ b/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py @@ -0,0 +1,163 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, no-value-for-parameter +"""Direct implementation of conv2d.""" + +from tvm import autotvm +from tvm.autotvm.task import deserialize_args +from tvm import te +from topi.util import simplify, traverse_inline +from topi.nn.pad import pad +from topi.nn.util import get_pad_tuple + +from ..micro_kernel.gemm import ( + intrin_gemm_MxKxN, gemm_MxKxN_impl, +) + +def conv2d_direct_simd(*args, **kwargs): + """Defines the Cortex-M7 SIMD implementation of conv2d.""" + assert not kwargs, "Do not support kwargs in template function call" + args = deserialize_args(args) + data, kernel = args[:2] + layout = args[-2] + cfg = autotvm.get_config() + args = [cfg] + args + assert layout == 'NHWC' + conv = conv2d_direct_simd_compute(*args) + sched = conv2d_direct_simd_nhwc_schedule(cfg, [data, kernel, conv]) + return sched, [data, kernel, conv] + + +conv2d_direct_simd.template_key = 'direct_simd' +conv2d_direct_simd.default_data_layout = 'NHWC' +conv2d_direct_simd.default_kernel_layout = 'HWOI' + +def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, out_dtype): + """Compute function for Cortex-M7 SIMD implementation of conv2d.""" + assert isinstance(strides, int) or len(strides) == 2 + assert isinstance(dilation, int) or len(dilation) == 2 + + if isinstance(strides, int): + stride_h = stride_w = strides + else: + stride_h, stride_w = strides + + if isinstance(dilation, int): + dilation_h = dilation_w = dilation + else: + dilation_h, dilation_w = dilation + + batch_size, in_height, in_width, in_channels = data.shape + kernel_h, kernel_w, out_channels, _ = kernel.shape + + # compute the output shape + dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 + dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 + pad_top, pad_left, pad_down, pad_right = get_pad_tuple( + padding, (dilated_kernel_h, dilated_kernel_w)) + out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) + out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) + + pad_before = [0, pad_top, pad_left, 0] + pad_after = [0, pad_down, pad_right, 0] + padded_data = pad(data, pad_before, pad_after, name='padded_data') + + rc = te.reduce_axis((0, in_channels), name='rc') + ry = te.reduce_axis((0, kernel_h), name='ry') + rx = te.reduce_axis((0, kernel_w), name='rx') + + conv = te.compute( + (batch_size, out_height, out_width, out_channels), + lambda nn, yy, xx, ff: te.sum( + padded_data[nn, yy * stride_h + ry * dilation_h, + xx * stride_w + rx * dilation_w, rc].astype(out_dtype) * + kernel[ry, rx, ff, rc].astype(out_dtype), axis=[ry, rx, rc]), + name='conv2d', tag='conv2d_nhwc') + + ########################### + # Config Space Definition # + ########################### + n, oh, ow, co = (cfg.axis(batch_size.value), + cfg.axis(out_height.value), + cfg.axis(out_width.value), + cfg.axis(out_channels.value)) + kh, kw, ci = (cfg.reduce_axis(kernel_h.value), + cfg.reduce_axis(kernel_w.value), + cfg.reduce_axis(in_channels.value)) + + assert in_channels.value % 4 == 0 + owo, owi = cfg.define_split('tile_ow', ow, policy='factors', num_outputs=2) + cio, cii = cfg.define_split('tile_ci', ci, policy='factors', num_outputs=2, + filter=lambda x: x.size[-1] % 4 == 0) + coo, coi = cfg.define_split('tile_co', co, policy='factors', num_outputs=2) + + cfg.define_reorder('reorder_0_simd', + [n, oh, owo, owi, coo, coi, kh, kw, cio, cii], + policy='candidate', candidate=[ + [n, oh, kh, kw, owo, coo, cio, owi, coi, cii], + [n, oh, kh, kw, coo, owo, cio, owi, coi, cii], + [n, kh, kw, oh, owo, coo, cio, owi, coi, cii], + [n, kh, kw, oh, coo, owo, cio, owi, coi, cii]]) + + cfg.define_knob('auto_unroll_max_step', [0, 2, 4, 8, 16, 32]) + cfg.define_knob('unroll_explicit', [0, 1]) + + return conv + + +def conv2d_direct_simd_nhwc_schedule(cfg, outs): + """Schedule function for Cortex-M7 SIMD implementation of conv2d.""" + sched = te.create_schedule([x.op for x in outs]) + + def _callback(op): + if 'conv2d_nhwc' not in op.tag: + return + + # extract tensors + output = op.output(0) + conv = op + data_vec = conv.input_tensors[0] + kernel = conv.input_tensors[1] # pylint: disable=unused-variable + last = outs[0] # pylint: disable=unused-variable + + # tile reduction axes + n, oh, ow, co = sched[conv].op.axis + kh, kw, ci = sched[conv].op.reduce_axis + + M = cfg['tile_ow'].size[-1] + K = cfg['tile_ci'].size[-1] + N = cfg['tile_co'].size[-1] + + owo, owi = cfg['tile_ow'].apply(sched, conv, ow) + cio, cii = cfg['tile_ci'].apply(sched, conv, ci) + coo, coi = cfg['tile_co'].apply(sched, conv, co) + + cfg['reorder_0_simd'].apply(sched, conv, [n, oh, owo, owi, coo, coi, kh, kw, cio, cii]) + + gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype) + sched[output].tensorize(owi, gemm) + sched[output].pragma(n, 'import_c', gemm_MxKxN_impl(M, K, N, uniq_id)) + + # this is the scope to attach global config inside this kernel + kernel_scope = n + + # tune unroll + sched[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) + sched[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val) + + traverse_inline(sched, outs[-1].op, _callback) + return sched diff --git a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py new file mode 100644 index 000000000000..70cd46bb32c2 --- /dev/null +++ b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py @@ -0,0 +1,221 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, no-value-for-parameter +"""Defines gemm intrinsics for SIMD matrix multiplication.""" + +import random +import string + +import tvm +from tvm import te + +########################## +# MxKxN MatMul Intrinsic # +########################## + +# NOTE this is transposed matmul (A * B^T) +def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype): + """Defines a SIMD-accelerated transposed matmul.""" + # we generate a unique ID for every intrinsic definition, to prevent name + # collisions in the generated source (e.g., if there are multiple operators + # in the same module that use the same intrinsic) + # + # TODO to cut down on memory usage, we should cache each intrinsic + # instantiation and include it only once, eliminating the need for unique + # IDs + UNIQ_ID_LEN = 8 + uniq_id = ''.join(random.choices(string.ascii_uppercase, k=UNIQ_ID_LEN)) + + if isinstance(M, tvm.tir.IntImm): + M = M.value + if isinstance(K, tvm.tir.IntImm): + K = K.value + if isinstance(N, tvm.tir.IntImm): + N = N.value + assert K % 4 == 0 + # TODO support more dtypes? + assert in_dtype == 'int8' + assert out_dtype == 'int32' + A = te.placeholder((M, K), name='a', dtype=in_dtype) + B = te.placeholder((N, K), name='b', dtype=in_dtype) + k = te.reduce_axis((0, K), name='k') + C = te.compute( + (M, N), + lambda i, j: te.sum(A[i, k].astype(out_dtype) * B[j, k].astype(out_dtype), axis=k), + name='c') + A_buf = tvm.tir.decl_buffer( + A.shape, A.dtype, + name="A", + offset_factor=1, + strides=[te.var("A_s"), 1]) + B_buf = tvm.tir.decl_buffer( + B.shape, B.dtype, + name="B", + offset_factor=1, + strides=[te.var("B_s"), 1]) + C_buf = tvm.tir.decl_buffer( + C.shape, C.dtype, + name="C", + offset_factor=1, + strides=[te.var("C_s"), 1]) + def intrin_func(ins, outs): + aa, bb = ins + cc = outs[0] + def _reduce_update(): + ib = tvm.tir.ir_builder.create() + ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_update_{uniq_id}", + aa.access_ptr("r"), + bb.access_ptr("r"), + cc.access_ptr("w"), + aa.strides[0], + bb.strides[0], + cc.strides[0])) + return ib.get() + def _reduce_reset(): + ib = tvm.tir.ir_builder.create() + ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_reset_{uniq_id}", + cc.access_ptr("w"), + cc.strides[0])) + return ib.get() + def _body(): + ib = tvm.tir.ir_builder.create() + # # NOTE we need the reset in the body for cases where the buffer + # # we're accumulating into is uninitialized (e.g., if it's the + # # result of a workspace allocation, because there are no guarantees + # # on the contents). + # ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_reset", + # cc.access_ptr("w"), + # cc.strides[0])) + # ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_update", + # aa.access_ptr("r"), + # bb.access_ptr("r"), + # cc.access_ptr("w"), + # aa.strides[0], + # bb.strides[0], + # cc.strides[0])) + ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_body_{uniq_id}", + aa.access_ptr("r"), + bb.access_ptr("r"), + cc.access_ptr("w"), + aa.strides[0], + bb.strides[0], + cc.strides[0])) + return ib.get() + return _body(), _reduce_reset(), _reduce_update() + with tvm.target.build_config(offset_factor=1): + intrin_decl = te.decl_tensor_intrin( + C.op, intrin_func, binds={A: A_buf, B: B_buf, C: C_buf}) + return intrin_decl, uniq_id + + +def gemm_MxKxN_impl(M, K, N, uniq_id): + """Emit C code for gemm impl.""" + # TODO are there any SIMD tricks to zero out arrays quickly? + aa_pad_size = M * K + bb_pad_size = N * K + # code reference: CMSIS-NN paper (https://arxiv.org/abs/1801.06601) + cc_code = f""" +#ifdef __cplusplus +extern "C" +#endif +__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_body_{uniq_id}( + int8_t *aa, int8_t *bb, int32_t *cc, + int A_stride, int B_stride, int C_stride) {{ + int16_t aa_pad[{aa_pad_size}]; + int16_t bb_pad[{bb_pad_size}]; + + for (int i = 0; i < {M}; i++) {{ + for (int j = 0; j < {K} / 4; j++) {{ + read_and_pad(&aa[i*A_stride + j*4], (int32_t*) &aa_pad[i*{K} + j*4], (int32_t*) &aa_pad[i*{K} + j*4 + 2]); + }} + }} + + for (int i = 0; i < {N}; i++) {{ + for (int j = 0; j < {K} / 4; j++) {{ + read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4], (int32_t*) &bb_pad[i*{K} + j*4 + 2]); + }} + }} + + for (int i = 0; i < {M}; i++) {{ + for (int j = 0; j < {N}; j++) {{ + int32_t sum = 0; + for (int l = 0; l < {K} / 2; l++) {{ + sum = __SMLAD( + *((int32_t*) &aa_pad[i*{K} + l*2]), + *((int32_t*) &bb_pad[j*{K} + l*2]), + sum); + }} + // NOTE: this is the line where `*_body` differs from `*_update`. here + // we're *setting* the result, instead of accumulating, because we know + // the `i` and `j` itervars span their entire respective axes. + cc[i*C_stride + j] = sum; + }} + }} + + return 0; +}} + +#ifdef __cplusplus +extern "C" +#endif +__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_update_{uniq_id}( + int8_t *aa, int8_t *bb, int32_t *cc, + int A_stride, int B_stride, int C_stride) {{ + int16_t aa_pad[{aa_pad_size}]; + int16_t bb_pad[{bb_pad_size}]; + + for (int i = 0; i < {M}; i++) {{ + for (int j = 0; j < {K} / 4; j++) {{ + read_and_pad(&aa[i*A_stride + j*4], (int32_t*) &aa_pad[i*{K} + j*4], (int32_t*) &aa_pad[i*{K} + j*4 + 2]); + }} + }} + + for (int i = 0; i < {N}; i++) {{ + for (int j = 0; j < {K} / 4; j++) {{ + read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4], (int32_t*) &bb_pad[i*{K} + j*4 + 2]); + }} + }} + + for (int i = 0; i < {M}; i++) {{ + for (int j = 0; j < {N}; j++) {{ + int32_t sum = 0; + for (int l = 0; l < {K} / 2; l++) {{ + sum = __SMLAD( + *((int32_t*) &aa_pad[i*{K} + l*2]), + *((int32_t*) &bb_pad[j*{K} + l*2]), + sum); + }} + cc[i*C_stride + j] += sum; + }} + }} + + return 0; +}} + +#ifdef __cplusplus +extern "C" +#endif +__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_reset_{uniq_id}(int32_t *cc, int C_stride) {{ + for (int i = 0; i < {M}; i++) {{ + for (int j = 0; j < {N}; j++) {{ + cc[i*C_stride + j] = 0; + }} + }} + return 0; +}} + """ + return cc_code diff --git a/topi/python/topi/arm_cpu/injective.py b/topi/python/topi/arm_cpu/injective.py index 8cf3fd5f11bf..966520088bc7 100644 --- a/topi/python/topi/arm_cpu/injective.py +++ b/topi/python/topi/arm_cpu/injective.py @@ -20,7 +20,6 @@ from tvm import te from ..util import is_empty_shape -@generic.schedule_injective_from_existing.register(["arm_cpu", "micro_dev"]) def schedule_injective_from_existing(sch, out): """Schedule for injective op from existing schedule. @@ -46,7 +45,6 @@ def schedule_injective_from_existing(sch, out): sch[out].parallel(sch[out].op.axis[0]) return sch -@generic.schedule_injective.register(["arm_cpu", "micro_dev"]) def schedule_injective(outs): """ARM CPU schedule for injective op. @@ -74,7 +72,6 @@ def schedule_injective(outs): schedule_injective_from_existing(s, x) return s -@generic.schedule_concatenate.register(["arm_cpu", "micro_dev"]) def schedule_concatenate(outs): """Schedule for concatenate op. diff --git a/topi/python/topi/generic/default.py b/topi/python/topi/generic/default.py index d4c642ab8814..59e5a255c6e1 100644 --- a/topi/python/topi/generic/default.py +++ b/topi/python/topi/generic/default.py @@ -24,7 +24,7 @@ def default_schedule(outs, auto_inline): """Default schedule for llvm.""" target = tvm.target.Target.current(allow_none=False) outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs - if target.target_name != "llvm": + if target.target_name not in ("llvm", "c"): raise RuntimeError("schedule not registered for '%s'" % target) s = te.create_schedule([x.op for x in outs]) if auto_inline: From 80c647039ce0ecfaf2a0f32817ac46511bdc1158 Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Thu, 23 Apr 2020 10:51:04 -0700 Subject: [PATCH 03/11] Use /std:c++14 with MSVC. * Per tqchen: project has already moved to C++14 * Presubmit failed for code that built locally on gcc. --- CMakeLists.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fc7c67c83a48..a0ebdf037756 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -304,12 +304,15 @@ include(cmake/modules/contrib/TFLite.cmake) include(cmake/modules/contrib/TF_TVMDSOOP.cmake) include(cmake/modules/contrib/CoreML.cmake) +include(CheckCXXCompilerFlag) if(NOT MSVC) - include(CheckCXXCompilerFlag) check_cxx_compiler_flag("-std=c++14" SUPPORT_CXX14) - message(STATUS "Build with c++14") set(CMAKE_CXX_FLAGS "-std=c++14 ${CMAKE_CXX_FLAGS}") set(CMAKE_CUDA_STANDARD 14) +else() + check_cxx_compiler_flag("/std:c++14" SUPPORT_CXX14) + set(CMAKE_CXX_FLAGS "/std:c++14 ${CMAKE_CXX_FLAGS}") + set(CMAKE_CUDA_STANDARD 14) endif() add_library(tvm SHARED ${COMPILER_SRCS} ${RUNTIME_SRCS}) From ce06883a7894cf7f929f3b8e47902d30d5cd8f95 Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Thu, 23 Apr 2020 23:21:05 +0000 Subject: [PATCH 04/11] fix ASF lint, and fix add_asf_header too --- tests/lint/add_asf_header.py | 4 +++- .../arm_cpu/cortex_m7/micro_kernel/__init__.py | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/tests/lint/add_asf_header.py b/tests/lint/add_asf_header.py index a44fbd3df1b5..21d25c25e573 100644 --- a/tests/lint/add_asf_header.py +++ b/tests/lint/add_asf_header.py @@ -181,7 +181,9 @@ def add_header(fname, header): skipline = False ext = os.path.splitext(fname)[1][1:] - if lines[0][:2] == "#!": + if not lines: + skipline = False # File is enpty + elif lines[0][:2] == "#!": skipline = True elif lines[0][:2] == " Date: Fri, 24 Apr 2020 10:28:33 -0700 Subject: [PATCH 05/11] Compiles with USE_MICRO=OFF. --- src/runtime/micro/micro_common.h | 2 +- src/runtime/micro/micro_session.cc | 35 +++++++++++++++++++++++++++++ src/runtime/rpc/rpc_session.cc | 36 +++--------------------------- 3 files changed, 39 insertions(+), 34 deletions(-) diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index 7ba62d3f30df..a8d16d2b9803 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -74,7 +74,7 @@ class TargetVal { if (width_bits_ == 64) { return 0xffffffff; } else { - return (1 << width_bits_) - 1; + return (1UL << width_bits_) - 1; } } diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 309fea45cd31..11ee035820a2 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -644,6 +645,40 @@ PackedFunc MicroSession::GetFunction( } } +TVM_REGISTER_GLOBAL("micro._GetMicroTimeEvaluator") +.set_body([](TVMArgs args, TVMRetValue* rv) { + PackedFunc pf = args[0]; + TVMContext ctx = args[1]; + int number = args[2]; + int repeat = args[3]; + + auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) mutable { + TVMRetValue temp; + std::ostringstream os; + + for (int i = 0; i < repeat; ++i) { + // start timing + CHECK(number < MicroSession::kTaskQueueCapacity) + << "`number` must be less than uTVM task queue capacity"; + for (unsigned int j = 0; j < number; ++j) { + pf.CallPacked(args, &temp); + } + ObjectPtr session = MicroSession::Current(); + DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); + double time_per_batch = session->GetLastBatchTime() / number; + os.write(reinterpret_cast(&time_per_batch), sizeof(time_per_batch)); + } + std::string blob = os.str(); + TVMByteArray arr; + arr.size = blob.length(); + arr.data = blob.data(); + // return the time. + *rv = arr; + }; + *rv = PackedFunc(ftimer); +}); + + // create micro session and low-level device from Python frontend TVM_REGISTER_GLOBAL("micro._CreateSession") .set_body([](TVMArgs args, TVMRetValue* rv) { diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc index 9b5612efc690..ae293abfacdd 100644 --- a/src/runtime/rpc/rpc_session.cc +++ b/src/runtime/rpc/rpc_session.cc @@ -1247,45 +1247,15 @@ void RPCSession::EventHandler::HandlePackedCall() { CHECK_EQ(state_, kRecvCode); } -PackedFunc MicroTimeEvaluator( - PackedFunc pf, - TVMContext ctx, - size_t number, - int repeat, - int min_repeat_ms) { - auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) mutable { - TVMRetValue temp; - std::ostringstream os; - - for (int i = 0; i < repeat; ++i) { - // start timing - CHECK(number < MicroSession::kTaskQueueCapacity) - << "`number` must be less than uTVM task queue capacity"; - for (unsigned int j = 0; j < number; ++j) { - pf.CallPacked(args, &temp); - } - ObjectPtr session = MicroSession::Current(); - DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); - double time_per_batch = session->GetLastBatchTime() / number; - os.write(reinterpret_cast(&time_per_batch), sizeof(time_per_batch)); - } - std::string blob = os.str(); - TVMByteArray arr; - arr.size = blob.length(); - arr.data = blob.data(); - // return the time. - *rv = arr; - }; - return PackedFunc(ftimer); -} - PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat, int min_repeat_ms) { if (static_cast(ctx.device_type) == static_cast(kDLMicroDev)) { - return MicroTimeEvaluator(pf, ctx, number, repeat, min_repeat_ms); + auto get_micro_time_evaluator = runtime::Registry::Get("micro._GetMicroTimeEvaluator"); + CHECK(get_micro_time_evaluator != nullptr) << "micro backend not enabled"; + return (*get_micro_time_evaluator)(pf, ctx, number, repeat); } auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable { From 25961d6f581f1c1a43a60b9290e44776fd2f09ce Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Fri, 24 Apr 2020 13:26:01 -0700 Subject: [PATCH 06/11] Cleanup TargetPtr and word size representations. --- python/tvm/micro/base.py | 4 +- python/tvm/micro/device/arm/stm32f746xx.py | 6 +- python/tvm/micro/device/base.py | 12 +- python/tvm/micro/device/host.py | 6 +- python/tvm/micro/device/riscv_spike.py | 6 +- src/runtime/micro/host_low_level_device.cc | 11 +- src/runtime/micro/low_level_device.h | 3 +- src/runtime/micro/micro_common.cc | 8 +- src/runtime/micro/micro_common.h | 105 +++++++++++++----- src/runtime/micro/micro_device_api.cc | 25 ++--- src/runtime/micro/micro_section_allocator.h | 16 +-- src/runtime/micro/micro_session.cc | 92 +++++++-------- src/runtime/micro/micro_session.h | 24 ++-- src/runtime/micro/openocd_low_level_device.cc | 6 +- .../micro/target_data_layout_encoder.h | 13 ++- .../cortex_m7/micro_kernel/__init__.py | 1 - 16 files changed, 200 insertions(+), 138 deletions(-) diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index d2ec7d398c46..c34dd728841a 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -77,7 +77,7 @@ def __init__(self, config): dev_funcs = tvm.micro.device.get_device_funcs(config['device_id']) self.toolchain_prefix = config['toolchain_prefix'] self.mem_layout = config['mem_layout'] - self.word_size = config['word_size'] + self.word_size_bits = config['word_size_bits'] self.thumb_mode = config['thumb_mode'] self.use_device_timer = config['use_device_timer'] self.comms_method = config['comms_method'] @@ -122,7 +122,7 @@ def __init__(self, config): self.mem_layout['workspace']['size'], self.mem_layout['stack'].get('start', 0), self.mem_layout['stack']['size'], - self.word_size, + self.word_size_bits, self.thumb_mode, self.use_device_timer, server_addr, diff --git a/python/tvm/micro/device/arm/stm32f746xx.py b/python/tvm/micro/device/arm/stm32f746xx.py index f85a34e4e0a2..9a2b46ac0cd7 100644 --- a/python/tvm/micro/device/arm/stm32f746xx.py +++ b/python/tvm/micro/device/arm/stm32f746xx.py @@ -20,7 +20,7 @@ DEVICE_ID = 'arm.stm32f746xx' TOOLCHAIN_PREFIX = 'arm-none-eabi-' -WORD_SIZE = 4 +WORD_SIZE_BITS = 32 # # [Device Memory Layout] # RAM (rwx) : START = 0x20000000, LENGTH = 320K @@ -112,8 +112,8 @@ def generate_config(server_addr, server_port, section_constraints=None): return { 'device_id': DEVICE_ID, 'toolchain_prefix': TOOLCHAIN_PREFIX, - 'mem_layout': gen_mem_layout(BASE_ADDR, AVAILABLE_MEM, WORD_SIZE, section_constraints), - 'word_size': WORD_SIZE, + 'mem_layout': gen_mem_layout(BASE_ADDR, AVAILABLE_MEM, WORD_SIZE_BITS, section_constraints), + 'word_size_bits': WORD_SIZE_BITS, 'thumb_mode': True, 'use_device_timer': True, 'comms_method': 'openocd', diff --git a/python/tvm/micro/device/base.py b/python/tvm/micro/device/base.py index 1621c69d1a77..4d42bff8ebbc 100644 --- a/python/tvm/micro/device/base.py +++ b/python/tvm/micro/device/base.py @@ -178,7 +178,7 @@ class MemConstraint(enum.Enum): WEIGHT = 1 -def gen_mem_layout(base_addr, available_mem, word_size, section_constraints): +def gen_mem_layout(base_addr, available_mem, word_size_bits, section_constraints): """Template function to generate memory layout for devices. Parameters @@ -189,12 +189,14 @@ def gen_mem_layout(base_addr, available_mem, word_size, section_constraints): available_mem: Number Available memory at base_addr, given in bytes. - word_size: Number - Number of bytes in one word on this device. + word_size_bits: Number + Number of bits in one word on this device. section_constraints: Optional[Dict[str, [Number, MemConstraint]]] maps section name to the quantity of available memory """ + assert word_size_bits in (32, 64), 'only 32- or 64-bit devices are supported now' + word_size_bytes = word_size_bits // 8 byte_sum = sum(x[0] for x in section_constraints.values() if x[1] == MemConstraint.ABSOLUTE_BYTES) @@ -209,7 +211,7 @@ def gen_mem_layout(base_addr, available_mem, word_size, section_constraints): for section in DEVICE_SECTIONS: (val, cons_type) = section_constraints[section] if cons_type == MemConstraint.ABSOLUTE_BYTES: - assert val % word_size == 0, \ + assert val % word_size_bytes == 0, \ f'constraint {val} for {section} section is not word-aligned' size = val res[section] = { @@ -218,7 +220,7 @@ def gen_mem_layout(base_addr, available_mem, word_size, section_constraints): } else: size = int((val / weight_sum) * available_weight_mem) - size = (size // word_size) * word_size + size = (size // word_size_bytes) * word_size_bytes res[section] = { 'start': curr_addr, 'size': size, diff --git a/python/tvm/micro/device/host.py b/python/tvm/micro/device/host.py index f6d0dadc5f43..55e0ac326f2d 100644 --- a/python/tvm/micro/device/host.py +++ b/python/tvm/micro/device/host.py @@ -21,7 +21,7 @@ DEVICE_ID = 'host' TOOLCHAIN_PREFIX = '' -WORD_SIZE = 8 if sys.maxsize > 2**32 else 4 +WORD_SIZE_BITS = 64 if sys.maxsize > 2**32 else 32 # we pretend we only have 320kb in the default case, so we can use `gen_mem_layout` DEFAULT_AVAILABLE_MEM = 3200000 @@ -89,7 +89,7 @@ def generate_config(available_mem=None, section_constraints=None): available_mem = DEFAULT_AVAILABLE_MEM if section_constraints is None: section_constraints = DEFAULT_SECTION_CONSTRAINTS - mem_layout = gen_mem_layout(0, available_mem, WORD_SIZE, section_constraints) + mem_layout = gen_mem_layout(0, available_mem, WORD_SIZE_BITS, section_constraints) # TODO the host emulated device is an outlier, since we don't know how what # its base address will be until we've created it in the C++. is there any # way to change the infrastructure around this so it's not so much of an @@ -103,7 +103,7 @@ def generate_config(available_mem=None, section_constraints=None): 'device_id': DEVICE_ID, 'toolchain_prefix': TOOLCHAIN_PREFIX, 'mem_layout': mem_layout, - 'word_size': WORD_SIZE, + 'word_size_bits': WORD_SIZE_BITS, 'thumb_mode': False, 'use_device_timer': False, 'comms_method': 'host', diff --git a/python/tvm/micro/device/riscv_spike.py b/python/tvm/micro/device/riscv_spike.py index f26f04604cac..b9f55dff0ce7 100644 --- a/python/tvm/micro/device/riscv_spike.py +++ b/python/tvm/micro/device/riscv_spike.py @@ -20,7 +20,7 @@ DEVICE_ID = 'riscv_spike' TOOLCHAIN_PREFIX = 'riscv64-unknown-elf-' -WORD_SIZE = 8 +WORD_SIZE_BITS = 64 DEFAULT_SECTION_CONSTRAINTS = { 'text': (18000, MemConstraint.ABSOLUTE_BYTES), @@ -92,8 +92,8 @@ def generate_config(base_addr, available_mem, server_addr, server_port, section_ return { 'device_id': DEVICE_ID, 'toolchain_prefix': TOOLCHAIN_PREFIX, - 'mem_layout': gen_mem_layout(base_addr, available_mem, WORD_SIZE, section_constraints), - 'word_size': WORD_SIZE, + 'mem_layout': gen_mem_layout(base_addr, available_mem, WORD_SIZE_BITS, section_constraints), + 'word_size_bits': WORD_SIZE_BITS, 'thumb_mode': False, 'use_device_timer': False, 'comms_method': 'openocd', diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index 13f68f295fa6..da4ade41c008 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -43,14 +43,15 @@ class HostLowLevelDevice final : public LowLevelDevice { * \brief constructor to initialize on-host memory region to act as device * \param num_bytes size of the emulated on-device memory region */ - explicit HostLowLevelDevice(size_t num_bytes, void** base_addr) : size_(num_bytes) { + explicit HostLowLevelDevice(size_t num_bytes, TargetPtr* base_addr) : size_(num_bytes) { size_t size_in_pages = (num_bytes + kPageSize - 1) / kPageSize; // TODO(weberlo): Set permissions per section (e.g., read-write perms for // the heap, execute perms for text, etc.). int mmap_prot = PROT_READ | PROT_WRITE | PROT_EXEC; int mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE; base_addr_ = mmap(nullptr, size_in_pages * kPageSize, mmap_prot, mmap_flags, -1, 0); - *base_addr = base_addr_; + *base_addr = TargetPtr(TargetWordSize(sizeof(size_t) * 8), + reinterpret_cast(base_addr_)); } /*! @@ -83,9 +84,9 @@ class HostLowLevelDevice final : public LowLevelDevice { size_t size_; }; -const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes, void** base_addr) { - std::shared_ptr lld = - std::make_shared(num_bytes, base_addr); +const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes, + TargetPtr* base_addr) { + std::shared_ptr lld = std::make_shared(num_bytes, base_addr); return lld; } diff --git a/src/runtime/micro/low_level_device.h b/src/runtime/micro/low_level_device.h index 666b08199a6b..c5b5f3df634b 100644 --- a/src/runtime/micro/low_level_device.h +++ b/src/runtime/micro/low_level_device.h @@ -78,7 +78,8 @@ class LowLevelDevice { * \param num_bytes size of the memory region * \param base_addr pointer to write the host device's resulting base address into */ -const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes, void** base_addr); +const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes, + TargetPtr* base_addr); /*! * \brief connect to OpenOCD and create an OpenOCD low-level device diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 3e322c639edd..c544fcd32801 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -51,7 +51,7 @@ const char* SectionToString(SectionKind section) { std::string RelocateBinarySections( const std::string& binary_path, - size_t word_size, + TargetWordSize word_size, TargetPtr text_start, TargetPtr rodata_start, TargetPtr data_start, @@ -62,7 +62,7 @@ std::string RelocateBinarySections( CHECK(f != nullptr) << "Require tvm_callback_relocate_binary to exist in registry"; std::string relocated_bin = (*f)(binary_path, - word_size, + word_size.bytes(), text_start.cast_to(), rodata_start.cast_to(), data_start.cast_to(), @@ -91,7 +91,7 @@ std::string ReadSection(const std::string& binary, size_t GetSectionSize(const std::string& binary_path, SectionKind section, const std::string& toolchain_prefix, - size_t align) { + TargetWordSize word_size) { CHECK(section == SectionKind::kText || section == SectionKind::kRodata || section == SectionKind::kData || section == SectionKind::kBss) << "GetSectionSize requires section to be one of text, rodata, data, or bss."; @@ -99,7 +99,7 @@ size_t GetSectionSize(const std::string& binary_path, CHECK(f != nullptr) << "Require tvm_callback_get_section_size to exist in registry"; int size = (*f)(binary_path, SectionToString(section), toolchain_prefix); - return UpperAlignValue(size, align); + return UpperAlignValue(size, word_size.bytes()); } } // namespace runtime diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index a8d16d2b9803..2093f385fca0 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -30,6 +30,7 @@ #include #include #include +#include namespace tvm { namespace runtime { @@ -52,6 +53,27 @@ enum class SectionKind : size_t { kNumKinds, }; +/*! \brief data type for word sizes */ +class TargetWordSize { + private: + size_t word_size_bits_; + + public: + explicit TargetWordSize(size_t word_size_bits) : word_size_bits_{word_size_bits} { + CHECK(word_size_bits == 32 || word_size_bits == 64) + << "only 32-bit and 64-bit are supported now"; + } + + size_t bytes() const { + return word_size_bits_ / 8; + } + + size_t bits() const { + return word_size_bits_; + } +}; + + /*! \brief class for storing values on varying target word sizes */ class TargetVal { private: @@ -59,42 +81,63 @@ class TargetVal { uint64_t value_; public: + /*! \brief construct a TargetVal matching the size of the given integral argument */ template::value, T>::type> - explicit constexpr TargetVal(T value) : - width_bits_{sizeof(T) * 8}, value_{value} {} + explicit constexpr TargetVal(T value) : TargetVal(sizeof(T) * 8, value) {} + /*! \brief construct an uninitialized value */ + TargetVal() : width_bits_{0}, value_{0} {} + + /*! \brief construct a TargetVal with explicit size and value */ TargetVal(size_t width_bits, uint64_t value) : width_bits_{width_bits} { - CHECK(width_bits != 0 && (width_bits & (width_bits - 1)) == 0) - << "width_bits must be a power of 2, got " << width_bits; - *this = value; + CHECK(width_bits >= 8 && + width_bits <= 64 && + (width_bits & (width_bits - 1)) == 0) + << "width_bits must be a power of 2 in [8, 64], got " << width_bits; + value_ = value & bitmask(); + } + + bool is_initialized() const { return width_bits_ != 0; } + + size_t width_bits() const { + CHECK(is_initialized()) << "TargetVal is not initialized"; + return width_bits_; } - size_t width_bits() const { return width_bits_; } uint64_t bitmask() const { + CHECK(is_initialized()) << "TargetVal is not initialized"; + if (width_bits_ == 64) { - return 0xffffffff; + return ~0UL; } else { return (1UL << width_bits_) - 1; } } uint32_t uint32() const { + CHECK(is_initialized()) << "TargetVal is not initialized"; CHECK(width_bits_ <= 32) << "TargetVal: requested 32-bit value, actual width is " << width_bits_; return uint32_t(value_ & bitmask()); } uint64_t uint64() const { + CHECK(is_initialized()) << "TargetVal is not initialized"; return value_; } - TargetVal& operator=(const uint64_t& value) { - if (width_bits_ == 64) { - value_ = value; - } else { - CHECK((value & ~bitmask()) == 0) << "bits above " << width_bits_ << " are non-zero"; - value_ = value & bitmask(); + TargetVal& operator=(const TargetVal& other) { + CHECK(other.is_initialized()) << "Cannot assign an uninitialized TargetVal"; + + if (!is_initialized()) { + width_bits_ = other.width_bits_; } + + CHECK(width_bits_ >= other.width_bits_) + << "Cannot assign TargetVal with width " << other.width_bits_ + << "bits to TargetVal with width " << width_bits_ << "bits"; + + value_ = other.value_ & bitmask(); return *this; } }; @@ -103,14 +146,19 @@ class TargetVal { /*! \brief absolute device address */ class TargetPtr { public: - /*! \brief construct a device address with val64 `value` */ - explicit TargetPtr(std::uint64_t value) : value_(TargetVal(64, value)) {} + /*! \brief construct a device address with variable-length value `value` */ + TargetPtr(TargetWordSize word_size, std::uint64_t value) : + value_(TargetVal(word_size.bits(), value)) {} + + /*! \brief construct a null address */ + TargetPtr(TargetWordSize word_size, std::nullptr_t value) : + value_{TargetVal(word_size.bits(), 0)} {} - /*! \brief default constructor (val64 0) */ - TargetPtr() : value_(TargetVal(64, 0)) {} + /*! \brief construct an uninitialized pointer whose word_size can be changed once */ + TargetPtr() = default; - /*! \brief construct a null address (stored in val64) */ - explicit TargetPtr(std::nullptr_t value) : value_{TargetVal(64, 0)} {} + /*! \brief construct a device address using the given TargetVal */ + explicit TargetPtr(const TargetVal& value) : value_{value} {} /*! \brief destructor */ ~TargetPtr() {} @@ -136,23 +184,23 @@ class TargetPtr { /*! \brief add an integer to this absolute address to get a larger absolute address */ TargetPtr operator+(size_t n) const { - return TargetPtr(value_.uint64() + n); + return TargetPtr(TargetWordSize(value_.width_bits()), value_.uint64() + n); } /*! \brief mutably add an integer to this absolute address */ TargetPtr& operator+=(size_t n) { - value_ = value_.uint64() + n; + value_ = TargetVal(value_.width_bits(), value_.uint64() + n); return *this; } /*! \brief subtract an integer from this absolute address to get a smaller absolute address */ TargetPtr operator-(size_t n) const { - return TargetPtr(value_.uint64() - n); + return TargetPtr(TargetWordSize(value_.width_bits()), value_.uint64() - n); } /*! \brief mutably subtract an integer from this absolute address */ TargetPtr& operator-=(size_t n) { - value_ = value_.uint64() - n; + value_ = TargetVal(value_.width_bits(), value_.uint64() - n); return *this; } @@ -177,7 +225,8 @@ class SymbolMap { * \param toolchain_prefix prefix of compiler toolchain to use */ SymbolMap(const std::string& binary, - const std::string& toolchain_prefix) { + const std::string& toolchain_prefix, + TargetWordSize word_size) { const auto* f = Registry::Get("tvm_callback_get_symbol_map"); CHECK(f != nullptr) << "require tvm_callback_get_symbol_map to exist in registry"; TVMByteArray arr; @@ -192,7 +241,7 @@ class SymbolMap { stream >> name; stream >> std::hex >> addr; while (stream) { - map_[name] = TargetPtr(addr); + map_.emplace(std::make_pair(name, TargetPtr(word_size, addr))); stream >> name; stream >> std::hex >> addr; } @@ -285,7 +334,7 @@ const char* SectionToString(SectionKind section); */ std::string RelocateBinarySections( const std::string& binary_path, - size_t word_size, + TargetWordSize word_size, TargetPtr text_start, TargetPtr rodata_start, TargetPtr data_start, @@ -309,13 +358,13 @@ std::string ReadSection(const std::string& binary, * \param binary input binary contents * \param section section type * \param toolchain_prefix prefix of compiler toolchain to use - * \param align alignment of the returned size (default: 8) + * \param word_size word size of the target, for alignment * \return size of the section if it exists, 0 otherwise */ size_t GetSectionSize(const std::string& binary_name, SectionKind section, const std::string& toolchain_prefix, - size_t align); + TargetWordSize word_size); } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index 619c31a8b1ba..77ad86577f7e 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -50,18 +50,14 @@ class MicroDeviceAPI final : public DeviceAPI { size_t alignment, DLDataType type_hint) final { ObjectPtr& session = MicroSession::Current(); - void* data = session->AllocateInSection(SectionKind::kHeap, nbytes).cast_to(); + TargetPtr data = session->AllocateInSection(SectionKind::kHeap, nbytes); CHECK(data != nullptr) << "unable to allocate " << nbytes << " bytes on device heap"; - MicroDevSpace* dev_space = new MicroDevSpace(); - dev_space->data = data; - dev_space->session = session; - return static_cast(dev_space); + return reinterpret_cast(new MicroDevSpace{data, session}); } void FreeDataSpace(TVMContext ctx, void* ptr) final { MicroDevSpace* dev_space = static_cast(ptr); - dev_space->session->FreeInSection( - SectionKind::kHeap, TargetPtr(reinterpret_cast(dev_space->data))); + dev_space->session->FreeInSection(SectionKind::kHeap, dev_space->data); delete dev_space; } @@ -134,20 +130,17 @@ class MicroDeviceAPI final : public DeviceAPI { CHECK(false) << "the on-device workspace allocator isn't aware of this function"; ObjectPtr& session = MicroSession::Current(); - void* data = session->AllocateInSection(SectionKind::kWorkspace, size).cast_to(); - CHECK(data != nullptr) << "unable to allocate " << size << " bytes on device workspace"; - MicroDevSpace* dev_space = new MicroDevSpace(); - dev_space->data = data; - dev_space->session = session; - return static_cast(dev_space); + TargetPtr data = session->AllocateInSection(SectionKind::kWorkspace, size); + CHECK(data.value().uint64() != 0) + << "unable to allocate " << size << " bytes on device workspace"; + return static_cast(new MicroDevSpace{data, session}); } void FreeWorkspace(TVMContext ctx, void* data) final { CHECK(false) << "the on-device workspace allocator isn't aware of this function"; MicroDevSpace* dev_space = static_cast(data); ObjectPtr& session = dev_space->session; - session->FreeInSection(SectionKind::kWorkspace, - TargetPtr(reinterpret_cast(dev_space->data))); + session->FreeInSection(SectionKind::kWorkspace, dev_space->data); delete dev_space; } @@ -162,7 +155,7 @@ class MicroDeviceAPI final : public DeviceAPI { private: TargetPtr GetDevLoc(MicroDevSpace* dev_space, size_t offset) { - return TargetPtr(reinterpret_cast(dev_space->data) + offset); + return dev_space->data + offset; } void* GetHostLoc(const void* ptr, size_t offset) { diff --git a/src/runtime/micro/micro_section_allocator.h b/src/runtime/micro/micro_section_allocator.h index 8fff39b61d19..2067794f393f 100644 --- a/src/runtime/micro/micro_section_allocator.h +++ b/src/runtime/micro/micro_section_allocator.h @@ -39,16 +39,18 @@ class MicroSectionAllocator { * \brief constructor that specifies section boundaries * \param region location and size of the section on the device */ - explicit MicroSectionAllocator(std::string section_name, DevMemRegion region, size_t word_size) + explicit MicroSectionAllocator(std::string section_name, + DevMemRegion region, + TargetWordSize word_size) : section_name_(section_name), start_addr_(region.start), size_(0), capacity_(region.size), word_size_(word_size) { - CHECK_EQ(start_addr_.value().uint64() % word_size, 0) - << "micro section start not aligned to " << word_size << " bytes"; - CHECK_EQ(capacity_ % word_size, 0) - << "micro section end not aligned to " << word_size << " bytes"; + CHECK_EQ(start_addr_.value().uint64() % word_size.bytes(), 0) + << "micro section start not aligned to " << word_size.bytes() << " bytes"; + CHECK_EQ(capacity_ % word_size.bytes(), 0) + << "micro section end not aligned to " << word_size.bytes() << " bytes"; } /*! @@ -62,7 +64,7 @@ class MicroSectionAllocator { * \return pointer to allocated memory region in section, nullptr if out of space */ TargetPtr Allocate(size_t size) { - size_ = UpperAlignValue(size_, word_size_); + size_ = UpperAlignValue(size_, word_size_.bytes()); CHECK(size_ + size < capacity_) << "cannot alloc " << size << " bytes in section \"" << section_name_ << "\" (start_addr=" << start_addr_.cast_to() @@ -122,7 +124,7 @@ class MicroSectionAllocator { /*! \brief total storage capacity of the section */ size_t capacity_; /*! \brief number of bytes in a word on the target device */ - size_t word_size_; + TargetWordSize word_size_; /*! \brief allocation map for allocation sizes */ std::unordered_map alloc_map_; }; diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 11ee035820a2..2cfde7e990a1 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -80,7 +80,7 @@ MicroSession::MicroSession( size_t workspace_size, uint64_t stack_start, size_t stack_size, - size_t word_size, + TargetWordSize word_size, bool thumb_mode, bool use_device_timer, const std::string& server_addr, @@ -90,7 +90,6 @@ MicroSession::MicroSession( thumb_mode_(thumb_mode), use_device_timer_(use_device_timer), batch_args_encoder_(args_size, word_size) { - CHECK(word_size_ == 4 || word_size_ == 8) << "unsupported word size " << word_size_; if (comms_method == "host") { // TODO(weberlo): move checks to python CHECK( @@ -105,11 +104,11 @@ MicroSession::MicroSession( size_t memory_size = text_size + rodata_size + data_size + bss_size + args_size + heap_size + workspace_size + stack_size; - void* base_addr; + TargetPtr base_addr; low_level_device_ = HostLowLevelDeviceCreate(memory_size, &base_addr); - CHECK_EQ(reinterpret_cast(base_addr) % word_size_, 0) - << "base address not aligned to " << word_size_ << " bytes"; - TargetPtr curr_addr = TargetPtr(reinterpret_cast(base_addr)); + CHECK_EQ(base_addr.value().uint64() % word_size.bytes(), 0) + << "base address not aligned to " << word_size.bytes() << " bytes"; + TargetPtr curr_addr = base_addr; section_allocators_[0] = std::make_shared( "text", @@ -172,49 +171,49 @@ MicroSession::MicroSession( section_allocators_[0] = std::make_shared( "text", DevMemRegion { - .start = TargetPtr(text_start), + .start = TargetPtr(word_size_, text_start), .size = text_size, }, word_size_); section_allocators_[1] = std::make_shared( "rodata", DevMemRegion { - .start = TargetPtr(rodata_start), + .start = TargetPtr(word_size_, rodata_start), .size = rodata_size, }, word_size_); section_allocators_[2] = std::make_shared( "data", DevMemRegion { - .start = TargetPtr(data_start), + .start = TargetPtr(word_size_, data_start), .size = data_size, }, word_size_); section_allocators_[3] = std::make_shared( "bss", DevMemRegion { - .start = TargetPtr(bss_start), + .start = TargetPtr(word_size_, bss_start), .size = bss_size, }, word_size_); section_allocators_[4] = std::make_shared( "args", DevMemRegion { - .start = TargetPtr(args_start), + .start = TargetPtr(word_size_, args_start), .size = args_size, }, word_size_); section_allocators_[5] = std::make_shared( "heap", DevMemRegion { - .start = TargetPtr(heap_start), + .start = TargetPtr(word_size_, heap_start), .size = heap_size, }, word_size_); section_allocators_[6] = std::make_shared( "workspace", DevMemRegion { - .start = TargetPtr(workspace_start), + .start = TargetPtr(word_size_, workspace_start), .size = workspace_size, }, word_size_); section_allocators_[7] = std::make_shared( "stack", DevMemRegion { - .start = TargetPtr(stack_start), + .start = TargetPtr(word_size_, stack_start), .size = stack_size, }, word_size_); } else { @@ -229,17 +228,14 @@ MicroSession::MicroSession( // Patch pointers to define the bounds of the workspace section and the word // size (for allocation alignment). std::shared_ptr ws_allocator = GetAllocator(SectionKind::kWorkspace); - TargetVal ws_start(word_size_ * 8, ws_allocator->start_addr().value().uint64()); - TargetVal ws_end(word_size_ * 8, ws_allocator->max_addr().value().uint64()); - TargetVal target_word_size(word_size_ * 8, word_size_); - if (word_size_ == 4) { - DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.uint32()); - DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.uint32()); - DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.uint32()); - } else if (word_size_ == 8) { - DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.uint64()); - DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.uint64()); - DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.uint64()); + DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_allocator->start_addr()); + DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_allocator->max_addr()); + if (word_size.bytes() == 4) { + DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", uint32_t(word_size.bytes())); + } else if (word_size.bytes() == 8) { + DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", uint64_t(word_size.bytes())); + } else { + CHECK(false) << "Unsupported word size unexpectedly here"; } } @@ -258,8 +254,8 @@ void MicroSession::PushToTaskQueue(TargetPtr func_ptr, const TVMArgs& args) { TargetVal func_dev_addr = func_ptr.value(); std::tuple arg_field_addrs = EncoderAppend(&batch_args_encoder_, args); - TargetVal arg_values_dev_addr{std::get<0>(arg_field_addrs).cast_to()}; - TargetVal arg_type_codes_dev_addr{std::get<1>(arg_field_addrs).cast_to()}; + TargetVal arg_values_dev_addr{std::get<0>(arg_field_addrs).value()}; + TargetVal arg_type_codes_dev_addr{std::get<1>(arg_field_addrs).value()}; task_queue_.push_back( DevTask { @@ -279,9 +275,9 @@ void MicroSession::FlushTaskQueue() { // nothing to run return; } - if (word_size_ == 4) { + if (word_size_.bytes() == 4) { FlushTaskQueuePriv(); - } else if (word_size_ == 8) { + } else if (word_size_.bytes() == 8) { FlushTaskQueuePriv(); } } @@ -389,9 +385,6 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d rodata_section.start = AllocateInSection(SectionKind::kRodata, rodata_section.size); data_section.start = AllocateInSection(SectionKind::kData, data_section.size); bss_section.start = AllocateInSection(SectionKind::kBss, bss_section.size); - CHECK(text_section.start != nullptr && rodata_section.start != nullptr && - data_section.start != nullptr && bss_section.start != nullptr) - << "not enough space to load module on device"; std::string relocated_bin = RelocateBinarySections( binary_path, @@ -411,7 +404,7 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d low_level_device_->Write(rodata_section.start, &rodata_contents[0], rodata_section.size); low_level_device_->Write(data_section.start, &data_contents[0], data_section.size); low_level_device_->Write(bss_section.start, &bss_contents[0], bss_section.size); - SymbolMap symbol_map {relocated_bin, toolchain_prefix_}; + SymbolMap symbol_map {relocated_bin, toolchain_prefix_, word_size_}; if (patch_dylib_pointers) { // Patch device lib pointers. @@ -447,12 +440,13 @@ std::tuple MicroSession::EncoderAppend( // order to prevent premature session destruction. void* old_data = base_arr_handle->data; // Mutate the array to unwrap the `data` field. - base_arr_handle->data = reinterpret_cast(old_data)->data; + MicroDevSpace* dev_arr_ptr = reinterpret_cast(old_data); + base_arr_handle->data = reinterpret_cast(dev_arr_ptr->data.value().uint64()); // Now, encode the unwrapped version. void* arr_ptr = nullptr; - if (word_size_ == 4) { + if (word_size_.bytes() == 4) { arr_ptr = EncoderAppend(encoder, *base_arr_handle).cast_to(); - } else if (word_size_ == 8) { + } else if (word_size_.bytes() == 8) { arr_ptr = EncoderAppend(encoder, *base_arr_handle).cast_to(); } // And restore the original wrapped version. @@ -486,7 +480,7 @@ TargetPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DL // is a device pointer, so we don't need to write it. shape_slot.WriteArray(arr.shape, arr.ndim); TargetPtr shape_dev_addr = shape_slot.start_addr(); - TargetPtr strides_dev_addr = TargetPtr(nullptr); + TargetPtr strides_dev_addr = TargetPtr(word_size_, nullptr); if (arr.strides != nullptr) { auto stride_slot = encoder->Alloc(arr.ndim); stride_slot.WriteArray(arr.strides, arr.ndim); @@ -494,13 +488,13 @@ TargetPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DL } T dev_arr( - TargetVal { word_size_ * 8, reinterpret_cast(arr.data) }, + TargetVal { word_size_.bits(), reinterpret_cast(arr.data) }, arr.ctx, arr.ndim, arr.dtype, shape_dev_addr.value(), strides_dev_addr.value(), - TargetVal { word_size_ * 8, arr.byte_offset }); + TargetVal { word_size_.bits(), arr.byte_offset }); CHECK(dev_arr.ctx.device_type == static_cast(kDLMicroDev)) << "attempt to write DLTensor with non-micro device type"; // Update the device type to CPU, because from the microcontroller's @@ -570,11 +564,7 @@ void MicroSession::PatchImplHole(const SymbolMap& symbol_map, const std::string& } std::ostringstream func_name_underscore; func_name_underscore << func_name << "_"; - if (word_size_ == 4) { - DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().uint32()); - } else if (word_size_ == 8) { - DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().uint64()); - } + DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr); } std::string MicroSession::ReadString(TargetPtr str_addr) { @@ -611,6 +601,18 @@ T MicroSession::DevSymbolRead(const SymbolMap& symbol_map, const std::string& sy return result; } +void MicroSession::DevSymbolWrite(const SymbolMap& symbol_map, + const std::string& symbol, + const TargetPtr& ptr) { + if (word_size_.bytes() == 4) { + DevSymbolWrite(symbol_map, symbol, ptr.value().uint32()); + } else if (word_size_.bytes() == 8) { + DevSymbolWrite(symbol_map, symbol, ptr.value().uint64()); + } else { + CHECK(false) << "Unsupported word size unexpectedly here"; + } +} + template void MicroSession::DevSymbolWrite(const SymbolMap& symbol_map, const std::string& symbol, @@ -701,7 +703,7 @@ TVM_REGISTER_GLOBAL("micro._CreateSession") size_t workspace_size = uint64_t(args[16]); uint64_t stack_start = args[17]; size_t stack_size = uint64_t(args[18]); - size_t word_size = uint64_t(args[19]); + TargetWordSize word_size{uint64_t(args[19])}; bool thumb_mode = args[20]; bool use_device_timer = args[21]; const std::string& server_addr = args[22]; diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index f7b05ae211e9..bf0996c3e8c4 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -99,7 +99,7 @@ class MicroSession : public ModuleNode { * \param workspace_size workspace section size * \param stack_start stack section start address * \param stack_size stack section size - * \param word_size number of bytes in a word on the target device + * \param word_size_bytes number of bytes in a word on the target device * \param thumb_mode whether the target device requires a thumb-mode bit on function addresses * \param server_addr address of the OpenOCD server to connect to (if `comms_method == "openocd"`) * \param port port of the OpenOCD server to connect to (if `comms_method == "openocd"`) @@ -124,7 +124,7 @@ class MicroSession : public ModuleNode { size_t workspace_size, uint64_t stack_start, size_t stack_size, - size_t word_size, + TargetWordSize word_size, bool thumb_mode, bool use_device_timer, const std::string& server_addr, @@ -196,6 +196,16 @@ class MicroSession : public ModuleNode { template T DevSymbolRead(const SymbolMap& symbol_map, const std::string& symbol); + /*! + * \brief write pointer value into device memory corresponding to symbol + * \param symbol_map symbol map to read location of symbol from + * \param symbol name of symbol being written to + * \param ptr pointer value to write into symbol + */ + void DevSymbolWrite(const SymbolMap& symbol_map, + const std::string& symbol, + const TargetPtr& ptr); + /*! * \brief write value into device memory corresponding to symbol * \param symbol_map symbol map to read location of symbol from @@ -235,7 +245,7 @@ class MicroSession : public ModuleNode { std::shared_ptr section_allocators_[static_cast(SectionKind::kNumKinds)]; /*! \brief number of bytes in a word on the target device */ - size_t word_size_; + TargetWordSize word_size_; /*! \brief whether the target device requires a thumb-mode bit on function addresses * * ARM and other manufacturers use the lowest bit of a function address to determine @@ -251,11 +261,11 @@ class MicroSession : public ModuleNode { std::vector task_queue_; // TODO(weberlo): we don't even need an allocator mechanism for the args // section. there's only ever one allocation. - /*! \brief TODO fukn hack */ + /*! \brief TODO hack */ TargetDataLayoutEncoder batch_args_encoder_; - /*! \brief TODO fukn hack */ + /*! \brief TODO hack */ double last_batch_time_; - /*! \brief TODO fukn hack */ + /*! \brief TODO hack */ double last_batch_cycles_; /*! @@ -317,7 +327,7 @@ class MicroSession : public ModuleNode { */ struct MicroDevSpace { /*! \brief data being wrapped */ - void* data; + TargetPtr data; /*! \brief shared ptr to session where this data is valid */ ObjectPtr session; }; diff --git a/src/runtime/micro/openocd_low_level_device.cc b/src/runtime/micro/openocd_low_level_device.cc index d1f279f6120f..68c036a89301 100644 --- a/src/runtime/micro/openocd_low_level_device.cc +++ b/src/runtime/micro/openocd_low_level_device.cc @@ -50,7 +50,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { socket_.SendCommand(); } - void Read(TargetPtr addr, void* buf, size_t num_bytes) { + void Read(TargetPtr addr, void* buf, size_t num_bytes) override { if (num_bytes == 0) { return; } @@ -119,7 +119,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { } } - void Write(TargetPtr addr, const void* buf, size_t num_bytes) { + void Write(TargetPtr addr, const void* buf, size_t num_bytes) override { if (num_bytes == 0) { return; } @@ -171,7 +171,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { } } - void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) { + void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) override { socket_.cmd_builder() << "halt 0"; socket_.SendCommand(); diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h index ea682e261348..e30c72b79fb5 100644 --- a/src/runtime/micro/target_data_layout_encoder.h +++ b/src/runtime/micro/target_data_layout_encoder.h @@ -96,8 +96,10 @@ class TargetDataLayoutEncoder { * \brief constructor * \param start_addr start address of the encoder in device memory */ - explicit TargetDataLayoutEncoder(size_t capacity, size_t word_size) - : buf_(std::vector()), curr_offset_(0), capacity_(capacity), word_size_(word_size) { + explicit TargetDataLayoutEncoder(size_t capacity, TargetWordSize word_size) + : buf_(std::vector()), curr_offset_(0), + start_addr_(word_size, nullptr), + capacity_(capacity), word_size_(word_size) { } /*! @@ -107,7 +109,7 @@ class TargetDataLayoutEncoder { */ template Slot Alloc(size_t num_elems = 1) { - curr_offset_ = UpperAlignValue(curr_offset_, word_size_); + curr_offset_ = UpperAlignValue(curr_offset_, word_size_.bytes()); size_t size = sizeof(T) * num_elems; if (curr_offset_ + size > buf_.size()) { buf_.resize(curr_offset_ + size); @@ -146,7 +148,8 @@ class TargetDataLayoutEncoder { void set_start_addr(TargetPtr start_addr) { CHECK_EQ(buf_.size(), 0) << "cannot change encoder start addr unless empty"; - start_addr_ = TargetPtr(UpperAlignValue(start_addr.value().uint64(), word_size_)); + start_addr_ = TargetPtr(word_size_, + UpperAlignValue(start_addr.value().uint64(), word_size_.bytes())); } private: @@ -159,7 +162,7 @@ class TargetDataLayoutEncoder { /*! \brief TODO */ size_t capacity_; /*! \brief number of bytes in a word on the target device */ - size_t word_size_; + TargetWordSize word_size_; }; template diff --git a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py index 245692337bc3..13a83393a912 100644 --- a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py +++ b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py @@ -14,4 +14,3 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - From 03782a81c146c9f2044557869587adfbeac4c904 Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Fri, 24 Apr 2020 23:29:48 +0000 Subject: [PATCH 07/11] fix compile warning --- src/runtime/micro/micro_session.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 2cfde7e990a1..672f16dc06f9 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -651,14 +651,14 @@ TVM_REGISTER_GLOBAL("micro._GetMicroTimeEvaluator") .set_body([](TVMArgs args, TVMRetValue* rv) { PackedFunc pf = args[0]; TVMContext ctx = args[1]; - int number = args[2]; - int repeat = args[3]; + uint64_t number = args[2]; + uint64_t repeat = args[3]; auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) mutable { TVMRetValue temp; std::ostringstream os; - for (int i = 0; i < repeat; ++i) { + for (unsigned int i = 0; i < repeat; ++i) { // start timing CHECK(number < MicroSession::kTaskQueueCapacity) << "`number` must be less than uTVM task queue capacity"; From d351b9c0097cb8a6d8461adae41bebdb09bab773 Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Fri, 24 Apr 2020 23:41:22 +0000 Subject: [PATCH 08/11] address logan's comments --- src/runtime/micro/openocd_low_level_device.cc | 4 ++-- topi/python/topi/arm_cpu/conv2d_spatial_pack.py | 2 +- topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/runtime/micro/openocd_low_level_device.cc b/src/runtime/micro/openocd_low_level_device.cc index 68c036a89301..0f21d6674740 100644 --- a/src/runtime/micro/openocd_low_level_device.cc +++ b/src/runtime/micro/openocd_low_level_device.cc @@ -207,8 +207,8 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { /*! \brief number of bytes in a word on the target device (64-bit) */ static const constexpr ssize_t kWordSize = 8; - // NOTE: OpenOCD will call any request larger than this constant an "absurd - // request". + // NOTE: The OS pipe buffer must be able to handle a line long enough to + // print this transfer request. /*! \brief maximum number of bytes allowed in a single memory transfer */ static const constexpr ssize_t kMemTransferLimit = 8000; /*! \brief number of milliseconds to wait for function execution to halt */ diff --git a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py index b4f8e7cf836d..a4d7ad83b1c8 100644 --- a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py +++ b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py @@ -173,7 +173,7 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec, axis_lens=[cfg['tile_oh'].size[-1], cfg['tile_ow'].size[-1], cfg['tile_co'].size[-1]], - max_unroll=None, + max_unroll=16, cfg=cfg) s[conv].compute_at(s[last], ow) diff --git a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py index 70cd46bb32c2..2169176c0711 100644 --- a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py +++ b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py @@ -34,7 +34,7 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype): # collisions in the generated source (e.g., if there are multiple operators # in the same module that use the same intrinsic) # - # TODO to cut down on memory usage, we should cache each intrinsic + # TODO(areusch): to cut down on memory usage, we should cache each intrinsic # instantiation and include it only once, eliminating the need for unique # IDs UNIQ_ID_LEN = 8 @@ -47,7 +47,7 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype): if isinstance(N, tvm.tir.IntImm): N = N.value assert K % 4 == 0 - # TODO support more dtypes? + # TODO(areusch): support more dtypes? assert in_dtype == 'int8' assert out_dtype == 'int32' A = te.placeholder((M, K), name='a', dtype=in_dtype) @@ -124,7 +124,7 @@ def _body(): def gemm_MxKxN_impl(M, K, N, uniq_id): """Emit C code for gemm impl.""" - # TODO are there any SIMD tricks to zero out arrays quickly? + # TODO(areusch): are there any SIMD tricks to zero out arrays quickly? aa_pad_size = M * K bb_pad_size = N * K # code reference: CMSIS-NN paper (https://arxiv.org/abs/1801.06601) From 4d426ab3a33b22d155579b49377675afac94a51c Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Mon, 27 Apr 2020 10:26:20 -0700 Subject: [PATCH 09/11] address logan and liangfu comments --- Makefile | 2 - python/tvm/autotvm/tuner/callback.py | 4 +- python/tvm/contrib/binutil.py | 80 +++++----- python/tvm/contrib/debugger/debug_runtime.py | 1 - python/tvm/exec/rpc_server.py | 2 +- python/tvm/micro/base.py | 138 +++++++++--------- python/tvm/micro/device/arm/stm32f746xx.py | 74 +++++----- python/tvm/micro/device/base.py | 66 ++++----- python/tvm/micro/device/host.py | 46 +++--- python/tvm/micro/device/riscv_spike.py | 42 +++--- python/tvm/relay/_parser.py | 5 +- python/tvm/relay/op/strategy/arm_cpu.py | 4 +- python/tvm/target/arm_isa.py | 2 +- src/ir/error.cc | 17 +-- .../host_driven/utvm_device_dylib_redirect.c | 15 +- src/runtime/micro/host_driven/utvm_runtime.c | 10 +- src/runtime/micro/host_driven/utvm_runtime.h | 18 ++- src/runtime/micro/micro_common.h | 30 ++-- .../micro/target_data_layout_encoder.h | 8 +- src/target/source/codegen_c_host.cc | 4 +- .../topi/arm_cpu/cortex_m7/conv2d/direct.py | 4 +- .../arm_cpu/cortex_m7/micro_kernel/gemm.py | 6 +- 22 files changed, 288 insertions(+), 290 deletions(-) diff --git a/Makefile b/Makefile index 64a127346e53..e54b9a93b230 100644 --- a/Makefile +++ b/Makefile @@ -73,9 +73,7 @@ build/libtvm_web_runtime.js: build/libtvm_web_runtime.bc cpplint: python3 3rdparty/dmlc-core/scripts/lint.py vta cpp vta/include vta/src python3 3rdparty/dmlc-core/scripts/lint.py topi cpp topi/include; - # Note: exclude src/runtime/micro/host_driven becuase it contains C99 files. python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp \ - --exclude_path=src/runtime/micro/host_driven \ include src \ examples/extension/src examples/graph_executor/src diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py index f2e608088510..1ae64e2ddfef 100644 --- a/python/tvm/autotvm/tuner/callback.py +++ b/python/tvm/autotvm/tuner/callback.py @@ -144,10 +144,10 @@ def __del__(self): def _callback(tuner, inputs, results): ctx.ct += len(inputs) - flops = 0 + flops = float("inf") for inp, res in zip(inputs, results): if res.error_no == 0: - flops = inp.task.flop / np.mean(res.costs) + flops = min(inp.task.flop / np.mean(res.costs), flops) if logger.level > logging.DEBUG: # only print progress bar in non-debug mode ctx.cur_flops = flops diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 2b18d645dbc0..21e06df9f7f0 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -120,18 +120,18 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix): size of the section in bytes """ if not os.path.isfile(binary_path): - raise RuntimeError('no such file \"{}\"'.format(binary_path)) + raise RuntimeError('no such file "{}"'.format(binary_path)) # We use the "-A" flag here to get the ".rodata" section's size, which is # not included by default. - size_output = run_cmd(['{}size'.format(toolchain_prefix), '-A', binary_path]) + size_output = run_cmd(["{}size".format(toolchain_prefix), "-A", binary_path]) # TODO(weberlo): Refactor this method and `*relocate_binary` so they are # both aware of [".bss", ".sbss", ".sdata"] being relocated to ".bss". section_mapping = { - '.text': ['.text'], - '.rodata': ['.rodata'], - '.data': ['.data', '.sdata'], - '.bss': ['.bss', '.sbss'], + ".text": [".text"], + ".rodata": [".rodata"], + ".data": [".data", ".sdata"], + ".bss": [".bss", ".sbss"], } sections_to_sum = section_mapping["." + section_name] section_size = 0 @@ -150,7 +150,7 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix): # NOTE: For some reason, the size of the BSS section on the RISC-V # GCC is sometimes reported to be smaller than it is, so we need to adjust # for this. - if 'riscv' in toolchain_prefix and section_name == 'bss': + if "riscv" in toolchain_prefix and section_name == "bss": # TODO(weberlo): Figure out why 32 is the minimum constant that works. # # The current hypothesis is that the last symbols in the ".bss" and @@ -214,10 +214,10 @@ def tvm_callback_relocate_binary( """ assert text_start < rodata_start < data_start < bss_start < stack_end stack_pointer_init = stack_end - word_size - ld_script_contents = '' + ld_script_contents = "" # TODO(weberlo): There should be a better way to configure this for different archs. # TODO is this line even necessary? - if 'riscv' in toolchain_prefix: + if "riscv" in toolchain_prefix: ld_script_contents += 'OUTPUT_ARCH( "riscv" )\n\n' ld_script_contents += RELOCATION_LD_SCRIPT_TEMPLATE.format( word_size=word_size, @@ -228,31 +228,31 @@ def tvm_callback_relocate_binary( stack_pointer_init=stack_pointer_init) tmp_dir = util.tempdir() - rel_obj_path = tmp_dir.relpath('relocated.obj') - rel_ld_script_path = tmp_dir.relpath('relocate.lds') - with open(rel_ld_script_path, 'w') as f: + rel_obj_path = tmp_dir.relpath("relocated.obj") + rel_ld_script_path = tmp_dir.relpath("relocate.lds") + with open(rel_ld_script_path, "w") as f: f.write(ld_script_contents) run_cmd([ - '{}ld'.format(toolchain_prefix), + "{}ld".format(toolchain_prefix), binary_path, - '-T', rel_ld_script_path, - '-o', rel_obj_path]) + "-T", rel_ld_script_path, + "-o", rel_obj_path]) - with open(rel_obj_path, 'rb') as f: + with open(rel_obj_path, "rb") as f: rel_bin = bytearray(f.read()) - gdb_init_dir = os.environ.get('MICRO_GDB_INIT_DIR') + gdb_init_dir = os.environ.get("MICRO_GDB_INIT_DIR") if gdb_init_dir is not None: - gdb_init_path = f'{gdb_init_dir}/.gdbinit' - with open(gdb_init_path, 'r') as f: - gdbinit_contents = f.read().split('\n') + gdb_init_path = f"{gdb_init_dir}/.gdbinit" + with open(gdb_init_path, "r") as f: + gdbinit_contents = f.read().split("\n") new_contents = [] for line in gdbinit_contents: new_contents.append(line) - if line.startswith('target'): - new_contents.append(f'add-symbol-file {rel_obj_path}') - with open(gdb_init_path, 'w') as f: - f.write('\n'.join(new_contents)) + if line.startswith("target"): + new_contents.append(f"add-symbol-file {rel_obj_path}") + with open(gdb_init_path, "w") as f: + f.write("\n".join(new_contents)) return rel_bin @@ -278,22 +278,22 @@ def tvm_callback_read_binary_section(binary, section, toolchain_prefix): contents of the read section """ tmp_dir = util.tempdir() - tmp_bin = tmp_dir.relpath('temp.bin') - tmp_section = tmp_dir.relpath('tmp_section.bin') - with open(tmp_bin, 'wb') as out_file: + tmp_bin = tmp_dir.relpath("temp.bin") + tmp_section = tmp_dir.relpath("tmp_section.bin") + with open(tmp_bin, "wb") as out_file: out_file.write(bytes(binary)) run_cmd([ - '{}objcopy'.format(toolchain_prefix), - '--dump-section', - '.{}={}'.format(section, tmp_section), + "{}objcopy".format(toolchain_prefix), + "--dump-section", + ".{}={}".format(section, tmp_section), tmp_bin]) if os.path.isfile(tmp_section): # Get section content if it exists. - with open(tmp_section, 'rb') as f: + with open(tmp_section, "rb") as f: section_bin = bytearray(f.read()) else: # Return empty bytearray if the section does not exist. - section_bin = bytearray('', 'utf-8') + section_bin = bytearray("", "utf-8") return section_bin @@ -316,18 +316,18 @@ def tvm_callback_get_symbol_map(binary, toolchain_prefix): alternating newline-separated keys and values """ tmp_dir = util.tempdir() - tmp_obj = tmp_dir.relpath('tmp_obj.bin') - with open(tmp_obj, 'wb') as out_file: + tmp_obj = tmp_dir.relpath("tmp_obj.bin") + with open(tmp_obj, "wb") as out_file: out_file.write(bytes(binary)) nm_output = run_cmd([ - '{}nm'.format(toolchain_prefix), - '-C', - '--defined-only', + "{}nm".format(toolchain_prefix), + "-C", + "--defined-only", tmp_obj]) nm_output = nm_output.splitlines() - map_str = '' + map_str = "" for line in nm_output: line = line.split() - map_str += line[2] + '\n' - map_str += line[0] + '\n' + map_str += line[2] + "\n" + map_str += line[0] + "\n" return map_str diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py index fd838c407617..848d7f57d1de 100644 --- a/python/tvm/contrib/debugger/debug_runtime.py +++ b/python/tvm/contrib/debugger/debug_runtime.py @@ -181,7 +181,6 @@ def _run_debug(self): """ self.debug_datum._time_list = [ [float(t) * 1e-6] for t in self.run_individual(10, 1, 1) - #[float(t) * 1e-6] for t in self.run_individual(1, 1, 1) ] for i, node in enumerate(self.debug_datum.get_graph_nodes()): num_outputs = self.debug_datum.get_graph_node_output_num(node) diff --git a/python/tvm/exec/rpc_server.py b/python/tvm/exec/rpc_server.py index dd275a3bad4d..805c3a8afb1a 100644 --- a/python/tvm/exec/rpc_server.py +++ b/python/tvm/exec/rpc_server.py @@ -42,7 +42,7 @@ def main(args): tracker_addr = (url, port) if not args.key: raise RuntimeError( - "Need key to present type of resource when tracker is available") + 'Need key to present type of resource when tracker is available') else: tracker_addr = None diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index c34dd728841a..bf4fd0ac9b76 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -32,14 +32,14 @@ # all sections that comprise a device's memory layout, in order from lowest # starting address to highest DEVICE_SECTIONS = [ - 'text', - 'rodata', - 'data', - 'bss', - 'args', - 'heap', - 'workspace', - 'stack', + "text", + "rodata", + "data", + "bss", + "args", + "heap", + "workspace", + "stack", ] class LibType(Enum): @@ -74,75 +74,75 @@ def __init__(self, config): # TODO(weberlo): add config validation # grab a binutil instance from the ID in the config - dev_funcs = tvm.micro.device.get_device_funcs(config['device_id']) - self.toolchain_prefix = config['toolchain_prefix'] - self.mem_layout = config['mem_layout'] - self.word_size_bits = config['word_size_bits'] - self.thumb_mode = config['thumb_mode'] - self.use_device_timer = config['use_device_timer'] - self.comms_method = config['comms_method'] + dev_funcs = tvm.micro.device.get_device_funcs(config["device_id"]) + self.toolchain_prefix = config["toolchain_prefix"] + self.mem_layout = config["mem_layout"] + self.word_size_bits = config["word_size_bits"] + self.thumb_mode = config["thumb_mode"] + self.use_device_timer = config["use_device_timer"] + self.comms_method = config["comms_method"] # First, find and compile runtime library. - runtime_src_path = os.path.join(get_micro_host_driven_dir(), 'utvm_runtime.c') + runtime_src_path = os.path.join(get_micro_host_driven_dir(), "utvm_runtime.c") tmp_dir = _util.tempdir() - runtime_obj_path = tmp_dir.relpath('utvm_runtime.obj') - options = ['-I{}'.format(get_micro_host_driven_dir())] - dev_funcs['create_micro_lib']( + runtime_obj_path = tmp_dir.relpath("utvm_runtime.obj") + options = ["-I{}".format(get_micro_host_driven_dir())] + dev_funcs["create_micro_lib"]( runtime_obj_path, runtime_src_path, LibType.RUNTIME, options=options) - comms_method = config['comms_method'] - if comms_method == 'openocd': - server_addr = config['server_addr'] - server_port = config['server_port'] - elif comms_method == 'host': - server_addr = '' + comms_method = config["comms_method"] + if comms_method == "openocd": + server_addr = config["server_addr"] + server_port = config["server_port"] + elif comms_method == "host": + server_addr = "" server_port = 0 else: - raise RuntimeError(f'unknown communication method: f{self.comms_method}') + raise RuntimeError(f"unknown communication method: f{self.comms_method}") assert all(map(lambda sec: sec in self.mem_layout, DEVICE_SECTIONS)), \ - 'not all sections have an assigned memory layout' + "not all sections have an assigned memory layout" self.module = _CreateSession( comms_method, runtime_obj_path, self.toolchain_prefix, - self.mem_layout['text'].get('start', 0), - self.mem_layout['text']['size'], - self.mem_layout['rodata'].get('start', 0), - self.mem_layout['rodata']['size'], - self.mem_layout['data'].get('start', 0), - self.mem_layout['data']['size'], - self.mem_layout['bss'].get('start', 0), - self.mem_layout['bss']['size'], - self.mem_layout['args'].get('start', 0), - self.mem_layout['args']['size'], - self.mem_layout['heap'].get('start', 0), - self.mem_layout['heap']['size'], - self.mem_layout['workspace'].get('start', 0), - self.mem_layout['workspace']['size'], - self.mem_layout['stack'].get('start', 0), - self.mem_layout['stack']['size'], + self.mem_layout["text"].get("start", 0), + self.mem_layout["text"]["size"], + self.mem_layout["rodata"].get("start", 0), + self.mem_layout["rodata"]["size"], + self.mem_layout["data"].get("start", 0), + self.mem_layout["data"]["size"], + self.mem_layout["bss"].get("start", 0), + self.mem_layout["bss"]["size"], + self.mem_layout["args"].get("start", 0), + self.mem_layout["args"]["size"], + self.mem_layout["heap"].get("start", 0), + self.mem_layout["heap"]["size"], + self.mem_layout["workspace"].get("start", 0), + self.mem_layout["workspace"]["size"], + self.mem_layout["stack"].get("start", 0), + self.mem_layout["stack"]["size"], self.word_size_bits, self.thumb_mode, self.use_device_timer, server_addr, server_port) - self._enter = self.module['enter'] - self._exit = self.module['exit'] - self.get_last_batch_time = self.module['get_last_batch_time'] - self.get_last_batch_cycles = self.module['get_last_batch_cycles'] + self._enter = self.module["enter"] + self._exit = self.module["exit"] + self.get_last_batch_time = self.module["get_last_batch_time"] + self.get_last_batch_cycles = self.module["get_last_batch_cycles"] def _check_system(self): """Check if the user's system is supported by MicroTVM. Raises error if not supported. """ - if not sys.platform.startswith('linux'): - raise RuntimeError('MicroTVM is currently only supported on Linux') + if not sys.platform.startswith("linux"): + raise RuntimeError("MicroTVM is currently only supported on Linux") # TODO(weberlo): Add 32-bit support. # It's primarily the compilation pipeline that isn't compatible. if sys.maxsize <= 2**32: - raise RuntimeError('MicroTVM is currently only supported on 64-bit host platforms') + raise RuntimeError("MicroTVM is currently only supported on 64-bit host platforms") def __enter__(self): self._enter() @@ -159,8 +159,8 @@ def _calc_max_workspace_usage(src): free_re = re.compile(r'.*if \(TVMBackendFreeWorkspace\(.+, .+, (\(void\*\))? (.+)\) != 0\) {.*') max_usage = 0 alloc_map = {} - for line in src.split('\n'): - if line.strip().startswith('//'): + for line in src.split("\n"): + if line.strip().startswith("//"): continue match = alloc_re.match(line) if match is not None: @@ -198,7 +198,7 @@ def create_micro_mod(c_mod, dev_config, lib_src_paths=None, lib_headers=None, micro module for the target device """ temp_dir = _util.tempdir() - lib_obj_path = temp_dir.relpath('dev_lib.obj') + lib_obj_path = temp_dir.relpath("dev_lib.obj") # TODO use dev config to dispatch on the type of C codegen to run through # (e.g., CodeGenCArm, CodeGenCHost, CodeGenCRiscV) c_mod.export_library( @@ -252,7 +252,7 @@ def cross_compiler(dev_config, lib_type, lib_src_paths=None, lib_headers=None, c_mod.export_library('dev_lib.obj', fcompile=fcompile) """ assert (lib_headers is None) == (lib_include_paths is None), \ - 'must specify both `lib_headers` and `lib_include_paths` or neither' + "must specify both `lib_headers` and `lib_include_paths` or neither" if lib_src_paths is None: lib_src_paths = [] @@ -260,39 +260,39 @@ def cross_compiler(dev_config, lib_type, lib_src_paths=None, lib_headers=None, lib_include_paths = [] include_options = [] for include_path in lib_include_paths: - include_options.append('-I') + include_options.append("-I") include_options.append(include_path) create_micro_lib = tvm.micro.device.get_device_funcs( - dev_config['device_id'])['create_micro_lib'] - mem_layout = dev_config['mem_layout'] + dev_config["device_id"])["create_micro_lib"] + mem_layout = dev_config["mem_layout"] def compile_func(obj_path, src_path, **kwargs): if isinstance(obj_path, list): obj_path = obj_path[0] if isinstance(src_path, list): src_path = src_path[0] - options = kwargs.get('options', []) + options = kwargs.get("options", []) options += include_options # check that workspace allocations don't exceed available workspace memory with open(src_path) as f: src_contents = f.read() max_ws_usage = _calc_max_workspace_usage(src_contents) - available_mem = mem_layout['workspace']['size'] + available_mem = mem_layout["workspace"]["size"] if max_ws_usage > available_mem: - raise RuntimeError(f'workspace allocations in library ({max_ws_usage}) ' - f'exceed available memory ({available_mem})') + raise RuntimeError(f"workspace allocations in library ({max_ws_usage}) " + f"exceed available memory ({available_mem})") # inject headers into new source path, if requested if lib_headers: - headers_to_inject = '\n'.join(map(lambda s: f'#include <{s}>', lib_headers)) + '\n' + headers_to_inject = "\n".join(map(lambda s: f"#include <{s}>", lib_headers)) + "\n" new_src_contents = headers_to_inject + src_contents tmp_dir = _util.tempdir() src_path = tmp_dir.relpath(os.path.basename(src_path)) - with open(src_path, 'w') as f: + with open(src_path, "w") as f: f.write(new_src_contents) create_micro_lib(obj_path, src_path, lib_type, options, lib_src_paths=lib_src_paths) - return _cc.cross_compiler(compile_func, output_format='obj') + return _cc.cross_compiler(compile_func, output_format="obj") def get_micro_host_driven_dir(): @@ -304,8 +304,8 @@ def get_micro_host_driven_dir(): directory path """ micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) - micro_host_driven_dir = os.path.join(micro_dir, '..', '..', '..', - 'src', 'runtime', 'micro', 'host_driven') + micro_host_driven_dir = os.path.join(micro_dir, "..", "..", "..", + "src", "runtime", "micro", "host_driven") return micro_host_driven_dir @@ -318,9 +318,9 @@ def get_micro_device_dir(): directory path """ micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) - micro_device_dir = os.path.join(micro_dir, '..', '..', '..', - 'src', 'runtime', 'micro', 'device') + micro_device_dir = os.path.join(micro_dir, "..", "..", "..", + "src", "runtime", "micro", "device") return micro_device_dir -tvm._ffi._init_api('tvm.micro', 'tvm.micro.base') +tvm._ffi._init_api("tvm.micro", "tvm.micro.base") diff --git a/python/tvm/micro/device/arm/stm32f746xx.py b/python/tvm/micro/device/arm/stm32f746xx.py index 9a2b46ac0cd7..6d4b8f74aacb 100644 --- a/python/tvm/micro/device/arm/stm32f746xx.py +++ b/python/tvm/micro/device/arm/stm32f746xx.py @@ -18,8 +18,8 @@ import os from .. import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint -DEVICE_ID = 'arm.stm32f746xx' -TOOLCHAIN_PREFIX = 'arm-none-eabi-' +DEVICE_ID = "arm.stm32f746xx" +TOOLCHAIN_PREFIX = "arm-none-eabi-" WORD_SIZE_BITS = 32 # # [Device Memory Layout] @@ -29,14 +29,14 @@ BASE_ADDR = 0x20000000 AVAILABLE_MEM = 320000 DEFAULT_SECTION_CONSTRAINTS = { - 'text': (18000, MemConstraint.ABSOLUTE_BYTES), - 'rodata': (100, MemConstraint.ABSOLUTE_BYTES), - 'data': (100, MemConstraint.ABSOLUTE_BYTES), - 'bss': (600, MemConstraint.ABSOLUTE_BYTES), - 'args': (4096, MemConstraint.ABSOLUTE_BYTES), - 'heap': (100.0, MemConstraint.WEIGHT), - 'workspace': (64000, MemConstraint.ABSOLUTE_BYTES), - 'stack': (32, MemConstraint.ABSOLUTE_BYTES), + "text": (18000, MemConstraint.ABSOLUTE_BYTES), + "rodata": (100, MemConstraint.ABSOLUTE_BYTES), + "data": (100, MemConstraint.ABSOLUTE_BYTES), + "bss": (600, MemConstraint.ABSOLUTE_BYTES), + "args": (4096, MemConstraint.ABSOLUTE_BYTES), + "heap": (100.0, MemConstraint.WEIGHT), + "workspace": (64000, MemConstraint.ABSOLUTE_BYTES), + "stack": (32, MemConstraint.ABSOLUTE_BYTES), } def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None): @@ -66,22 +66,22 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=N options += [ # TODO(weberlo): make a debug flag - '-O2', - '-march=armv7e-m', - '-mcpu=cortex-m7', - '-mlittle-endian', - '-mfloat-abi=hard', - '-mfpu=fpv5-sp-d16', - '-mthumb', - '-ffast-math', - '-gdwarf-5', - '-DARM_MATH_CM7', - '-D__FPU_PRESENT=1U', - '-DARM_MATH_DSP', - '-Wno-unused-variable', - '-Wno-unused-parameter', - '-I{}'.format(os.environ['CMSIS_ST_PATH']), - '-I{}/Core/Include'.format(os.environ['CMSIS_ST_PATH']) + "-O2", + "-march=armv7e-m", + "-mcpu=cortex-m7", + "-mlittle-endian", + "-mfloat-abi=hard", + "-mfpu=fpv5-sp-d16", + "-mthumb", + "-ffast-math", + "-gdwarf-5", + "-DARM_MATH_CM7", + "-D__FPU_PRESENT=1U", + "-DARM_MATH_DSP", + "-Wno-unused-variable", + "-Wno-unused-parameter", + "-I{}".format(os.environ["CMSIS_ST_PATH"]), + "-I{}/Core/Include".format(os.environ["CMSIS_ST_PATH"]) ] create_micro_lib_base( obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options, @@ -110,19 +110,19 @@ def generate_config(server_addr, server_port, section_constraints=None): if section_constraints is None: section_constraints = DEFAULT_SECTION_CONSTRAINTS return { - 'device_id': DEVICE_ID, - 'toolchain_prefix': TOOLCHAIN_PREFIX, - 'mem_layout': gen_mem_layout(BASE_ADDR, AVAILABLE_MEM, WORD_SIZE_BITS, section_constraints), - 'word_size_bits': WORD_SIZE_BITS, - 'thumb_mode': True, - 'use_device_timer': True, - 'comms_method': 'openocd', - 'server_addr': server_addr, - 'server_port': server_port, + "device_id": DEVICE_ID, + "toolchain_prefix": TOOLCHAIN_PREFIX, + "mem_layout": gen_mem_layout(BASE_ADDR, AVAILABLE_MEM, WORD_SIZE_BITS, section_constraints), + "word_size_bits": WORD_SIZE_BITS, + "thumb_mode": True, + "use_device_timer": True, + "comms_method": "openocd", + "server_addr": server_addr, + "server_port": server_port, } register_device(DEVICE_ID, { - 'create_micro_lib': create_micro_lib, - 'generate_config': generate_config, + "create_micro_lib": create_micro_lib, + "generate_config": generate_config, }) diff --git a/python/tvm/micro/device/base.py b/python/tvm/micro/device/base.py index 4d42bff8ebbc..767284c9c254 100644 --- a/python/tvm/micro/device/base.py +++ b/python/tvm/micro/device/base.py @@ -102,18 +102,18 @@ def create_micro_lib_base( # look at these (specifically `strip`): # https://stackoverflow.com/questions/15314581/g-compiler-flag-to-minimize-binary-size base_compile_cmd = [ - f'{toolchain_prefix}gcc', - '-std=c11', - '-Wall', - '-Wextra', - '--pedantic', - '-c', - '-g', - '-nostartfiles', - '-nodefaultlibs', - '-nostdlib', - '-fdata-sections', - '-ffunction-sections', + f"{toolchain_prefix}gcc", + "-std=c11", + "-Wall", + "-Wextra", + "--pedantic", + "-c", + "-g", + "-nostartfiles", + "-nodefaultlibs", + "-nostdlib", + "-fdata-sections", + "-ffunction-sections", ] if options is not None: base_compile_cmd += options @@ -126,24 +126,24 @@ def create_micro_lib_base( if lib_type == LibType.RUNTIME: dev_dir = _get_device_source_dir(device_id) - dev_src_paths = glob.glob(f'{dev_dir}/*.[csS]') + dev_src_paths = glob.glob(f"{dev_dir}/*.[csS]") # there needs to at least be a utvm_timer.c file assert dev_src_paths - assert 'utvm_timer.c' in map(os.path.basename, dev_src_paths) + assert "utvm_timer.c" in map(os.path.basename, dev_src_paths) src_paths += dev_src_paths elif lib_type == LibType.OPERATOR: # create a temporary copy of the operator source, so we can inject the dev lib # header without modifying the original. - temp_src_path = tmp_dir.relpath('temp.c') - with open(in_src_path, 'r') as f: + temp_src_path = tmp_dir.relpath("temp.c") + with open(in_src_path, "r") as f: src_lines = f.read().splitlines() src_lines.insert(0, '#include "utvm_device_dylib_redirect.c"') - with open(temp_src_path, 'w') as f: - f.write('\n'.join(src_lines)) + with open(temp_src_path, "w") as f: + f.write("\n".join(src_lines)) new_in_src_path = temp_src_path else: - raise RuntimeError('unknown lib type') + raise RuntimeError("unknown lib type") src_paths += [new_in_src_path] @@ -151,23 +151,23 @@ def create_micro_lib_base( if lib_src_paths is not None: src_paths += lib_src_paths - # print(f'include paths: {include_paths}') + # print(f"include paths: {include_paths}") for path in include_paths: - base_compile_cmd += ['-I', path] + base_compile_cmd += ["-I", path] prereq_obj_paths = [] # print(src_paths) for src_path in src_paths: - curr_obj_path = tmp_dir.relpath(pathlib.Path(src_path).with_suffix('.o').name) + curr_obj_path = tmp_dir.relpath(pathlib.Path(src_path).with_suffix(".o").name) assert curr_obj_path not in prereq_obj_paths prereq_obj_paths.append(curr_obj_path) - curr_compile_cmd = base_compile_cmd + [src_path, '-o', curr_obj_path] + curr_compile_cmd = base_compile_cmd + [src_path, "-o", curr_obj_path] # TODO(weberlo): make compilation fail if there are any warnings run_cmd(curr_compile_cmd) - ld_cmd = [f'{toolchain_prefix}ld', '-relocatable'] + ld_cmd = [f"{toolchain_prefix}ld", "-relocatable"] ld_cmd += prereq_obj_paths - ld_cmd += ['-o', out_obj_path] + ld_cmd += ["-o", out_obj_path] run_cmd(ld_cmd) @@ -195,7 +195,7 @@ def gen_mem_layout(base_addr, available_mem, word_size_bits, section_constraints section_constraints: Optional[Dict[str, [Number, MemConstraint]]] maps section name to the quantity of available memory """ - assert word_size_bits in (32, 64), 'only 32- or 64-bit devices are supported now' + assert word_size_bits in (32, 64), "only 32- or 64-bit devices are supported now" word_size_bytes = word_size_bits // 8 byte_sum = sum(x[0] for x in section_constraints.values() @@ -212,18 +212,18 @@ def gen_mem_layout(base_addr, available_mem, word_size_bits, section_constraints (val, cons_type) = section_constraints[section] if cons_type == MemConstraint.ABSOLUTE_BYTES: assert val % word_size_bytes == 0, \ - f'constraint {val} for {section} section is not word-aligned' + f"constraint {val} for {section} section is not word-aligned" size = val res[section] = { - 'start': curr_addr, - 'size': size, + "start": curr_addr, + "size": size, } else: size = int((val / weight_sum) * available_weight_mem) size = (size // word_size_bytes) * word_size_bytes res[section] = { - 'start': curr_addr, - 'size': size, + "start": curr_addr, + "size": size, } curr_addr += size @@ -232,5 +232,5 @@ def gen_mem_layout(base_addr, available_mem, word_size_bits, section_constraints def _get_device_source_dir(device_id): """Grabs the source directory for device-specific uTVM files""" - dev_subdir = '/'.join(device_id.split('.')) - return get_micro_device_dir() + '/' + dev_subdir + dev_subdir = "/".join(device_id.split(".")) + return get_micro_device_dir() + "/" + dev_subdir diff --git a/python/tvm/micro/device/host.py b/python/tvm/micro/device/host.py index 55e0ac326f2d..0cf29874ab57 100644 --- a/python/tvm/micro/device/host.py +++ b/python/tvm/micro/device/host.py @@ -19,21 +19,21 @@ from . import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint -DEVICE_ID = 'host' -TOOLCHAIN_PREFIX = '' +DEVICE_ID = "host" +TOOLCHAIN_PREFIX = "" WORD_SIZE_BITS = 64 if sys.maxsize > 2**32 else 32 # we pretend we only have 320kb in the default case, so we can use `gen_mem_layout` DEFAULT_AVAILABLE_MEM = 3200000 DEFAULT_SECTION_CONSTRAINTS = { - 'text': (20480, MemConstraint.ABSOLUTE_BYTES), - 'rodata': (20480, MemConstraint.ABSOLUTE_BYTES), - 'data': (768, MemConstraint.ABSOLUTE_BYTES), - 'bss': (4096, MemConstraint.ABSOLUTE_BYTES), - 'args': (4096, MemConstraint.ABSOLUTE_BYTES), - 'heap': (262144, MemConstraint.ABSOLUTE_BYTES), - 'workspace': (64000, MemConstraint.ABSOLUTE_BYTES), - 'stack': (80, MemConstraint.ABSOLUTE_BYTES), + "text": (20480, MemConstraint.ABSOLUTE_BYTES), + "rodata": (20480, MemConstraint.ABSOLUTE_BYTES), + "data": (768, MemConstraint.ABSOLUTE_BYTES), + "bss": (4096, MemConstraint.ABSOLUTE_BYTES), + "args": (4096, MemConstraint.ABSOLUTE_BYTES), + "heap": (262144, MemConstraint.ABSOLUTE_BYTES), + "workspace": (64000, MemConstraint.ABSOLUTE_BYTES), + "stack": (80, MemConstraint.ABSOLUTE_BYTES), } def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None): @@ -61,9 +61,9 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=N else: options = list(options) # Cannot increase optimization level on host due to code loading method. - options.append('-O0') - if sys.maxsize > 2**32 and sys.platform.startswith('linux'): - options += ['-mcmodel=large'] + options.append("-O0") + if sys.maxsize > 2**32 and sys.platform.startswith("linux"): + options += ["-mcmodel=large"] create_micro_lib_base( obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options, lib_src_paths=lib_src_paths) @@ -98,19 +98,19 @@ def generate_config(available_mem=None, section_constraints=None): # need to zero out all start addresses, because they don't make sense for a # host device (the memory region is allocated in the backend) for section in mem_layout: - mem_layout[section]['start'] = 0 + mem_layout[section]["start"] = 0 return { - 'device_id': DEVICE_ID, - 'toolchain_prefix': TOOLCHAIN_PREFIX, - 'mem_layout': mem_layout, - 'word_size_bits': WORD_SIZE_BITS, - 'thumb_mode': False, - 'use_device_timer': False, - 'comms_method': 'host', + "device_id": DEVICE_ID, + "toolchain_prefix": TOOLCHAIN_PREFIX, + "mem_layout": mem_layout, + "word_size_bits": WORD_SIZE_BITS, + "thumb_mode": False, + "use_device_timer": False, + "comms_method": "host", } register_device(DEVICE_ID, { - 'create_micro_lib': create_micro_lib, - 'generate_config': generate_config, + "create_micro_lib": create_micro_lib, + "generate_config": generate_config, }) diff --git a/python/tvm/micro/device/riscv_spike.py b/python/tvm/micro/device/riscv_spike.py index b9f55dff0ce7..32881cab6ba9 100644 --- a/python/tvm/micro/device/riscv_spike.py +++ b/python/tvm/micro/device/riscv_spike.py @@ -18,19 +18,19 @@ from . import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint -DEVICE_ID = 'riscv_spike' -TOOLCHAIN_PREFIX = 'riscv64-unknown-elf-' +DEVICE_ID = "riscv_spike" +TOOLCHAIN_PREFIX = "riscv64-unknown-elf-" WORD_SIZE_BITS = 64 DEFAULT_SECTION_CONSTRAINTS = { - 'text': (18000, MemConstraint.ABSOLUTE_BYTES), - 'rodata': (128, MemConstraint.ABSOLUTE_BYTES), - 'data': (128, MemConstraint.ABSOLUTE_BYTES), - 'bss': (2048, MemConstraint.ABSOLUTE_BYTES), - 'args': (4096, MemConstraint.ABSOLUTE_BYTES), - 'heap': (100.0, MemConstraint.WEIGHT), - 'workspace': (64000, MemConstraint.ABSOLUTE_BYTES), - 'stack': (32, MemConstraint.ABSOLUTE_BYTES), + "text": (18000, MemConstraint.ABSOLUTE_BYTES), + "rodata": (128, MemConstraint.ABSOLUTE_BYTES), + "data": (128, MemConstraint.ABSOLUTE_BYTES), + "bss": (2048, MemConstraint.ABSOLUTE_BYTES), + "args": (4096, MemConstraint.ABSOLUTE_BYTES), + "heap": (100.0, MemConstraint.WEIGHT), + "workspace": (64000, MemConstraint.ABSOLUTE_BYTES), + "stack": (32, MemConstraint.ABSOLUTE_BYTES), } def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None): @@ -90,19 +90,19 @@ def generate_config(base_addr, available_mem, server_addr, server_port, section_ if section_constraints is None: section_constraints = DEFAULT_SECTION_CONSTRAINTS return { - 'device_id': DEVICE_ID, - 'toolchain_prefix': TOOLCHAIN_PREFIX, - 'mem_layout': gen_mem_layout(base_addr, available_mem, WORD_SIZE_BITS, section_constraints), - 'word_size_bits': WORD_SIZE_BITS, - 'thumb_mode': False, - 'use_device_timer': False, - 'comms_method': 'openocd', - 'server_addr': server_addr, - 'server_port': server_port, + "device_id": DEVICE_ID, + "toolchain_prefix": TOOLCHAIN_PREFIX, + "mem_layout": gen_mem_layout(base_addr, available_mem, WORD_SIZE_BITS, section_constraints), + "word_size_bits": WORD_SIZE_BITS, + "thumb_mode": False, + "use_device_timer": False, + "comms_method": "openocd", + "server_addr": server_addr, + "server_port": server_port, } register_device(DEVICE_ID, { - 'create_micro_lib': create_micro_lib, - 'generate_config': generate_config, + "create_micro_lib": create_micro_lib, + "generate_config": generate_config, }) diff --git a/python/tvm/relay/_parser.py b/python/tvm/relay/_parser.py index 6a1d928352f4..7731efe04dfd 100644 --- a/python/tvm/relay/_parser.py +++ b/python/tvm/relay/_parser.py @@ -342,10 +342,7 @@ def visitLocalVar(self, ctx): return local_var def visitGraphVar(self, ctx): - graph_var_idx = int(ctx.NAT().getText()) - if graph_var_idx >= len(self.graph_expr): - raise ParseError(f"graph var `%{graph_var_idx}` is unbound") - return self.graph_expr[graph_var_idx] + return self.graph_expr[int(ctx.NAT().getText())] def visit_list(self, ctx_list) -> List[Any]: """"Visit a list of contexts.""" diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 2ad75533e807..6bdec67617e1 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -262,7 +262,7 @@ def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target): name="conv2d_tranpose_nchw.arm_cpu") return strategy -@bitserial_conv2d_strategy.register(["arm_cpu", "micro_dev"]) +@bitserial_conv2d_strategy.register("arm_cpu") def bitserial_conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): """bitserial_conv2d x86 strategy""" strategy = _op.OpStrategy() @@ -281,7 +281,7 @@ def bitserial_conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): raise ValueError("Data layout {} not supported.".format(layout)) return strategy -@bitserial_dense_strategy.register(["arm_cpu", "micro_dev"]) +@bitserial_dense_strategy.register("arm_cpu") def schedule_bitserial_dense_arm_cpu(attrs, inputs, out_type, target): """bitserial_dense arm cpu strategy""" strategy = _op.OpStrategy() diff --git a/python/tvm/target/arm_isa.py b/python/tvm/target/arm_isa.py index 6b2eda8f050e..c40296e50713 100644 --- a/python/tvm/target/arm_isa.py +++ b/python/tvm/target/arm_isa.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Defines functions to analyze availble opcodes in the ARM ISA.""" +"""Defines functions to analyze available opcodes in the ARM ISA.""" ARM_ISA_MAP = { diff --git a/src/ir/error.cc b/src/ir/error.cc index 67694342db45..9d498288d2ba 100644 --- a/src/ir/error.cc +++ b/src/ir/error.cc @@ -62,21 +62,16 @@ void ErrorReporter::RenderErrors(const IRModule& module, bool use_color) { CHECK(has_errs != this->node_to_error_.end()); - const auto& error_indices = has_errs->second; + const auto& error_indicies = has_errs->second; std::stringstream err_msg; - if (error_indices.size() != 0) { - err_msg << rang::fg::red; - err_msg << " "; - // the errors are in reverse order, so print them with a reversed iteration - err_msg << this->errors_[error_indices[error_indices.size()-1]].what(); - for (int i = error_indices.size() - 2; i >= 0; i--) { - size_t err_idx = error_indices[i]; - err_msg << "; " << this->errors_[err_idx].what(); - } - err_msg << rang::fg::reset; + err_msg << rang::fg::red; + err_msg << " "; + for (auto index : error_indicies) { + err_msg << this->errors_[index].what() << "; "; } + err_msg << rang::fg::reset; // Setup error map. auto it = error_maps.find(global); diff --git a/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c b/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c index 3a840e7a7861..9fabce6bdc1e 100644 --- a/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c +++ b/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c @@ -32,7 +32,8 @@ extern "C" { #include #include -// TODO(areusch): compiler errors say volatile qualifier is discarded. should we just get rid of em? +// TODO(weberlo, areusch): compiler errors say volatile qualifier is discarded. +// should we just get rid of em? void* (* volatile TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = NULL; int (* volatile TVMBackendFreeWorkspace_)(int, int, void*) = NULL; void (* volatile TVMAPISetLastError_)(const char*) = NULL; @@ -52,9 +53,9 @@ void TVMAPISetLastError(const char* msg) { } void *memset(void *s, int c, size_t n) { - char *p = (char*) s; + char *p = (char*) s; // NOLINT(readability/casting): linter is configured for c++ while (n > 0) { - *p = (char) c; + *p = (char) c; // NOLINT(readability/casting): linter is configured for c++ p++; n--; } @@ -62,21 +63,23 @@ void *memset(void *s, int c, size_t n) { } void *memmove(void *to, const void *from, size_t n) { - // TODO will need to factor memmove calls into workspace size calculation + // TODO(weberlo, areusch): will need to factor memmove calls into workspace size calculation + // NOLINTNEXTLINE(readability/casting): linter is configured for c++ char *temp = (char*) TVMBackendAllocWorkspace(1, 1, (uint64_t) n, 2, 8); if (temp == NULL) { return NULL; } - const char *from_pp = (char*) from; + const char *from_pp = (char*) from; // NOLINT(readability/casting): linter is configured for c++ for (size_t i = 0; i < n; i++) { temp[i] = from_pp[i]; } - char *to_pp = (char*) to; + char *to_pp = (char*) to; // NOLINT(readability/casting): linter is configured for c++ for (size_t i = 0; i < n; i++) { to_pp[i] = temp[i]; } + // NOLINTNEXTLINE(readability/casting): linter is configured for c++ if (TVMBackendFreeWorkspace(1, (uint64_t) 1, (void*) temp) != 0) { return NULL; } diff --git a/src/runtime/micro/host_driven/utvm_runtime.c b/src/runtime/micro/host_driven/utvm_runtime.c index f05fdb5fe2c1..2f2f0c1e0dea 100644 --- a/src/runtime/micro/host_driven/utvm_runtime.c +++ b/src/runtime/micro/host_driven/utvm_runtime.c @@ -34,7 +34,8 @@ extern "C" { #include "utvm_runtime.h" -// TODO(areusch): move defines into header +// TODO(weberlo, areusch): move defines into header +// TODO(weberlo, areusch): unify TASK_QUEUE_SIZE and MicroSession::kTaskQueueCapacity. #define TASK_QUEUE_SIZE 20 volatile UTVMTask utvm_tasks[TASK_QUEUE_SIZE] = { }; volatile uint32_t utvm_num_tasks = 0; @@ -139,7 +140,7 @@ void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, } int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { - // TODO(areusch): add dev type check + // TODO(weberlo, areusch): add dev type check if (utvm_num_active_allocs == 0) { TVMAPISetLastError("free called with no active workspace allocations"); // Reset allocations and workspace (for future task executions). @@ -153,7 +154,6 @@ int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { // it's the first allocation utvm_alloc_ends[0] = NULL; } else { - // TODO(areusch): reverse loop iteration since usually it's the last alloc being freed for (uint32_t i = utvm_alloc_idx - 1; i >= 0; i--) { if (utvm_alloc_ends[i] == ptr) { utvm_alloc_ends[i + 1] = NULL; @@ -167,8 +167,8 @@ int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { if (utvm_alloc_idx == 0) { utvm_workspace_curr = utvm_workspace_start; } else { - // TODO(areusch): could you possibly have utvm_alloc_idx pointing to a NULL entry in this - // branch? + // TODO(weberlo, areusch): could you possibly have utvm_alloc_idx pointing to a NULL entry in + // this branch? utvm_workspace_curr = utvm_alloc_ends[utvm_alloc_idx - 1]; } return 0; diff --git a/src/runtime/micro/host_driven/utvm_runtime.h b/src/runtime/micro/host_driven/utvm_runtime.h index 42ca025fc2ab..fc11b70ed8e2 100644 --- a/src/runtime/micro/host_driven/utvm_runtime.h +++ b/src/runtime/micro/host_driven/utvm_runtime.h @@ -63,27 +63,35 @@ typedef struct { } UTVMTask; /*! - * \brief TODO + * \brief microTVM processor startup. + * Expected to reset the stack pointer, configure any hardware required to support the CRT + * (i.e. FPU), and then jump to UTVMMain. */ extern void UTVMInit(); /*! - * \brief TODO + * \brief Start the on-device timer. + * \return UTVMReturnCode indicating the outcome of the operation. */ extern int32_t UTVMTimerStart(); /*! - * \brief TODO + * \brief Stop the on-device timer. + * TODO(areusch): Use an SI specification of timer units here. + * \param err Receives a UTVMReturnCode indicating the outcome of the operation. + * \return elapsed time since UTVMTimerStart returned, in device timer ticks. */ extern uint32_t UTVMTimerStop(int32_t* err); /*! - * \brief TODO + * \brief Main entry point for UTVM runtime. + * Waits for "go" signal, then executes tasks and reports result. Should never return. */ void UTVMMain(); /*! - * \brief TODO + * \brief Function entered when UTVMMain is complete. + * Should never return. The host sets a breakpoint here to detect end of computation. */ void UTVMDone(); diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index 2093f385fca0..2d74bc349f42 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -55,9 +55,6 @@ enum class SectionKind : size_t { /*! \brief data type for word sizes */ class TargetWordSize { - private: - size_t word_size_bits_; - public: explicit TargetWordSize(size_t word_size_bits) : word_size_bits_{word_size_bits} { CHECK(word_size_bits == 32 || word_size_bits == 64) @@ -71,6 +68,9 @@ class TargetWordSize { size_t bits() const { return word_size_bits_; } + + private: + size_t word_size_bits_; }; @@ -94,18 +94,18 @@ class TargetVal { width_bits <= 64 && (width_bits & (width_bits - 1)) == 0) << "width_bits must be a power of 2 in [8, 64], got " << width_bits; - value_ = value & bitmask(); + value_ = value & Bitmask(); } - bool is_initialized() const { return width_bits_ != 0; } + bool IsInitialized() const { return width_bits_ != 0; } size_t width_bits() const { - CHECK(is_initialized()) << "TargetVal is not initialized"; + CHECK(IsInitialized()) << "TargetVal is not initialized"; return width_bits_; } - uint64_t bitmask() const { - CHECK(is_initialized()) << "TargetVal is not initialized"; + uint64_t Bitmask() const { + CHECK(IsInitialized()) << "TargetVal is not initialized"; if (width_bits_ == 64) { return ~0UL; @@ -115,21 +115,21 @@ class TargetVal { } uint32_t uint32() const { - CHECK(is_initialized()) << "TargetVal is not initialized"; + CHECK(IsInitialized()) << "TargetVal is not initialized"; CHECK(width_bits_ <= 32) << "TargetVal: requested 32-bit value, actual width is " << width_bits_; - return uint32_t(value_ & bitmask()); + return uint32_t(value_ & Bitmask()); } uint64_t uint64() const { - CHECK(is_initialized()) << "TargetVal is not initialized"; + CHECK(IsInitialized()) << "TargetVal is not initialized"; return value_; } TargetVal& operator=(const TargetVal& other) { - CHECK(other.is_initialized()) << "Cannot assign an uninitialized TargetVal"; + CHECK(other.IsInitialized()) << "Cannot assign an uninitialized TargetVal"; - if (!is_initialized()) { + if (!IsInitialized()) { width_bits_ = other.width_bits_; } @@ -137,12 +137,12 @@ class TargetVal { << "Cannot assign TargetVal with width " << other.width_bits_ << "bits to TargetVal with width " << width_bits_ << "bits"; - value_ = other.value_ & bitmask(); + value_ = other.value_ & Bitmask(); return *this; } }; -// TODO(areusch): just get rid of `TargetPtr`. +// TODO(weberlo, areusch): just get rid of `TargetPtr`. /*! \brief absolute device address */ class TargetPtr { public: diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h index e30c72b79fb5..c99d79652edb 100644 --- a/src/runtime/micro/target_data_layout_encoder.h +++ b/src/runtime/micro/target_data_layout_encoder.h @@ -30,7 +30,7 @@ namespace tvm { namespace runtime { -// TODO(weberlo): Handle endianness. +// TODO(weberlo, areusch): Handle endianness. /*! * \brief data encoder for uTVM that builds a host-side buffer @@ -159,7 +159,7 @@ class TargetDataLayoutEncoder { size_t curr_offset_; /*! \brief start address of the encoder in device memory */ TargetPtr start_addr_; - /*! \brief TODO */ + /*! \brief number of bytes available in device memory */ size_t capacity_; /*! \brief number of bytes in a word on the target device */ TargetWordSize word_size_; @@ -178,8 +178,8 @@ TargetDataLayoutEncoder::Slot::Slot(TargetDataLayoutEncoder* parent, template TargetDataLayoutEncoder::Slot::~Slot() { - // TODO(areusch): this can mask the exception thrown by slot allocation... even though that - // doesn't make sense. + // TODO(weberlo, areusch): this can mask the exception thrown by slot allocation... even though + // that doesn't make sense. CHECK(curr_offset_ == size_) << "unwritten space in slot; curr_offset=" << curr_offset_ << ", size=" << size_; } diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc index 4cca1efea2ca..5e5db82eb931 100644 --- a/src/target/source/codegen_c_host.cc +++ b/src/target/source/codegen_c_host.cc @@ -20,11 +20,11 @@ /*! * \file codegen_c_host.cc */ -#include "codegen_c_host.h" +#include #include #include -#include "tvm/target/codegen.h" #include "../build_common.h" +#include "codegen_c_host.h" namespace tvm { namespace codegen { diff --git a/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct.py b/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct.py index 1fdb596b7163..7d3e945fef14 100644 --- a/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct.py +++ b/topi/python/topi/arm_cpu/cortex_m7/conv2d/direct.py @@ -60,9 +60,7 @@ def _conv2d_direct_nhwc_compute(cfg, data, kernel, strides, padding, dilation, l assert layout == 'NHWC' conv = conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype) - ########################### - # Config Space Definition # - ########################### + # Config Space Definition N, H, W, CI = get_const_tuple(data.shape) KH, KW, _, CO = get_const_tuple(kernel.shape) n, oh, ow, co = cfg.axis(N), cfg.axis(H), cfg.axis(W), cfg.axis(CO) diff --git a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py index 2169176c0711..01acbb72caca 100644 --- a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py +++ b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py @@ -34,7 +34,7 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype): # collisions in the generated source (e.g., if there are multiple operators # in the same module that use the same intrinsic) # - # TODO(areusch): to cut down on memory usage, we should cache each intrinsic + # TODO(weberlo, areusch): to cut down on memory usage, we should cache each intrinsic # instantiation and include it only once, eliminating the need for unique # IDs UNIQ_ID_LEN = 8 @@ -47,7 +47,7 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype): if isinstance(N, tvm.tir.IntImm): N = N.value assert K % 4 == 0 - # TODO(areusch): support more dtypes? + # TODO(weberlo, areusch): support more dtypes? assert in_dtype == 'int8' assert out_dtype == 'int32' A = te.placeholder((M, K), name='a', dtype=in_dtype) @@ -124,7 +124,7 @@ def _body(): def gemm_MxKxN_impl(M, K, N, uniq_id): """Emit C code for gemm impl.""" - # TODO(areusch): are there any SIMD tricks to zero out arrays quickly? + # TODO(weberlo, areusch): are there any SIMD tricks to zero out arrays quickly? aa_pad_size = M * K bb_pad_size = N * K # code reference: CMSIS-NN paper (https://arxiv.org/abs/1801.06601) From 99e4718fc6818dbc5746a6319af693591a4be847 Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Mon, 27 Apr 2020 13:59:23 -0700 Subject: [PATCH 10/11] address thierry's comments --- python/tvm/autotvm/tuner/callback.py | 6 +++--- python/tvm/autotvm/tuner/tuner.py | 10 +--------- src/runtime/micro/micro_session.cc | 2 -- tests/python/unittest/test_runtime_micro.py | 15 --------------- .../topi/arm_cpu/cortex_m7/micro_kernel/gemm.py | 14 -------------- 5 files changed, 4 insertions(+), 43 deletions(-) diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py index 1ae64e2ddfef..cfc1b2c38f85 100644 --- a/python/tvm/autotvm/tuner/callback.py +++ b/python/tvm/autotvm/tuner/callback.py @@ -144,12 +144,12 @@ def __del__(self): def _callback(tuner, inputs, results): ctx.ct += len(inputs) - flops = float("inf") + flops = 0 for inp, res in zip(inputs, results): if res.error_no == 0: - flops = min(inp.task.flop / np.mean(res.costs), flops) + flops = inp.task.flop / np.mean(res.costs) - if logger.level > logging.DEBUG: # only print progress bar in non-debug mode + if not logger.isEnabledFor(logging.DEBUG): # only print progress bar in non-debug mode ctx.cur_flops = flops ctx.best_flops = tuner.best_flops diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py index 0d66d34ac316..2441a4ae642f 100644 --- a/python/tvm/autotvm/tuner/tuner.py +++ b/python/tvm/autotvm/tuner/tuner.py @@ -150,15 +150,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=(), si_pr i + k + 1, si_prefix, format_si_prefix(flops, si_prefix), format_si_prefix(self.best_flops, si_prefix), res, config) - num_successes = 0 - for result in results: - if isinstance(result.costs[0], float): - num_successes += 1 - if num_successes != len(results): - logger.debug('not counting %d failures towards trial count', - len(results) - num_successes) - i += num_successes - + i += len(results) self.ttl = min(early_stopping + self.best_iter, n_trial) - i self.update(inputs, results) diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 672f16dc06f9..0e8e1693448e 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -284,7 +284,6 @@ void MicroSession::FlushTaskQueue() { template void MicroSession::FlushTaskQueuePriv() { - // std::cout << "[MicroSession::FlushTaskQueue]" << std::endl; std::vector prepped_tasks; for (const auto& task : task_queue_) { prepped_tasks.push_back(T(task)); @@ -297,7 +296,6 @@ void MicroSession::FlushTaskQueuePriv() { batch_args_encoder_.buf_size()); // Flush `tasks` to device memory. -// runtime_symbol_map_.Dump(std::cout); TargetPtr dev_tasks_addr = runtime_symbol_map_["utvm_tasks"]; low_level_device()->Write( dev_tasks_addr, diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 448f53c57f3c..bec74fb6644c 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -30,21 +30,6 @@ DEV_CONFIG_B = micro.device.host.generate_config() TARGET = 'c -device=micro_dev' -# # TODO why do spike examples have memory that starts at 0x10000000, but you -# # should set the base addr as 0x10010000? should somehow help the user to be -# # aware of that. -# # are there always 0x10000 bytes reserved at the beginning of the address space? -# BASE_ADDR = 0x10010000 - -# AVAILABLE_MEM = 0x200000 -# DEV_CONFIG_A = micro.device.riscv_spike.generate_config(BASE_ADDR, AVAILABLE_MEM, '127.0.0.1', 6666) -# DEV_CONFIG_B = micro.device.riscv_spike.generate_config(BASE_ADDR, AVAILABLE_MEM, '127.0.0.1', 6667) -# TARGET = 'c -device=micro_dev' - -# DEV_CONFIG_A = micro.device.arm.stm32f746xx.generate_config('127.0.0.1', 6666) -# DEV_CONFIG_B = micro.device.arm.stm32f746xx.generate_config('127.0.0.1', 6667) -# TARGET = 'c -device=micro_dev' - def relay_micro_build(func, dev_config, params=None): """Create a graph runtime module with a micro device context from a Relay function. diff --git a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py index 01acbb72caca..9af7bef95b7c 100644 --- a/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py +++ b/topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py @@ -93,20 +93,6 @@ def _reduce_reset(): return ib.get() def _body(): ib = tvm.tir.ir_builder.create() - # # NOTE we need the reset in the body for cases where the buffer - # # we're accumulating into is uninitialized (e.g., if it's the - # # result of a workspace allocation, because there are no guarantees - # # on the contents). - # ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_reset", - # cc.access_ptr("w"), - # cc.strides[0])) - # ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_update", - # aa.access_ptr("r"), - # bb.access_ptr("r"), - # cc.access_ptr("w"), - # aa.strides[0], - # bb.strides[0], - # cc.strides[0])) ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_body_{uniq_id}", aa.access_ptr("r"), bb.access_ptr("r"), From bef97f1324a514f2c914d50ec874b0c2c9855b7d Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Tue, 28 Apr 2020 17:09:52 -0700 Subject: [PATCH 11/11] address u99127, liangfu, tmoreau89 comments --- python/tvm/autotvm/measure/local_executor.py | 3 +-- python/tvm/exec/rpc_server.py | 18 +++++++++--------- python/tvm/micro/device/arm/stm32f746xx.py | 1 - 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/python/tvm/autotvm/measure/local_executor.py b/python/tvm/autotvm/measure/local_executor.py index d838a92daa01..a0a826abccf6 100644 --- a/python/tvm/autotvm/measure/local_executor.py +++ b/python/tvm/autotvm/measure/local_executor.py @@ -145,8 +145,7 @@ def submit(self, func, *args, **kwargs): if not self.do_fork: return LocalFutureNoFork(func(*args, **kwargs)) - # TODO why they choose a queue size of 2? add a comment - queue = Queue(2) + queue = Queue(2) # Size of 2 to avoid a race condition with size 1. process = Process(target=call_with_timeout, args=(queue, self.timeout, func, args, kwargs)) process.start() diff --git a/python/tvm/exec/rpc_server.py b/python/tvm/exec/rpc_server.py index 805c3a8afb1a..e281e58e3879 100644 --- a/python/tvm/exec/rpc_server.py +++ b/python/tvm/exec/rpc_server.py @@ -101,20 +101,20 @@ def server_shutdown(): parser.add_argument('--port-end', type=int, default=9199, help='The end search port of the RPC') parser.add_argument('--tracker', type=str, - help=('The address of RPC tracker in host:port format. ' - 'e.g. (10.77.1.234:9190)')) + help=("The address of RPC tracker in host:port format. " + "e.g. (10.77.1.234:9190)")) parser.add_argument('--key', type=str, default="", - help='The key used to identify the device type in tracker.') + help="The key used to identify the device type in tracker.") parser.add_argument('--silent', action='store_true', - help='Whether run in silent mode.') + help="Whether run in silent mode.") parser.add_argument('--load-library', type=str, - help='Additional library to load') + help="Additional library to load") parser.add_argument('--no-fork', dest='fork', action='store_false', - help=('Use spawn mode to avoid fork. This option ' - 'is able to avoid potential fork problems with Metal, OpenCL ' - 'and ROCM compilers.')) + help="Use spawn mode to avoid fork. This option \ + is able to avoid potential fork problems with Metal, OpenCL \ + and ROCM compilers.") parser.add_argument('--custom-addr', type=str, - help='Custom IP Address to Report to RPC Tracker') + help="Custom IP Address to Report to RPC Tracker") parser.add_argument('--utvm-dev-config', type=str, help=('JSON config file for the target device (if using MicroTVM). ' 'This file should contain serialized output similar to that returned ' diff --git a/python/tvm/micro/device/arm/stm32f746xx.py b/python/tvm/micro/device/arm/stm32f746xx.py index 6d4b8f74aacb..746958504eda 100644 --- a/python/tvm/micro/device/arm/stm32f746xx.py +++ b/python/tvm/micro/device/arm/stm32f746xx.py @@ -67,7 +67,6 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=N options += [ # TODO(weberlo): make a debug flag "-O2", - "-march=armv7e-m", "-mcpu=cortex-m7", "-mlittle-endian", "-mfloat-abi=hard",