From 8d7249688771bb6806595931586d95648036f383 Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Thu, 30 Apr 2020 10:59:33 -0700 Subject: [PATCH] [RUNTIME][uTVM] AutoTVM + uTVM for Cortex-M7 (#5417) * Prototype for micro TVM. * Cleanup and sync micro tvm prototype. * Use /std:c++14 with MSVC. * Per tqchen: project has already moved to C++14 * Presubmit failed for code that built locally on gcc. * fix ASF lint, and fix add_asf_header too * Compiles with USE_MICRO=OFF. * Cleanup TargetPtr and word size representations. * fix compile warning * address logan's comments * address logan and liangfu comments * address thierry's comments * address u99127, liangfu, tmoreau89 comments Co-authored-by: Logan Weber --- 3rdparty/dmlc-core | 2 +- CMakeLists.txt | 7 +- Makefile | 3 +- include/tvm/tir/stmt.h | 2 + python/tvm/autotvm/measure/local_executor.py | 2 +- python/tvm/autotvm/measure/measure_methods.py | 7 +- python/tvm/autotvm/task/relay_integration.py | 1 + python/tvm/autotvm/tuner/callback.py | 2 +- python/tvm/autotvm/tuner/ga_tuner.py | 8 +- python/tvm/contrib/binutil.py | 29 +- python/tvm/exec/rpc_server.py | 32 +- python/tvm/micro/__init__.py | 5 +- python/tvm/micro/base.py | 143 ++++- python/tvm/micro/device/__init__.py | 3 +- python/tvm/micro/device/arm/stm32f746xx.py | 98 ++-- python/tvm/micro/device/base.py | 96 +++- python/tvm/micro/device/host.py | 86 +-- python/tvm/micro/device/riscv_spike.py | 77 ++- python/tvm/relay/_parser.py | 4 +- python/tvm/relay/op/strategy/arm_cpu.py | 32 +- python/tvm/rpc/server.py | 9 +- python/tvm/runtime/module.py | 1 - python/tvm/runtime/ndarray.py | 2 +- python/tvm/target/arm_isa.py | 34 ++ src/driver/driver_api.cc | 2 +- .../micro/device/arm/stm32f746xx/utvm_init.s | 5 - .../micro/device/arm/stm32f746xx/utvm_timer.c | 113 ++-- src/runtime/micro/device/host/utvm_timer.c | 20 +- .../micro/device/riscv_spike/utvm_init.s | 23 + .../micro/device/riscv_spike/utvm_timer.c | 42 ++ .../host_driven/utvm_device_dylib_redirect.c | 44 +- src/runtime/micro/host_driven/utvm_runtime.c | 149 +++-- src/runtime/micro/host_driven/utvm_runtime.h | 52 +- src/runtime/micro/host_low_level_device.cc | 19 +- src/runtime/micro/low_level_device.h | 9 +- src/runtime/micro/micro_common.cc | 18 +- src/runtime/micro/micro_common.h | 171 ++++-- src/runtime/micro/micro_device_api.cc | 50 +- src/runtime/micro/micro_module.cc | 10 +- src/runtime/micro/micro_section_allocator.h | 49 +- src/runtime/micro/micro_session.cc | 525 ++++++++++++------ src/runtime/micro/micro_session.h | 130 ++++- src/runtime/micro/openocd_low_level_device.cc | 16 +- .../micro/target_data_layout_encoder.h | 53 +- src/runtime/micro/tcl_socket.cc | 1 + src/runtime/rpc/rpc_session.cc | 35 +- src/target/source/codegen_c.cc | 4 + src/target/source/codegen_c_host.cc | 22 +- src/target/source/codegen_c_host.h | 7 +- src/target/target.cc | 2 +- tests/lint/add_asf_header.py | 4 +- tests/python/unittest/test_runtime_micro.py | 195 +++++-- topi/python/topi/arm_cpu/__init__.py | 1 + topi/python/topi/arm_cpu/conv2d.py | 13 + .../topi/arm_cpu/conv2d_spatial_pack.py | 4 +- .../python/topi/arm_cpu/cortex_m7/__init__.py | 20 + .../topi/arm_cpu/cortex_m7/conv2d/__init__.py | 19 + .../topi/arm_cpu/cortex_m7/conv2d/direct.py | 175 ++++++ .../arm_cpu/cortex_m7/conv2d/direct_simd.py | 163 ++++++ .../cortex_m7/micro_kernel/__init__.py | 16 + .../arm_cpu/cortex_m7/micro_kernel/gemm.py | 207 +++++++ topi/python/topi/generic/default.py | 2 +- .../python/topi/testing/conv2d_nhwc_python.py | 6 +- 63 files changed, 2306 insertions(+), 775 deletions(-) create mode 100644 python/tvm/target/arm_isa.py create mode 100644 src/runtime/micro/device/riscv_spike/utvm_init.s create mode 100644 src/runtime/micro/device/riscv_spike/utvm_timer.c create mode 100644 topi/python/topi/arm_cpu/cortex_m7/__init__.py create mode 100644 topi/python/topi/arm_cpu/cortex_m7/conv2d/__init__.py create mode 100644 topi/python/topi/arm_cpu/cortex_m7/conv2d/direct.py create mode 100644 topi/python/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py create mode 100644 topi/python/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py create mode 100644 topi/python/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core index 981b1c32f916..808f485387f9 160000 --- a/3rdparty/dmlc-core +++ b/3rdparty/dmlc-core @@ -1 +1 @@ -Subproject commit 981b1c32f91668e669ee376856f92f36cfd2a351 +Subproject commit 808f485387f9a03f78fa9f1159f387d0d91b7a28 diff --git a/CMakeLists.txt b/CMakeLists.txt index fc7c67c83a48..a0ebdf037756 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -304,12 +304,15 @@ include(cmake/modules/contrib/TFLite.cmake) include(cmake/modules/contrib/TF_TVMDSOOP.cmake) include(cmake/modules/contrib/CoreML.cmake) +include(CheckCXXCompilerFlag) if(NOT MSVC) - include(CheckCXXCompilerFlag) check_cxx_compiler_flag("-std=c++14" SUPPORT_CXX14) - message(STATUS "Build with c++14") set(CMAKE_CXX_FLAGS "-std=c++14 ${CMAKE_CXX_FLAGS}") set(CMAKE_CUDA_STANDARD 14) +else() + check_cxx_compiler_flag("/std:c++14" SUPPORT_CXX14) + set(CMAKE_CXX_FLAGS "/std:c++14 ${CMAKE_CXX_FLAGS}") + set(CMAKE_CUDA_STANDARD 14) endif() add_library(tvm SHARED ${COMPILER_SRCS} ${RUNTIME_SRCS}) diff --git a/Makefile b/Makefile index 757b3300f7d5..e54b9a93b230 100644 --- a/Makefile +++ b/Makefile @@ -73,7 +73,8 @@ build/libtvm_web_runtime.js: build/libtvm_web_runtime.bc cpplint: python3 3rdparty/dmlc-core/scripts/lint.py vta cpp vta/include vta/src python3 3rdparty/dmlc-core/scripts/lint.py topi cpp topi/include; - python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp include src \ + python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp \ + include src \ examples/extension/src examples/graph_executor/src pylint: diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h index aed8b5c77ae5..0d3cf42d8190 100644 --- a/include/tvm/tir/stmt.h +++ b/include/tvm/tir/stmt.h @@ -930,6 +930,8 @@ constexpr const char* loop_scope = "loop_scope"; constexpr const char* reduce_scope = "reduce_scope"; /*! \brief Mark region is guarded by the pragma extension */ constexpr const char* pragma_scope_prefix = "pragma_"; +/*! \brief Import C source or file into the final code gen module */ +constexpr const char* pragma_import_c = "pragma_import_c"; /*! \brief Import llvm source or file into the final code gen module */ constexpr const char* pragma_import_llvm = "pragma_import_llvm"; /*! \brief Try to modify the AST to support Tensor Core */ diff --git a/python/tvm/autotvm/measure/local_executor.py b/python/tvm/autotvm/measure/local_executor.py index cf81e2b50e50..a0a826abccf6 100644 --- a/python/tvm/autotvm/measure/local_executor.py +++ b/python/tvm/autotvm/measure/local_executor.py @@ -145,7 +145,7 @@ def submit(self, func, *args, **kwargs): if not self.do_fork: return LocalFutureNoFork(func(*args, **kwargs)) - queue = Queue(2) + queue = Queue(2) # Size of 2 to avoid a race condition with size 1. process = Process(target=call_with_timeout, args=(queue, self.timeout, func, args, kwargs)) process.start() diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 6533e75eef93..185ed7d05019 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -245,6 +245,8 @@ def get_build_kwargs(self): if 'cuda' in self.task.target.keys: kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split('.')) + if self.task.target.device_name == 'micro_dev': + kwargs.setdefault('build_option', {})['disable_vectorize'] = True return kwargs @@ -273,8 +275,9 @@ def run(self, measure_inputs, build_results): if isinstance(res, Exception): # executor error or timeout results.append(MeasureResult((str(res),), MeasureErrorNo.RUN_TIMEOUT, self.timeout, time.time())) - else: - results.append(res) + raise Exception(f'encountered exception during measurement: {results}') + + results.append(res) return results diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py index de183db41e2c..f3edfb01dc07 100644 --- a/python/tvm/autotvm/task/relay_integration.py +++ b/python/tvm/autotvm/task/relay_integration.py @@ -48,6 +48,7 @@ def _lower(mod, grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target) grc.codegen(mod["main"]) return + # default case # Try graph codegen first to extract autotvm tasks. # If failed to compile, then fallback to use VM compiler. diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py index eede450eaeaf..cfc1b2c38f85 100644 --- a/python/tvm/autotvm/tuner/callback.py +++ b/python/tvm/autotvm/tuner/callback.py @@ -149,7 +149,7 @@ def _callback(tuner, inputs, results): if res.error_no == 0: flops = inp.task.flop / np.mean(res.costs) - if logger.level < logging.DEBUG: # only print progress bar in non-debug mode + if not logger.isEnabledFor(logging.DEBUG): # only print progress bar in non-debug mode ctx.cur_flops = flops ctx.best_flops = tuner.best_flops diff --git a/python/tvm/autotvm/tuner/ga_tuner.py b/python/tvm/autotvm/tuner/ga_tuner.py index a4c36bcd385e..da10f73d5a53 100644 --- a/python/tvm/autotvm/tuner/ga_tuner.py +++ b/python/tvm/autotvm/tuner/ga_tuner.py @@ -50,7 +50,11 @@ def __init__(self, task, pop_size=100, elite_num=3, mutation_prob=0.1): # space info self.space = task.config_space - self.dims = [len(x) for x in self.space.space_map.values()] + self.dim_keys = [] + self.dims = [] + for k, v in self.space.space_map.items(): + self.dim_keys.append(k) + self.dims.append(len(v)) self.visited = set([]) @@ -123,7 +127,7 @@ def update(self, inputs, results): if len(self.visited) < len(self.space): while knob2point(tmp_gene, self.dims) in self.visited: j = np.random.randint(len(self.dims)) - tmp_gene[j] = np.random.randint(self.dims[j]) + tmp_gene[j] = np.random.randint(self.dims[j]) # pylint: disable=invalid-sequence-index next_genes.append(tmp_gene) self.visited.add(knob2point(tmp_gene, self.dims)) else: diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 521e0885548c..21e06df9f7f0 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -21,7 +21,9 @@ import tvm._ffi from . import util +# TODO does this file still belong in `contrib`. is it too µTVM-specific? +# TODO shouldn't need so many `ALIGN` directives RELOCATION_LD_SCRIPT_TEMPLATE = """ /* linker symbol for use in UTVMInit */ _utvm_stack_pointer_init = 0x{stack_pointer_init:x}; @@ -118,7 +120,7 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix): size of the section in bytes """ if not os.path.isfile(binary_path): - raise RuntimeError("no such file \"{}\"".format(binary_path)) + raise RuntimeError('no such file "{}"'.format(binary_path)) # We use the "-A" flag here to get the ".rodata" section's size, which is # not included by default. size_output = run_cmd(["{}size".format(toolchain_prefix), "-A", binary_path]) @@ -160,6 +162,10 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix): # padding for most cases, but symbols can be arbitrarily large, so this # isn't bulletproof. return section_size + 32 + + # NOTE: in the past, section_size has been wrong on x86. it may be + # inconsistent. TODO: maybe stop relying on `*size` to give us the size and + # instead read the section with `*objcopy` and count the bytes. return section_size @@ -206,11 +212,13 @@ def tvm_callback_relocate_binary( rel_bin : bytearray the relocated binary """ + assert text_start < rodata_start < data_start < bss_start < stack_end stack_pointer_init = stack_end - word_size ld_script_contents = "" # TODO(weberlo): There should be a better way to configure this for different archs. + # TODO is this line even necessary? if "riscv" in toolchain_prefix: - ld_script_contents += "OUTPUT_ARCH( \"riscv\" )\n\n" + ld_script_contents += 'OUTPUT_ARCH( "riscv" )\n\n' ld_script_contents += RELOCATION_LD_SCRIPT_TEMPLATE.format( word_size=word_size, text_start=text_start, @@ -221,7 +229,7 @@ def tvm_callback_relocate_binary( tmp_dir = util.tempdir() rel_obj_path = tmp_dir.relpath("relocated.obj") - rel_ld_script_path = tmp_dir.relpath("relocated.lds") + rel_ld_script_path = tmp_dir.relpath("relocate.lds") with open(rel_ld_script_path, "w") as f: f.write(ld_script_contents) run_cmd([ @@ -229,8 +237,23 @@ def tvm_callback_relocate_binary( binary_path, "-T", rel_ld_script_path, "-o", rel_obj_path]) + with open(rel_obj_path, "rb") as f: rel_bin = bytearray(f.read()) + + gdb_init_dir = os.environ.get("MICRO_GDB_INIT_DIR") + if gdb_init_dir is not None: + gdb_init_path = f"{gdb_init_dir}/.gdbinit" + with open(gdb_init_path, "r") as f: + gdbinit_contents = f.read().split("\n") + new_contents = [] + for line in gdbinit_contents: + new_contents.append(line) + if line.startswith("target"): + new_contents.append(f"add-symbol-file {rel_obj_path}") + with open(gdb_init_path, "w") as f: + f.write("\n".join(new_contents)) + return rel_bin diff --git a/python/tvm/exec/rpc_server.py b/python/tvm/exec/rpc_server.py index dbb690267e2a..e281e58e3879 100644 --- a/python/tvm/exec/rpc_server.py +++ b/python/tvm/exec/rpc_server.py @@ -20,6 +20,7 @@ import argparse import ast +import json import multiprocessing import sys import logging @@ -41,7 +42,7 @@ def main(args): tracker_addr = (url, port) if not args.key: raise RuntimeError( - "Need key to present type of resource when tracker is available") + 'Need key to present type of resource when tracker is available') else: tracker_addr = None @@ -75,8 +76,8 @@ def init_utvm(args): dev_config = json.load(dev_conf_file) else: dev_config_args = ast.literal_eval(args.utvm_dev_config_args) - default_config_func = micro.device.get_device_funcs(args.utvm_dev_id)['default_config'] - dev_config = default_config_func(*dev_config_args) + generate_config_func = micro.device.get_device_funcs(args.utvm_dev_id)['generate_config'] + dev_config = generate_config_func(*dev_config_args) if args.utvm_dev_config or args.utvm_dev_id: # add MicroTVM overrides @@ -100,8 +101,8 @@ def server_shutdown(): parser.add_argument('--port-end', type=int, default=9199, help='The end search port of the RPC') parser.add_argument('--tracker', type=str, - help="The address of RPC tracker in host:port format. " - "e.g. (10.77.1.234:9190)") + help=("The address of RPC tracker in host:port format. " + "e.g. (10.77.1.234:9190)")) parser.add_argument('--key', type=str, default="", help="The key used to identify the device type in tracker.") parser.add_argument('--silent', action='store_true', @@ -110,17 +111,24 @@ def server_shutdown(): help="Additional library to load") parser.add_argument('--no-fork', dest='fork', action='store_false', help="Use spawn mode to avoid fork. This option \ - is able to avoid potential fork problems with Metal, OpenCL \ - and ROCM compilers.") + is able to avoid potential fork problems with Metal, OpenCL \ + and ROCM compilers.") parser.add_argument('--custom-addr', type=str, help="Custom IP Address to Report to RPC Tracker") parser.add_argument('--utvm-dev-config', type=str, - help='JSON config file for the target device (if using MicroTVM)') - parser.add_argument('--utvm-dev-id', type=str, - help='Unique ID for the target device (if using MicroTVM)') + help=('JSON config file for the target device (if using MicroTVM). ' + 'This file should contain serialized output similar to that returned ' + "from the device module's generate_config. Can't be specified when " + '--utvm-dev-config-args is specified.')) parser.add_argument('--utvm-dev-config-args', type=str, - help=('Python list of literals required to generate a default' - ' MicroTVM config (if --utvm-dev-id is specified)')) + help=("Arguments to the device module's generate_config function. " + 'Must be a python literal parseable by literal_eval. If specified, ' + "the device configuration is generated using the device module's " + "generate_config. Can't be specified when --utvm-dev-config is " + "specified.")) + parser.add_argument('--utvm-dev-id', type=str, + help=('Unique ID for the target device (if using MicroTVM). Should ' + 'match the name of a module underneath tvm.micro.device).')) parser.set_defaults(fork=True) args = parser.parse_args() diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py index 9e984c08fe2c..7c1389cc4eef 100644 --- a/python/tvm/micro/__init__.py +++ b/python/tvm/micro/__init__.py @@ -17,6 +17,7 @@ """MicroTVM module for bare-metal backends""" from ..contrib import binutil -from .base import Session, create_micro_mod, cross_compiler -from .base import LibType, get_micro_host_driven_dir, get_micro_device_dir +from .base import DEVICE_SECTIONS +from .base import Session, create_micro_mod, cross_compiler, LibType +from .base import get_micro_host_driven_dir, get_micro_device_dir from . import device diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 9f50f9855303..bf4fd0ac9b76 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -19,6 +19,7 @@ from __future__ import absolute_import import os +import re import sys from enum import Enum @@ -28,6 +29,18 @@ from tvm.contrib import util as _util from tvm.contrib import cc as _cc +# all sections that comprise a device's memory layout, in order from lowest +# starting address to highest +DEVICE_SECTIONS = [ + "text", + "rodata", + "data", + "bss", + "args", + "heap", + "workspace", + "stack", +] class LibType(Enum): """Enumeration of library types that can be compiled and loaded onto a device""" @@ -51,9 +64,9 @@ class Session: .. code-block:: python c_mod = ... # some module generated with "c" as the target - dev_config = micro.device.arm.stm32f746xx.default_config("127.0.0.1", 6666) + dev_config = micro.device.arm.stm32f746xx.default_config('127.0.0.1', 6666) with tvm.micro.Session(dev_config) as sess: - micro_mod = create_micro_mod(c_mod, dev_config) + micro_mod = sess.create_micro_mod(c_mod) """ def __init__(self, config): @@ -62,19 +75,20 @@ def __init__(self, config): # grab a binutil instance from the ID in the config dev_funcs = tvm.micro.device.get_device_funcs(config["device_id"]) - self.create_micro_lib = dev_funcs["create_micro_lib"] self.toolchain_prefix = config["toolchain_prefix"] self.mem_layout = config["mem_layout"] - self.word_size = config["word_size"] + self.word_size_bits = config["word_size_bits"] self.thumb_mode = config["thumb_mode"] + self.use_device_timer = config["use_device_timer"] self.comms_method = config["comms_method"] # First, find and compile runtime library. runtime_src_path = os.path.join(get_micro_host_driven_dir(), "utvm_runtime.c") tmp_dir = _util.tempdir() runtime_obj_path = tmp_dir.relpath("utvm_runtime.obj") - self.create_micro_lib(runtime_obj_path, runtime_src_path, LibType.RUNTIME) - #input(f"check {runtime_obj_path}: ") + options = ["-I{}".format(get_micro_host_driven_dir())] + dev_funcs["create_micro_lib"]( + runtime_obj_path, runtime_src_path, LibType.RUNTIME, options=options) comms_method = config["comms_method"] if comms_method == "openocd": @@ -86,6 +100,8 @@ def __init__(self, config): else: raise RuntimeError(f"unknown communication method: f{self.comms_method}") + assert all(map(lambda sec: sec in self.mem_layout, DEVICE_SECTIONS)), \ + "not all sections have an assigned memory layout" self.module = _CreateSession( comms_method, runtime_obj_path, @@ -106,12 +122,15 @@ def __init__(self, config): self.mem_layout["workspace"]["size"], self.mem_layout["stack"].get("start", 0), self.mem_layout["stack"]["size"], - self.word_size, + self.word_size_bits, self.thumb_mode, + self.use_device_timer, server_addr, server_port) self._enter = self.module["enter"] self._exit = self.module["exit"] + self.get_last_batch_time = self.module["get_last_batch_time"] + self.get_last_batch_cycles = self.module["get_last_batch_cycles"] def _check_system(self): """Check if the user's system is supported by MicroTVM. @@ -119,7 +138,7 @@ def _check_system(self): Raises error if not supported. """ if not sys.platform.startswith("linux"): - raise RuntimeError("MicroTVM is currently only supported on Linux hosts") + raise RuntimeError("MicroTVM is currently only supported on Linux") # TODO(weberlo): Add 32-bit support. # It's primarily the compilation pipeline that isn't compatible. if sys.maxsize <= 2**32: @@ -133,44 +152,91 @@ def __exit__(self, exc_type, exc_value, exc_traceback): self._exit() -def create_micro_mod(c_mod, dev_config): +def _calc_max_workspace_usage(src): + # TODO factor in alignment to the calculation (alloc sizes will be aligned up to the word size) + alloc_re = re.compile( + r'.*\* ?(.+) = (\(.+\))? TVMBackendAllocWorkspace\(.+, .+, \(uint64_t\)(.+), .+, .+\).*') + free_re = re.compile(r'.*if \(TVMBackendFreeWorkspace\(.+, .+, (\(void\*\))? (.+)\) != 0\) {.*') + max_usage = 0 + alloc_map = {} + for line in src.split("\n"): + if line.strip().startswith("//"): + continue + match = alloc_re.match(line) + if match is not None: + alloc_map[match.group(1)] = int(match.group(3)) + max_usage = max(max_usage, sum(alloc_map.values())) + else: + match = free_re.match(line) + if match is not None: + print(alloc_map) + del alloc_map[match.group(2)] + return max_usage + + +def create_micro_mod(c_mod, dev_config, lib_src_paths=None, lib_headers=None, + lib_include_paths=None): """Produces a micro module from a given module. Parameters ---------- - c_mod : tvm.runtime.Module + c_mod : tvm.module.Module module with "c" as its target backend - dev_config : Dict[str, Any] - MicroTVM config dict for the target device + lib_src_paths: TODO + TODO + + lib_headers: TODO + TODO + + lib_include_paths: TODO + TODO Return ------ - micro_mod : tvm.runtim.Module + micro_mod : tvm.module.Module micro module for the target device """ temp_dir = _util.tempdir() lib_obj_path = temp_dir.relpath("dev_lib.obj") + # TODO use dev config to dispatch on the type of C codegen to run through + # (e.g., CodeGenCArm, CodeGenCHost, CodeGenCRiscV) c_mod.export_library( lib_obj_path, - fcompile=cross_compiler(dev_config, LibType.OPERATOR)) + fcompile=cross_compiler( + dev_config, + LibType.OPERATOR, + lib_src_paths=lib_src_paths, + lib_headers=lib_headers, + lib_include_paths=lib_include_paths)) micro_mod = tvm.runtime.load_module(lib_obj_path) return micro_mod -def cross_compiler(dev_config, lib_type): - """Create a cross-compile function that wraps `create_lib` for a `Binutil` instance. +def cross_compiler(dev_config, lib_type, lib_src_paths=None, lib_headers=None, + lib_include_paths=None): + """Create a cross compile function that wraps `create_lib` for a `Binutil` instance. For use in `tvm.runtime.Module.export_library`. Parameters ---------- - dev_config : Dict[str, Any] - MicroTVM config dict for the target device + create_micro_lib : func + function for creating MicroTVM libraries for a specific device (e.g., + `tvm.micro.device.get_device_funcs('arm.stm32f746xx')['create_micro_lib']`) lib_type : micro.LibType whether to compile a MicroTVM runtime or operator library + lib_src_paths: TODO + TODO + + lib_headers: TODO + e.g., `['cmsis_gcc.h', 'arm_math.h']` + + lib_include_paths: TODO + TODO + Return ------ func : Callable[[str, str, Optional[str]], None] @@ -183,16 +249,49 @@ def cross_compiler(dev_config, lib_type): c_mod = ... # some module generated with "c" as the target fcompile = tvm.micro.cross_compiler(dev_config, LibType.OPERATOR) - c_mod.export_library("dev_lib.obj", fcompile=fcompile) + c_mod.export_library('dev_lib.obj', fcompile=fcompile) """ - dev_funcs = tvm.micro.device.get_device_funcs(dev_config['device_id']) - create_micro_lib = dev_funcs['create_micro_lib'] + assert (lib_headers is None) == (lib_include_paths is None), \ + "must specify both `lib_headers` and `lib_include_paths` or neither" + + if lib_src_paths is None: + lib_src_paths = [] + if lib_include_paths is None: + lib_include_paths = [] + include_options = [] + for include_path in lib_include_paths: + include_options.append("-I") + include_options.append(include_path) + create_micro_lib = tvm.micro.device.get_device_funcs( + dev_config["device_id"])["create_micro_lib"] + mem_layout = dev_config["mem_layout"] + def compile_func(obj_path, src_path, **kwargs): if isinstance(obj_path, list): obj_path = obj_path[0] if isinstance(src_path, list): src_path = src_path[0] - create_micro_lib(obj_path, src_path, lib_type, kwargs.get("options", None)) + options = kwargs.get("options", []) + options += include_options + + # check that workspace allocations don't exceed available workspace memory + with open(src_path) as f: + src_contents = f.read() + max_ws_usage = _calc_max_workspace_usage(src_contents) + available_mem = mem_layout["workspace"]["size"] + if max_ws_usage > available_mem: + raise RuntimeError(f"workspace allocations in library ({max_ws_usage}) " + f"exceed available memory ({available_mem})") + # inject headers into new source path, if requested + if lib_headers: + headers_to_inject = "\n".join(map(lambda s: f"#include <{s}>", lib_headers)) + "\n" + new_src_contents = headers_to_inject + src_contents + tmp_dir = _util.tempdir() + src_path = tmp_dir.relpath(os.path.basename(src_path)) + with open(src_path, "w") as f: + f.write(new_src_contents) + + create_micro_lib(obj_path, src_path, lib_type, options, lib_src_paths=lib_src_paths) return _cc.cross_compiler(compile_func, output_format="obj") diff --git a/python/tvm/micro/device/__init__.py b/python/tvm/micro/device/__init__.py index 1ccd6847edd8..89731b9aa797 100644 --- a/python/tvm/micro/device/__init__.py +++ b/python/tvm/micro/device/__init__.py @@ -16,7 +16,8 @@ # under the License. """Device-specific configuration for MicroTVM""" -from .base import register_device, get_device_funcs, create_micro_lib_base +from .base import create_micro_lib_base, gen_mem_layout +from .base import MemConstraint, register_device, get_device_funcs from . import host from . import arm from . import riscv_spike diff --git a/python/tvm/micro/device/arm/stm32f746xx.py b/python/tvm/micro/device/arm/stm32f746xx.py index 31b44cf9d36b..746958504eda 100644 --- a/python/tvm/micro/device/arm/stm32f746xx.py +++ b/python/tvm/micro/device/arm/stm32f746xx.py @@ -14,13 +14,32 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Compilation and config definitions for ARM STM32F746XX devices""" -from .. import create_micro_lib_base, register_device +"""Compilation and config definitions for Arm STM32F746XX devices""" +import os +from .. import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint DEVICE_ID = "arm.stm32f746xx" TOOLCHAIN_PREFIX = "arm-none-eabi-" +WORD_SIZE_BITS = 32 +# +# [Device Memory Layout] +# RAM (rwx) : START = 0x20000000, LENGTH = 320K +# Flash (rx) : START = 0x8000000, LENGTH = 1024K +# +BASE_ADDR = 0x20000000 +AVAILABLE_MEM = 320000 +DEFAULT_SECTION_CONSTRAINTS = { + "text": (18000, MemConstraint.ABSOLUTE_BYTES), + "rodata": (100, MemConstraint.ABSOLUTE_BYTES), + "data": (100, MemConstraint.ABSOLUTE_BYTES), + "bss": (600, MemConstraint.ABSOLUTE_BYTES), + "args": (4096, MemConstraint.ABSOLUTE_BYTES), + "heap": (100.0, MemConstraint.WEIGHT), + "workspace": (64000, MemConstraint.ABSOLUTE_BYTES), + "stack": (32, MemConstraint.ABSOLUTE_BYTES), +} -def create_micro_lib(obj_path, src_path, lib_type, options=None): +def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None): """Wrapper over `create_micro_lib_base` to add device-specific options Parameters @@ -36,23 +55,40 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None): options : Optional[List[str]] additional options to pass to GCC + + lib_src_paths : Optional[List[str]] + TODO """ if options is None: options = [] + else: + options = list(options) + options += [ + # TODO(weberlo): make a debug flag + "-O2", "-mcpu=cortex-m7", "-mlittle-endian", "-mfloat-abi=hard", "-mfpu=fpv5-sp-d16", "-mthumb", + "-ffast-math", "-gdwarf-5", + "-DARM_MATH_CM7", + "-D__FPU_PRESENT=1U", + "-DARM_MATH_DSP", + "-Wno-unused-variable", + "-Wno-unused-parameter", + "-I{}".format(os.environ["CMSIS_ST_PATH"]), + "-I{}/Core/Include".format(os.environ["CMSIS_ST_PATH"]) ] create_micro_lib_base( - obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options) + obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options, + lib_src_paths=lib_src_paths) -def default_config(server_addr, server_port): - """Generates a default configuration for ARM STM32F746XX devices +def generate_config(server_addr, server_port, section_constraints=None): + """Generates a configuration for Arm STM32F746XX devices Parameters ---------- @@ -62,55 +98,23 @@ def default_config(server_addr, server_port): server_port : int port of OpenOCD server to connect to + section_constraints: Optional[Dict[str, [Number, MemConstraint]]] + maps section name to the quantity of available memory + Return ------ config : Dict[str, Any] MicroTVM config dict for this device """ + if section_constraints is None: + section_constraints = DEFAULT_SECTION_CONSTRAINTS return { "device_id": DEVICE_ID, "toolchain_prefix": TOOLCHAIN_PREFIX, - # - # [Device Memory Layout] - # RAM (rwx) : START = 0x20000000, LENGTH = 320K - # FLASH (rx) : START = 0x8000000, LENGTH = 1024K - # - "mem_layout": { - "text": { - "start": 0x20000180, - "size": 20480, - }, - "rodata": { - "start": 0x20005180, - "size": 20480, - }, - "data": { - "start": 0x2000a180, - "size": 768, - }, - "bss": { - "start": 0x2000a480, - "size": 768, - }, - "args": { - "start": 0x2000a780, - "size": 1280, - }, - "heap": { - "start": 0x2000ac80, - "size": 262144, - }, - "workspace": { - "start": 0x2004ac80, - "size": 20480, - }, - "stack": { - "start": 0x2004fc80, - "size": 80, - }, - }, - "word_size": 4, + "mem_layout": gen_mem_layout(BASE_ADDR, AVAILABLE_MEM, WORD_SIZE_BITS, section_constraints), + "word_size_bits": WORD_SIZE_BITS, "thumb_mode": True, + "use_device_timer": True, "comms_method": "openocd", "server_addr": server_addr, "server_port": server_port, @@ -119,5 +123,5 @@ def default_config(server_addr, server_port): register_device(DEVICE_ID, { "create_micro_lib": create_micro_lib, - "default_config": default_config, + "generate_config": generate_config, }) diff --git a/python/tvm/micro/device/base.py b/python/tvm/micro/device/base.py index ae53b9cc539f..767284c9c254 100644 --- a/python/tvm/micro/device/base.py +++ b/python/tvm/micro/device/base.py @@ -17,12 +17,13 @@ """Base definitions for MicroTVM config""" import glob import os -from pathlib import Path +import enum +import pathlib from tvm.contrib import util as _util from tvm.contrib.binutil import run_cmd from tvm._ffi.libinfo import find_include_path -from tvm.micro import LibType, get_micro_host_driven_dir, get_micro_device_dir +from tvm.micro import DEVICE_SECTIONS, LibType, get_micro_host_driven_dir, get_micro_device_dir _DEVICE_REGISTRY = {} @@ -38,7 +39,7 @@ def register_device(device_id, device_funcs): dictionary with compilation and config generation functions as values """ if device_id in _DEVICE_REGISTRY: - raise RuntimeError(f"\"{device_id}\" already exists in the device registry") + raise RuntimeError(f'"{device_id}" already exists in the device registry') _DEVICE_REGISTRY[device_id] = device_funcs @@ -56,7 +57,7 @@ def get_device_funcs(device_id): dictionary with compilation and config generation functions as values """ if device_id not in _DEVICE_REGISTRY: - raise RuntimeError(f"\"{device_id}\" does not exist in the binutil registry") + raise RuntimeError(f'"{device_id}" does not exist in the binutil registry') device_funcs = _DEVICE_REGISTRY[device_id] return device_funcs @@ -67,7 +68,9 @@ def create_micro_lib_base( toolchain_prefix, device_id, lib_type, - options=None): + options=None, + lib_src_paths=None, + ): """Compiles code into a binary for the target micro device. Parameters @@ -92,7 +95,12 @@ def create_micro_lib_base( options : List[str] additional options to pass to GCC + + lib_src_paths : Optional[List[str]] + paths to additional source files to be compiled into the library """ + # look at these (specifically `strip`): + # https://stackoverflow.com/questions/15314581/g-compiler-flag-to-minimize-binary-size base_compile_cmd = [ f"{toolchain_prefix}gcc", "-std=c11", @@ -100,7 +108,6 @@ def create_micro_lib_base( "-Wextra", "--pedantic", "-c", - "-O0", "-g", "-nostartfiles", "-nodefaultlibs", @@ -114,40 +121,48 @@ def create_micro_lib_base( src_paths = [] include_paths = find_include_path() + [get_micro_host_driven_dir()] tmp_dir = _util.tempdir() - # we might transform the src path in one of the branches below + # we need to create a new src file in the operator branch new_in_src_path = in_src_path if lib_type == LibType.RUNTIME: dev_dir = _get_device_source_dir(device_id) + dev_src_paths = glob.glob(f"{dev_dir}/*.[csS]") # there needs to at least be a utvm_timer.c file assert dev_src_paths assert "utvm_timer.c" in map(os.path.basename, dev_src_paths) + src_paths += dev_src_paths elif lib_type == LibType.OPERATOR: - # create a temporary copy of the source, so we can inject the dev lib + # create a temporary copy of the operator source, so we can inject the dev lib # header without modifying the original. temp_src_path = tmp_dir.relpath("temp.c") with open(in_src_path, "r") as f: src_lines = f.read().splitlines() - src_lines.insert(0, "#include \"utvm_device_dylib_redirect.c\"") + src_lines.insert(0, '#include "utvm_device_dylib_redirect.c"') with open(temp_src_path, "w") as f: f.write("\n".join(src_lines)) new_in_src_path = temp_src_path - base_compile_cmd += ["-c"] else: raise RuntimeError("unknown lib type") src_paths += [new_in_src_path] + # add any src paths required by the operator + if lib_src_paths is not None: + src_paths += lib_src_paths + + # print(f"include paths: {include_paths}") for path in include_paths: base_compile_cmd += ["-I", path] prereq_obj_paths = [] + # print(src_paths) for src_path in src_paths: - curr_obj_path = Path(src_path).with_suffix(".o").name + curr_obj_path = tmp_dir.relpath(pathlib.Path(src_path).with_suffix(".o").name) assert curr_obj_path not in prereq_obj_paths prereq_obj_paths.append(curr_obj_path) curr_compile_cmd = base_compile_cmd + [src_path, "-o", curr_obj_path] + # TODO(weberlo): make compilation fail if there are any warnings run_cmd(curr_compile_cmd) ld_cmd = [f"{toolchain_prefix}ld", "-relocatable"] @@ -156,6 +171,65 @@ def create_micro_lib_base( run_cmd(ld_cmd) +# TODO we shouldn't need an enum for this. too much bureaucracy. +class MemConstraint(enum.Enum): + """Represents a constraint on the device's memory layout""" + ABSOLUTE_BYTES = 0 + WEIGHT = 1 + + +def gen_mem_layout(base_addr, available_mem, word_size_bits, section_constraints): + """Template function to generate memory layout for devices. + + Parameters + ---------- + base_addr: Number + The address where usable memory begins on this device. + + available_mem: Number + Available memory at base_addr, given in bytes. + + word_size_bits: Number + Number of bits in one word on this device. + + section_constraints: Optional[Dict[str, [Number, MemConstraint]]] + maps section name to the quantity of available memory + """ + assert word_size_bits in (32, 64), "only 32- or 64-bit devices are supported now" + word_size_bytes = word_size_bits // 8 + byte_sum = sum(x[0] + for x in section_constraints.values() + if x[1] == MemConstraint.ABSOLUTE_BYTES) + weight_sum = sum(x[0] + for x in section_constraints.values() + if x[1] == MemConstraint.WEIGHT) + assert byte_sum <= available_mem + available_weight_mem = available_mem - byte_sum + + res = {} + curr_addr = base_addr + for section in DEVICE_SECTIONS: + (val, cons_type) = section_constraints[section] + if cons_type == MemConstraint.ABSOLUTE_BYTES: + assert val % word_size_bytes == 0, \ + f"constraint {val} for {section} section is not word-aligned" + size = val + res[section] = { + "start": curr_addr, + "size": size, + } + else: + size = int((val / weight_sum) * available_weight_mem) + size = (size // word_size_bytes) * word_size_bytes + res[section] = { + "start": curr_addr, + "size": size, + } + curr_addr += size + + return res + + def _get_device_source_dir(device_id): """Grabs the source directory for device-specific uTVM files""" dev_subdir = "/".join(device_id.split(".")) diff --git a/python/tvm/micro/device/host.py b/python/tvm/micro/device/host.py index a5495b60cf99..0cf29874ab57 100644 --- a/python/tvm/micro/device/host.py +++ b/python/tvm/micro/device/host.py @@ -17,12 +17,26 @@ """Compilation and config definitions for the host emulated device""" import sys -from . import create_micro_lib_base, register_device +from . import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint DEVICE_ID = "host" TOOLCHAIN_PREFIX = "" +WORD_SIZE_BITS = 64 if sys.maxsize > 2**32 else 32 -def create_micro_lib(obj_path, src_path, lib_type, options=None): +# we pretend we only have 320kb in the default case, so we can use `gen_mem_layout` +DEFAULT_AVAILABLE_MEM = 3200000 +DEFAULT_SECTION_CONSTRAINTS = { + "text": (20480, MemConstraint.ABSOLUTE_BYTES), + "rodata": (20480, MemConstraint.ABSOLUTE_BYTES), + "data": (768, MemConstraint.ABSOLUTE_BYTES), + "bss": (4096, MemConstraint.ABSOLUTE_BYTES), + "args": (4096, MemConstraint.ABSOLUTE_BYTES), + "heap": (262144, MemConstraint.ABSOLUTE_BYTES), + "workspace": (64000, MemConstraint.ABSOLUTE_BYTES), + "stack": (80, MemConstraint.ABSOLUTE_BYTES), +} + +def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None): """Wrapper over `create_micro_lib_base` to add device-specific options Parameters @@ -38,59 +52,65 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None): options : Optional[List[str]] additional options to pass to GCC + + lib_src_paths : Optional[List[str]] + paths to additional source files to be compiled into the library """ if options is None: options = [] + else: + options = list(options) + # Cannot increase optimization level on host due to code loading method. + options.append("-O0") if sys.maxsize > 2**32 and sys.platform.startswith("linux"): options += ["-mcmodel=large"] create_micro_lib_base( - obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options) + obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options, + lib_src_paths=lib_src_paths) -def default_config(): - """Generates a default configuration for the host emulated device +def generate_config(available_mem=None, section_constraints=None): + """Generates a configuration for the host emulated device + + Parameters + ---------- + available_mem: int + number of RW bytes available for use on device + + section_constraints: Optional[Dict[str, Dict[Number, MemConstraint]]] + maps section name to the quantity of available memory Return ------ config : Dict[str, Any] MicroTVM config dict for this device """ + if available_mem is None: + available_mem = DEFAULT_AVAILABLE_MEM + if section_constraints is None: + section_constraints = DEFAULT_SECTION_CONSTRAINTS + mem_layout = gen_mem_layout(0, available_mem, WORD_SIZE_BITS, section_constraints) + # TODO the host emulated device is an outlier, since we don't know how what + # its base address will be until we've created it in the C++. is there any + # way to change the infrastructure around this so it's not so much of an + # outlier? + + # need to zero out all start addresses, because they don't make sense for a + # host device (the memory region is allocated in the backend) + for section in mem_layout: + mem_layout[section]["start"] = 0 return { "device_id": DEVICE_ID, "toolchain_prefix": TOOLCHAIN_PREFIX, - "mem_layout": { - "text": { - "size": 20480, - }, - "rodata": { - "size": 20480, - }, - "data": { - "size": 768, - }, - "bss": { - "size": 768, - }, - "args": { - "size": 1280, - }, - "heap": { - "size": 262144, - }, - "workspace": { - "size": 20480, - }, - "stack": { - "size": 80, - }, - }, - "word_size": 8 if sys.maxsize > 2**32 else 4, + "mem_layout": mem_layout, + "word_size_bits": WORD_SIZE_BITS, "thumb_mode": False, + "use_device_timer": False, "comms_method": "host", } register_device(DEVICE_ID, { "create_micro_lib": create_micro_lib, - "default_config": default_config, + "generate_config": generate_config, }) diff --git a/python/tvm/micro/device/riscv_spike.py b/python/tvm/micro/device/riscv_spike.py index 923e5dfb23a2..32881cab6ba9 100644 --- a/python/tvm/micro/device/riscv_spike.py +++ b/python/tvm/micro/device/riscv_spike.py @@ -15,14 +15,25 @@ # specific language governing permissions and limitations # under the License. """Compilation and config definitions for Spike, a RISC-V functional ISA simulator""" -from collections import OrderedDict -from . import create_micro_lib_base, register_device +from . import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint DEVICE_ID = "riscv_spike" TOOLCHAIN_PREFIX = "riscv64-unknown-elf-" +WORD_SIZE_BITS = 64 -def create_micro_lib(obj_path, src_path, lib_type, options=None): +DEFAULT_SECTION_CONSTRAINTS = { + "text": (18000, MemConstraint.ABSOLUTE_BYTES), + "rodata": (128, MemConstraint.ABSOLUTE_BYTES), + "data": (128, MemConstraint.ABSOLUTE_BYTES), + "bss": (2048, MemConstraint.ABSOLUTE_BYTES), + "args": (4096, MemConstraint.ABSOLUTE_BYTES), + "heap": (100.0, MemConstraint.WEIGHT), + "workspace": (64000, MemConstraint.ABSOLUTE_BYTES), + "stack": (32, MemConstraint.ABSOLUTE_BYTES), +} + +def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None): """Wrapper over `create_micro_lib_base` to add device-specific options Parameters @@ -38,6 +49,9 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None): options : Optional[List[str]] additional options to pass to GCC + + lib_src_paths : Optional[List[str]] + TODO """ create_micro_lib_base( obj_path, @@ -45,11 +59,13 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None): TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, - options=options) + options=options, + lib_src_paths=lib_src_paths + ) -def default_config(base_addr, server_addr, server_port): - """Generates a default configuration for Spike +def generate_config(base_addr, available_mem, server_addr, server_port, section_constraints=None): + """Generates a configuration for Spike Parameters ---------- @@ -62,56 +78,31 @@ def default_config(base_addr, server_addr, server_port): server_port : int port of OpenOCD server to connect to + TODO correct type annotation? + section_constraints: Optional[Dict[str, Tuple[Number, MemConstraint]]] + TODO + Return ------ config : Dict[str, Any] MicroTVM config dict for this device """ - res = { + if section_constraints is None: + section_constraints = DEFAULT_SECTION_CONSTRAINTS + return { "device_id": DEVICE_ID, "toolchain_prefix": TOOLCHAIN_PREFIX, - "mem_layout": OrderedDict([ - ("text", { - "size": 20480, - }), - ("rodata", { - "size": 20480, - }), - ("data", { - "size": 768, - }), - ("bss", { - "size": 768, - }), - ("args", { - "size": 1280, - }), - ("heap", { - "size": 262144, - }), - ("workspace", { - "size": 20480, - }), - ("stack", { - "size": 80, - }), - ]), - "word_size": 4, - "thumb_mode": True, + "mem_layout": gen_mem_layout(base_addr, available_mem, WORD_SIZE_BITS, section_constraints), + "word_size_bits": WORD_SIZE_BITS, + "thumb_mode": False, + "use_device_timer": False, "comms_method": "openocd", "server_addr": server_addr, "server_port": server_port, } - # generate section start addresses from the given `base_addr` - curr_offset = 0 - mem_layout = res["mem_layout"] - for region_dict in mem_layout.values(): - region_dict["start"] = base_addr + curr_offset - curr_offset += region_dict["size"] - return res register_device(DEVICE_ID, { "create_micro_lib": create_micro_lib, - "default_config": default_config, + "generate_config": generate_config, }) diff --git a/python/tvm/relay/_parser.py b/python/tvm/relay/_parser.py index 4a73e572f924..7731efe04dfd 100644 --- a/python/tvm/relay/_parser.py +++ b/python/tvm/relay/_parser.py @@ -151,7 +151,9 @@ def __call__(self, args, attrs, type_args): "nn.dropout": op.nn.dropout_raw, "zeros": op.zeros, "split": op.split, - "cast": op.cast + "cast": op.cast, + "clip": op.clip, + "right_shift": op.right_shift, } TYPE_PREFIXES = [ diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 942d4c7f86af..6bdec67617e1 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -20,24 +20,25 @@ import logging import topi +from ....target import arm_isa from .generic import * from .. import op as _op logger = logging.getLogger('strategy') -@schedule_injective.register("arm_cpu") +@schedule_injective.register(["arm_cpu", "micro_dev"]) def schedule_injective_arm_cpu(_, outs, target): """schedule injective ops for arm cpu""" with target: return topi.arm_cpu.schedule_injective(outs) -@schedule_concatenate.register("arm_cpu") +@schedule_concatenate.register(["arm_cpu", "micro_dev"]) def schedule_concatenate_arm_cpu(_, outs, target): """schedule concatenate for arm cpu""" with target: return topi.arm_cpu.schedule_concatenate(outs) -@conv2d_strategy.register("arm_cpu") +@conv2d_strategy.register(["arm_cpu", "micro_dev"]) def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): """conv2d arm cpu strategy""" strategy = _op.OpStrategy() @@ -51,6 +52,8 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") + isa = arm_isa.IsaAnalyzer(target) + if groups == 1: if layout == "NCHW": if kernel_layout == "OIHW": @@ -102,11 +105,22 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn), name="conv2d_hwcn.generic") elif layout == "NHWC": - assert kernel_layout == "HWIO" - strategy.add_implementation( - wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack), - name="conv2d_nhwc_spatial_pack.arm_cpu") + channels = data.shape[3] + if "SMLAD" in isa and (channels % 4) == 0 and kernel_layout == "HWOI": + strategy.add_implementation( + wrap_compute_conv2d(topi.arm_cpu.conv2d_direct_simd), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_direct_simd), + name='conv2d_direct_simd.micro_dev') + elif kernel_layout == "HWIO": + strategy.add_implementation( + wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack), + name="conv2d_nhwc_spatial_pack.arm_cpu") + else: + raise RuntimeError("Unsupported kernel layout {} for conv2d NHWC". + format(kernel_layout)) + + else: raise RuntimeError("Unsupported conv2d layout {} for arm cpu".format(layout)) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): @@ -232,7 +246,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_arm_cpu(attrs, inputs, out format(layout)) return strategy -@conv2d_transpose_strategy.register("arm_cpu") +@conv2d_transpose_strategy.register(["arm_cpu", "micro_dev"]) def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target): """conv2d_transpose arm cpu strategy""" layout = attrs.data_layout diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py index 627d67a0a835..03749c1c17e4 100644 --- a/python/tvm/rpc/server.py +++ b/python/tvm/rpc/server.py @@ -325,7 +325,10 @@ def __init__(self, key="", load_library=None, custom_addr=None, - silent=False): + silent=False, + utvm_dev_id=None, + utvm_dev_config_args=None, + ): try: if base._ServerLoop is None: raise RuntimeError("Please compile with USE_RPC=1") @@ -355,6 +358,10 @@ def __init__(self, cmd += ["--custom-addr", custom_addr] if silent: cmd += ["--silent"] + if utvm_dev_id is not None: + assert utvm_dev_config_args is not None + cmd += [f"--utvm-dev-id={utvm_dev_id}"] + cmd += [f"--utvm-dev-config-args={utvm_dev_config_args}"] # prexec_fn is not thread safe and may result in deadlock. # python 3.2 introduced the start_new_session parameter as diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py index 7845a26bfca2..716f87f33fc1 100644 --- a/python/tvm/runtime/module.py +++ b/python/tvm/runtime/module.py @@ -109,7 +109,6 @@ def __call__(self, *args): # pylint: disable=not-callable return self.entry_func(*args) - def __repr__(self): return "Module(%s, %x)" % (self.type_key, self.handle.value) diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py index 10bbb6ef54c2..9b7e7c52f351 100644 --- a/python/tvm/runtime/ndarray.py +++ b/python/tvm/runtime/ndarray.py @@ -219,7 +219,7 @@ def context(dev_type, dev_id=0): """ if isinstance(dev_type, string_types): if '-device=micro_dev' in dev_type: - dev_type = 'micro_dev' + dev_type = TVMContext.STR2MASK['micro_dev'] else: dev_type = dev_type.split()[0] if dev_type not in TVMContext.STR2MASK: diff --git a/python/tvm/target/arm_isa.py b/python/tvm/target/arm_isa.py new file mode 100644 index 000000000000..c40296e50713 --- /dev/null +++ b/python/tvm/target/arm_isa.py @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Defines functions to analyze available opcodes in the ARM ISA.""" + + +ARM_ISA_MAP = { + 'armv7e-m': ['SMLAD'], +} + + +class IsaAnalyzer(object): + + def __init__(self, target): + self.target = target + # TODO: actually parse -mcpu + arch = 'armv7e-m' + self._isa_map = ARM_ISA_MAP[arch] + + def __contains__(self, instruction): + return instruction in self._isa_map diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc index 849c74028d63..8231c1b79d38 100644 --- a/src/driver/driver_api.cc +++ b/src/driver/driver_api.cc @@ -272,7 +272,7 @@ runtime::Module build(const Map& inputs, Target target_host_val = target_host; if (!target_host.defined()) { for (const auto& it : inputs) { - if (it.first->device_type == kDLCPU) { + if (it.first->device_type == kDLCPU || it.first->device_type == kDLMicroDev) { target_host_val = it.first; break; } diff --git a/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s b/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s index 300deb8079a0..f5720f4d7b28 100644 --- a/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s +++ b/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s @@ -17,11 +17,6 @@ * under the License. */ -/*! - * \file utvm_init.s - * \brief uTVM init definition for STM32F746XX-series boards - */ - .syntax unified .cpu cortex-m7 .fpu softvfp diff --git a/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c b/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c index 1b8376150fce..0f13a7dede88 100644 --- a/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c +++ b/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c @@ -29,100 +29,51 @@ extern "C" { #include #include "utvm_runtime.h" +// NOTE: This expects ST CMSIS to be in your include path. +// Download STM32CubeF7 here: +// https://www.st.com/content/st_com/en/products/embedded-software/mcu-mpu-embedded-software/stm32-embedded-software/stm32cube-mcu-mpu-packages/stm32cubef7.html +// and add Drivers/CMSIS to your C include path. +#include "Device/ST/STM32F7xx/Include/stm32f746xx.h" -// There are two implementations of cycle counters on the STM32F7X: SysTick and -// CYCCNT. SysTick is preferred, as it gives better error handling, but the -// counter is only 24 bits wide. If a larger timer is needed, use the CYCCNT -// implementation, which has a 32-bit counter. -#define USE_SYSTICK -#ifdef USE_SYSTICK - -#define SYST_CSR (*((volatile uint32_t *) 0xE000E010)) -#define SYST_RVR (*((volatile uint32_t *) 0xE000E014)) -#define SYST_CVR (*((volatile uint32_t *) 0xE000E018)) -#define SYST_CALIB (*((volatile uint32_t *) 0xE000E01C)) - -#define SYST_CSR_ENABLE 0 -#define SYST_CSR_TICKINT 1 -#define SYST_CSR_CLKSOURCE 2 -#define SYST_COUNTFLAG 16 - -#define SYST_CALIB_NOREF 31 -#define SYST_CALIB_SKEW 30 - -uint32_t start_time = 0; -uint32_t stop_time = 0; +#define utvm_SystemCoreClock 216000000UL int32_t UTVMTimerStart() { - SYST_CSR = (1 << SYST_CSR_ENABLE) | (1 << SYST_CSR_CLKSOURCE); - // wait until timer starts - while (SYST_CVR == 0) {} - start_time = SYST_CVR; - return 0; -} - -void UTVMTimerStop() { - SYST_CSR = 0; - stop_time = SYST_CVR; + UTVMTimerReset(); + TIM2->CR1 = + TIM_CR1_CEN; // Start counter + return UTVM_ERR_OK; } -void UTVMTimerReset() { - SYST_CSR = 0; - // maximum reload value (24-bit) - SYST_RVR = (~((uint32_t) 0)) >> 8; - SYST_CVR = 0; -} - -uint32_t UTVMTimerRead() { - if (SYST_CSR & SYST_COUNTFLAG) { - TVMAPISetLastError("timer overflowed"); - return -1; - } else { - return start_time - stop_time; +uint32_t UTVMTimerStop(int32_t* err) { + TIM2->CR1 &= TIM_CR1_CEN; + if (TIM2->SR & TIM_SR_UIF_Msk) { + *err = UTVM_ERR_TIMER_OVERFLOW; + return 0; } + *err = UTVM_ERR_OK; + uint32_t tim_cnt = TIM2->CNT; + uint32_t millis = tim_cnt / (utvm_SystemCoreClock / 1000); + uint32_t micros = + (tim_cnt - (millis * (utvm_SystemCoreClock / 1000))) / + (utvm_SystemCoreClock / 1000000); + return millis * 1000 + micros; } -#else // !USE_SYSTICK - -#define DWT_CTRL (*((volatile uint32_t *) 0xE0001000)) -#define DWT_CYCCNT (*((volatile uint32_t *) 0xE0001004)) - -#define DWT_CTRL_NOCYCCNT 25 -#define DWT_CTRL_CYCCNTENA 0 - -uint32_t start_time = 0; -uint32_t stop_time = 0; - void UTVMTimerReset() { - DWT_CYCCNT = 0; -} - -int32_t UTVMTimerStart() { - if (DWT_CTRL & DWT_CTRL_NOCYCCNT) { - TVMAPISetLastError("cycle counter not implemented on device"); - return -1; + RCC->APB1RSTR |= RCC_APB1RSTR_TIM2RST; // Hold TIM2 in reset + RCC->DCKCFGR1 = (RCC->DCKCFGR1 & ~RCC_DCKCFGR1_TIMPRE_Msk); // disable 2x clock boost to TIM2 + RCC->CFGR = (RCC->CFGR & ~RCC_CFGR_PPRE1_Msk); // No AHB clock division to APB1 (1:1). + RCC->APB1ENR |= RCC_APB1ENR_TIM2EN; // Enable TIM2 clock. + RCC->APB1RSTR &= ~RCC_APB1RSTR_TIM2RST; // Exit TIM2 reset. + + DBGMCU->APB1FZ |= DBGMCU_APB1_FZ_DBG_TIM2_STOP; // stop TIM2 clock during debug halt. + TIM2->ARR = 0xffffffff; + if (TIM2->SR & TIM_SR_UIF_Msk) { + for (;;) ; } - start_time = DWT_CYCCNT; - DWT_CTRL |= (1 << DWT_CTRL_CYCCNTENA); } -void UTVMTimerStop() { - stop_time = DWT_CYCCNT; - DWT_CTRL &= ~(1 << DWT_CTRL_CYCCNTENA); -} - -int32_t UTVMTimerRead() { - if (stop_time > stop_time) { - return stop_time - start_time; - } else { - uint32_t largest = ~0; - return (largest - start_time) + stop_time; - } -} - -#endif // USE_SYSTICK - #ifdef __cplusplus } // TVM_EXTERN_C #endif diff --git a/src/runtime/micro/device/host/utvm_timer.c b/src/runtime/micro/device/host/utvm_timer.c index 56a36ebae86d..6ab585a88f24 100644 --- a/src/runtime/micro/device/host/utvm_timer.c +++ b/src/runtime/micro/device/host/utvm_timer.c @@ -22,26 +22,16 @@ * \brief uTVM timer API stubs for the host emulated device */ -#ifdef __cplusplus -extern "C" { -#endif - +#include #include "utvm_runtime.h" // TODO(weberlo): use this? https://stackoverflow.com/questions/5141960/get-the-current-time-in-c int32_t UTVMTimerStart() { - return 0; + return UTVM_ERR_OK; } -void UTVMTimerStop() { } - -void UTVMTimerReset() { } - -uint32_t UTVMTimerRead() { - return 1; +uint32_t UTVMTimerStop(int32_t* err) { + *err = UTVM_ERR_OK; + return 0; } - -#ifdef __cplusplus -} // TVM_EXTERN_C -#endif diff --git a/src/runtime/micro/device/riscv_spike/utvm_init.s b/src/runtime/micro/device/riscv_spike/utvm_init.s new file mode 100644 index 000000000000..68662cce97e7 --- /dev/null +++ b/src/runtime/micro/device/riscv_spike/utvm_init.s @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +UTVMInit: + /* set stack pointer */ + la sp, _utvm_stack_pointer_init + call UTVMMain diff --git a/src/runtime/micro/device/riscv_spike/utvm_timer.c b/src/runtime/micro/device/riscv_spike/utvm_timer.c new file mode 100644 index 000000000000..5cf38559feab --- /dev/null +++ b/src/runtime/micro/device/riscv_spike/utvm_timer.c @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file utvm_timer.c + * \brief uTVM timer API stubs for Spike + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "utvm_runtime.h" + +int32_t UTVMTimerStart() { + return UTVM_ERR_OK; +} + +uint32_t UTVMTimerStop(int32_t* err) { + *err = UTVM_ERR_OK; + return 0; +} + +#ifdef __cplusplus +} // TVM_EXTERN_C +#endif diff --git a/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c b/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c index a8c600ed347b..9fabce6bdc1e 100644 --- a/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c +++ b/src/runtime/micro/host_driven/utvm_device_dylib_redirect.c @@ -32,10 +32,11 @@ extern "C" { #include #include -void *(*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = - (void *(*)(int, int, uint64_t, int, int)) NULL; -int (*TVMBackendFreeWorkspace_)(int, int, void*) = (int (*)(int, int, void*)) NULL; -void (*TVMAPISetLastError_)(const char*) = (void (*)(const char*)) NULL; +// TODO(weberlo, areusch): compiler errors say volatile qualifier is discarded. +// should we just get rid of em? +void* (* volatile TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = NULL; +int (* volatile TVMBackendFreeWorkspace_)(int, int, void*) = NULL; +void (* volatile TVMAPISetLastError_)(const char*) = NULL; void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, int dtype_code_hint, int dtype_bits_hint) { @@ -51,6 +52,41 @@ void TVMAPISetLastError(const char* msg) { (*TVMAPISetLastError_)(msg); } +void *memset(void *s, int c, size_t n) { + char *p = (char*) s; // NOLINT(readability/casting): linter is configured for c++ + while (n > 0) { + *p = (char) c; // NOLINT(readability/casting): linter is configured for c++ + p++; + n--; + } + return s; +} + +void *memmove(void *to, const void *from, size_t n) { + // TODO(weberlo, areusch): will need to factor memmove calls into workspace size calculation + // NOLINTNEXTLINE(readability/casting): linter is configured for c++ + char *temp = (char*) TVMBackendAllocWorkspace(1, 1, (uint64_t) n, 2, 8); + if (temp == NULL) { + return NULL; + } + + const char *from_pp = (char*) from; // NOLINT(readability/casting): linter is configured for c++ + for (size_t i = 0; i < n; i++) { + temp[i] = from_pp[i]; + } + char *to_pp = (char*) to; // NOLINT(readability/casting): linter is configured for c++ + for (size_t i = 0; i < n; i++) { + to_pp[i] = temp[i]; + } + + // NOLINTNEXTLINE(readability/casting): linter is configured for c++ + if (TVMBackendFreeWorkspace(1, (uint64_t) 1, (void*) temp) != 0) { + return NULL; + } + + return to; +} + #ifdef __cplusplus } // TVM_EXTERN_C #endif diff --git a/src/runtime/micro/host_driven/utvm_runtime.c b/src/runtime/micro/host_driven/utvm_runtime.c index a4de495a185c..2f2f0c1e0dea 100644 --- a/src/runtime/micro/host_driven/utvm_runtime.c +++ b/src/runtime/micro/host_driven/utvm_runtime.c @@ -34,89 +34,148 @@ extern "C" { #include "utvm_runtime.h" -// Task pointers must be patched before calling a function. -UTVMTask utvm_task = { - .func = NULL, - .arg_values = NULL, - .arg_type_codes = NULL, - .num_args = 0, -}; - -size_t utvm_word_size = 0; // NOLINT(*) +// TODO(weberlo, areusch): move defines into header +// TODO(weberlo, areusch): unify TASK_QUEUE_SIZE and MicroSession::kTaskQueueCapacity. +#define TASK_QUEUE_SIZE 20 +volatile UTVMTask utvm_tasks[TASK_QUEUE_SIZE] = { }; +volatile uint32_t utvm_num_tasks = 0; +volatile uint32_t utvm_task_times[TASK_QUEUE_SIZE] = { }; // These pointers are patched at load time to point to the workspace section. -char* utvm_workspace_start = NULL; // NOLINT(*) -char* utvm_workspace_end = NULL; // NOLINT(*) -char* utvm_workspace_curr = NULL; // NOLINT(*) +volatile char* utvm_workspace_start = NULL; // NOLINT(*) +volatile char* utvm_workspace_end = NULL; // NOLINT(*) +volatile char* utvm_workspace_curr = NULL; // NOLINT(*) +#define MAX_WS_ALLOCS 10 +volatile char* utvm_alloc_ends[MAX_WS_ALLOCS] = {}; // NOLINT(*) +volatile uint32_t utvm_alloc_idx = 0; // Keep track of how many active allocations there are on the workspace. -size_t utvm_num_active_allocs = 0; +volatile uint32_t utvm_num_active_allocs = 0; + +volatile uint32_t utvm_word_size = 0; -const char* utvm_last_error = NULL; // NOLINT(*) -int32_t utvm_return_code = 0; // NOLINT(*) +volatile int32_t utvm_last_error = 0; // NOLINT(*) -uint32_t utvm_task_time = 0; +volatile uint32_t utvm_done = 0; // Gets called by UTVMInit, after device-specific initialization is finished. void UTVMMain() { + utvm_done = 0; + // loss of precision should be fine here, since we only care about the lower bits + if (((uint32_t) utvm_workspace_start) % utvm_word_size) { + utvm_last_error = UTVM_ERR_WS_UNALIGNED_START; + UTVMDone(); + return; + } utvm_workspace_curr = utvm_workspace_start; utvm_num_active_allocs = 0; - utvm_last_error = NULL; // NOLINT(*) - utvm_return_code = 0; - utvm_task_time = 0; - UTVMTimerReset(); - int32_t err = UTVMTimerStart(); - if (err < 0) { - utvm_return_code = err; - UTVMDone(); + utvm_alloc_idx = 0; + utvm_last_error = UTVM_ERR_NOT_FINISHED; + for (uint32_t i = 0; i < utvm_num_tasks; i++) { + int32_t err = UTVM_ERR_OK; + utvm_task_times[i] = 0; + err = UTVMTimerStart(); + if (err < 0) { + utvm_last_error = err; + UTVMDone(); + return; + } + err = utvm_tasks[i].func( + (void*) utvm_tasks[i].arg_values, // NOLINT(*) + (void*) utvm_tasks[i].arg_type_codes, // NOLINT(*) + utvm_tasks[i].num_args); + if (err < 0) { + UTVMDone(); + return; + } + utvm_task_times[i] = UTVMTimerStop(&err); + if (err < 0) { + utvm_last_error = err; + UTVMDone(); + return; + } + } + if (utvm_last_error == UTVM_ERR_NOT_FINISHED) { + utvm_last_error = UTVM_ERR_OK; } - utvm_return_code = utvm_task.func( - (void*) utvm_task.arg_values, // NOLINT(*) - (void*) utvm_task.arg_type_codes, // NOLINT(*) - utvm_task.num_args); - UTVMTimerStop(); - utvm_task_time = UTVMTimerRead(); UTVMDone(); } // We use a dummy function to signal execution is finished for device // backends which require breakpoints. -void UTVMDone() { } +void __attribute__((noinline)) UTVMDone() { + utvm_done = 1; +} + +#define ALIGNED_UP(x, word_size) \ + ((((word_size) - (((uintptr_t) (x)) % (word_size))) % (word_size)) + (x)) void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, int dtype_code_hint, int dtype_bits_hint) { - // Align up to 8 bytes. - utvm_workspace_curr += - (utvm_word_size - ((uintptr_t) utvm_workspace_curr % utvm_word_size)) % utvm_word_size; // NOLINT(*) - if (utvm_workspace_curr + size > utvm_workspace_end) { + if (size == 0) { + utvm_last_error = UTVM_ERR_WS_ZERO_SIZE_ALLOC; + return NULL; + } + size_t alloc_requested_bytes = size; + size_t alloc_size_words = (alloc_requested_bytes + utvm_word_size - 1) / utvm_word_size; + size_t alloc_size_bytes = alloc_size_words * utvm_word_size; + + // Align up to the target word size. + if (utvm_workspace_curr + alloc_size_bytes > utvm_workspace_end) { // Out of space in workspace. + utvm_last_error = UTVM_ERR_WS_OUT_OF_SPACE; + return NULL; + } + if (utvm_alloc_idx == MAX_WS_ALLOCS - 1) { + // Exceeded number of allocs we can keep track of. + utvm_last_error = UTVM_ERR_WS_TOO_MANY_ALLOCS; return NULL; } void* ret_ptr = (void*) utvm_workspace_curr; // NOLINT(*) - utvm_workspace_curr += size; + utvm_workspace_curr = utvm_workspace_curr + alloc_size_bytes; + // store the *end* of the alloc, so we can restore the WS pointer when freeing + utvm_alloc_ends[utvm_alloc_idx] = utvm_workspace_curr; + utvm_alloc_idx++; utvm_num_active_allocs++; return ret_ptr; } int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { - utvm_num_active_allocs--; - if (utvm_num_active_allocs < 0) { + // TODO(weberlo, areusch): add dev type check + if (utvm_num_active_allocs == 0) { TVMAPISetLastError("free called with no active workspace allocations"); // Reset allocations and workspace (for future task executions). utvm_num_active_allocs = 0; utvm_workspace_curr = utvm_workspace_start; + utvm_last_error = UTVM_ERR_WS_DOUBLE_FREE; return -1; - } else if (utvm_num_active_allocs == 0) { - // No more allocations. Reset workspace. - utvm_workspace_curr = utvm_workspace_start; - return 0; } else { + utvm_num_active_allocs--; + if (ptr == utvm_workspace_start) { + // it's the first allocation + utvm_alloc_ends[0] = NULL; + } else { + for (uint32_t i = utvm_alloc_idx - 1; i >= 0; i--) { + if (utvm_alloc_ends[i] == ptr) { + utvm_alloc_ends[i + 1] = NULL; + break; + } + } + } + while (utvm_alloc_idx > 0 && utvm_alloc_ends[utvm_alloc_idx - 1] == NULL) { + utvm_alloc_idx--; + } + if (utvm_alloc_idx == 0) { + utvm_workspace_curr = utvm_workspace_start; + } else { + // TODO(weberlo, areusch): could you possibly have utvm_alloc_idx pointing to a NULL entry in + // this branch? + utvm_workspace_curr = utvm_alloc_ends[utvm_alloc_idx - 1]; + } return 0; } } -void TVMAPISetLastError(const char* msg) { - utvm_last_error = msg; -} +void TVMAPISetLastError(const char* msg) { } #ifdef __cplusplus } // TVM_EXTERN_C diff --git a/src/runtime/micro/host_driven/utvm_runtime.h b/src/runtime/micro/host_driven/utvm_runtime.h index c364ecf40792..fc11b70ed8e2 100644 --- a/src/runtime/micro/host_driven/utvm_runtime.h +++ b/src/runtime/micro/host_driven/utvm_runtime.h @@ -32,6 +32,22 @@ extern "C" { #include #include +/*! + * \brief TODO + */ +enum UTVMReturnCode { + UTVM_ERR_OK = 0, + UTVM_ERR_NOT_FINISHED = -1, + UTVM_ERR_TIMER_NOT_IMPLEMENTED = -2, + UTVM_ERR_TIMER_OVERFLOW = -3, + UTVM_ERR_WS_DOUBLE_FREE = -4, + UTVM_ERR_WS_OUT_OF_SPACE = -5, + UTVM_ERR_WS_TOO_MANY_ALLOCS = -6, + UTVM_ERR_WS_ZERO_SIZE_ALLOC = -7, + UTVM_ERR_WS_UNALIGNED_START = -8, + UTVM_ERR_WS_UNALIGNED_ALLOC_SIZE = -9, +}; + /*! * \brief Task structure for uTVM */ @@ -46,20 +62,46 @@ typedef struct { int32_t num_args; } UTVMTask; +/*! + * \brief microTVM processor startup. + * Expected to reset the stack pointer, configure any hardware required to support the CRT + * (i.e. FPU), and then jump to UTVMMain. + */ extern void UTVMInit(); -extern void UTVMTimerReset(); - +/*! + * \brief Start the on-device timer. + * \return UTVMReturnCode indicating the outcome of the operation. + */ extern int32_t UTVMTimerStart(); -extern void UTVMTimerStop(); - -extern uint32_t UTVMTimerRead(); +/*! + * \brief Stop the on-device timer. + * TODO(areusch): Use an SI specification of timer units here. + * \param err Receives a UTVMReturnCode indicating the outcome of the operation. + * \return elapsed time since UTVMTimerStart returned, in device timer ticks. + */ +extern uint32_t UTVMTimerStop(int32_t* err); +/*! + * \brief Main entry point for UTVM runtime. + * Waits for "go" signal, then executes tasks and reports result. Should never return. + */ void UTVMMain(); +/*! + * \brief Function entered when UTVMMain is complete. + * Should never return. The host sets a breakpoint here to detect end of computation. + */ void UTVMDone(); +// GCC -O3 begins to inject memset and memmove calls, so we provide impls in +// the runtime for this case and for general usage. + +void *memset(void *s, int c, size_t n); + +void *memmove(void *to, const void *from, size_t n); + #ifdef __cplusplus } // TVM_EXTERN_C #endif diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index a24994a2a0e5..da4ade41c008 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -43,14 +43,15 @@ class HostLowLevelDevice final : public LowLevelDevice { * \brief constructor to initialize on-host memory region to act as device * \param num_bytes size of the emulated on-device memory region */ - explicit HostLowLevelDevice(size_t num_bytes, void** base_addr) : size_(num_bytes) { + explicit HostLowLevelDevice(size_t num_bytes, TargetPtr* base_addr) : size_(num_bytes) { size_t size_in_pages = (num_bytes + kPageSize - 1) / kPageSize; // TODO(weberlo): Set permissions per section (e.g., read-write perms for // the heap, execute perms for text, etc.). int mmap_prot = PROT_READ | PROT_WRITE | PROT_EXEC; int mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE; base_addr_ = mmap(nullptr, size_in_pages * kPageSize, mmap_prot, mmap_flags, -1, 0); - *base_addr = base_addr_; + *base_addr = TargetPtr(TargetWordSize(sizeof(size_t) * 8), + reinterpret_cast(base_addr_)); } /*! @@ -60,16 +61,16 @@ class HostLowLevelDevice final : public LowLevelDevice { munmap(base_addr_, size_); } - void Read(DevPtr addr, void* buf, size_t num_bytes) { + void Read(TargetPtr addr, void* buf, size_t num_bytes) { std::memcpy(buf, addr.cast_to(), num_bytes); } - void Write(DevPtr addr, const void* buf, size_t num_bytes) { + void Write(TargetPtr addr, const void* buf, size_t num_bytes) { std::memcpy(addr.cast_to(), buf, num_bytes); } - void Execute(DevPtr func_addr, DevPtr breakpoint_addr) { - reinterpret_cast(func_addr.value().val64)(); + void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) { + reinterpret_cast(func_addr.value().uint64())(); } const char* device_type() const final { @@ -83,9 +84,9 @@ class HostLowLevelDevice final : public LowLevelDevice { size_t size_; }; -const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes, void** base_addr) { - std::shared_ptr lld = - std::make_shared(num_bytes, base_addr); +const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes, + TargetPtr* base_addr) { + std::shared_ptr lld = std::make_shared(num_bytes, base_addr); return lld; } diff --git a/src/runtime/micro/low_level_device.h b/src/runtime/micro/low_level_device.h index 3158e2fe20de..c5b5f3df634b 100644 --- a/src/runtime/micro/low_level_device.h +++ b/src/runtime/micro/low_level_device.h @@ -45,7 +45,7 @@ class LowLevelDevice { * \param buffer on-host buffer to be read into * \param num_bytes number of bytes to read */ - virtual void Read(DevPtr addr, + virtual void Read(TargetPtr addr, void* buffer, size_t num_bytes) = 0; @@ -55,7 +55,7 @@ class LowLevelDevice { * \param buffer host buffer to write from * \param num_bytes number of bytes to write */ - virtual void Write(DevPtr addr, + virtual void Write(TargetPtr addr, const void* buffer, size_t num_bytes) = 0; @@ -64,7 +64,7 @@ class LowLevelDevice { * \param func_addr offset of the init stub function * \param breakpoint_addr address at which to stop function execution */ - virtual void Execute(DevPtr func_addr, DevPtr breakpoint_addr) = 0; + virtual void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) = 0; /*! * \brief getter function for low-level device type @@ -78,7 +78,8 @@ class LowLevelDevice { * \param num_bytes size of the memory region * \param base_addr pointer to write the host device's resulting base address into */ -const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes, void** base_addr); +const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes, + TargetPtr* base_addr); /*! * \brief connect to OpenOCD and create an OpenOCD low-level device diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 632b6048b182..c544fcd32801 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -51,18 +51,18 @@ const char* SectionToString(SectionKind section) { std::string RelocateBinarySections( const std::string& binary_path, - size_t word_size, - DevPtr text_start, - DevPtr rodata_start, - DevPtr data_start, - DevPtr bss_start, - DevPtr stack_end, + TargetWordSize word_size, + TargetPtr text_start, + TargetPtr rodata_start, + TargetPtr data_start, + TargetPtr bss_start, + TargetPtr stack_end, const std::string& toolchain_prefix) { const auto* f = Registry::Get("tvm_callback_relocate_binary"); CHECK(f != nullptr) << "Require tvm_callback_relocate_binary to exist in registry"; std::string relocated_bin = (*f)(binary_path, - word_size, + word_size.bytes(), text_start.cast_to(), rodata_start.cast_to(), data_start.cast_to(), @@ -91,7 +91,7 @@ std::string ReadSection(const std::string& binary, size_t GetSectionSize(const std::string& binary_path, SectionKind section, const std::string& toolchain_prefix, - size_t align) { + TargetWordSize word_size) { CHECK(section == SectionKind::kText || section == SectionKind::kRodata || section == SectionKind::kData || section == SectionKind::kBss) << "GetSectionSize requires section to be one of text, rodata, data, or bss."; @@ -99,7 +99,7 @@ size_t GetSectionSize(const std::string& binary_path, CHECK(f != nullptr) << "Require tvm_callback_get_section_size to exist in registry"; int size = (*f)(binary_path, SectionToString(section), toolchain_prefix); - return UpperAlignValue(size, align); + return UpperAlignValue(size, word_size.bytes()); } } // namespace runtime diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index 4a0189b3e89e..2d74bc349f42 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -30,6 +30,7 @@ #include #include #include +#include namespace tvm { namespace runtime { @@ -52,28 +53,115 @@ enum class SectionKind : size_t { kNumKinds, }; -/*! \brief union for storing values on varying target word sizes */ -union TargetVal { - /*! \brief 32-bit pointer */ - uint32_t val32; - /*! \brief 64-bit pointer */ - uint64_t val64; +/*! \brief data type for word sizes */ +class TargetWordSize { + public: + explicit TargetWordSize(size_t word_size_bits) : word_size_bits_{word_size_bits} { + CHECK(word_size_bits == 32 || word_size_bits == 64) + << "only 32-bit and 64-bit are supported now"; + } + + size_t bytes() const { + return word_size_bits_ / 8; + } + + size_t bits() const { + return word_size_bits_; + } + + private: + size_t word_size_bits_; }; -/*! \brief absolute device address */ -class DevPtr { + +/*! \brief class for storing values on varying target word sizes */ +class TargetVal { + private: + size_t width_bits_; + uint64_t value_; + public: - /*! \brief construct a device address with value `value` */ - explicit DevPtr(std::uintptr_t value) : value_(TargetVal { .val64 = value }) {} + /*! \brief construct a TargetVal matching the size of the given integral argument */ + template::value, T>::type> + explicit constexpr TargetVal(T value) : TargetVal(sizeof(T) * 8, value) {} + + /*! \brief construct an uninitialized value */ + TargetVal() : width_bits_{0}, value_{0} {} + + /*! \brief construct a TargetVal with explicit size and value */ + TargetVal(size_t width_bits, uint64_t value) : width_bits_{width_bits} { + CHECK(width_bits >= 8 && + width_bits <= 64 && + (width_bits & (width_bits - 1)) == 0) + << "width_bits must be a power of 2 in [8, 64], got " << width_bits; + value_ = value & Bitmask(); + } + + bool IsInitialized() const { return width_bits_ != 0; } + + size_t width_bits() const { + CHECK(IsInitialized()) << "TargetVal is not initialized"; + return width_bits_; + } - /*! \brief default constructor */ - DevPtr() : value_(TargetVal { .val64 = 0 }) {} + uint64_t Bitmask() const { + CHECK(IsInitialized()) << "TargetVal is not initialized"; + + if (width_bits_ == 64) { + return ~0UL; + } else { + return (1UL << width_bits_) - 1; + } + } + + uint32_t uint32() const { + CHECK(IsInitialized()) << "TargetVal is not initialized"; + CHECK(width_bits_ <= 32) << "TargetVal: requested 32-bit value, actual width is " + << width_bits_; + return uint32_t(value_ & Bitmask()); + } + + uint64_t uint64() const { + CHECK(IsInitialized()) << "TargetVal is not initialized"; + return value_; + } + + TargetVal& operator=(const TargetVal& other) { + CHECK(other.IsInitialized()) << "Cannot assign an uninitialized TargetVal"; + + if (!IsInitialized()) { + width_bits_ = other.width_bits_; + } + + CHECK(width_bits_ >= other.width_bits_) + << "Cannot assign TargetVal with width " << other.width_bits_ + << "bits to TargetVal with width " << width_bits_ << "bits"; + + value_ = other.value_ & Bitmask(); + return *this; + } +}; + +// TODO(weberlo, areusch): just get rid of `TargetPtr`. +/*! \brief absolute device address */ +class TargetPtr { + public: + /*! \brief construct a device address with variable-length value `value` */ + TargetPtr(TargetWordSize word_size, std::uint64_t value) : + value_(TargetVal(word_size.bits(), value)) {} /*! \brief construct a null address */ - explicit DevPtr(std::nullptr_t value) : value_(TargetVal { .val64 = 0 }) {} + TargetPtr(TargetWordSize word_size, std::nullptr_t value) : + value_{TargetVal(word_size.bits(), 0)} {} + + /*! \brief construct an uninitialized pointer whose word_size can be changed once */ + TargetPtr() = default; + + /*! \brief construct a device address using the given TargetVal */ + explicit TargetPtr(const TargetVal& value) : value_{value} {} /*! \brief destructor */ - ~DevPtr() {} + ~TargetPtr() {} /*! * \brief get value of pointer @@ -86,33 +174,33 @@ class DevPtr { * \return casted result */ template - T cast_to() const { return reinterpret_cast(value_.val64); } + T cast_to() const { return reinterpret_cast(value_.uint64()); } /*! \brief check if location is null */ - bool operator==(std::nullptr_t) const { return value_.val64 == 0; } + bool operator==(std::nullptr_t) const { return value_.uint64() == 0; } /*! \brief check if location is not null */ - bool operator!=(std::nullptr_t) const { return value_.val64 != 0; } + bool operator!=(std::nullptr_t) const { return value_.uint64() != 0; } /*! \brief add an integer to this absolute address to get a larger absolute address */ - DevPtr operator+(size_t n) const { - return DevPtr(value_.val64 + n); + TargetPtr operator+(size_t n) const { + return TargetPtr(TargetWordSize(value_.width_bits()), value_.uint64() + n); } /*! \brief mutably add an integer to this absolute address */ - DevPtr& operator+=(size_t n) { - value_.val64 += n; + TargetPtr& operator+=(size_t n) { + value_ = TargetVal(value_.width_bits(), value_.uint64() + n); return *this; } /*! \brief subtract an integer from this absolute address to get a smaller absolute address */ - DevPtr operator-(size_t n) const { - return DevPtr(value_.val64 - n); + TargetPtr operator-(size_t n) const { + return TargetPtr(TargetWordSize(value_.width_bits()), value_.uint64() - n); } /*! \brief mutably subtract an integer from this absolute address */ - DevPtr& operator-=(size_t n) { - value_.val64 -= n; + TargetPtr& operator-=(size_t n) { + value_ = TargetVal(value_.width_bits(), value_.uint64() - n); return *this; } @@ -137,7 +225,8 @@ class SymbolMap { * \param toolchain_prefix prefix of compiler toolchain to use */ SymbolMap(const std::string& binary, - const std::string& toolchain_prefix) { + const std::string& toolchain_prefix, + TargetWordSize word_size) { const auto* f = Registry::Get("tvm_callback_get_symbol_map"); CHECK(f != nullptr) << "require tvm_callback_get_symbol_map to exist in registry"; TVMByteArray arr; @@ -152,7 +241,7 @@ class SymbolMap { stream >> name; stream >> std::hex >> addr; while (stream) { - map_[name] = DevPtr(addr); + map_.emplace(std::make_pair(name, TargetPtr(word_size, addr))); stream >> name; stream >> std::hex >> addr; } @@ -163,7 +252,7 @@ class SymbolMap { * \param name name of the symbol * \return on-device offset of the symbol */ - DevPtr operator[](const std::string& name) const { + TargetPtr operator[](const std::string& name) const { auto result = map_.find(name); CHECK(result != map_.end()) << "\"" << name << "\" not in symbol map"; return result->second; @@ -173,15 +262,21 @@ class SymbolMap { return map_.find(name) != map_.end(); } + void Dump(std::ostream& stream) const { + for (auto e : map_) { + stream << "Entry:" << e.first << std::endl; + } + } + private: /*! \brief backing map */ - std::unordered_map map_; + std::unordered_map map_; }; /*! \brief struct containing start and size of a device memory region */ struct DevMemRegion { /*! \brief section start offset */ - DevPtr start; + TargetPtr start; /*! \brief size of section */ size_t size; }; @@ -239,12 +334,12 @@ const char* SectionToString(SectionKind section); */ std::string RelocateBinarySections( const std::string& binary_path, - size_t word_size, - DevPtr text_start, - DevPtr rodata_start, - DevPtr data_start, - DevPtr bss_start, - DevPtr stack_end, + TargetWordSize word_size, + TargetPtr text_start, + TargetPtr rodata_start, + TargetPtr data_start, + TargetPtr bss_start, + TargetPtr stack_end, const std::string& toolchain_prefix); /*! @@ -263,13 +358,13 @@ std::string ReadSection(const std::string& binary, * \param binary input binary contents * \param section section type * \param toolchain_prefix prefix of compiler toolchain to use - * \param align alignment of the returned size (default: 8) + * \param word_size word size of the target, for alignment * \return size of the section if it exists, 0 otherwise */ size_t GetSectionSize(const std::string& binary_name, SectionKind section, const std::string& toolchain_prefix, - size_t align); + TargetWordSize word_size); } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index 3d0a6889c4f7..77ad86577f7e 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -50,18 +50,14 @@ class MicroDeviceAPI final : public DeviceAPI { size_t alignment, DLDataType type_hint) final { ObjectPtr& session = MicroSession::Current(); - void* data = session->AllocateInSection(SectionKind::kHeap, nbytes).cast_to(); + TargetPtr data = session->AllocateInSection(SectionKind::kHeap, nbytes); CHECK(data != nullptr) << "unable to allocate " << nbytes << " bytes on device heap"; - MicroDevSpace* dev_space = new MicroDevSpace(); - dev_space->data = data; - dev_space->session = session; - return static_cast(dev_space); + return reinterpret_cast(new MicroDevSpace{data, session}); } void FreeDataSpace(TVMContext ctx, void* ptr) final { MicroDevSpace* dev_space = static_cast(ptr); - dev_space->session->FreeInSection( - SectionKind::kHeap, DevPtr(reinterpret_cast(dev_space->data))); + dev_space->session->FreeInSection(SectionKind::kHeap, dev_space->data); delete dev_space; } @@ -77,7 +73,6 @@ class MicroDeviceAPI final : public DeviceAPI { std::tuple type_from_to(ctx_from.device_type, ctx_to.device_type); if (type_from_to == std::make_tuple(kDLMicroDev, kDLMicroDev)) { // Copying from the device to the device. - MicroDevSpace* from_space = static_cast(const_cast(from)); MicroDevSpace* to_space = static_cast(const_cast(to)); CHECK(from_space->session == to_space->session) @@ -87,58 +82,65 @@ class MicroDeviceAPI final : public DeviceAPI { CHECK(ctx_from.device_id == ctx_to.device_id) << "can only copy between the same micro device"; ObjectPtr& session = from_space->session; + // flush all pending tasks to ensure data is consistent + session->FlushTaskQueue(); const std::shared_ptr& lld = session->low_level_device(); - DevPtr from_dev_addr = GetDevLoc(from_space, from_offset); - DevPtr to_dev_addr = GetDevLoc(to_space, to_offset); + TargetPtr from_dev_addr = GetDevLoc(from_space, from_offset); + TargetPtr to_dev_addr = GetDevLoc(to_space, to_offset); std::vector buffer(size); lld->Read(from_dev_addr, static_cast(buffer.data()), size); lld->Write(to_dev_addr, static_cast(buffer.data()), size); + } else if (type_from_to == std::make_tuple(kDLMicroDev, kDLCPU)) { // Reading from the device. - MicroDevSpace* from_space = static_cast(const_cast(from)); ObjectPtr& session = from_space->session; + // flush all pending tasks to ensure data is consistent + session->FlushTaskQueue(); const std::shared_ptr& lld = session->low_level_device(); - DevPtr from_dev_addr = GetDevLoc(from_space, from_offset); + TargetPtr from_dev_addr = GetDevLoc(from_space, from_offset); void* to_host_ptr = GetHostLoc(to, to_offset); lld->Read(from_dev_addr, to_host_ptr, size); + } else if (type_from_to == std::make_tuple(kDLCPU, kDLMicroDev)) { // Writing to the device. - MicroDevSpace* to_space = static_cast(const_cast(to)); ObjectPtr& session = to_space->session; + // flush all pending tasks to ensure data is consistent + session->FlushTaskQueue(); const std::shared_ptr& lld = session->low_level_device(); void* from_host_ptr = GetHostLoc(from, from_offset); - DevPtr to_dev_addr = GetDevLoc(to_space, to_offset); + TargetPtr to_dev_addr = GetDevLoc(to_space, to_offset); lld->Write(to_dev_addr, from_host_ptr, size); + } else { LOG(FATAL) << "Expect copy from/to micro device or between micro device\n"; } } void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { + MicroSession::Current()->FlushTaskQueue(); } void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final { + CHECK(false) << "the on-device workspace allocator isn't aware of this function"; ObjectPtr& session = MicroSession::Current(); - void* data = session->AllocateInSection(SectionKind::kWorkspace, size).cast_to(); - CHECK(data != nullptr) << "unable to allocate " << size << " bytes on device workspace"; - MicroDevSpace* dev_space = new MicroDevSpace(); - dev_space->data = data; - dev_space->session = session; - return static_cast(dev_space); + TargetPtr data = session->AllocateInSection(SectionKind::kWorkspace, size); + CHECK(data.value().uint64() != 0) + << "unable to allocate " << size << " bytes on device workspace"; + return static_cast(new MicroDevSpace{data, session}); } void FreeWorkspace(TVMContext ctx, void* data) final { + CHECK(false) << "the on-device workspace allocator isn't aware of this function"; MicroDevSpace* dev_space = static_cast(data); ObjectPtr& session = dev_space->session; - session->FreeInSection(SectionKind::kWorkspace, - DevPtr(reinterpret_cast(dev_space->data))); + session->FreeInSection(SectionKind::kWorkspace, dev_space->data); delete dev_space; } @@ -152,8 +154,8 @@ class MicroDeviceAPI final : public DeviceAPI { } private: - DevPtr GetDevLoc(MicroDevSpace* dev_space, size_t offset) { - return DevPtr(reinterpret_cast(dev_space->data) + offset); + TargetPtr GetDevLoc(MicroDevSpace* dev_space, size_t offset) { + return dev_space->data + offset; } void* GetHostLoc(const void* ptr, size_t offset) { diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 50cee34be4a6..01056deeeb18 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -54,6 +54,8 @@ class MicroModuleNode final : public ModuleNode { * \param binary_path path of the binary to be loaded */ void InitMicroModule(const std::string& binary_path) { + // std::cout << "[MicroModuleNode::InitMicroModule]" << std::endl; + // std::cout << " start" << std::endl; session_ = MicroSession::Current(); symbol_map_ = session_->LoadBinary(binary_path, true).symbol_map; } @@ -67,26 +69,26 @@ class MicroModuleNode final : public ModuleNode { class MicroWrappedFunc { public: MicroWrappedFunc(ObjectPtr session, - DevPtr func_ptr) { + TargetPtr func_ptr) { session_ = session; func_ptr_ = func_ptr; } void operator()(TVMArgs args, TVMRetValue* rv) const { - *rv = session_->PushToExecQueue(func_ptr_, args); + session_->PushToTaskQueue(func_ptr_, args); } private: /*! \brief reference to the session for this function (to keep the session alive) */ ObjectPtr session_; /*! \brief offset of the function to be called */ - DevPtr func_ptr_; + TargetPtr func_ptr_; }; PackedFunc MicroModuleNode::GetFunction( const std::string& name, const ObjectPtr& sptr_to_self) { - DevPtr func_ptr; + TargetPtr func_ptr; if (name == tvm::runtime::symbol::tvm_module_main) { if (symbol_map_.HasSymbol(tvm::runtime::symbol::tvm_module_main)) { func_ptr = symbol_map_[tvm::runtime::symbol::tvm_module_main]; diff --git a/src/runtime/micro/micro_section_allocator.h b/src/runtime/micro/micro_section_allocator.h index 5c75f92737ab..2067794f393f 100644 --- a/src/runtime/micro/micro_section_allocator.h +++ b/src/runtime/micro/micro_section_allocator.h @@ -23,6 +23,7 @@ #ifndef TVM_RUNTIME_MICRO_MICRO_SECTION_ALLOCATOR_H_ #define TVM_RUNTIME_MICRO_MICRO_SECTION_ALLOCATOR_H_ +#include #include #include "micro_common.h" @@ -38,15 +39,18 @@ class MicroSectionAllocator { * \brief constructor that specifies section boundaries * \param region location and size of the section on the device */ - explicit MicroSectionAllocator(DevMemRegion region, size_t word_size) - : start_addr_(region.start), + explicit MicroSectionAllocator(std::string section_name, + DevMemRegion region, + TargetWordSize word_size) + : section_name_(section_name), + start_addr_(region.start), size_(0), capacity_(region.size), word_size_(word_size) { - CHECK_EQ(start_addr_.value().val64 % word_size, 0) - << "micro section start not aligned to " << word_size << " bytes"; - CHECK_EQ(capacity_ % word_size, 0) - << "micro section end not aligned to " << word_size << " bytes"; + CHECK_EQ(start_addr_.value().uint64() % word_size.bytes(), 0) + << "micro section start not aligned to " << word_size.bytes() << " bytes"; + CHECK_EQ(capacity_ % word_size.bytes(), 0) + << "micro section end not aligned to " << word_size.bytes() << " bytes"; } /*! @@ -56,17 +60,18 @@ class MicroSectionAllocator { /*! * \brief memory allocator - * \param size size of allocated memory in bytes + * \param alloc_size size of allocated memory in bytes * \return pointer to allocated memory region in section, nullptr if out of space */ - DevPtr Allocate(size_t size) { - size_ = UpperAlignValue(size_, word_size_); + TargetPtr Allocate(size_t size) { + size_ = UpperAlignValue(size_, word_size_.bytes()); CHECK(size_ + size < capacity_) - << "cannot alloc " << size << " bytes in section with start_addr " << - start_addr_.cast_to(); - DevPtr alloc_addr = start_addr_ + size_; + << "cannot alloc " << size << " bytes in section \"" + << section_name_ << "\" (start_addr=" << start_addr_.cast_to() + << ", used=" << size_ << ", capacity=" << capacity_ << ")"; + TargetPtr alloc_addr = start_addr_ + size_; size_ += size; - alloc_map_[alloc_addr.value().val64] = size; + alloc_map_[alloc_addr.value().uint64()] = size; return alloc_addr; } @@ -75,10 +80,10 @@ class MicroSectionAllocator { * \param offs offset to allocated memory * \note simple allocator scheme, more complex versions will be implemented later */ - void Free(DevPtr addr) { - CHECK(alloc_map_.find(addr.value().val64) != alloc_map_.end()) + void Free(TargetPtr addr) { + CHECK(alloc_map_.find(addr.value().uint64()) != alloc_map_.end()) << "freed pointer was never allocated"; - alloc_map_.erase(addr.value().val64); + alloc_map_.erase(addr.value().uint64()); if (alloc_map_.empty()) { size_ = 0; } @@ -87,17 +92,17 @@ class MicroSectionAllocator { /*! * \brief start offset of the memory region managed by this allocator */ - DevPtr start_addr() const { return start_addr_; } + TargetPtr start_addr() const { return start_addr_; } /*! * \brief current end addr of the space being used in this memory region */ - DevPtr curr_end_addr() const { return start_addr_ + size_; } + TargetPtr curr_end_addr() const { return start_addr_ + size_; } /*! * \brief end addr of the memory region managed by this allocator */ - DevPtr max_addr() const { return start_addr_ + capacity_; } + TargetPtr max_addr() const { return start_addr_ + capacity_; } /*! * \brief size of the section @@ -110,14 +115,16 @@ class MicroSectionAllocator { size_t capacity() const { return capacity_; } private: + /*! \brief name of the section (for debugging) */ + std::string section_name_; /*! \brief start address of the section */ - DevPtr start_addr_; + TargetPtr start_addr_; /*! \brief current size of the section */ size_t size_; /*! \brief total storage capacity of the section */ size_t capacity_; /*! \brief number of bytes in a word on the target device */ - size_t word_size_; + TargetWordSize word_size_; /*! \brief allocation map for allocation sizes */ std::unordered_map alloc_map_; }; diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 4bdc8ed69797..0e8e1693448e 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -23,7 +23,10 @@ #include #include +#include +#include #include +#include #include #include #include @@ -77,14 +80,16 @@ MicroSession::MicroSession( size_t workspace_size, uint64_t stack_start, size_t stack_size, - size_t word_size, + TargetWordSize word_size, bool thumb_mode, + bool use_device_timer, const std::string& server_addr, int port) - : toolchain_prefix_(toolchain_prefix) - , word_size_(word_size) - , thumb_mode_(thumb_mode) { - CHECK(word_size_ == 4 || word_size_ == 8) << "unsupported word size " << word_size_; + : toolchain_prefix_(toolchain_prefix), + word_size_(word_size), + thumb_mode_(thumb_mode), + use_device_timer_(use_device_timer), + batch_args_encoder_(args_size, word_size) { if (comms_method == "host") { // TODO(weberlo): move checks to python CHECK( @@ -99,106 +104,138 @@ MicroSession::MicroSession( size_t memory_size = text_size + rodata_size + data_size + bss_size + args_size + heap_size + workspace_size + stack_size; - void* base_addr; + TargetPtr base_addr; low_level_device_ = HostLowLevelDeviceCreate(memory_size, &base_addr); - CHECK_EQ(reinterpret_cast(base_addr) % word_size_, 0) - << "base address not aligned to " << word_size_ << " bytes"; - DevPtr curr_addr = DevPtr(reinterpret_cast(base_addr)); + CHECK_EQ(base_addr.value().uint64() % word_size.bytes(), 0) + << "base address not aligned to " << word_size.bytes() << " bytes"; + TargetPtr curr_addr = base_addr; - section_allocators_[0] = std::make_shared(DevMemRegion { + section_allocators_[0] = std::make_shared( + "text", + DevMemRegion { .start = curr_addr, .size = text_size, - }, word_size_); + }, word_size_); curr_addr += text_size; - section_allocators_[1] = std::make_shared(DevMemRegion { - .start = curr_addr, - .size = rodata_size, - }, word_size_); + section_allocators_[1] = std::make_shared( + "rodata", + DevMemRegion { + .start = curr_addr, + .size = rodata_size, + }, word_size_); curr_addr += rodata_size; - section_allocators_[2] = std::make_shared(DevMemRegion { - .start = curr_addr, - .size = data_size, - }, word_size_); + section_allocators_[2] = std::make_shared( + "data", + DevMemRegion { + .start = curr_addr, + .size = data_size, + }, word_size_); curr_addr += data_size; - section_allocators_[3] = std::make_shared(DevMemRegion { - .start = curr_addr, - .size = bss_size, - }, word_size_); + section_allocators_[3] = std::make_shared( + "bss", + DevMemRegion { + .start = curr_addr, + .size = bss_size, + }, word_size_); curr_addr += bss_size; - section_allocators_[4] = std::make_shared(DevMemRegion { - .start = curr_addr, - .size = args_size, - }, word_size_); + section_allocators_[4] = std::make_shared( + "args", + DevMemRegion { + .start = curr_addr, + .size = args_size, + }, word_size_); curr_addr += args_size; - section_allocators_[5] = std::make_shared(DevMemRegion { - .start = curr_addr, - .size = heap_size, - }, word_size_); + section_allocators_[5] = std::make_shared( + "heap", + DevMemRegion { + .start = curr_addr, + .size = heap_size, + }, word_size_); curr_addr += heap_size; - section_allocators_[6] = std::make_shared(DevMemRegion { - .start = curr_addr, - .size = workspace_size, - }, word_size_); + section_allocators_[6] = std::make_shared( + "workspace", + DevMemRegion { + .start = curr_addr, + .size = workspace_size, + }, word_size_); curr_addr += workspace_size; - section_allocators_[7] = std::make_shared(DevMemRegion { - .start = curr_addr, - .size = stack_size, - }, word_size_); + section_allocators_[7] = std::make_shared( + "stack", + DevMemRegion { + .start = curr_addr, + .size = stack_size, + }, word_size_); curr_addr += stack_size; } else if (comms_method == "openocd") { low_level_device_ = OpenOCDLowLevelDeviceCreate(server_addr, port); - section_allocators_[0] = std::make_shared(DevMemRegion { - .start = DevPtr(text_start), - .size = text_size, - }, word_size_); - section_allocators_[1] = std::make_shared(DevMemRegion { - .start = DevPtr(rodata_start), - .size = rodata_size, - }, word_size_); - section_allocators_[2] = std::make_shared(DevMemRegion { - .start = DevPtr(data_start), - .size = data_size, - }, word_size_); - section_allocators_[3] = std::make_shared(DevMemRegion { - .start = DevPtr(bss_start), - .size = bss_size, - }, word_size_); - section_allocators_[4] = std::make_shared(DevMemRegion { - .start = DevPtr(args_start), - .size = args_size, - }, word_size_); - section_allocators_[5] = std::make_shared(DevMemRegion { - .start = DevPtr(heap_start), - .size = heap_size, - }, word_size_); - section_allocators_[6] = std::make_shared(DevMemRegion { - .start = DevPtr(workspace_start), - .size = workspace_size, - }, word_size_); - section_allocators_[7] = std::make_shared(DevMemRegion { - .start = DevPtr(stack_start), - .size = stack_size, - }, word_size_); + section_allocators_[0] = std::make_shared( + "text", + DevMemRegion { + .start = TargetPtr(word_size_, text_start), + .size = text_size, + }, word_size_); + section_allocators_[1] = std::make_shared( + "rodata", + DevMemRegion { + .start = TargetPtr(word_size_, rodata_start), + .size = rodata_size, + }, word_size_); + section_allocators_[2] = std::make_shared( + "data", + DevMemRegion { + .start = TargetPtr(word_size_, data_start), + .size = data_size, + }, word_size_); + section_allocators_[3] = std::make_shared( + "bss", + DevMemRegion { + .start = TargetPtr(word_size_, bss_start), + .size = bss_size, + }, word_size_); + section_allocators_[4] = std::make_shared( + "args", + DevMemRegion { + .start = TargetPtr(word_size_, args_start), + .size = args_size, + }, word_size_); + section_allocators_[5] = std::make_shared( + "heap", + DevMemRegion { + .start = TargetPtr(word_size_, heap_start), + .size = heap_size, + }, word_size_); + section_allocators_[6] = std::make_shared( + "workspace", + DevMemRegion { + .start = TargetPtr(word_size_, workspace_start), + .size = workspace_size, + }, word_size_); + section_allocators_[7] = std::make_shared( + "stack", + DevMemRegion { + .start = TargetPtr(word_size_, stack_start), + .size = stack_size, + }, word_size_); } else { LOG(FATAL) << "unsupported micro low-level device"; } + TargetPtr args_start_addr = GetAllocator(SectionKind::kArgs)->start_addr(); + batch_args_encoder_.set_start_addr(args_start_addr); + runtime_symbol_map_ = LoadBinary(binary_path, false).symbol_map; // Patch pointers to define the bounds of the workspace section and the word // size (for allocation alignment). std::shared_ptr ws_allocator = GetAllocator(SectionKind::kWorkspace); - TargetVal ws_start = ws_allocator->start_addr().value(); - TargetVal ws_end = ws_allocator->max_addr().value(); - TargetVal target_word_size { .val64 = word_size_ }; - if (word_size_ == 4) { - DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.val32); - DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.val32); - DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.val32); - } else if (word_size_ == 8) { - DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.val64); - DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.val64); - DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.val64); + DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_allocator->start_addr()); + DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_allocator->max_addr()); + if (word_size.bytes() == 4) { + DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", uint32_t(word_size.bytes())); + } else if (word_size.bytes() == 8) { + DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", uint64_t(word_size.bytes())); + } else { + CHECK(false) << "Unsupported word size unexpectedly here"; } } @@ -209,59 +246,122 @@ MicroSession::~MicroSession() { low_level_device_ = nullptr; } -double MicroSession::PushToExecQueue(DevPtr func_ptr, const TVMArgs& args) { +void MicroSession::PushToTaskQueue(TargetPtr func_ptr, const TVMArgs& args) { if (thumb_mode_) { + // TODO(areusch): should be |= func_ptr += 1; } + TargetVal func_dev_addr = func_ptr.value(); + + std::tuple arg_field_addrs = EncoderAppend(&batch_args_encoder_, args); + TargetVal arg_values_dev_addr{std::get<0>(arg_field_addrs).value()}; + TargetVal arg_type_codes_dev_addr{std::get<1>(arg_field_addrs).value()}; + + task_queue_.push_back( + DevTask { + .func = func_dev_addr, + .arg_values = arg_values_dev_addr, + .arg_type_codes = arg_type_codes_dev_addr, + .num_args = args.num_args + }); + + if (task_queue_.size() == MicroSession::kTaskQueueCapacity) { + FlushTaskQueue(); + } +} - // Create an allocator stream for the memory region after the most recent - // allocation in the args section. - DevPtr args_addr = GetAllocator(SectionKind::kArgs)->curr_end_addr(); - TargetDataLayoutEncoder encoder(args_addr, word_size_); - - std::tuple arg_field_addrs = EncoderAppend(&encoder, args); - - // Flush `stream` to device memory. - DevPtr stream_dev_addr = - GetAllocator(SectionKind::kArgs)->Allocate(encoder.buf_size()); - low_level_device()->Write(stream_dev_addr, - reinterpret_cast(encoder.data()), - encoder.buf_size()); - - TargetVal arg_values_dev_addr = std::get<0>(arg_field_addrs).value(); - TargetVal arg_type_codes_dev_addr = std::get<1>(arg_field_addrs).value(); - if (word_size_ == 4) { - UTVMTask32 task = { - .func = func_ptr.value().val32, - .arg_values = arg_values_dev_addr.val32, - .arg_type_codes = arg_type_codes_dev_addr.val32, - .num_args = args.num_args, - }; - // Write the task. - DevSymbolWrite(runtime_symbol_map_, "utvm_task", task); - } else if (word_size_ == 8) { - UTVMTask64 task = { - .func = func_ptr.value().val64, - .arg_values = arg_values_dev_addr.val64, - .arg_type_codes = arg_type_codes_dev_addr.val64, - .num_args = args.num_args, - }; - // Write the task. - DevSymbolWrite(runtime_symbol_map_, "utvm_task", task); +void MicroSession::FlushTaskQueue() { + if (task_queue_.size() == 0) { + // nothing to run + return; + } + if (word_size_.bytes() == 4) { + FlushTaskQueuePriv(); + } else if (word_size_.bytes() == 8) { + FlushTaskQueuePriv(); } +} - DevPtr utvm_init_addr = runtime_symbol_map_["UTVMInit"]; - DevPtr utvm_done_addr = runtime_symbol_map_["UTVMDone"]; +template +void MicroSession::FlushTaskQueuePriv() { + std::vector prepped_tasks; + for (const auto& task : task_queue_) { + prepped_tasks.push_back(T(task)); + } + + // Flush `args` to device memory. + low_level_device()->Write( + batch_args_encoder_.start_addr(), + reinterpret_cast(batch_args_encoder_.data()), + batch_args_encoder_.buf_size()); + + // Flush `tasks` to device memory. + TargetPtr dev_tasks_addr = runtime_symbol_map_["utvm_tasks"]; + low_level_device()->Write( + dev_tasks_addr, + reinterpret_cast(prepped_tasks.data()), + prepped_tasks.size() * sizeof(T)); + DevSymbolWrite(runtime_symbol_map_, "utvm_num_tasks", prepped_tasks.size()); + + TargetPtr utvm_init_addr = runtime_symbol_map_["UTVMInit"]; + TargetPtr utvm_done_addr = runtime_symbol_map_["UTVMDone"]; if (thumb_mode_) { + // TODO(areusch): should be |= utvm_init_addr += 1; } + std::chrono::time_point< + std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin, tend; + tbegin = std::chrono::high_resolution_clock::now(); + // std::string tmp; + // while (tmp[0] != 'd' && tmp[0] != 'e') { + // std::cout << "How to proceed? [Debug / Execute] "; + // getline(std::cin, tmp); + // CHECK(std::cin.good()) << "Stdin closed"; + // tmp[0] = std::tolower(tmp[0]); + // } + // if (tmp[0] == 'd') { + // std::cout << "Launch debugger; [Enter] to resume automated execution"; + // getline(std::cin, tmp); + // } else { low_level_device()->Execute(utvm_init_addr, utvm_done_addr); + // } + tend = std::chrono::high_resolution_clock::now(); + // Check if there was an error during execution. If so, log it. CheckDeviceError(); - uint32_t task_time = DevSymbolRead(runtime_symbol_map_, "utvm_task_time"); - GetAllocator(SectionKind::kArgs)->Free(stream_dev_addr); - return static_cast(task_time); + + if (use_device_timer_) { + uint64_t sum = 0; + std::vector times; + times.resize(task_queue_.size()); + low_level_device()->Read(runtime_symbol_map_["utvm_task_times"], + times.data(), + task_queue_.size() * sizeof(uint32_t)); + int i = 0; + for (uint32_t time : times) { + LOG(INFO) << "Time " << i++ << ": " << time; + sum += time; + } + last_batch_time_ += static_cast(sum) / 1e3; + } else { + last_batch_time_ += std::chrono::duration_cast > + (tend - tbegin).count() * 1000; + // TODO(weberlo): Reading internal data structure is hacky. + uint64_t sum = 0; + std::vector times; + times.resize(task_queue_.size()); + low_level_device()->Read(runtime_symbol_map_["utvm_task_times"], + times.data(), + task_queue_.size() * sizeof(uint32_t)); + for (uint32_t time : times) { + sum += time; + } + last_batch_cycles_ += static_cast(sum); + } + + batch_args_encoder_.Clear(); + task_queue_.clear(); } BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_dylib_pointers) { @@ -283,9 +383,6 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d rodata_section.start = AllocateInSection(SectionKind::kRodata, rodata_section.size); data_section.start = AllocateInSection(SectionKind::kData, data_section.size); bss_section.start = AllocateInSection(SectionKind::kBss, bss_section.size); - CHECK(text_section.start != nullptr && rodata_section.start != nullptr && - data_section.start != nullptr && bss_section.start != nullptr) - << "not enough space to load module on device"; std::string relocated_bin = RelocateBinarySections( binary_path, @@ -305,7 +402,7 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d low_level_device_->Write(rodata_section.start, &rodata_contents[0], rodata_section.size); low_level_device_->Write(data_section.start, &data_contents[0], data_section.size); low_level_device_->Write(bss_section.start, &bss_contents[0], bss_section.size); - SymbolMap symbol_map {relocated_bin, toolchain_prefix_}; + SymbolMap symbol_map {relocated_bin, toolchain_prefix_, word_size_}; if (patch_dylib_pointers) { // Patch device lib pointers. @@ -323,7 +420,7 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d }; } -std::tuple MicroSession::EncoderAppend( +std::tuple MicroSession::EncoderAppend( TargetDataLayoutEncoder* encoder, const TVMArgs& args) { const int* type_codes = args.type_codes; int num_args = args.num_args; @@ -341,12 +438,13 @@ std::tuple MicroSession::EncoderAppend( // order to prevent premature session destruction. void* old_data = base_arr_handle->data; // Mutate the array to unwrap the `data` field. - base_arr_handle->data = reinterpret_cast(old_data)->data; + MicroDevSpace* dev_arr_ptr = reinterpret_cast(old_data); + base_arr_handle->data = reinterpret_cast(dev_arr_ptr->data.value().uint64()); // Now, encode the unwrapped version. void* arr_ptr = nullptr; - if (word_size_ == 4) { + if (word_size_.bytes() == 4) { arr_ptr = EncoderAppend(encoder, *base_arr_handle).cast_to(); - } else if (word_size_ == 8) { + } else if (word_size_.bytes() == 8) { arr_ptr = EncoderAppend(encoder, *base_arr_handle).cast_to(); } // And restore the original wrapped version. @@ -371,7 +469,7 @@ std::tuple MicroSession::EncoderAppend( } template -DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr) { +TargetPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr) { auto tvm_arr_slot = encoder->Alloc(); auto shape_slot = encoder->Alloc(arr.ndim); @@ -379,8 +477,8 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTen // the device first. The `data` field is already allocated on the device and // is a device pointer, so we don't need to write it. shape_slot.WriteArray(arr.shape, arr.ndim); - DevPtr shape_dev_addr = shape_slot.start_addr(); - DevPtr strides_dev_addr = DevPtr(nullptr); + TargetPtr shape_dev_addr = shape_slot.start_addr(); + TargetPtr strides_dev_addr = TargetPtr(word_size_, nullptr); if (arr.strides != nullptr) { auto stride_slot = encoder->Alloc(arr.ndim); stride_slot.WriteArray(arr.strides, arr.ndim); @@ -388,13 +486,13 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTen } T dev_arr( - TargetVal { .val64 = reinterpret_cast(arr.data) }, + TargetVal { word_size_.bits(), reinterpret_cast(arr.data) }, arr.ctx, arr.ndim, arr.dtype, shape_dev_addr.value(), strides_dev_addr.value(), - TargetVal { .val64 = arr.byte_offset }); + TargetVal { word_size_.bits(), arr.byte_offset }); CHECK(dev_arr.ctx.device_type == static_cast(kDLMicroDev)) << "attempt to write DLTensor with non-micro device type"; // Update the device type to CPU, because from the microcontroller's @@ -404,39 +502,70 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTen return tvm_arr_slot.start_addr(); } +// TODO(weberlo): switch over entirely to error codes that expand to error +// messages on the host side. void MicroSession::CheckDeviceError() { - int32_t return_code = DevSymbolRead(runtime_symbol_map_, "utvm_return_code"); - - if (return_code) { - std::uintptr_t last_error = - DevSymbolRead(runtime_symbol_map_, "utvm_last_error"); - std::string last_error_str; - if (last_error) { - DevPtr last_err_addr = DevPtr(last_error); - last_error_str = ReadString(last_err_addr); + int32_t last_error = DevSymbolRead(runtime_symbol_map_, "utvm_last_error"); + + if (last_error) { + if (!use_device_timer_ && + (last_error == UTVM_ERR_TIMER_OVERFLOW || + last_error == UTVM_ERR_TIMER_NOT_IMPLEMENTED)) { + // these errors don't matter if we're not using the on-device timer + return; + } + std::string err_msg; + switch (last_error) { + case UTVM_ERR_NOT_FINISHED: + err_msg = "execution timed out"; + break; + case UTVM_ERR_TIMER_NOT_IMPLEMENTED: + err_msg = "timer is not implemented for the target device"; + break; + case UTVM_ERR_TIMER_OVERFLOW: + // TODO(weberlo): this should be remedied by using interrupts to accumulate the + // timer into a larger datatype (ARM timers are only 24 bits) + err_msg = "timer overflowed during execution"; + break; + case UTVM_ERR_WS_DOUBLE_FREE: + err_msg = "free called with no active workspace allocations"; + break; + case UTVM_ERR_WS_OUT_OF_SPACE: + err_msg = "ran out of space in workspace section"; + break; + case UTVM_ERR_WS_TOO_MANY_ALLOCS: + err_msg = "exceeded number of allocs the runtime can keep track of"; + break; + case UTVM_ERR_WS_ZERO_SIZE_ALLOC: + err_msg = "attempt to allocate scratchpad of size zero"; + break; + case UTVM_ERR_WS_UNALIGNED_START: + err_msg = "start of workspace section is not word-aligned"; + break; + case UTVM_ERR_WS_UNALIGNED_ALLOC_SIZE: + err_msg = "scratchpad allocation size is not a multiple of the word size"; + break; + default: + err_msg = "unknown error code"; + break; } LOG(FATAL) << "error during micro function execution:\n" - << " return code: " << std::dec << return_code << "\n" - << " dev str addr: 0x" << std::hex << last_error << "\n" - << " dev str data: " << last_error_str << std::endl; + << " error ID: " << std::dec << last_error << std::endl + << " error message: " << err_msg; } } void MicroSession::PatchImplHole(const SymbolMap& symbol_map, const std::string& func_name) { - DevPtr runtime_impl_addr = runtime_symbol_map_[func_name]; + TargetPtr runtime_impl_addr = runtime_symbol_map_[func_name]; if (thumb_mode_) { runtime_impl_addr += 1; } std::ostringstream func_name_underscore; func_name_underscore << func_name << "_"; - if (word_size_ == 4) { - DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().val32); - } else if (word_size_ == 8) { - DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().val64); - } + DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr); } -std::string MicroSession::ReadString(DevPtr str_addr) { +std::string MicroSession::ReadString(TargetPtr str_addr) { std::ostringstream result; const size_t buf_size = 256; std::vector buf(buf_size, 0); @@ -454,27 +583,39 @@ std::string MicroSession::ReadString(DevPtr str_addr) { return result.str(); } -DevPtr MicroSession::AllocateInSection(SectionKind type, size_t size) { +TargetPtr MicroSession::AllocateInSection(SectionKind type, size_t size) { return GetAllocator(type)->Allocate(size); } -void MicroSession::FreeInSection(SectionKind type, DevPtr addr) { +void MicroSession::FreeInSection(SectionKind type, TargetPtr addr) { return GetAllocator(type)->Free(addr); } template T MicroSession::DevSymbolRead(const SymbolMap& symbol_map, const std::string& symbol) { - DevPtr sym_addr = symbol_map[symbol]; + TargetPtr sym_addr = symbol_map[symbol]; T result; low_level_device()->Read(sym_addr, &result, sizeof(T)); return result; } +void MicroSession::DevSymbolWrite(const SymbolMap& symbol_map, + const std::string& symbol, + const TargetPtr& ptr) { + if (word_size_.bytes() == 4) { + DevSymbolWrite(symbol_map, symbol, ptr.value().uint32()); + } else if (word_size_.bytes() == 8) { + DevSymbolWrite(symbol_map, symbol, ptr.value().uint64()); + } else { + CHECK(false) << "Unsupported word size unexpectedly here"; + } +} + template void MicroSession::DevSymbolWrite(const SymbolMap& symbol_map, const std::string& symbol, const T& value) { - DevPtr sym_addr = symbol_map[symbol]; + TargetPtr sym_addr = symbol_map[symbol]; low_level_device()->Write(sym_addr, &value, sizeof(T)); } @@ -489,11 +630,55 @@ PackedFunc MicroSession::GetFunction( return PackedFunc([sptr_to_self](TVMArgs args, TVMRetValue* rv) { MicroSession::ExitWithScope(); }); + // TODO(weberlo): add a `clear_batch_timer` func + } else if (name == "get_last_batch_time") { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + *rv = this->GetLastBatchTime(); + }); + // TODO(weberlo): remove this func + } else if (name == "get_last_batch_cycles") { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + *rv = this->GetLastBatchCycles(); + }); } else { return PackedFunc(); } } +TVM_REGISTER_GLOBAL("micro._GetMicroTimeEvaluator") +.set_body([](TVMArgs args, TVMRetValue* rv) { + PackedFunc pf = args[0]; + TVMContext ctx = args[1]; + uint64_t number = args[2]; + uint64_t repeat = args[3]; + + auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) mutable { + TVMRetValue temp; + std::ostringstream os; + + for (unsigned int i = 0; i < repeat; ++i) { + // start timing + CHECK(number < MicroSession::kTaskQueueCapacity) + << "`number` must be less than uTVM task queue capacity"; + for (unsigned int j = 0; j < number; ++j) { + pf.CallPacked(args, &temp); + } + ObjectPtr session = MicroSession::Current(); + DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); + double time_per_batch = session->GetLastBatchTime() / number; + os.write(reinterpret_cast(&time_per_batch), sizeof(time_per_batch)); + } + std::string blob = os.str(); + TVMByteArray arr; + arr.size = blob.length(); + arr.data = blob.data(); + // return the time. + *rv = arr; + }; + *rv = PackedFunc(ftimer); +}); + + // create micro session and low-level device from Python frontend TVM_REGISTER_GLOBAL("micro._CreateSession") .set_body([](TVMArgs args, TVMRetValue* rv) { @@ -501,25 +686,26 @@ TVM_REGISTER_GLOBAL("micro._CreateSession") const std::string& binary_path = args[1]; const std::string& toolchain_prefix = args[2]; uint64_t text_start = args[3]; - size_t text_size = args[4]; + size_t text_size = uint64_t(args[4]); uint64_t rodata_start = args[5]; - size_t rodata_size = args[6]; + size_t rodata_size = uint64_t(args[6]); uint64_t data_start = args[7]; - size_t data_size = args[8]; + size_t data_size = uint64_t(args[8]); uint64_t bss_start = args[9]; - size_t bss_size = args[10]; + size_t bss_size = uint64_t(args[10]); uint64_t args_start = args[11]; - size_t args_size = args[12]; + size_t args_size = uint64_t(args[12]); uint64_t heap_start = args[13]; - size_t heap_size = args[14]; + size_t heap_size = uint64_t(args[14]); uint64_t workspace_start = args[15]; - size_t workspace_size = args[16]; + size_t workspace_size = uint64_t(args[16]); uint64_t stack_start = args[17]; - size_t stack_size = args[18]; - size_t word_size = args[19]; + size_t stack_size = uint64_t(args[18]); + TargetWordSize word_size{uint64_t(args[19])}; bool thumb_mode = args[20]; - const std::string& server_addr = args[21]; - int port = args[22]; + bool use_device_timer = args[21]; + const std::string& server_addr = args[22]; + int port = args[23]; ObjectPtr session = make_object( comms_method, binary_path, @@ -542,6 +728,7 @@ TVM_REGISTER_GLOBAL("micro._CreateSession") stack_size, word_size, thumb_mode, + use_device_timer, server_addr, port); *rv = Module(session); diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 9e844e8b2140..bf0996c3e8c4 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -52,6 +52,8 @@ namespace tvm { namespace runtime { +struct DevTask; + /*! * \brief session for facilitating micro device interaction */ @@ -66,6 +68,9 @@ class MicroSession : public ModuleNode { virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr& sptr_to_self); + // todo having this decoupled from the value in utvm_runtime.c gives me stress dreams + static const size_t kTaskQueueCapacity = 20; + /*! * \return The type key of the executor. */ @@ -94,7 +99,7 @@ class MicroSession : public ModuleNode { * \param workspace_size workspace section size * \param stack_start stack section start address * \param stack_size stack section size - * \param word_size number of bytes in a word on the target device + * \param word_size_bytes number of bytes in a word on the target device * \param thumb_mode whether the target device requires a thumb-mode bit on function addresses * \param server_addr address of the OpenOCD server to connect to (if `comms_method == "openocd"`) * \param port port of the OpenOCD server to connect to (if `comms_method == "openocd"`) @@ -119,8 +124,9 @@ class MicroSession : public ModuleNode { size_t workspace_size, uint64_t stack_start, size_t stack_size, - size_t word_size, + TargetWordSize word_size, bool thumb_mode, + bool use_device_timer, const std::string& server_addr, int port); @@ -137,7 +143,19 @@ class MicroSession : public ModuleNode { * \param args args to the packed function * \return elapsed time during function execution on the device */ - double PushToExecQueue(DevPtr func, const TVMArgs& args); + void PushToTaskQueue(TargetPtr func, const TVMArgs& args); + + /*! + * \brief serialize runtime metadata to the device for enqueued tasks and execute + * \return elapsed time during function execution on the device + */ + void FlushTaskQueue(); + + /*! + * \brief TODO + */ + template + void FlushTaskQueuePriv(); /*! * \brief loads binary onto device @@ -153,21 +171,21 @@ class MicroSession : public ModuleNode { * \param size size of allocated memory in bytes * \return pointer to allocated memory region in section, nullptr if out of space */ - DevPtr AllocateInSection(SectionKind type, size_t size); + TargetPtr AllocateInSection(SectionKind type, size_t size); /*! * \brief free prior allocation from section * \param type type of section to allocate in * \param addr device address of allocated memory */ - void FreeInSection(SectionKind type, DevPtr addr); + void FreeInSection(SectionKind type, TargetPtr addr); /*! * \brief read string from device to host * \param str_addr device address of first character of string * \return host copy of device string that was read */ - std::string ReadString(DevPtr str_addr); + std::string ReadString(TargetPtr str_addr); /*! * \brief read value of symbol from device memory @@ -178,6 +196,16 @@ class MicroSession : public ModuleNode { template T DevSymbolRead(const SymbolMap& symbol_map, const std::string& symbol); + /*! + * \brief write pointer value into device memory corresponding to symbol + * \param symbol_map symbol map to read location of symbol from + * \param symbol name of symbol being written to + * \param ptr pointer value to write into symbol + */ + void DevSymbolWrite(const SymbolMap& symbol_map, + const std::string& symbol, + const TargetPtr& ptr); + /*! * \brief write value into device memory corresponding to symbol * \param symbol_map symbol map to read location of symbol from @@ -196,6 +224,18 @@ class MicroSession : public ModuleNode { return low_level_device_; } + const double GetLastBatchTime() { + double result = last_batch_time_; + last_batch_time_ = 0.0; + return result; + } + + const double GetLastBatchCycles() { + double result = last_batch_cycles_; + last_batch_cycles_ = 0.0; + return result; + } + private: /*! \brief low-level device pointer */ std::shared_ptr low_level_device_; @@ -205,7 +245,7 @@ class MicroSession : public ModuleNode { std::shared_ptr section_allocators_[static_cast(SectionKind::kNumKinds)]; /*! \brief number of bytes in a word on the target device */ - size_t word_size_; + TargetWordSize word_size_; /*! \brief whether the target device requires a thumb-mode bit on function addresses * * ARM and other manufacturers use the lowest bit of a function address to determine @@ -213,8 +253,20 @@ class MicroSession : public ModuleNode { * results in more compact binaries. */ bool thumb_mode_; + /*! \brief TODO */ + bool use_device_timer_; /*! \brief symbol map for the device runtime */ SymbolMap runtime_symbol_map_; + /*! \brief TODO */ + std::vector task_queue_; + // TODO(weberlo): we don't even need an allocator mechanism for the args + // section. there's only ever one allocation. + /*! \brief TODO hack */ + TargetDataLayoutEncoder batch_args_encoder_; + /*! \brief TODO hack */ + double last_batch_time_; + /*! \brief TODO hack */ + double last_batch_cycles_; /*! * \brief patches a function pointer in this module to an implementation @@ -228,7 +280,8 @@ class MicroSession : public ModuleNode { * \param args args to be appended * \return device address of the allocated args */ - std::tuple EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArgs& args); + std::tuple EncoderAppend(TargetDataLayoutEncoder* encoder, + const TVMArgs& args); /*! * \brief appends a `DLTensor` to the host-side buffer of `encoder` @@ -237,7 +290,7 @@ class MicroSession : public ModuleNode { * \return device address of the allocated `DLTensor` */ template - DevPtr EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr); + TargetPtr EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr); /*! * \brief checks and logs if there was an error during the device's most recent execution @@ -274,7 +327,7 @@ class MicroSession : public ModuleNode { */ struct MicroDevSpace { /*! \brief data being wrapped */ - void* data; + TargetPtr data; /*! \brief shared ptr to session where this data is valid */ ObjectPtr session; }; @@ -291,18 +344,22 @@ struct TVMArray32 { TargetVal shape, TargetVal strides, TargetVal byte_offset) - : data(data.val32), + : data(data.uint32()), ctx(ctx), ndim(ndim), pad0(0), dtype(dtype), - shape(shape.val32), - strides(strides.val32), + shape(shape.uint32()), + strides(strides.uint32()), pad1(0), - byte_offset(byte_offset.val32), + byte_offset(byte_offset.uint32()), pad2(0) { } - /*! \brief opaque pointer to the allocated data */ + /*! + * \brief The opaque data pointer points to the allocated data. + * This will be CUDA device pointer or cl_mem handle in OpenCL. + * This pointer is always aligns to 256 bytes as in CUDA. + */ uint32_t data; /*! \brief The device context of the tensor */ DLContext ctx; @@ -337,16 +394,19 @@ struct TVMArray64 { TargetVal shape, TargetVal strides, TargetVal byte_offset) - : data(data.val64), + : data(data.uint64()), ctx(ctx), ndim(ndim), pad0(0), dtype(dtype), - shape(shape.val64), - strides(strides.val64), - byte_offset(byte_offset.val64) { } - - /*! \brief opaque pointer to the allocated data */ + shape(shape.uint64()), + strides(strides.uint64()), + byte_offset(byte_offset.uint64()) { } + /*! + * \brief The opaque data pointer points to the allocated data. + * This will be CUDA device pointer or cl_mem handle in OpenCL. + * This pointer is always aligns to 256 bytes as in CUDA. + */ uint64_t data; /*! \brief The device context of the tensor */ DLContext ctx; @@ -367,8 +427,26 @@ struct TVMArray64 { uint64_t byte_offset; }; +/*! \brief MicroTVM task to store in task queue before specializing to word size */ +struct DevTask { + /*! \brief Pointer to function to call for this task */ + TargetVal func; + /*! \brief Array of argument values */ + TargetVal arg_values; + /*! \brief Array of type codes for each argument value */ + TargetVal arg_type_codes; + /*! \brief Number of arguments */ + int32_t num_args; +}; + /*! \brief MicroTVM task for serialization to 32-bit devices */ typedef struct StructUTVMTask32 { + StructUTVMTask32(DevTask task) + : func(task.func.uint32()), + arg_values(task.arg_values.uint32()), + arg_type_codes(task.arg_type_codes.uint32()), + num_args(task.num_args) { } + /*! \brief Pointer to function to call for this task */ uint32_t func; /*! \brief Array of argument values */ @@ -377,10 +455,16 @@ typedef struct StructUTVMTask32 { uint32_t arg_type_codes; /*! \brief Number of arguments */ int32_t num_args; -} UTVMTask32; +} StructUTVMTask32; /*! \brief MicroTVM task for serialization to 64-bit devices */ typedef struct StructUTVMTask64 { + StructUTVMTask64(DevTask task) + : func(task.func.uint64()), + arg_values(task.arg_values.uint64()), + arg_type_codes(task.arg_type_codes.uint64()), + num_args(task.num_args) { } + /*! \brief Pointer to function to call for this task */ uint64_t func; /*! \brief Array of argument values */ @@ -389,7 +473,7 @@ typedef struct StructUTVMTask64 { uint64_t arg_type_codes; /*! \brief Number of arguments */ int32_t num_args; -} UTVMTask64; +} StructUTVMTask64; } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/openocd_low_level_device.cc b/src/runtime/micro/openocd_low_level_device.cc index e5c83e590c36..0f21d6674740 100644 --- a/src/runtime/micro/openocd_low_level_device.cc +++ b/src/runtime/micro/openocd_low_level_device.cc @@ -50,7 +50,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { socket_.SendCommand(); } - void Read(DevPtr addr, void* buf, size_t num_bytes) { + void Read(TargetPtr addr, void* buf, size_t num_bytes) override { if (num_bytes == 0) { return; } @@ -88,7 +88,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { } { - socket_.cmd_builder() << "ocd_echo $output"; + socket_.cmd_builder() << "return $output"; socket_.SendCommand(); const std::string& reply = socket_.last_reply(); @@ -119,7 +119,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { } } - void Write(DevPtr addr, const void* buf, size_t num_bytes) { + void Write(TargetPtr addr, const void* buf, size_t num_bytes) override { if (num_bytes == 0) { return; } @@ -171,7 +171,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { } } - void Execute(DevPtr func_addr, DevPtr breakpoint_addr) { + void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) override { socket_.cmd_builder() << "halt 0"; socket_.SendCommand(); @@ -207,12 +207,12 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { /*! \brief number of bytes in a word on the target device (64-bit) */ static const constexpr ssize_t kWordSize = 8; - // NOTE: OpenOCD will call any request larger than this constant an "absurd - // request". + // NOTE: The OS pipe buffer must be able to handle a line long enough to + // print this transfer request. /*! \brief maximum number of bytes allowed in a single memory transfer */ - static const constexpr ssize_t kMemTransferLimit = 64000; + static const constexpr ssize_t kMemTransferLimit = 8000; /*! \brief number of milliseconds to wait for function execution to halt */ - static const constexpr int kWaitTime = 10000; + static const constexpr int kWaitTime = 30000; }; const std::shared_ptr OpenOCDLowLevelDeviceCreate(const std::string& server_addr, diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h index e0275165e774..c99d79652edb 100644 --- a/src/runtime/micro/target_data_layout_encoder.h +++ b/src/runtime/micro/target_data_layout_encoder.h @@ -30,7 +30,7 @@ namespace tvm { namespace runtime { -// TODO(weberlo): Handle endianness. +// TODO(weberlo, areusch): Handle endianness. /*! * \brief data encoder for uTVM that builds a host-side buffer @@ -50,7 +50,7 @@ class TargetDataLayoutEncoder { * \param size size (in bytes) of the memory region allocated for this slot * \param start_addr start address of the slot in the device's memory */ - Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, DevPtr start_addr); + Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, TargetPtr start_addr); ~Slot(); @@ -71,7 +71,7 @@ class TargetDataLayoutEncoder { * \brief returns start address of the slot in device memory * \return device start address */ - DevPtr start_addr(); + TargetPtr start_addr(); /*! * \brief returns number of bytes allocated for this slot @@ -89,16 +89,17 @@ class TargetDataLayoutEncoder { /*! \brief size (in bytes) of the memory region allocated for this slot */ size_t size_; /*! \brief start address of the slot in the device's memory */ - DevPtr start_addr_; + TargetPtr start_addr_; }; /*! * \brief constructor * \param start_addr start address of the encoder in device memory */ - explicit TargetDataLayoutEncoder(DevPtr start_addr, size_t word_size) - : buf_(std::vector()), curr_offset_(0), word_size_(word_size) { - start_addr_ = DevPtr(UpperAlignValue(start_addr.value().val64, word_size_)); + explicit TargetDataLayoutEncoder(size_t capacity, TargetWordSize word_size) + : buf_(std::vector()), curr_offset_(0), + start_addr_(word_size, nullptr), + capacity_(capacity), word_size_(word_size) { } /*! @@ -108,14 +109,20 @@ class TargetDataLayoutEncoder { */ template Slot Alloc(size_t num_elems = 1) { - curr_offset_ = UpperAlignValue(curr_offset_, word_size_); + curr_offset_ = UpperAlignValue(curr_offset_, word_size_.bytes()); size_t size = sizeof(T) * num_elems; if (curr_offset_ + size > buf_.size()) { buf_.resize(curr_offset_ + size); } + CHECK(buf_.size() < capacity_) << "out of space in data encoder"; size_t slot_start_offset = curr_offset_; curr_offset_ += size; - return Slot(this, slot_start_offset, size, start_addr_ + slot_start_offset); + return Slot(this, slot_start_offset, size, start_addr() + slot_start_offset); + } + + void Clear() { + buf_.clear(); + curr_offset_ = 0; } /*! @@ -130,26 +137,39 @@ class TargetDataLayoutEncoder { * \brief returns current size of the encoder's buffer * \return buffer size */ - size_t buf_size() { + size_t buf_size() const { return buf_.size(); } + TargetPtr start_addr() const { + CHECK_NE(start_addr_.value().uint64(), 0) << "start addr uninitialized"; + return start_addr_; + } + + void set_start_addr(TargetPtr start_addr) { + CHECK_EQ(buf_.size(), 0) << "cannot change encoder start addr unless empty"; + start_addr_ = TargetPtr(word_size_, + UpperAlignValue(start_addr.value().uint64(), word_size_.bytes())); + } + private: /*! \brief in-memory backing buffer */ std::vector buf_; /*! \brief current offset */ size_t curr_offset_; /*! \brief start address of the encoder in device memory */ - DevPtr start_addr_; + TargetPtr start_addr_; + /*! \brief number of bytes available in device memory */ + size_t capacity_; /*! \brief number of bytes in a word on the target device */ - size_t word_size_; + TargetWordSize word_size_; }; template TargetDataLayoutEncoder::Slot::Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, - DevPtr start_addr) + TargetPtr start_addr) : parent_(parent), start_offset_(start_offset), curr_offset_(0), @@ -158,7 +178,10 @@ TargetDataLayoutEncoder::Slot::Slot(TargetDataLayoutEncoder* parent, template TargetDataLayoutEncoder::Slot::~Slot() { - CHECK(curr_offset_ == size_) << "unwritten space in slot"; + // TODO(weberlo, areusch): this can mask the exception thrown by slot allocation... even though + // that doesn't make sense. + CHECK(curr_offset_ == size_) << "unwritten space in slot; curr_offset=" + << curr_offset_ << ", size=" << size_; } template @@ -177,7 +200,7 @@ void TargetDataLayoutEncoder::Slot::WriteValue(const T& val) { } template -DevPtr TargetDataLayoutEncoder::Slot::start_addr() { +TargetPtr TargetDataLayoutEncoder::Slot::start_addr() { return start_addr_; } diff --git a/src/runtime/micro/tcl_socket.cc b/src/runtime/micro/tcl_socket.cc index 64dfbf218388..24abe42f786f 100644 --- a/src/runtime/micro/tcl_socket.cc +++ b/src/runtime/micro/tcl_socket.cc @@ -45,6 +45,7 @@ void TclSocket::SendCommand() { const char terminate_token = kCommandTerminateToken; cmd_builder_ << terminate_token; std::string full_cmd = cmd_builder_.str(); + CHECK(tcp_socket_.Send(full_cmd.data(), full_cmd.length()) != -1) << "failed to send command"; cmd_builder_.str(std::string()); diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc index 43ca630f9496..ae293abfacdd 100644 --- a/src/runtime/rpc/rpc_session.cc +++ b/src/runtime/rpc/rpc_session.cc @@ -38,6 +38,7 @@ #include "../object_internal.h" #include "../../support/ring_buffer.h" #include "../../support/socket.h" +#include "../micro/micro_session.h" namespace tvm { namespace runtime { @@ -1246,43 +1247,15 @@ void RPCSession::EventHandler::HandlePackedCall() { CHECK_EQ(state_, kRecvCode); } -PackedFunc MicroTimeEvaluator( - PackedFunc pf, - TVMContext ctx, - int number, - int repeat) { - auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) mutable { - TVMRetValue temp; - std::ostringstream os; - // skip first time call, to activate lazy compilation components. - pf.CallPacked(args, &temp); - DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); - for (int i = 0; i < repeat; ++i) { - double speed = 0.0; - for (int j = 0; j < number; ++j) { - pf.CallPacked(args, &temp); - DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); - speed += (temp.operator double()) / number; - } - os.write(reinterpret_cast(&speed), sizeof(speed)); - } - std::string blob = os.str(); - TVMByteArray arr; - arr.size = blob.length(); - arr.data = blob.data(); - // return the time. - *rv = arr; - }; - return PackedFunc(ftimer); -} - PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat, int min_repeat_ms) { if (static_cast(ctx.device_type) == static_cast(kDLMicroDev)) { - return MicroTimeEvaluator(pf, ctx, number, repeat); + auto get_micro_time_evaluator = runtime::Registry::Get("micro._GetMicroTimeEvaluator"); + CHECK(get_micro_time_evaluator != nullptr) << "micro backend not enabled"; + return (*get_micro_time_evaluator)(pf, ctx, number, repeat); } auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable { diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc index 6461908f1c4a..adb84e498e5d 100644 --- a/src/target/source/codegen_c.cc +++ b/src/target/source/codegen_c.cc @@ -840,6 +840,10 @@ void CodeGenC::VisitStmt_(const AttrStmtNode* op) { const VarNode* v = op->node.as(); CHECK(v); volatile_buf_.insert(v); + } else if (op->attr_key == tir::attr::pragma_import_c) { + const StringImmNode* value = op->value.as(); + CHECK(value != nullptr); + decl_stream << value->value; } this->PrintStmt(op->body); } diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc index cbdec6201742..5e5db82eb931 100644 --- a/src/target/source/codegen_c_host.cc +++ b/src/target/source/codegen_c_host.cc @@ -23,8 +23,8 @@ #include #include #include -#include "codegen_c_host.h" #include "../build_common.h" +#include "codegen_c_host.h" namespace tvm { namespace codegen { @@ -35,9 +35,10 @@ CodeGenCHost::CodeGenCHost() { void CodeGenCHost::Init(bool output_ssa, bool emit_asserts) { emit_asserts_ = emit_asserts; + declared_globals_.clear(); decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n"; decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n"; - decl_stream << "extern void* " << module_name_ << " = NULL;\n"; + decl_stream << "void* " << module_name_ << " = NULL;\n"; CodeGenC::Init(output_ssa); } @@ -182,8 +183,15 @@ void CodeGenCHost::VisitExpr_(const CallNode *op, std::ostream& os) { // NOLINT( int64_t num_args = end - begin; CHECK_GE(num_args, 0); std::string func_name = s->value; - std::string packed_func_name = GetUniqueName(func_name + "_packed"); - decl_stream << "static void* " << packed_func_name << " = NULL;\n"; + // NOTE: cannot rely on GetUnique for global decl_stream declarations + // because it is reset between AddFunction(). + std::string packed_func_name = func_name + "_packed"; + if (declared_globals_.insert(packed_func_name).second) { + // Still reserve the name among unique names. + CHECK(GetUniqueName(packed_func_name) == packed_func_name) << + "Expected name " << packed_func_name << " to not be taken"; + decl_stream << "static void* " << packed_func_name << " = NULL;\n"; + } this->PrintGetFuncFromBackend(func_name, packed_func_name); this->PrintFuncCall(packed_func_name, num_args); } else if (op->is_intrinsic(intrinsic::tvm_throw_last_error)) { @@ -241,7 +249,7 @@ runtime::Module BuildCHost(IRModule mod) { CodeGenCHost cg; cg.Init(output_ssa, emit_asserts); - for (auto kv : mod->functions) { + for (auto kv : mod->functions) { CHECK(kv.second->IsInstance()) << "CodegenCHost: Can only take PrimFunc"; auto f = Downcast(kv.second); @@ -254,7 +262,7 @@ runtime::Module BuildCHost(IRModule mod) { TVM_REGISTER_GLOBAL("target.build.c") .set_body([](TVMArgs args, TVMRetValue* rv) { - *rv = BuildCHost(args[0]); - }); + *rv = BuildCHost(args[0]); +}); } // namespace codegen } // namespace tvm diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h index 4f9a0a74511f..bec96860ad47 100644 --- a/src/target/source/codegen_c_host.h +++ b/src/target/source/codegen_c_host.h @@ -24,9 +24,10 @@ #ifndef TVM_TARGET_SOURCE_CODEGEN_C_HOST_H_ #define TVM_TARGET_SOURCE_CODEGEN_C_HOST_H_ -#include -#include +#include #include +#include "tvm/target/codegen.h" +#include "tvm/tir/expr.h" #include "codegen_c.h" namespace tvm { @@ -53,6 +54,8 @@ class CodeGenCHost final : public CodeGenC { private: std::string module_name_; + /* \brief tracks declared global variables which live despite GetUniqueName */ + std::set declared_globals_; /*! \brief whether to emit asserts in the resulting C code */ bool emit_asserts_; diff --git a/src/target/target.cc b/src/target/target.cc index a72ce1c5b3e4..2cb72a285a41 100644 --- a/src/target/target.cc +++ b/src/target/target.cc @@ -140,7 +140,7 @@ Target CreateTarget(const std::string& target_name, t->keys_array.push_back("hexagon"); t->device_type = kDLHexagon; } else { - LOG(ERROR) << "Unknown target name " << target_name; + LOG(ERROR) << "Unknown target name " << target_name << "; falling back to stackvm"; return target::stackvm(); } diff --git a/tests/lint/add_asf_header.py b/tests/lint/add_asf_header.py index a44fbd3df1b5..21d25c25e573 100644 --- a/tests/lint/add_asf_header.py +++ b/tests/lint/add_asf_header.py @@ -181,7 +181,9 @@ def add_header(fname, header): skipline = False ext = os.path.splitext(fname)[1][1:] - if lines[0][:2] == "#!": + if not lines: + skipline = False # File is enpty + elif lines[0][:2] == "#!": skipline = True elif lines[0][:2] == "