diff --git a/apps/microtvm/zephyr_cmsisnn/src/main.c b/apps/microtvm/zephyr_cmsisnn/src/main.c index 274bd63d3ea5..31f6cd0cc1d0 100644 --- a/apps/microtvm/zephyr_cmsisnn/src/main.c +++ b/apps/microtvm/zephyr_cmsisnn/src/main.c @@ -34,7 +34,7 @@ extern float output_storage[12]; extern const size_t output_len; -static uint8_t g_crt_workspace[TVMGEN_DEFAULT_WORKSPACE_SIZE + 512]; +static uint8_t g_crt_workspace[TVMGEN_DEFAULT_WORKSPACE_SIZE]; tvm_workspace_t app_workspace; void TVMLogf(const char* msg, ...) { diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index 3694d6bcef95..c6207e209be4 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -686,6 +686,35 @@ class AOTExecutorCodegen : public MixedModeVisitor { } } + /*! + * brief Calculate workspace sizes for PrimFuncs in the IRModule + */ + Map CalculateWorkspaceSizes( + const IRModule& lowered_mod, const Map& function_metadata) { + Executor executor_config = lowered_mod->GetAttr(tvm::attr::kExecutor).value(); + Integer workspace_byte_alignment = + executor_config->GetAttr("workspace-byte-alignment").value_or(16); + Map updated_function_metadata; + for (const auto& kv : lowered_mod->functions) { + GlobalVar global_var = kv.first; + BaseFunc base_func = kv.second; + if (base_func->IsInstance()) { + tir::PrimFunc pfunc = Downcast(base_func); + Target tgt = pfunc->GetAttr(tvm::attr::kTarget).value(); + const auto& ws = CalculateWorkspaceBytes(pfunc, workspace_byte_alignment); + if (function_metadata.count(global_var->name_hint)) { + updated_function_metadata.Set(global_var->name_hint, + function_metadata[global_var->name_hint]); + updated_function_metadata[global_var->name_hint]->workspace_sizes.Set(tgt, ws); + } else { + FunctionInfo finfo{{{tgt, ws}}, {}, {}, {{tgt, pfunc}}, {}}; + updated_function_metadata.Set(global_var->name_hint, finfo); + } + } + } + return updated_function_metadata; + } + /*! * brief Run USMP to plan memory for lowered IRModule */ @@ -694,17 +723,8 @@ class AOTExecutorCodegen : public MixedModeVisitor { Integer workspace_byte_alignment = executor_config->GetAttr("workspace-byte-alignment").value_or(16); IRModule lowered_mod = mod->ShallowCopy(); + function_metadata_ = CalculateWorkspaceSizes(lowered_mod, function_metadata_); lowered_mod = tir::transform::UnifiedStaticMemoryPlanner()(lowered_mod); - // Update workspace size based on the pool allocations. - for (const auto& kv : function_metadata_) { - if (lowered_mod->ContainGlobalVar(kv.first) && - lowered_mod->Lookup(kv.first)->IsInstance()) { - tir::PrimFunc pfunc = Downcast(lowered_mod->Lookup(kv.first)); - Target tgt = pfunc->GetAttr(tvm::attr::kTarget).value(); - const auto& ws = CalculateWorkspaceBytes(pfunc, workspace_byte_alignment); - kv.second->workspace_sizes.Set(tgt, ws); - } - } Optional> allocated_pool_infos = lowered_mod->GetAttr>(tvm::attr::kPoolArgs); backend::FunctionInfo main_func_info = @@ -736,6 +756,7 @@ class AOTExecutorCodegen : public MixedModeVisitor { Integer workspace_byte_alignment = executor_config->GetAttr("workspace-byte-alignment").value_or(16); IRModule lowered_mod = mod->ShallowCopy(); + function_metadata_ = CalculateWorkspaceSizes(lowered_mod, function_metadata_); // Running StorageRewrite just on the main function tir::PrimFunc tir_main_func = Downcast(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)); diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py index 4bdaef7a74ca..b355c440e006 100644 --- a/tests/python/contrib/test_ethosu/infra.py +++ b/tests/python/contrib/test_ethosu/infra.py @@ -242,12 +242,13 @@ def build_source( def verify_source( models: List[AOTCompiledTestModel], accel="ethos-u55-256", + enable_usmp=True, ): """ This method verifies the generated source from an NPU module by building it and running on an FVP. """ interface_api = "c" - test_runner = create_test_runner(accel) + test_runner = create_test_runner(accel, enable_usmp) run_and_check( models, test_runner, diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py index e9c6da5be18a..7e3140ff514a 100644 --- a/tests/python/contrib/test_ethosu/test_networks.py +++ b/tests/python/contrib/test_ethosu/test_networks.py @@ -71,7 +71,7 @@ def test_forward_mobilenet_v1(accel_type, enable_usmp): compiled_models = infra.build_source( mod, input_data, output_data, accel_type, output_tolerance=10, enable_usmp=enable_usmp ) - infra.verify_source(compiled_models, accel_type) + infra.verify_source(compiled_models, accel_type, enable_usmp=enable_usmp) if __name__ == "__main__": diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py index 451071b69dea..0f9736c4900b 100644 --- a/tests/python/relay/aot/aot_test_utils.py +++ b/tests/python/relay/aot/aot_test_utils.py @@ -265,21 +265,28 @@ def emit_data_linkage(output_file, data_linkage): def emit_main_prologue( - main_file, custom_prologue, workspace_bytes, data_linkage, compiled_models, interface_api + main_file, + custom_prologue, + workspace_bytes, + data_linkage, + compiled_models, + interface_api, + use_stack_allocator=True, ): - # Add TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES because of memory alignment. - workspace_define = f"#define WORKSPACE_SIZE ({workspace_bytes}" - if interface_api == "c": - for compiled_model in compiled_models: - model = compiled_model.model - workspace_define += f" + TVMGEN_{model.name.upper()}_WORKSPACE_SIZE" - workspace_define += " + TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES)\n" - main_file.write(workspace_define) - emit_data_linkage(main_file, data_linkage) - main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n") - main_file.write("tvm_workspace_t app_workspace;\n") - main_file.write( - """ + if use_stack_allocator: + workspace_define = f"#define WORKSPACE_SIZE ({workspace_bytes}" + if interface_api == "c": + for compiled_model in compiled_models: + model = compiled_model.model + workspace_define += f" + TVMGEN_{model.name.upper()}_WORKSPACE_SIZE" + # Add TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES because of memory alignment. + workspace_define += " + TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES)\n" + main_file.write(workspace_define) + emit_data_linkage(main_file, data_linkage) + main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n") + main_file.write("tvm_workspace_t app_workspace;\n") + main_file.write( + """ tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) { return StackMemoryManager_Allocate(&app_workspace, num_bytes, out_ptr); } @@ -287,7 +294,25 @@ def emit_main_prologue( tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) { return StackMemoryManager_Free(&app_workspace,ptr); } + """ + ) + else: + # An implementation is not needed for these if the stack allocator is not used + main_file.write( + """ +tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) { + return kTvmErrorNoError; +} +tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) { + return kTvmErrorNoError; +} + + """ + ) + main_file.write( + """ + void TVMPlatformAbort(tvm_crt_error_t code) { exit(-1); } void TVMLogf(const char* msg, ...) { @@ -296,10 +321,10 @@ def emit_main_prologue( vfprintf(stdout, msg, args); va_end(args); } - + TVM_DLL int TVMFuncRegisterGlobal(const char* name, TVMFunctionHandle f, int override) {} int main(){\n -""" + """ ) main_file.write(custom_prologue) @@ -511,6 +536,7 @@ def create_main( data_linkage, interface_api, workspace_bytes, + use_stack_allocator=True, ): file_path = pathlib.Path(f"{output_path}/" + test_name).resolve() # create header file @@ -533,8 +559,10 @@ def create_main( data_linkage, compiled_models, interface_api, + use_stack_allocator, ) - emit_main_init_memory_manager(main_file) + if use_stack_allocator: + emit_main_init_memory_manager(main_file) if interface_api == "c": for compiled_model in compiled_models: @@ -709,11 +737,14 @@ def run_and_check( t = tarfile.open(tar_file) t.extractall(base_path) - workspace_bytes = model.extra_memory_in_bytes - use_usmp = runner.pass_config.get("tir.usmp.enable", False) - if interface_api == "packed" and not use_usmp: + # Interface C APIs does not need compiler generated + # workspace to generate the test application, because + # workspace size is codegen'd as a macro to + # tvmgen_.h. + if interface_api != "c": workspace_bytes += mlf_extract_workspace_size_bytes(tar_file) + workspace_bytes += model.extra_memory_in_bytes for key in model.inputs: sanitized_tensor_name = re.sub(r"\W", "_", key) create_header_file( @@ -738,6 +769,10 @@ def run_and_check( data_linkage, ) + use_usmp = runner.pass_config.get("tir.usmp.enable", False) + # We only need the stack allocator if USMP is not used + use_stack_allocator = not use_usmp + create_main( "test.c", models, @@ -748,6 +783,7 @@ def run_and_check( data_linkage, interface_api, workspace_bytes, + use_stack_allocator, ) # Verify that compiles fine @@ -868,3 +904,22 @@ def generate_ref_data(mod, input_data, params=None, target="llvm"): output_tensor_names = main.attrs["output_tensor_names"] return dict(zip(output_tensor_names, out)) + + +def create_relay_module_and_inputs_from_tflite_file(tflite_model_file): + """A helper function to create a Relay IRModule with inputs + and params from a tflite file""" + with open(tflite_model_file, "rb") as f: + tflite_model_buf = f.read() + mod, params = convert_to_relay(tflite_model_buf) + + inputs = dict() + for param in mod["main"].params: + name = str(param.name_hint) + data_shape = [int(i) for i in param.type_annotation.shape] + dtype = str(param.type_annotation.dtype) + in_min, in_max = (np.iinfo(dtype).min, np.iinfo(dtype).max) + data = np.random.randint(in_min, high=in_max, size=data_shape, dtype=dtype) + inputs[name] = data + + return mod, inputs, params diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py index f4f0806dca52..87ebb5da3ba6 100644 --- a/tests/python/relay/aot/test_crt_aot.py +++ b/tests/python/relay/aot/test_crt_aot.py @@ -28,14 +28,18 @@ from tvm.relay.testing import byoc from tvm.relay.op.annotation import compiler_begin, compiler_end from tvm.relay.backend import Executor, Runtime +from tvm.micro import model_library_format as mlf from aot_test_utils import ( AOTTestModel, AOT_DEFAULT_RUNNER, + AOT_CORSTONE300_RUNNER, + AOTDataLinkage, generate_ref_data, convert_to_relay, compile_and_run, compile_models, parametrize_aot_options, + create_relay_module_and_inputs_from_tflite_file, ) @@ -87,11 +91,16 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), inputs = {"data": input_data} output_list = generate_ref_data(mod, inputs, params) + data_linkage = None + if test_runner == AOT_CORSTONE300_RUNNER: + data_linkage = AOTDataLinkage(section=".data.tvm", alignment=8) + compile_and_run( AOTTestModel(module=mod, inputs=inputs, outputs=output_list, params=params), test_runner, interface_api, use_unpacked_api, + data_linkage=data_linkage, ) @@ -501,6 +510,10 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), inputs2 = {"data": input_data} output_list2 = generate_ref_data(mod2, inputs2, params2) + data_linkage = None + if test_runner == AOT_CORSTONE300_RUNNER: + data_linkage = AOTDataLinkage(section=".data.tvm", alignment=8) + compile_and_run( [ AOTTestModel( @@ -521,6 +534,7 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), test_runner, interface_api, use_unpacked_api, + data_linkage=data_linkage, ) @@ -541,13 +555,7 @@ def test_quant_mobilenet_tfl(): "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz", "mobilenet_v1_1.0_224_quant.tflite", ) - with open(tflite_model_file, "rb") as f: - tflite_model_buf = f.read() - data_shape = (1, 224, 224, 3) - in_min, in_max = (0, 255) - data = np.random.randint(in_min, high=in_max, size=data_shape, dtype="uint8") - mod, params = convert_to_relay(tflite_model_buf) - inputs = {"input": data} + mod, inputs, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file) output_list = generate_ref_data(mod, inputs, params) compile_and_run( AOTTestModel(module=mod, inputs=inputs, outputs=output_list, params=params), @@ -843,5 +851,68 @@ def representative_dataset(): assert output_name in source +@pytest.mark.parametrize( + "workspace_byte_alignment,main_workspace_size", + [ + (8, 55296), + (16, 55296), + (256, 57344), + ], +) +def test_workspace_calculation(workspace_byte_alignment, main_workspace_size): + mod, params = tvm.relay.testing.synthetic.get_workload() + target = "c" + runtime = Runtime("crt") + executor = Executor( + "aot", + { + "workspace-byte-alignment": workspace_byte_alignment, + }, + ) + with tvm.transform.PassContext( + opt_level=3, + config={ + "tir.disable_vectorize": True, + }, + ): + lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params) + + mlf_memory_map = mlf._build_function_memory_map(lib.function_metadata) + assert mlf_memory_map["main"][0]["workspace_size_bytes"] == main_workspace_size + + +@tvm.testing.requires_package("tflite") +@tvm.testing.requires_cmsisnn +def test_workspace_calculation_cmsis_nn(): + """This tests cmsis_nn codegen for workspace calculation. + This is tested specially because cmsis-nn codegen creates + multiple PrimFuncs per offloaded relay function in a non + -hierarchical manner.""" + pytest.importorskip("tflite") + + import tvm.relay.testing.tf as tf_testing + from tvm.relay.op.contrib import cmsisnn + + target = "c" + runtime = Runtime("crt") + executor = Executor("aot") + tflite_model_file = tf_testing.get_workload_official( + "https://storage.googleapis.com/download.tensorflow.org/" + "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz", + "mobilenet_v1_1.0_224_quant.tflite", + ) + mod, _, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file) + mod = cmsisnn.partition_for_cmsisnn(mod, params) + with tvm.transform.PassContext( + opt_level=3, + config={ + "tir.disable_vectorize": True, + }, + ): + lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params) + mlf_memory_map = mlf._build_function_memory_map(lib.function_metadata) + assert mlf_memory_map["main"][0]["workspace_size_bytes"] == 12907328 + + if __name__ == "__main__": sys.exit(pytest.main([__file__] + sys.argv[1:])) diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py index a27609cc07ad..73b34700ee27 100644 --- a/tests/python/relay/aot/test_crt_aot_usmp.py +++ b/tests/python/relay/aot/test_crt_aot_usmp.py @@ -39,6 +39,7 @@ compile_models, parametrize_aot_options, run_and_check, + create_relay_module_and_inputs_from_tflite_file, ) @@ -202,23 +203,6 @@ def test_byoc_microtvm(merge_compiler_regions): ) -def _get_relay_module_and_inputs_from_tflite_file(tflite_model_file): - with open(tflite_model_file, "rb") as f: - tflite_model_buf = f.read() - mod, params = convert_to_relay(tflite_model_buf) - - inputs = dict() - for param in mod["main"].params: - name = str(param.name_hint) - data_shape = [int(i) for i in param.type_annotation.shape] - dtype = str(param.type_annotation.dtype) - in_min, in_max = (np.iinfo(dtype).min, np.iinfo(dtype).max) - data = np.random.randint(in_min, high=in_max, size=data_shape, dtype=dtype) - inputs[name] = data - - return mod, inputs, params - - MOBILENET_V1_URL = ( "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz", "mobilenet_v1_1.0_224_quant.tflite", @@ -253,7 +237,7 @@ def test_tflite_model_u1_usecase(model_url, usmp_algo, workspace_size): model_url[0], model_url[1], ) - mod, inputs, params = _get_relay_module_and_inputs_from_tflite_file(tflite_model_file) + mod, inputs, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file) output_list = generate_ref_data(mod, inputs, params) compiled_test_mods = compile_models( @@ -324,7 +308,7 @@ def test_tflite_model_u3_usecase_single_external_pool(model_url, usmp_algo): model_url[0], model_url[1], ) - mod, inputs, params = _get_relay_module_and_inputs_from_tflite_file(tflite_model_file) + mod, inputs, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file) output_list = generate_ref_data(mod, inputs, params) compiled_test_mods = compile_models( @@ -384,7 +368,7 @@ def test_tflite_model_u3_usecase_two_external_pools(model_url, usmp_algo): model_url[0], model_url[1], ) - mod, inputs, params = _get_relay_module_and_inputs_from_tflite_file(tflite_model_file) + mod, inputs, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file) output_list = generate_ref_data(mod, inputs, params) compiled_test_mods = compile_models( @@ -438,14 +422,14 @@ def test_tflite_model_u2_usecase_two_models_with_a_single_external_pool(model_ur model_urls[0][0], model_urls[0][1], ) - mod1, inputs1, params1 = _get_relay_module_and_inputs_from_tflite_file(tflite_model_file1) + mod1, inputs1, params1 = create_relay_module_and_inputs_from_tflite_file(tflite_model_file1) output_list1 = generate_ref_data(mod1, inputs1, params1) tflite_model_file2 = tf_testing.get_workload_official( model_urls[1][0], model_urls[1][1], ) - mod2, inputs2, params2 = _get_relay_module_and_inputs_from_tflite_file(tflite_model_file2) + mod2, inputs2, params2 = create_relay_module_and_inputs_from_tflite_file(tflite_model_file2) output_list2 = generate_ref_data(mod2, inputs2, params2) compiled_test_mods = compile_models( diff --git a/tests/scripts/task_demo_microtvm.sh b/tests/scripts/task_demo_microtvm.sh index 9ed9c671acc0..b5c18ec9e757 100755 --- a/tests/scripts/task_demo_microtvm.sh +++ b/tests/scripts/task_demo_microtvm.sh @@ -19,8 +19,7 @@ set -euxo pipefail pushd apps/microtvm/zephyr_cmsisnn -# Demo tests are disabled here due to https://github.com/apache/tvm/issues/10312 -# timeout 5m ./run_demo.sh + timeout 5m ./run_demo.sh popd pushd apps/microtvm/ethosu @@ -28,6 +27,6 @@ FVP_PATH="/opt/arm/FVP_Corstone_SSE-300_Ethos-U55" CMAKE_PATH="/opt/arm/cmake/bin/cmake" FREERTOS_PATH="/opt/freertos/FreeRTOSv202112.00" -# timeout 5m ./run_demo.sh --fvp_path $FVP_PATH --cmake_path $CMAKE_PATH -# timeout 5m ./run_demo.sh --fvp_path $FVP_PATH --cmake_path $CMAKE_PATH --freertos_path $FREERTOS_PATH + timeout 5m ./run_demo.sh --fvp_path $FVP_PATH --cmake_path $CMAKE_PATH + timeout 5m ./run_demo.sh --fvp_path $FVP_PATH --cmake_path $CMAKE_PATH --freertos_path $FREERTOS_PATH popd