[AOT] BugFix of workspace calculation

Following an investigation from apache#10022, it turns out, currently the workspace calculation assumes there would be a single lowered PrimFunc could be produced per primitive Relay Function. However, the exception turned out to be the CMSIS-NN codegen that produces multiple calls/PrimFuncs in the place of a single call to single relay PrimFunc. This commit adds changes to workspace calculation to be done on lowered IRModule. Additionally, changes the test utils to not to generate any stack allocator code when USMP is used to make the tests more strict. Change-Id: I5202d9cc7c6a8c00c73791b82df062a8e13dd224
manupak · Feb 21, 2022 · 1f84d4e · 1f84d4e
1 parent 2f93780
commit 1f84d4e
Show file tree

Hide file tree

Showing 8 changed files with 199 additions and 66 deletions.
diff --git a/apps/microtvm/zephyr_cmsisnn/src/main.c b/apps/microtvm/zephyr_cmsisnn/src/main.c
@@ -34,7 +34,7 @@ extern float output_storage[12];
 
 extern const size_t output_len;
 
-static uint8_t g_crt_workspace[TVMGEN_DEFAULT_WORKSPACE_SIZE + 512];
+static uint8_t g_crt_workspace[TVMGEN_DEFAULT_WORKSPACE_SIZE];
 tvm_workspace_t app_workspace;
 
 void TVMLogf(const char* msg, ...) {

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
@@ -686,6 +686,35 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     }
   }
 
+  /*!
+   * brief Calculate workspace sizes for PrimFuncs in the IRModule
+   */
+  Map<String, FunctionInfo> CalculateWorkspaceSizes(
+      const IRModule& lowered_mod, const Map<String, FunctionInfo>& function_metadata) {
+    Executor executor_config = lowered_mod->GetAttr<Executor>(tvm::attr::kExecutor).value();
+    Integer workspace_byte_alignment =
+        executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
+    Map<String, FunctionInfo> updated_function_metadata;
+    for (const auto& kv : lowered_mod->functions) {
+      GlobalVar global_var = kv.first;
+      BaseFunc base_func = kv.second;
+      if (base_func->IsInstance<tir::PrimFuncNode>()) {
+        tir::PrimFunc pfunc = Downcast<tir::PrimFunc>(base_func);
+        Target tgt = pfunc->GetAttr<Target>(tvm::attr::kTarget).value();
+        const auto& ws = CalculateWorkspaceBytes(pfunc, workspace_byte_alignment);
+        if (function_metadata.count(global_var->name_hint)) {
+          updated_function_metadata.Set(global_var->name_hint,
+                                        function_metadata[global_var->name_hint]);
+          updated_function_metadata[global_var->name_hint]->workspace_sizes.Set(tgt, ws);
+        } else {
+          FunctionInfo finfo{{{tgt, ws}}, {}, {}, {{tgt, pfunc}}, {}};
+          updated_function_metadata.Set(global_var->name_hint, finfo);
+        }
+      }
+    }
+    return updated_function_metadata;
+  }
+
   /*!
    * brief Run USMP to plan memory for lowered IRModule
    */
@@ -694,17 +723,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     Integer workspace_byte_alignment =
         executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
     IRModule lowered_mod = mod->ShallowCopy();
+    function_metadata_ = CalculateWorkspaceSizes(lowered_mod, function_metadata_);
     lowered_mod = tir::transform::UnifiedStaticMemoryPlanner()(lowered_mod);
-    // Update workspace size based on the pool allocations.
-    for (const auto& kv : function_metadata_) {
-      if (lowered_mod->ContainGlobalVar(kv.first) &&
-          lowered_mod->Lookup(kv.first)->IsInstance<tir::PrimFuncNode>()) {
-        tir::PrimFunc pfunc = Downcast<tir::PrimFunc>(lowered_mod->Lookup(kv.first));
-        Target tgt = pfunc->GetAttr<Target>(tvm::attr::kTarget).value();
-        const auto& ws = CalculateWorkspaceBytes(pfunc, workspace_byte_alignment);
-        kv.second->workspace_sizes.Set(tgt, ws);
-      }
-    }
     Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
         lowered_mod->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
     backend::FunctionInfo main_func_info =
@@ -736,6 +756,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     Integer workspace_byte_alignment =
         executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
     IRModule lowered_mod = mod->ShallowCopy();
+    function_metadata_ = CalculateWorkspaceSizes(lowered_mod, function_metadata_);
     // Running StorageRewrite just on the main function
     tir::PrimFunc tir_main_func =
         Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));

diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
@@ -242,12 +242,13 @@ def build_source(
 def verify_source(
     models: List[AOTCompiledTestModel],
     accel="ethos-u55-256",
+    enable_usmp=True,
 ):
     """
     This method verifies the generated source from an NPU module by building it and running on an FVP.
     """
     interface_api = "c"
-    test_runner = create_test_runner(accel)
+    test_runner = create_test_runner(accel, enable_usmp)
     run_and_check(
         models,
         test_runner,

diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
@@ -71,7 +71,7 @@ def test_forward_mobilenet_v1(accel_type, enable_usmp):
     compiled_models = infra.build_source(
         mod, input_data, output_data, accel_type, output_tolerance=10, enable_usmp=enable_usmp
     )
-    infra.verify_source(compiled_models, accel_type)
+    infra.verify_source(compiled_models, accel_type, enable_usmp=enable_usmp)
 
 
 if __name__ == "__main__":

diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
@@ -265,29 +265,56 @@ def emit_data_linkage(output_file, data_linkage):
 
 
 def emit_main_prologue(
-    main_file, custom_prologue, workspace_bytes, data_linkage, compiled_models, interface_api
+    main_file,
+    custom_prologue,
+    workspace_bytes,
+    data_linkage,
+    compiled_models,
+    interface_api,
+    use_stack_allocator=True,
 ):
-    # Add TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES because of memory alignment.
-    workspace_define = f"#define WORKSPACE_SIZE ({workspace_bytes}"
-    if interface_api == "c":
-        for compiled_model in compiled_models:
-            model = compiled_model.model
-            workspace_define += f" + TVMGEN_{model.name.upper()}_WORKSPACE_SIZE"
-    workspace_define += " + TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES)\n"
-    main_file.write(workspace_define)
-    emit_data_linkage(main_file, data_linkage)
-    main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n")
-    main_file.write("tvm_workspace_t app_workspace;\n")
-    main_file.write(
-        """
+    if use_stack_allocator:
+        workspace_define = f"#define WORKSPACE_SIZE ({workspace_bytes}"
+        if interface_api == "c":
+            for compiled_model in compiled_models:
+                model = compiled_model.model
+                workspace_define += f" + TVMGEN_{model.name.upper()}_WORKSPACE_SIZE"
+        # Add TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES because of memory alignment.
+        workspace_define += " + TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES)\n"
+        main_file.write(workspace_define)
+        emit_data_linkage(main_file, data_linkage)
+        main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n")
+        main_file.write("tvm_workspace_t app_workspace;\n")
+        main_file.write(
+            """
+            
 tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
     return StackMemoryManager_Allocate(&app_workspace, num_bytes, out_ptr);
 }
 
 tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
     return StackMemoryManager_Free(&app_workspace,ptr);
 }
+        """
+        )
+    else:
+        # An implementation is not needed for these if the stack allocator is not used
+        main_file.write(
+            """
+            
+tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
+    return kTvmErrorFunctionCallNotImplemented;
+}
 
+tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
+    return kTvmErrorFunctionCallNotImplemented;
+}
+
+            """
+        )
+    main_file.write(
+        """
+    
 void TVMPlatformAbort(tvm_crt_error_t code) { exit(-1); }
 
 void TVMLogf(const char* msg, ...) {
@@ -296,10 +323,10 @@ def emit_main_prologue(
   vfprintf(stdout, msg, args);
   va_end(args);
 }
-
+    
 TVM_DLL int TVMFuncRegisterGlobal(const char* name, TVMFunctionHandle f, int override) {}
 int main(){\n
-"""
+    """
     )
     main_file.write(custom_prologue)
 
@@ -511,6 +538,7 @@ def create_main(
     data_linkage,
     interface_api,
     workspace_bytes,
+    use_stack_allocator=True,
 ):
     file_path = pathlib.Path(f"{output_path}/" + test_name).resolve()
     # create header file
@@ -533,8 +561,10 @@ def create_main(
             data_linkage,
             compiled_models,
             interface_api,
+            use_stack_allocator,
         )
-        emit_main_init_memory_manager(main_file)
+        if use_stack_allocator:
+            emit_main_init_memory_manager(main_file)
 
         if interface_api == "c":
             for compiled_model in compiled_models:
@@ -709,11 +739,14 @@ def run_and_check(
         t = tarfile.open(tar_file)
         t.extractall(base_path)
 
-        workspace_bytes = model.extra_memory_in_bytes
-        use_usmp = runner.pass_config.get("tir.usmp.enable", False)
-        if interface_api == "packed" and not use_usmp:
+        # Interface C APIs does not need compiler generated
+        # workspace to generate the test application, because
+        # workspace size is codegen'd as a macro to
+        # tvmgen_<model_name>.h.
+        if interface_api != "c":
             workspace_bytes += mlf_extract_workspace_size_bytes(tar_file)
 
+        workspace_bytes += model.extra_memory_in_bytes
         for key in model.inputs:
             sanitized_tensor_name = re.sub(r"\W", "_", key)
             create_header_file(
@@ -738,6 +771,10 @@ def run_and_check(
                 data_linkage,
             )
 
+    use_usmp = runner.pass_config.get("tir.usmp.enable", False)
+    # We only need the stack allocator if USMP is not used
+    use_stack_allocator = not use_usmp
+
     create_main(
         "test.c",
         models,
@@ -748,6 +785,7 @@ def run_and_check(
         data_linkage,
         interface_api,
         workspace_bytes,
+        use_stack_allocator,
     )
 
     # Verify that compiles fine
@@ -868,3 +906,22 @@ def generate_ref_data(mod, input_data, params=None, target="llvm"):
         output_tensor_names = main.attrs["output_tensor_names"]
 
     return dict(zip(output_tensor_names, out))
+
+
+def create_relay_module_and_inputs_from_tflite_file(tflite_model_file):
+    """A helper function to create a Relay IRModule with inputs
+    and params from a tflite file"""
+    with open(tflite_model_file, "rb") as f:
+        tflite_model_buf = f.read()
+    mod, params = convert_to_relay(tflite_model_buf)
+
+    inputs = dict()
+    for param in mod["main"].params:
+        name = str(param.name_hint)
+        data_shape = [int(i) for i in param.type_annotation.shape]
+        dtype = str(param.type_annotation.dtype)
+        in_min, in_max = (np.iinfo(dtype).min, np.iinfo(dtype).max)
+        data = np.random.randint(in_min, high=in_max, size=data_shape, dtype=dtype)
+        inputs[name] = data
+
+    return mod, inputs, params
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
@@ -28,14 +28,18 @@
 from tvm.relay.testing import byoc
 from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.backend import Executor, Runtime
+from tvm.micro import model_library_format as mlf
 from aot_test_utils import (
     AOTTestModel,
     AOT_DEFAULT_RUNNER,
+    AOT_CORSTONE300_RUNNER,
+    AOTDataLinkage,
     generate_ref_data,
     convert_to_relay,
     compile_and_run,
     compile_models,
     parametrize_aot_options,
+    create_relay_module_and_inputs_from_tflite_file,
 )
 
 
@@ -87,11 +91,16 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
     inputs = {"data": input_data}
     output_list = generate_ref_data(mod, inputs, params)
 
+    data_linkage = None
+    if test_runner == AOT_CORSTONE300_RUNNER:
+        data_linkage = AOTDataLinkage(section=".data.tvm", alignment=8)
+
     compile_and_run(
         AOTTestModel(module=mod, inputs=inputs, outputs=output_list, params=params),
         test_runner,
         interface_api,
         use_unpacked_api,
+        data_linkage=data_linkage,
     )
 
 
@@ -501,6 +510,10 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
     inputs2 = {"data": input_data}
     output_list2 = generate_ref_data(mod2, inputs2, params2)
 
+    data_linkage = None
+    if test_runner == AOT_CORSTONE300_RUNNER:
+        data_linkage = AOTDataLinkage(section=".data.tvm", alignment=8)
+
     compile_and_run(
         [
             AOTTestModel(
@@ -521,6 +534,7 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
         test_runner,
         interface_api,
         use_unpacked_api,
+        data_linkage=data_linkage,
     )
 
 
@@ -541,13 +555,7 @@ def test_quant_mobilenet_tfl():
         "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
         "mobilenet_v1_1.0_224_quant.tflite",
     )
-    with open(tflite_model_file, "rb") as f:
-        tflite_model_buf = f.read()
-    data_shape = (1, 224, 224, 3)
-    in_min, in_max = (0, 255)
-    data = np.random.randint(in_min, high=in_max, size=data_shape, dtype="uint8")
-    mod, params = convert_to_relay(tflite_model_buf)
-    inputs = {"input": data}
+    mod, inputs, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file)
     output_list = generate_ref_data(mod, inputs, params)
     compile_and_run(
         AOTTestModel(module=mod, inputs=inputs, outputs=output_list, params=params),
@@ -843,5 +851,68 @@ def representative_dataset():
         assert output_name in source
 
 
+@pytest.mark.parametrize(
+    "workspace_byte_alignment,main_workspace_size",
+    [
+        (8, 55296),
+        (16, 55296),
+        (256, 57344),
+    ],
+)
+def test_workspace_calculation(workspace_byte_alignment, main_workspace_size):
+    mod, params = tvm.relay.testing.synthetic.get_workload()
+    target = "c"
+    runtime = Runtime("crt")
+    executor = Executor(
+        "aot",
+        {
+            "workspace-byte-alignment": workspace_byte_alignment,
+        },
+    )
+    with tvm.transform.PassContext(
+        opt_level=3,
+        config={
+            "tir.disable_vectorize": True,
+        },
+    ):
+        lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params)
+
+    mlf_memory_map = mlf._build_function_memory_map(lib.function_metadata)
+    assert mlf_memory_map["main"][0]["workspace_size_bytes"] == main_workspace_size
+
+
+@tvm.testing.requires_package("tflite")
+@tvm.testing.requires_cmsisnn
+def test_workspace_calculation_cmsis_nn():
+    """This tests cmsis_nn codegen for workspace calculation.
+    This is tested specially because cmsis-nn codegen creates
+    multiple PrimFuncs per offloaded relay function in a non
+    -hierarchical manner."""
+    pytest.importorskip("tflite")
+
+    import tvm.relay.testing.tf as tf_testing
+    from tvm.relay.op.contrib import cmsisnn
+
+    target = "c"
+    runtime = Runtime("crt")
+    executor = Executor("aot")
+    tflite_model_file = tf_testing.get_workload_official(
+        "https://storage.googleapis.com/download.tensorflow.org/"
+        "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
+        "mobilenet_v1_1.0_224_quant.tflite",
+    )
+    mod, _, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file)
+    mod = cmsisnn.partition_for_cmsisnn(mod, params)
+    with tvm.transform.PassContext(
+        opt_level=3,
+        config={
+            "tir.disable_vectorize": True,
+        },
+    ):
+        lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params)
+    mlf_memory_map = mlf._build_function_memory_map(lib.function_metadata)
+    assert mlf_memory_map["main"][0]["workspace_size_bytes"] == 12907328
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))