From 454fa3bfea7f73aa2edfd16737d35d1fe0582b8e Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 9 Jul 2020 16:46:47 +0100
Subject: [PATCH] [BYOC][Contrib] Arm Compute Library integration

Arm Compute Library (ACL) integration using the BYOC infrastructure. This will enable offloading select operators from a relay
graph to ACL so we can achieve faster inference times on Arm CPU's due to hand crafted optimized routines. The PR adds initial
support for offloading FP32 conv2d, maxpool2d and reshape to ACL. ACL codegen is used to generate a JSON representation of an
operator or 'ACL layer', the ACL runtime then uses this representation to construct a layer, cache it and create a packed
function to for the graph runtime to call into.

RFC here: https://discuss.tvm.ai/t/rfc-byoc-arm-compute-library-integration/7082

Change-Id: If756dcea787ea346b1508e9a191b7eed7bd02b7f
---
 CMakeLists.txt                                |   3 +
 cmake/config.cmake                            |  12 +
 cmake/modules/contrib/ACL.cmake               |  68 +++++
 python/tvm/relay/op/contrib/__init__.py       |   1 +
 python/tvm/relay/op/contrib/acl.py            | 125 ++++++++
 src/relay/backend/contrib/acl/README.md       | 111 +++++++
 src/relay/backend/contrib/acl/acl_api.cc      |  72 +++++
 src/relay/backend/contrib/acl/acl_api.h       | 172 +++++++++++
 src/relay/backend/contrib/acl/codegen.cc      | 287 ++++++++++++++++++
 src/relay/backend/contrib/acl/codegen_acl.h   | 198 ++++++++++++
 src/runtime/contrib/acl/acl_allocator.cc      |  73 +++++
 src/runtime/contrib/acl/acl_allocator.h       | 141 +++++++++
 src/runtime/contrib/acl/acl_kernel.cc         | 157 ++++++++++
 src/runtime/contrib/acl/acl_kernel.h          | 129 ++++++++
 src/runtime/contrib/acl/acl_runtime.cc        | 233 ++++++++++++++
 src/runtime/contrib/acl/acl_utils.cc          |  65 ++++
 src/runtime/contrib/acl/acl_utils.h           |  87 ++++++
 tests/python/contrib/test_acl/__init__.py     |  17 ++
 .../python/contrib/test_acl/infrastructure.py | 162 ++++++++++
 tests/python/contrib/test_acl/test_conv2d.py  | 202 ++++++++++++
 tests/python/contrib/test_acl/test_network.py |  76 +++++
 tests/python/contrib/test_acl/test_pooling.py | 121 ++++++++
 tests/python/contrib/test_acl/test_reshape.py |  91 ++++++
 tests/python/contrib/test_acl/test_runtime.py |  97 ++++++
 24 files changed, 2700 insertions(+)
 create mode 100644 cmake/modules/contrib/ACL.cmake
 create mode 100644 python/tvm/relay/op/contrib/acl.py
 create mode 100644 src/relay/backend/contrib/acl/README.md
 create mode 100644 src/relay/backend/contrib/acl/acl_api.cc
 create mode 100644 src/relay/backend/contrib/acl/acl_api.h
 create mode 100644 src/relay/backend/contrib/acl/codegen.cc
 create mode 100644 src/relay/backend/contrib/acl/codegen_acl.h
 create mode 100644 src/runtime/contrib/acl/acl_allocator.cc
 create mode 100644 src/runtime/contrib/acl/acl_allocator.h
 create mode 100644 src/runtime/contrib/acl/acl_kernel.cc
 create mode 100644 src/runtime/contrib/acl/acl_kernel.h
 create mode 100644 src/runtime/contrib/acl/acl_runtime.cc
 create mode 100644 src/runtime/contrib/acl/acl_utils.cc
 create mode 100644 src/runtime/contrib/acl/acl_utils.h
 create mode 100644 tests/python/contrib/test_acl/__init__.py
 create mode 100644 tests/python/contrib/test_acl/infrastructure.py
 create mode 100644 tests/python/contrib/test_acl/test_conv2d.py
 create mode 100644 tests/python/contrib/test_acl/test_network.py
 create mode 100644 tests/python/contrib/test_acl/test_pooling.py
 create mode 100644 tests/python/contrib/test_acl/test_reshape.py
 create mode 100644 tests/python/contrib/test_acl/test_runtime.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aaddebdfe3c57..bdb38e3fb8058 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,6 +70,8 @@ tvm_option(USE_CPP_RPC "Build CPP RPC" OFF)
 tvm_option(USE_TFLITE "Build with tflite support" OFF)
 tvm_option(USE_TENSORFLOW_PATH "TensorFlow root path when use TFLite" none)
 tvm_option(USE_COREML "Build with coreml support" OFF)
+tvm_option(USE_ACL "Build with Arm Compute Library" OFF)
+tvm_option(USE_ACL_GRAPH_RUNTIME "Build with Arm Compute Library graph runtime" OFF)
 
 if(USE_CPP_RPC AND UNIX)
   message(FATAL_ERROR "USE_CPP_RPC is only supported with WIN32. Use the Makefile for non-Windows.")
@@ -327,6 +329,7 @@ include(cmake/modules/contrib/HybridDump.cmake)
 include(cmake/modules/contrib/TFLite.cmake)
 include(cmake/modules/contrib/TF_TVMDSOOP.cmake)
 include(cmake/modules/contrib/CoreML.cmake)
+include(cmake/modules/contrib/ACL.cmake)
 
 include(CheckCXXCompilerFlag)
 if(NOT MSVC)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 1b196922ca059..e59690da2c04e 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -184,6 +184,18 @@ set(USE_SORT ON)
 # Whether use MKL-DNN (DNNL) codegen
 set(USE_DNNL_CODEGEN OFF)
 
+# Whether to use ACL (Arm Compute Library) codegen
+# We provide 2 separate flags since we cannot build the ACL runtime on x86.
+# This is useful for cases where you want to cross-compile a relay graph
+# on x86 then run on AArch.
+#
+# USE_ACL - Support for compiling a relay graph offloading supported
+#           operators to ACL. OFF/ON
+# USE_ACL_GRAPH_RUNTIME - Run ACL annotated functions via the ACL
+#                         runtime. OFF/ON/"path/to/ACL"
+set(USE_ACL OFF)
+set(USE_ACL_GRAPH_RUNTIME OFF)
+
 # Build ANTLR parser for Relay text format
 # Possible values:
 # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
diff --git a/cmake/modules/contrib/ACL.cmake b/cmake/modules/contrib/ACL.cmake
new file mode 100644
index 0000000000000..94db11d1fdf05
--- /dev/null
+++ b/cmake/modules/contrib/ACL.cmake
@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# We separate the codegen and runtime build since ACL can only be built
+# for AArch. In the world where we take the cross compilation approach,
+# which is common with arm devices, we need to be able to cross-compile
+# a relay graph on x86 for AArch and then run the graph on AArch.
+if(USE_ACL)
+    file(GLOB ACL_RELAY_CONTRIB_SRC src/relay/backend/contrib/acl/*.cc)
+    file(GLOB ACL_RUNTIME_MODULE src/runtime/contrib/acl/acl_runtime.cc)
+    list(APPEND COMPILER_SRCS ${ACL_RELAY_CONTRIB_SRC})
+    list(APPEND COMPILER_SRCS ${ACL_RUNTIME_MODULE})
+    message(STATUS "Build with ACL support...")
+endif()
+
+if(USE_ACL_GRAPH_RUNTIME)
+    set(ACL_PATH ${CMAKE_CURRENT_SOURCE_DIR}/acl)
+    # Detect custom ACL path.
+    if (NOT USE_ACL_GRAPH_RUNTIME STREQUAL "ON")
+        set(ACL_PATH ${USE_ACL_GRAPH_RUNTIME})
+    endif()
+
+    file(GLOB ACL_CONTRIB_SRC src/runtime/contrib/acl/*)
+    file(GLOB ACL_API src/relay/backend/contrib/acl/acl_api.cc)
+
+    set(ACL_INCLUDE_DIRS ${ACL_PATH}/include ${ACL_PATH})
+    include_directories(${ACL_INCLUDE_DIRS})
+
+    find_library(EXTERN_ACL_COMPUTE_LIB
+            NAMES arm_compute libarm_compute
+            HINTS "${ACL_PATH}" "${ACL_PATH}/lib" "${ACL_PATH}/build"
+            )
+    find_library(EXTERN_ACL_COMPUTE_CORE_LIB
+            NAMES arm_compute_core libarm_compute_core
+            HINTS "${ACL_PATH}" "${ACL_PATH}/lib" "${ACL_PATH}/build"
+            )
+    find_library(EXTERN_ACL_COMPUTE_GRAPH_LIB
+            NAMES arm_compute_graph libarm_compute_graph
+            HINTS "${ACL_PATH}" "${ACL_PATH}/lib" "${ACL_PATH}/build"
+            )
+
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_ACL_COMPUTE_LIB})
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_ACL_COMPUTE_CORE_LIB})
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_ACL_COMPUTE_GRAPH_LIB})
+    list(APPEND RUNTIME_SRCS ${ACL_CONTRIB_SRC})
+    list(APPEND RUNTIME_SRCS ${ACL_API})
+    message(STATUS "Build with ACL graph runtime support: "
+            ${EXTERN_ACL_COMPUTE_LIB} ", \n"
+            ${EXTERN_ACL_COMPUTE_CORE_LIB} ", \n"
+            ${EXTERN_ACL_COMPUTE_GRAPH_LIB})
+
+    # Set flag to detect ACL graph runtime support.
+    add_definitions(-DTVM_GRAPH_RUNTIME_ACL)
+endif()
diff --git a/python/tvm/relay/op/contrib/__init__.py b/python/tvm/relay/op/contrib/__init__.py
index 0e1b4b024a5aa..fad7183d92987 100644
--- a/python/tvm/relay/op/contrib/__init__.py
+++ b/python/tvm/relay/op/contrib/__init__.py
@@ -18,5 +18,6 @@
 """Contrib modules."""
 from .register import get_pattern_table, register_pattern_table
 
+from .acl import *
 from .dnnl import *
 from .coreml import *
diff --git a/python/tvm/relay/op/contrib/acl.py b/python/tvm/relay/op/contrib/acl.py
new file mode 100644
index 0000000000000..8207575460450
--- /dev/null
+++ b/python/tvm/relay/op/contrib/acl.py
@@ -0,0 +1,125 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""ACL library supported operators."""
+import tvm
+from tvm.relay import transform
+from tvm.relay.build_module import bind_params_by_name
+
+from ...dataflow_pattern import wildcard, is_op, is_constant
+from .register import register_pattern_table
+
+
+def is_acl_runtime_present():
+    """Check if the ACL graph runtime is present.
+
+    Returns
+    -------
+    ret: bool
+        True if present, False if not.
+    """
+    return tvm.get_global_func("relay.op.is_acl_runtime_enabled", True)
+
+
+def partition_for_acl(mod, params=None):
+    """Partition the graph greedily offloading supported
+    operators to ACL.
+
+    Parameters
+    ----------
+    mod : Module
+        The module to run passes on.
+    params : dict[str, NDArray]
+        Constant input parameters.
+
+    Returns
+    -------
+    ret : annotated and partitioned module.
+    """
+    if params:
+        mod['main'] = bind_params_by_name(mod['main'], params)
+
+    seq = tvm.transform.Sequential([transform.MergeComposite(pattern_table()),
+                                    transform.AnnotateTarget('acl'),
+                                    transform.PartitionGraph()])
+
+    return seq(mod)
+
+
+@register_pattern_table("acl")
+def pattern_table():
+    """Get the ACL pattern table."""
+
+    def conv_pattern():
+        """Create a convolution pattern.
+
+        Returns
+        -------
+        pattern : dataflow_pattern.AltPattern
+            Denotes the convolution pattern.
+        """
+        pattern = is_op('nn.pad')(wildcard()) | wildcard()
+        pattern = is_op('nn.conv2d')(pattern, is_constant())
+        pattern = pattern.optional(lambda x: is_op('nn.bias_add')(x, is_constant()))
+        pattern = pattern.optional(is_op('nn.relu'))
+        return pattern
+
+    def check_conv(extract):
+        """Check conv pattern is supported by ACL."""
+        call = extract
+        while call.op.name != "nn.conv2d":
+            call = call.args[0]
+        return conv2d(call.attrs, call.args)
+
+    return [('acl.conv2d', conv_pattern(), check_conv)]
+
+
+def _register_external_op_helper(op_name, supported=True):
+    @tvm.ir.register_op_attr(op_name, "target.acl")
+    def _func_wrapper(attrs, args):
+        return supported
+
+    return _func_wrapper
+
+
+_register_external_op_helper("reshape")
+
+
+@tvm.ir.register_op_attr("nn.conv2d", "target.acl")
+def conv2d(attrs, args):
+    """Check if the external ACL codegen for conv2d should be used."""
+
+    # ACL only supports group size of 1
+    if attrs.groups != 1:
+        return False
+
+    # ACL only supports NHWC layout
+    if attrs.data_layout != "NHWC":
+        return False
+
+    return True
+
+
+@tvm.ir.register_op_attr("nn.max_pool2d", "target.acl")
+def max_pool2d(attrs, args):
+    """Check if the external ACL codegen for maxpool2d should be used."""
+
+    # ACL only supports NHWC layout
+    if attrs.layout != "NHWC":
+        return False
+
+    return True
diff --git a/src/relay/backend/contrib/acl/README.md b/src/relay/backend/contrib/acl/README.md
new file mode 100644
index 0000000000000..111f64c2c1f28
--- /dev/null
+++ b/src/relay/backend/contrib/acl/README.md
@@ -0,0 +1,111 @@
+<!---
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Relay Arm&reg; Compute Library Integration
+Arm Compute Library (ACL) is an open source project that provides accelerated kernels for Arm CPU's
+and GPU's. Currently the integration offloads operators to ACL to use hand-crafted assembler
+routines in the library. By offloading select operators from a relay graph to ACL we can achieve
+a performance boost on such devices.
+
+## Building with ACL support
+The current implementation has two separate build options in cmake. The reason for this split is
+because ACL cannot be used on an x86 machine. However, we still want to be able compile an ACL
+runtime module on an x86 machine.
+
+* USE_ACL=ON/OFF - Enabling this flag will add support for compiling an ACL runtime module.
+* USE_GRAPH_RUNTIME_ACL=ON/OFF/path-to-acl - Enabling this flag will allow the graph runtime to
+compute the ACL offloaded functions.
+
+These flags can be used in different scenarios depending on your setup. For example, if you want
+to compile ACL on an x86 machine and then run the module on a remote Arm device via RPC, you will
+need to use USE_ACL=ON on the x86 machine and USE_GRAPH_RUNTIME_ACL=ON on the remote AArch64
+device.
+## Usage
+_Note:_ this may not stay up-to-date with changes to the API.
+1. Create a relay graph. This may be a single operator or a whole graph. The intention is that any
+relay graph can be input. The ACL integration will only pick supported operators to be offloaded
+whilst the rest will be computed via TVM. (For this example we will use a single
+max_pool2d operator).
+    ```
+    import tvm
+    from tvm import relay
+
+    data_type = "float32"
+    data_shape = (1, 14, 14, 512)
+    strides = (2, 2)
+    padding = (0, 0, 0, 0)
+    pool_size = (2, 2)
+    layout = "NHWC"
+    output_shape = (1, 7, 7, 512)
+
+    data = relay.var('data', shape=data_shape, dtype=data_type)
+    out = relay.nn.max_pool2d(data, pool_size=pool_size, strides=strides,
+                              layout=layout, padding=padding)
+    module = tvm.IRModule.from_expr(out)
+    ```
+2. Annotate and partition the graph for ACL.
+    ```
+    module = relay.transform.AnnotateTarget("acl")(module)
+    module = relay.transform.PartitionGraph()(module)
+    ```
+3. Build the Relay graph.
+    ```
+    target = "llvm -target=aarch64-linux-gnu -mattr=+neon"
+    with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
+            json, lib, params = relay.build(module, target=target)
+    ```
+4. Export the module.
+    ```
+    lib_path = '~/lib_acl.so'
+    cross_compile = 'aarch64-linux-gnu-c++'
+    lib.export_library(lib_path, cc=cross_compile)
+    ```
+ 5. Run Inference. This must be on an Arm device. If compiling on x86 device and running on aarch64
+ consider using the RPC mechanism.
+    ```
+    tvm.runtime.load_module('lib_acl.so')
+    gen_module = tvm.contrib.graph_runtime.create(json, lib, ctx)
+
+    d_data = np.random.uniform(0, 1, data_shape).astype(data_type)
+    map_inputs = {'data': d_data}
+    gen_module.map_inputs(**map_inputs)
+    gen_module.run()
+    ```
+
+## More examples
+The example above only shows a basic example of how ACL can be used for offloading a single
+Maxpool2D. If you would like to see more examples for each implemented operator and for
+networks refer to the tests: `tests/python/contrib/test_acl`. Here you can modify
+`infrastructure.py` to use the remote device you have setup.
+
+## Adding a new operator
+Adding a new operator requires changes to a series of places. This section will give a hint on
+what needs to be changed and where, it will not however dive into the complexities for an
+individual operator. This is left to the developer.
+
+There are a series of files we need to make changes to:
+* `python/relay/op/contrib/acl.py` In this file we define the operators we wish to offload using the
+`op.register` decorator. This will mean the annotation pass recognizes this operator as ACL
+offloadable.
+* `src/relay/backend/contrib/acl/codegen_acl.h` Implement `Make[OpName]` method. This is where we
+declare how the operator should be represented by JSON. This will be used to create the ACL module.
+* `src/runtime/contrib/acl/acl_kernel.h` Implement `Create[OpName]Layer` method. This is where we
+define how the JSON representation can be used to create an ACL function. We simply define how to
+translate from the JSON representation to ACL API.
+* `tests/python/contrib/test_acl` Add unit tests for the given operator.
diff --git a/src/relay/backend/contrib/acl/acl_api.cc b/src/relay/backend/contrib/acl/acl_api.cc
new file mode 100644
index 0000000000000..5e3aa9c5679ef
--- /dev/null
+++ b/src/relay/backend/contrib/acl/acl_api.cc
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/acl/acl_api.cc
+ * \brief A common JSON interface between relay and the ACL runtime module.
+ */
+
+#include "acl_api.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace acl {
+
+std::pair<JSONSubGraph, std::vector<runtime::NDArray>> DeserializeSubgraph(
+    std::string* serialized_function) {
+  dmlc::MemoryStringStream mstrm(serialized_function);
+  dmlc::Stream* strm = &mstrm;
+  std::string serialized_json;
+  strm->Read(&serialized_json);
+  std::istringstream is(serialized_json);
+  dmlc::JSONReader reader(&is);
+  JSONSubGraph function;
+  function.Load(&reader);
+  std::vector<runtime::NDArray> constants;
+  size_t const_count;
+  strm->Read(&const_count);
+  for (size_t i = 0; i < const_count; i++) {
+    runtime::NDArray temp;
+    temp.Load(strm);
+    constants.push_back(temp);
+  }
+  return std::make_pair(function, constants);
+}
+
+std::string SerializeSubgraph(const JSONSubGraph& subgraph,
+                              const std::vector<runtime::NDArray>& constants) {
+  std::ostringstream os;
+  dmlc::JSONWriter writer(&os);
+  subgraph.Save(&writer);
+  std::string serialized_subgraph;
+  dmlc::MemoryStringStream mstrm(&serialized_subgraph);
+  dmlc::Stream* strm = &mstrm;
+  strm->Write(os.str());
+  strm->Write(constants.size());
+  for (const auto& it : constants) {
+    it.Save(strm);
+  }
+  return serialized_subgraph;
+}
+
+}  // namespace acl
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/acl/acl_api.h b/src/relay/backend/contrib/acl/acl_api.h
new file mode 100644
index 0000000000000..60ea03e5b3fe4
--- /dev/null
+++ b/src/relay/backend/contrib/acl/acl_api.h
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/acl/acl_api.h
+ * \brief A common JSON interface between relay and the ACL runtime module.
+ */
+
+#ifndef TVM_RELAY_BACKEND_CONTRIB_ACL_ACL_API_H_
+#define TVM_RELAY_BACKEND_CONTRIB_ACL_ACL_API_H_
+
+#include <dmlc/json.h>
+#include <dmlc/memory_io.h>
+#include <tvm/runtime/ndarray.h>
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace acl {
+
+DMLC_JSON_ENABLE_ANY(std::vector<int>, IntVector);
+DMLC_JSON_ENABLE_ANY(int, Int);
+DMLC_JSON_ENABLE_ANY(size_t, Size_t);
+DMLC_JSON_ENABLE_ANY(std::string, String);
+
+/*!
+ * JSON interface for ACL tensor.
+ */
+class JSONTensor {
+ public:
+  JSONTensor() = default;
+  explicit JSONTensor(std::vector<int> shape) : type("var"), shape(std::move(shape)) {}
+
+  JSONTensor(std::string type, std::vector<int> shape)
+      : type(std::move(type)), shape(std::move(shape)) {}
+
+  void Save(dmlc::JSONWriter* writer) const {
+    writer->BeginObject();
+    writer->WriteObjectKeyValue("type", type);
+    writer->WriteObjectKeyValue("shape", shape);
+    writer->EndObject();
+  }
+
+  void Load(dmlc::JSONReader* reader) {
+    dmlc::JSONObjectReadHelper helper;
+    helper.DeclareField("type", &type);
+    helper.DeclareField("shape", &shape);
+    helper.ReadAllFields(reader);
+  }
+
+  /*! \brief The type of the tensor var/const. */
+  std::string type;
+  /*! \brief The shape of the tensor. */
+  std::vector<int> shape;
+};
+
+/*!
+ * JSON interface for an ACL operator.
+ */
+class JSONOp {
+ public:
+  JSONOp() = default;
+  explicit JSONOp(std::string name) : name(std::move(name)) {}
+
+  void Save(dmlc::JSONWriter* writer) const {
+    auto op_attrs = attrs;
+    op_attrs["num_inputs"] = dmlc::any(inputs.size());
+    op_attrs["num_outputs"] = dmlc::any(outputs.size());
+    writer->BeginObject();
+    writer->WriteObjectKeyValue("name", name);
+    writer->WriteObjectKeyValue("inputs", inputs);
+    writer->WriteObjectKeyValue("outputs", outputs);
+    writer->WriteObjectKeyValue("attrs", op_attrs);
+    writer->EndObject();
+  }
+
+  void Load(dmlc::JSONReader* reader) {
+    dmlc::JSONObjectReadHelper helper;
+    helper.DeclareField("name", &name);
+    helper.DeclareField("inputs", &inputs);
+    helper.DeclareField("outputs", &outputs);
+    helper.DeclareField("attrs", &attrs);
+    helper.ReadAllFields(reader);
+  }
+
+  /*! The name of the operator. */
+  std::string name;
+  /*! The required variable inputs to the operator. */
+  std::vector<JSONTensor> inputs;
+  /*! The required outputs to the operator. */
+  std::vector<JSONTensor> outputs;
+  /*! The attributes of the operator e.g. padding, strides, etc. */
+  std::unordered_map<std::string, dmlc::any> attrs;
+};
+
+/*!
+ * JSON interface for a series of ACL ops.
+ */
+class JSONSubGraph {
+ public:
+  JSONSubGraph() = default;
+  explicit JSONSubGraph(JSONOp op) : op(std::move(op)) {}
+
+  void Save(dmlc::JSONWriter* writer) const {
+    writer->BeginObject();
+    writer->WriteObjectKeyValue("node", op);
+    writer->EndObject();
+  }
+
+  void Load(dmlc::JSONReader* reader) {
+    dmlc::JSONObjectReadHelper helper;
+    helper.DeclareField("node", &op);
+    helper.ReadAllFields(reader);
+  }
+
+  /*! \brief JSON op to be serialized. */
+  JSONOp op;
+};
+
+/*!
+ * \brief Deserialize a function (or subgraph). The function is serialized in the
+ * format: Serialized JSON (using dmlc::JSONWriter), number of constants, serialized
+ * NDArray constants.
+ *
+ * \param serialized_function Pointer to a serialized function (or subgraph).
+ * \return A pair consisting of deserialized json subgraph object and deserialized
+ * NDArray.
+ */
+std::pair<JSONSubGraph, std::vector<runtime::NDArray>> DeserializeSubgraph(
+    std::string* serialized_function);
+
+/*!
+ * \brief Serialize a single subgraph which can be saved to disk.
+ *
+ * A subgraph is serialized so that the output is as follows:
+ * - Serialized JSON.
+ * - Number of constant tensors.
+ * - Serialized constant tensors.
+ *
+ * \param subgraph JSON subgraph representation.
+ * \constants Serialized JSON constants.
+ */
+std::string SerializeSubgraph(const JSONSubGraph& subgraph,
+                              const std::vector<runtime::NDArray>& constants);
+
+}  // namespace acl
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_CONTRIB_ACL_ACL_API_H_
diff --git a/src/relay/backend/contrib/acl/codegen.cc b/src/relay/backend/contrib/acl/codegen.cc
new file mode 100644
index 0000000000000..1c61a6b09fce4
--- /dev/null
+++ b/src/relay/backend/contrib/acl/codegen.cc
@@ -0,0 +1,287 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/acl/codegen_acl.cc
+ * \brief Implementation of the Relay -> ACL JSON schema compiler.
+ */
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/type.h>
+
+#include "../../utils.h"
+#include "codegen_acl.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace acl {
+
+void CodegenACL::VisitLeaf(const Expr& expr) {
+  if (expr->IsInstance<ConstantNode>()) {
+    const auto* constant_node = expr.as<ConstantNode>();
+    this->constants_.push_back(constant_node->data);
+  } else if (!expr->IsInstance<FunctionNode>()) {
+    // Don't enter functions
+    MixedModeVisitor::VisitLeaf(expr);
+  }
+}
+
+void CodegenACL::VisitExpr_(const CallNode* node) {
+  Call call = GetRef<Call>(node);
+  if (this->layer_table_.find(call) == this->layer_table_.end()) {
+    for (const auto& arg : call->args) {
+      this->VisitExpr(arg);
+    }
+    // Determine call -> ACL mapping
+    JSONOp layer;
+    if (IsAclFunc(node, "acl.conv2d") || backend::IsOp(node, "nn.conv2d")) {
+      layer = MakeConvolutionOp(call);
+    } else if (backend::IsOp(node, "nn.max_pool2d")) {
+      layer = MakeMaxPool2DOp(call);
+    } else if (backend::IsOp(node, "reshape")) {
+      layer = MakeReshapeOp(call);
+    } else {
+      LOG(FATAL) << "Unsupported op: " << AsText(node->op, false);
+    }
+    this->layer_table_[call] = layer;
+  }
+}
+
+runtime::Module CodegenACL::CreateRuntimeModule(const ObjectRef& ref) {
+  std::vector<std::pair<std::string, std::string>> serialized_functions;
+  if (ref->IsInstance<FunctionNode>()) {
+    IRModule mod;
+    Function func = Downcast<Function>(ref);
+    auto name_node = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
+    CHECK(name_node.defined()) << "Failed to retrieve external symbol";
+    mod->Add(GlobalVar(name_node.value()), func);
+    mod = this->PreProcessModule(mod);
+    for (const auto& it : mod->functions) {
+      this->SerializeFunction(it.second, &serialized_functions);
+    }
+  } else {
+    LOG(FATAL) << "The input ref is expected to be a Relay function.";
+  }
+  std::string data;
+  dmlc::MemoryStringStream fs(&data);
+  dmlc::SeekStream* strm = &fs;
+  strm->Write(serialized_functions.size());
+  for (const auto& it : serialized_functions) {
+    strm->Write(it.first);
+    strm->Write(it.second);
+  }
+  strm->Seek(0);
+  std::string make_acl_module = "runtime.module.loadbinary_acl";
+  auto pf = tvm::runtime::Registry::Get(make_acl_module);
+  if (pf) {
+    return (*pf)(strm);
+  } else {
+    return runtime::Module();
+  }
+}
+
+JSONSubGraph CodegenACL::CreateJSONSubgraph(const Function& func) {
+  Expr body = func->body;
+  this->layer_table_.clear();
+  this->constants_.clear();
+  this->VisitExpr(body);
+  std::vector<JSONOp> ops;
+  for (const auto& it : this->layer_table_) {
+    ops.push_back(it.second);
+  }
+  CHECK_EQ(layer_table_.size(), 1) << "ACL codegen expects only a single op per function.";
+  return JSONSubGraph(ops[0]);
+}
+
+void CodegenACL::SerializeFunction(
+    const ObjectRef& ref, std::vector<std::pair<std::string, std::string>>* serialized_functions) {
+  Function func = Downcast<Function>(ref);
+  JSONSubGraph subgraph = this->CreateJSONSubgraph(func);
+  const auto name_node = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
+  CHECK(name_node != "") << "Fail to retrieve external symbol";
+  std::string serialized_pair = SerializeSubgraph(subgraph, this->constants_);
+  serialized_functions->emplace_back(name_node.value(), serialized_pair);
+}
+
+IRModule CodegenACL::PreProcessModule(const IRModule& mod) {
+  IRModule preprocessed_module;
+  tvm::Map<String, Array<String>> desired_layouts = {
+      {"nn.conv2d", {String("NHWC"), String("OHWI")}}};
+  preprocessed_module = transform::ConvertLayout(desired_layouts)(mod);
+  preprocessed_module = transform::FoldConstant()(preprocessed_module);
+  return preprocessed_module;
+}
+
+JSONOp CodegenACL::MakeConvolutionOp(const Call& call) {
+  JSONOp op("conv2d");
+  const CallNode* pad = nullptr;
+  const CallNode* conv;
+  const CallNode* bias = nullptr;
+  bool has_activation = false;
+  if (call->op->IsInstance<FunctionNode>()) {
+    Expr composite_conv = GetCompositeExpr(call);
+    // Unpack composite function
+    const auto* current_call = composite_conv.as<CallNode>();
+    if (backend::IsOp(current_call, "nn.relu")) {
+      has_activation = true;
+      current_call = current_call->args[0].as<CallNode>();
+    }
+    if (backend::IsOp(current_call, "nn.bias_add")) {
+      bias = current_call;
+      current_call = current_call->args[0].as<CallNode>();
+    }
+    CHECK(backend::IsOp(current_call, "nn.conv2d"));
+    conv = current_call;
+    if (!current_call->args.empty() && current_call->args[0]->IsInstance<CallNode>()) {
+      current_call = current_call->args[0].as<CallNode>();
+      if (backend::IsOp(current_call, "nn.pad")) {
+        pad = current_call;
+      }
+    }
+  } else {
+    conv = call.as<CallNode>();
+  }
+  const auto* conv_attr = conv->attrs.as<Conv2DAttrs>();
+  CHECK(conv_attr);
+  CHECK(conv_attr->kernel_layout == "OHWI")
+      << "Kernel layout must be OHWI, has the module been pre-processed correctly?";
+  if (pad) {
+    op.inputs.push_back(MakeJSONTensor(pad->args[0]));
+  } else {
+    op.inputs.push_back(MakeJSONTensor(conv->args[0]));
+  }
+  op.inputs.push_back(MakeJSONConstTensor(conv->args[1]));
+  op.outputs.push_back(MakeJSONTensor(GetRef<Expr>(conv)));
+  if (bias) {
+    op.inputs.push_back(MakeJSONConstTensor(bias->args[1]));
+  }
+  // It seems there are two different methods for padding a convolution:
+  // - using nn.pad operator before convolution
+  // - using conv2d_attrs to add padding
+  //
+  // Cover both cases here.
+  std::vector<int> padding;
+  if (pad) {
+    const auto* pad_attr = pad->attrs.as<PadAttrs>();
+    CHECK(pad_attr);
+    padding = GetPadVector(pad_attr->pad_width);
+  } else {
+    padding = GetPadVector(conv_attr->padding);
+  }
+  op.attrs["padding"] = padding;
+  op.attrs["groups"] = conv_attr->groups;
+  op.attrs["strides"] = ToVector(conv_attr->strides);
+  if (has_activation) op.attrs["activation_type"] = std::string("relu");
+  return op;
+}
+
+JSONOp CodegenACL::MakeMaxPool2DOp(const Call& call) {
+  JSONOp op("max_pool");
+  const auto* attr = call->attrs.as<MaxPool2DAttrs>();
+  CHECK(attr);
+  op.inputs.push_back(MakeJSONTensor(call->args[0]));
+  op.outputs.push_back(MakeJSONTensor(call));
+  op.attrs["padding"] = GetPadVector(attr->padding);
+  op.attrs["strides"] = ToVector(attr->strides);
+  op.attrs["pooling_type"] = std::string("max");
+  op.attrs["pool_size"] = ToVector(attr->pool_size);
+  return op;
+}
+
+JSONOp CodegenACL::MakeReshapeOp(const Call& call) {
+  JSONOp op("reshape");
+  const auto* attr = call->attrs.as<ReshapeAttrs>();
+  CHECK(attr);
+  op.inputs.push_back(MakeJSONTensor(call->args[0]));
+  op.outputs.push_back(MakeJSONTensor(call));
+  return op;
+}
+
+JSONTensor CodegenACL::MakeJSONTensor(const Expr& expr) {
+  const auto* ttnode = expr->checked_type().as<TensorTypeNode>();
+  CHECK(ttnode);
+  std::vector<int> shape = ToVector(ttnode->shape);
+  return JSONTensor("var", shape);
+}
+
+JSONTensor CodegenACL::MakeJSONConstTensor(const Expr& expr) {
+  const auto* ttnode = expr->checked_type().as<TensorTypeNode>();
+  CHECK(ttnode);
+  std::vector<int> shape = ToVector(ttnode->shape);
+  VisitExpr(expr);
+  return JSONTensor("const", shape);
+}
+
+bool CodegenACL::IsAclFunc(const CallNode* call, const std::string& op_name) const {
+  if (call->op->IsInstance<FunctionNode>()) {
+    Function func = Downcast<Function>(call->op);
+    CHECK(func.defined());
+    auto name_node = func->GetAttr<String>(attr::kComposite);
+    return name_node.value() == op_name;
+  }
+  return false;
+}
+
+Expr CodegenACL::GetCompositeExpr(const Call& call) {
+  Function composite_function = Downcast<Function>(call->op);
+  Expr composite_expr = composite_function->body;
+  CHECK(composite_expr->IsInstance<CallNode>());
+  return composite_expr;
+}
+
+std::vector<int> CodegenACL::ToVector(const Array<IndexExpr>& array) {
+  std::vector<int> stl_vector;
+  for (auto it : array) {
+    const auto* val = it.as<IntImmNode>();
+    CHECK(val);
+    stl_vector.push_back(val->value);
+  }
+  return stl_vector;
+}
+
+std::vector<int> CodegenACL::GetPadVector(const Array<Array<IndexExpr>>& pad) {
+  // TVM nn.pad: top, bottom, left, right -> ACL Pad: left, right, top, bottom
+  auto acl_pad = {pad[2][0], pad[2][1], pad[1][0], pad[1][1]};
+  return ToVector(acl_pad);
+}
+
+std::vector<int> CodegenACL::GetPadVector(const Array<IndexExpr>& pad) {
+  Array<IndexExpr> acl_pad;
+  switch (pad.size()) {
+    case 1:
+      acl_pad = {pad[0], pad[0], pad[0], pad[0]};
+      break;
+    case 2:
+      // TVM Pad: height, width -> ACL Pad: left, right, top, bottom
+      acl_pad = {pad[1], pad[1], pad[0], pad[0]};
+      break;
+    case 4:
+      // TVM Pad: top, left, bottom, right -> ACL Pad: left, right, top, bottom
+      acl_pad = {pad[1], pad[3], pad[0], pad[2]};
+      break;
+    default:
+      LOG(FATAL) << "Unsupported padding dimensions";
+  }
+  return ToVector(acl_pad);
+}
+
+}  // namespace acl
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/acl/codegen_acl.h b/src/relay/backend/contrib/acl/codegen_acl.h
new file mode 100644
index 0000000000000..23efb09521b2a
--- /dev/null
+++ b/src/relay/backend/contrib/acl/codegen_acl.h
@@ -0,0 +1,198 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/acl/codegen_acl.h
+ * \brief The Relay -> ACL JSON schema compiler.
+ */
+
+#ifndef TVM_RELAY_BACKEND_CONTRIB_ACL_CODEGEN_ACL_H_
+#define TVM_RELAY_BACKEND_CONTRIB_ACL_CODEGEN_ACL_H_
+
+#include <dmlc/json.h>
+#include <dmlc/memory_io.h>
+#include <tvm/relay/expr_functor.h>
+
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "acl_api.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace acl {
+
+/*!
+ * \brief Generates an ACLModule from a relay expression. This "compilation"
+ * does not require ACL since the actual conversion using ACL APIs is
+ * deferred until creation of the runtime. This step simply serializes the
+ * relay program into a JSON string.
+ */
+class CodegenACL : public MixedModeVisitor {
+ public:
+  CodegenACL() = default;
+  void VisitExpr_(const CallNode* node) final;
+  void VisitLeaf(const Expr& expr) final;
+
+  /*!
+   * \brief Create a runtime module for ACL.
+   *
+   * This consists of a series of "serialized functions" which each represent a
+   * subgraph to be computed by ACL and will each be executed independently from
+   * one another. Each function consists of serialized JSON describing the subgraph
+   * and serialized constant tensors.
+   *
+   * \note The ACL runtime module only currently supports a single operator per
+   * subgraph currently.
+   *
+   * \param ref The ext_func Relay expression/module to be executed using extern ops.
+   * \return A runtime module.
+   */
+  runtime::Module CreateRuntimeModule(const ObjectRef& ref);
+
+  /*!
+   * \brief Create a JSON representation of a subgraph.
+   *
+   * \param func The function to be represented.
+   * \return A JSON representation of the function.
+   */
+  JSONSubGraph CreateJSONSubgraph(const Function& func);
+
+ private:
+  /*!
+   * \brief Serialize a single subgraph which can be saved to disk.
+   *
+   * A subgraph is serialized so that the output is as follows.
+   * - Serialized JSON.
+   * - Number of constant tensors.
+   * - Serialized constant tensors.
+   *
+   * \param ref Reference to the function to be serialized.
+   * \param serialized_functions A vector of serialized functions to add to.
+   */
+  void SerializeFunction(const ObjectRef& ref,
+                         std::vector<std::pair<std::string, std::string>>* serialized_functions);
+
+  /*!
+   * \brief Pre-process a module containing functions ready for ACL codegen.
+   *
+   * For now we enforce OHWI kernel layout and fold the transforms away.
+   *
+   * \param mod The module to be pre-processed.
+   * \return The processed module.
+   */
+  IRModule PreProcessModule(const IRModule& mod);
+
+  /*!
+   * \brief Create a JSON representation of an operator.
+   *
+   * \param call The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  JSONOp MakeConvolutionOp(const Call& call);
+  static JSONOp MakeMaxPool2DOp(const Call& call);
+  static JSONOp MakeReshapeOp(const Call& call);
+
+  /*!
+   * \brief Make a JSON representation of a (constant)tensor.
+   *
+   * \param expr Expression of a tensor to be represented.
+   * \return A JSON representation of a tensor.
+   */
+  static JSONTensor MakeJSONTensor(const Expr& expr);
+  JSONTensor MakeJSONConstTensor(const Expr& expr);
+
+  /*!
+   * \brief Check whether CallNode is a composite function and has the same
+   * op_name.
+   *
+   * \param call The current call node.
+   * \param op_name The expected name of the call node to check.
+   * \return True if the call node is composite and has the same name as
+   * op_name, False otherwise.
+   */
+  bool IsAclFunc(const CallNode* call, const std::string& op_name) const;
+
+  /*!
+   * \brief Get composite expression from call node.
+   *
+   * \param call The call node to get expression from.
+   * \return Expression for composite function.
+   */
+  static Expr GetCompositeExpr(const Call& call);
+
+  /*!
+   * \brief Convert a relay array to std::vector.
+   *
+   * \param array A relay array to be converted.
+   * \return std::vector.
+   */
+  static std::vector<int> ToVector(const Array<IndexExpr>& array);
+
+  /*!
+   * \brief Create a padding vector compatible with ACL.
+   *
+   * Currently TVM has many ways to pad a an operator, so each method is taken care of here.
+   *
+   * \param pad Padding array.
+   * \return ACL compatible padding vector.
+   */
+  static std::vector<int> GetPadVector(const Array<Array<IndexExpr>>& pad);
+  static std::vector<int> GetPadVector(const Array<IndexExpr>& pad);
+
+  /*! \brief A vector of constants to be serialized after the JSON representation is constructed. */
+  std::vector<runtime::NDArray> constants_;
+  /*! \brief A look-up table from Expr to JSONOp. */
+  std::map<Expr, JSONOp> layer_table_;
+};
+
+/*!
+ * \brief The external ACL compiler/codegen tool. It takes a Relay
+ * expression/module and compiles it into a runtime module.
+ */
+runtime::Module ACLCompiler(const ObjectRef& ref) {
+  CodegenACL acl_codegen;
+  return acl_codegen.CreateRuntimeModule(ref);
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.acl").set_body_typed(ACLCompiler);
+
+/*!
+ * \brief Check whether ACL graph runtime is used.
+ * \return True if ACL graph runtime is enabled, False if not.
+ */
+inline constexpr bool IsACLRuntimeEnabled() {
+#if TVM_GRAPH_RUNTIME_ACL
+  return true;
+#else
+  return false;
+#endif
+}
+
+TVM_REGISTER_GLOBAL("relay.op.is_acl_runtime_enabled").set_body_typed(IsACLRuntimeEnabled);
+
+}  // namespace acl
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_CONTRIB_ACL_CODEGEN_ACL_H_
diff --git a/src/runtime/contrib/acl/acl_allocator.cc b/src/runtime/contrib/acl/acl_allocator.cc
new file mode 100644
index 0000000000000..b72ec9552130c
--- /dev/null
+++ b/src/runtime/contrib/acl/acl_allocator.cc
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/acl/acl_allocator.cc
+ * \brief ACL Allocator implementation that requests memory from TVM.
+ */
+
+#include "acl_allocator.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+namespace acl {
+
+void* ACLAllocator::allocate(size_t size, size_t alignment) {
+  CHECK_GT(size, 0) << "Cannot allocate size less than or equal to zero";
+  return this->device_api_->AllocWorkspace(this->ctx_, size, {});
+}
+
+void ACLAllocator::free(void* ptr) { this->device_api_->FreeWorkspace(this->ctx_, ptr); }
+
+std::unique_ptr<acl::IMemoryRegion> ACLAllocator::make_region(size_t size, size_t alignment) {
+  return acl::support::cpp14::make_unique<ACLMemoryRegion>(size, alignment);
+}
+
+ACLMemoryRegion::ACLMemoryRegion(size_t size, size_t alignment) : IMemoryRegion(size) {
+  CHECK_GT(size, 0) << "Cannot allocate size less than or equal to zero";
+  this->ptr_ = this->device_api_->AllocDataSpace(this->ctx_, size, alignment, {});
+}
+
+ACLMemoryRegion::ACLMemoryRegion(void* ptr, size_t size)
+    : IMemoryRegion(size), is_subregion_(true) {
+  if (size != 0) {
+    this->ptr_ = ptr;
+  }
+}
+
+ACLMemoryRegion::~ACLMemoryRegion() {
+  if (!is_subregion_) {
+    this->device_api_->FreeDataSpace(this->ctx_, this->ptr_);
+  }
+}
+
+std::unique_ptr<acl::IMemoryRegion> ACLMemoryRegion::extract_subregion(size_t offset, size_t size) {
+  if (this->ptr_ != nullptr && (offset < _size) && (_size - offset >= size)) {
+    return acl::support::cpp14::make_unique<ACLMemoryRegion>(
+        static_cast<uint8_t*>(this->ptr_) + offset, size);
+  } else {
+    return nullptr;
+  }
+}
+
+}  // namespace acl
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/acl/acl_allocator.h b/src/runtime/contrib/acl/acl_allocator.h
new file mode 100644
index 0000000000000..d608645947891
--- /dev/null
+++ b/src/runtime/contrib/acl/acl_allocator.h
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/acl/acl_allocator.h
+ * \brief ACL Allocator implementation that requests memory from TVM.
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_ACL_ACL_ALLOCATOR_H_
+#define TVM_RUNTIME_CONTRIB_ACL_ACL_ALLOCATOR_H_
+
+#include <arm_compute/runtime/IAllocator.h>
+#include <arm_compute/runtime/IMemoryRegion.h>
+#include <arm_compute/runtime/MemoryRegion.h>
+#include <support/ToolchainSupport.h>
+#include <tvm/runtime/data_type.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/registry.h>
+
+#include <memory>
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+namespace acl {
+
+namespace acl = arm_compute;
+
+/*!
+ * \brief Override ACL memory allocator and replace with TVM workspace based allocation.
+ */
+class ACLAllocator : public arm_compute::IAllocator {
+ public:
+  ACLAllocator() = default;
+
+  /*!
+   * \brief Allocate bytes to ACL runtime.
+   *
+   * Specific implementation requests memory from TVM using their device api.
+   *
+   * \param size Size to allocate.
+   * \param alignment Alignment that the returned pointer should comply with.
+   * \return A pointer to the allocated memory.
+   */
+  void* allocate(size_t size, size_t alignment) override;
+
+  /*!
+   * \brief Free memory from ACL runtime.
+   *
+   * \param ptr Pointer to workspace to free.
+   */
+  void free(void* ptr) override;
+
+  /*!
+   * \brief Create self-managed memory region.
+   *
+   * \param size Size of the memory region.
+   * \param alignment Alignment of the memory region.
+   * \return The memory region object.
+   */
+  std::unique_ptr<acl::IMemoryRegion> make_region(size_t size, size_t alignment) override;
+
+ private:
+  /*! \brief Always allocate data in the context of the current CPU. */
+  const TVMContext ctx_{kDLCPU, 0};
+  /*! \brief Device API which allows requests for memory from TVM. */
+  runtime::DeviceAPI* device_api_ = runtime::DeviceAPI::Get(ctx_);
+};
+
+/*!
+ * \brief Memory region that can request TVM memory for ACL to use.
+ */
+class ACLMemoryRegion : public arm_compute::IMemoryRegion {
+ public:
+  ACLMemoryRegion(size_t size, size_t alignment);
+  ACLMemoryRegion(void* ptr, size_t size);
+
+  ~ACLMemoryRegion() override;
+
+  /*! \brief Prevent instances of this class from being copied (As this class contains
+   * pointers). */
+  ACLMemoryRegion(const ACLMemoryRegion&) = delete;
+  /*! \brief Default move constructor. */
+  ACLMemoryRegion(ACLMemoryRegion&&) = default;
+  /*! \brief Prevent instances of this class from being copied (As this class
+   * contains pointers) */
+  ACLMemoryRegion& operator=(const ACLMemoryRegion&) = delete;
+  /*! Default move assignment operator. */
+  ACLMemoryRegion& operator=(ACLMemoryRegion&&) = default;
+
+  void* buffer() override { return this->ptr_; }
+
+  const void* buffer() const override { return this->ptr_; }
+
+  /*!
+   * \brief Extract a sub-region from the memory.
+   *
+   * \warning Ownership is maintained by the parent memory,
+   *          while a wrapped raw memory region is returned by this function.
+   *          Thus parent memory should not be released before this.
+   *
+   * \param offset Offset to the region.
+   * \param size Size of the region.
+   * \return A wrapped memory sub-region with no ownership of the
+   * underlying memory.
+   */
+  std::unique_ptr<acl::IMemoryRegion> extract_subregion(size_t offset, size_t size) override;
+
+ private:
+  /*! \brief Points to a region of memory allocated by TVM. */
+  void* ptr_;
+  /*! \brief A subregion doesn't manage TVM memory so we don't need to free it. */
+  bool is_subregion_ = false;
+  /*! \brief Always allocate data in the context of the current CPU. */
+  const TVMContext ctx_{kDLCPU, 0};
+  /*! \brief Device API which allows requests for memory from TVM. */
+  runtime::DeviceAPI* device_api_ = runtime::DeviceAPI::Get(ctx_);
+};
+
+}  // namespace acl
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_CONTRIB_ACL_ACL_ALLOCATOR_H_
diff --git a/src/runtime/contrib/acl/acl_kernel.cc b/src/runtime/contrib/acl/acl_kernel.cc
new file mode 100644
index 0000000000000..a87b1b525e2e5
--- /dev/null
+++ b/src/runtime/contrib/acl/acl_kernel.cc
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/acl/acl_kernel.cc
+ * \brief TVM compatible wrappers for ACL kernels.
+ */
+
+#include "acl_kernel.h"
+
+#include <arm_compute/core/Types.h>
+#include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEPoolingLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>
+#include <tvm/runtime/data_type.h>
+#include <tvm/runtime/registry.h>
+
+#include <memory>
+#include <string>
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+namespace acl {
+
+CachedLayer::CachedLayer(const api::JSONSubGraph& function, const std::vector<NDArray>& constants,
+                         ACLAllocator* allocator,
+                         const std::shared_ptr<acl::MemoryManagerOnDemand>& mm)
+    : constants_(constants), allocator_(allocator) {
+  api::JSONOp op = function.op;
+  // Make tensors
+  int const_tensor_idx = 0;
+  for (const auto& it : op.inputs) {
+    if (it.type == "const") {
+      this->function_.const_inputs.push_back(MakeTensor(it, constants[const_tensor_idx++]->data));
+    } else if (it.type == "var") {
+      this->function_.inputs.push_back(MakeTensor(it));
+    } else {
+      LOG(FATAL) << "Unsupported tensor type";
+    }
+  }
+  for (const auto& it : op.outputs) {
+    this->function_.outputs.push_back(MakeTensor(it));
+  }
+  // Create layer
+  if (op.name == "conv2d") {
+    CreateConvolution2DLayer(&this->function_, function.op, mm);
+    this->is_mm_ = true;
+  } else if (op.name == "max_pool") {
+    CreateMaxPoolLayer(&this->function_, function.op);
+  } else if (op.name == "reshape") {
+    CreateReshapeLayer(&this->function_, function.op);
+  } else {
+    LOG(FATAL) << "Operator not yet supported";
+  }
+  // Prepare function
+  this->function_.function->prepare();
+}
+
+bool CachedLayer::Inference(const std::vector<DLTensor*>& inputs,
+                            const std::vector<DLTensor*>& outputs) {
+  for (size_t i = 0; i < inputs.size(); i++) {
+    CheckACLError(function_.inputs[i].allocator()->import_memory(inputs[i]->data));
+  }
+  for (size_t i = 0; i < outputs.size(); i++) {
+    CheckACLError(function_.outputs[i].allocator()->import_memory(outputs[i]->data));
+  }
+
+  this->function_.function->run();
+  return true;
+}
+
+size_t CachedLayer::GetNumInputs() const { return this->function_.inputs.size(); }
+
+void CachedLayer::CreateConvolution2DLayer(CacheItems* cache, const api::JSONOp& params,
+                                           const std::shared_ptr<acl::MemoryManagerOnDemand>& mm) {
+  auto padding = dmlc::get<std::vector<int>>(params.attrs.at("padding"));
+  auto strides = dmlc::get<std::vector<int>>(params.attrs.at("strides"));
+  auto groups = dmlc::get<int>(params.attrs.at("groups"));
+
+  CHECK(groups == 1) << "ACL NEON Convolution only supports group size of 1";
+
+  acl::PadStrideInfo pad_stride_info =
+      acl::PadStrideInfo(strides[0], strides[1], padding[0], padding[1], padding[2], padding[3],
+                         acl::DimensionRoundingType::FLOOR);
+  acl::ActivationLayerInfo act_info = acl::ActivationLayerInfo();
+  if (params.attrs.find("activation_type") != params.attrs.end()) {
+    auto activation_function = dmlc::get<std::string>(params.attrs.at("activation_type"));
+
+    if (activation_function == "relu") {
+      act_info = acl::ActivationLayerInfo(acl::ActivationLayerInfo::ActivationFunction::RELU);
+    } else {
+      LOG(FATAL) << "Unsupported activation function";
+    }
+  }
+
+  auto function = std::make_shared<acl::NEConvolutionLayer>(mm);
+  function->configure(&cache->inputs[0], &cache->const_inputs[0],
+                      cache->const_inputs.size() > 1 ? &cache->const_inputs[1] : nullptr,
+                      &cache->outputs[0], pad_stride_info, acl::WeightsInfo(), acl::Size2D(1U, 1U),
+                      act_info);
+
+  cache->function = function;
+}
+
+void CachedLayer::CreateMaxPoolLayer(CacheItems* cache, const api::JSONOp& params) {
+  auto padding = dmlc::get<std::vector<int>>(params.attrs.at("padding"));
+  auto strides = dmlc::get<std::vector<int>>(params.attrs.at("strides"));
+  auto pool_size = dmlc::get<std::vector<int>>(params.attrs.at("pool_size"));
+  auto pooling_type = dmlc::get<std::string>(params.attrs.at("pooling_type"));
+
+  acl::PoolingType pool_type;
+  if (pooling_type == "max") {
+    pool_type = acl::PoolingType::MAX;
+  } else {
+    LOG(FATAL) << "Pooling type not supported";
+  }
+
+  acl::PadStrideInfo pad_stride_info =
+      acl::PadStrideInfo(strides[0], strides[1], padding[0], padding[1], padding[2], padding[3],
+                         acl::DimensionRoundingType::FLOOR);
+  acl::PoolingLayerInfo pool_info = acl::PoolingLayerInfo(
+      pool_type, acl::Size2D(pool_size[0], pool_size[1]), acl::DataLayout::NHWC, pad_stride_info);
+
+  auto function = std::make_shared<acl::NEPoolingLayer>();
+  function->configure(&cache->inputs[0], &cache->outputs[0], pool_info);
+
+  cache->function = function;
+}
+
+void CachedLayer::CreateReshapeLayer(CacheItems* cache, const api::JSONOp& params) {
+  auto function = std::make_shared<acl::NEReshapeLayer>();
+  function->configure(&cache->inputs[0], &cache->outputs[0]);
+
+  cache->function = function;
+}
+
+}  // namespace acl
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/acl/acl_kernel.h b/src/runtime/contrib/acl/acl_kernel.h
new file mode 100644
index 0000000000000..8ab8eaf229109
--- /dev/null
+++ b/src/runtime/contrib/acl/acl_kernel.h
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/acl/acl_kernel.h
+ * \brief Use ACL library kernels, we create an interface to these.
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_ACL_ACL_KERNEL_H_
+#define TVM_RUNTIME_CONTRIB_ACL_ACL_KERNEL_H_
+
+#include <arm_compute/runtime/IFunction.h>
+#include <arm_compute/runtime/MemoryManagerOnDemand.h>
+#include <arm_compute/runtime/Tensor.h>
+#include <dmlc/filesystem.h>
+#include <dmlc/json.h>
+#include <dmlc/logging.h>
+#include <dmlc/memory_io.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "../../../relay/backend/contrib/acl/acl_api.h"
+#include "acl_allocator.h"
+#include "acl_utils.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+namespace acl {
+
+namespace api = relay::contrib::acl;
+namespace acl = arm_compute;
+
+/*!
+ * \brief ACL objects we cache in order to avoid needing to construct
+ * a new layer each time.
+ */
+struct CacheItems {
+  std::shared_ptr<arm_compute::IFunction> function;
+  std::vector<arm_compute::Tensor> inputs;
+  std::vector<arm_compute::Tensor> const_inputs;
+  std::vector<arm_compute::Tensor> outputs;
+};
+
+/*!
+ * \brief A cached ACL layer containing a single ACL function.
+ */
+class CachedLayer {
+ public:
+  /*!
+   * \brief Create an ACL layer from a JSON representation. Also prepare
+   * the layer for execution - this will perform actions such as pre-
+   * transposing of weights.
+   *
+   * \note The naming suggests a subgraph directly maps to a layer.
+   * In general this is not true, but since ACL only expects subgraphs
+   * consisting of a single op it is.
+   *
+   * \param function A JSON representation of a subgraph.
+   * \param constants The constants used in the subgraph.
+   * \param allocator ACL can request memory from TVM.
+   */
+  CachedLayer(const api::JSONSubGraph& function, const std::vector<NDArray>& constants,
+              ACLAllocator* allocator, const std::shared_ptr<acl::MemoryManagerOnDemand>& mm);
+
+  /*!
+   * \brief Run inference on the ACL layer.
+   *
+   * \param inputs The inputs for the layer.
+   * \param outputs The outputs for the layer.
+   * \return True if success, False if not successful.
+   */
+  bool Inference(const std::vector<DLTensor*>& inputs, const std::vector<DLTensor*>& outputs);
+
+  /*!
+   * \brief Get the number of inputs the layer takes.
+   *
+   * \return Number of inputs.
+   */
+  size_t GetNumInputs() const;
+
+  /*!
+   * \brief Check if the layer requires working memory to be allocated.
+   *
+   * \return True if it does, False if not.
+   */
+  bool IsMemoryManaged() const { return this->is_mm_; }
+
+ private:
+  /*! \brief Constant tensors used in the layer. */
+  std::vector<NDArray> constants_;
+  /*! \brief Cache ACL function and tensors for execution. */
+  CacheItems function_;
+  /*! \brief ACL Allocator to request auxiliary memory from TVM. */
+  ACLAllocator* allocator_;
+  /*! \brief Check if the function requires working memory to be allocated. */
+  bool is_mm_ = false;
+
+  /*! \brief Create individual ACL layer. */
+  static void CreateConvolution2DLayer(CacheItems* cache, const api::JSONOp& params,
+                                       const std::shared_ptr<acl::MemoryManagerOnDemand>& mm);
+  static void CreateMaxPoolLayer(CacheItems* cache, const api::JSONOp& params);
+  static void CreateReshapeLayer(CacheItems* cache, const api::JSONOp& params);
+};
+
+}  // namespace acl
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_CONTRIB_ACL_ACL_KERNEL_H_
diff --git a/src/runtime/contrib/acl/acl_runtime.cc b/src/runtime/contrib/acl/acl_runtime.cc
new file mode 100644
index 0000000000000..1c372fe2c7e01
--- /dev/null
+++ b/src/runtime/contrib/acl/acl_runtime.cc
@@ -0,0 +1,233 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <dmlc/json.h>
+#include <dmlc/logging.h>
+#include <dmlc/memory_io.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/registry.h>
+
+#include <utility>
+
+#include "../../../relay/backend/contrib/acl/acl_api.h"
+#include "../../file_util.h"
+
+#ifdef TVM_GRAPH_RUNTIME_ACL
+#include <arm_compute/runtime/MemoryManagerOnDemand.h>
+#include <arm_compute/runtime/OffsetLifetimeManager.h>
+#include <arm_compute/runtime/PoolManager.h>
+
+#include "acl_allocator.h"
+#include "acl_kernel.h"
+#endif
+
+namespace tvm {
+namespace runtime {
+
+namespace api = relay::contrib::acl;
+
+class ACLModule : public ModuleNode {
+ public:
+  /*!
+   * \brief The ACL runtime module. Deserialize the provided functions
+   * on creation and store in the layer cache.
+   *
+   * \param serialized_graphs A vector of (external symbol, serialized JSON subgraph) pairs.
+   */
+  explicit ACLModule(const std::vector<std::pair<std::string, std::string>>& serialized_functions) {
+#ifdef TVM_GRAPH_RUNTIME_ACL
+    auto lifetime_mgr = std::make_shared<arm_compute::OffsetLifetimeManager>();
+    auto pool_mgr = std::make_shared<arm_compute::PoolManager>();
+    auto mm = std::make_shared<arm_compute::MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
+    int num_pools = 0;
+#endif
+
+    for (const auto& it : serialized_functions) {
+      std::string serialized_function = it.second;
+      auto ds = api::DeserializeSubgraph(&serialized_function);
+      this->deserialized_functions_.emplace_back(it.first, ds);
+
+#ifdef TVM_GRAPH_RUNTIME_ACL
+      this->subgraph_cache_[it.first] =
+          std::make_shared<contrib::acl::CachedLayer>(ds.first, ds.second, &this->allocator_, mm);
+      if (this->subgraph_cache_[it.first]->IsMemoryManaged()) num_pools++;
+#endif
+    }
+#ifdef TVM_GRAPH_RUNTIME_ACL
+    // Allocate working memory for layers.
+    if (num_pools > 0) mm->populate(this->allocator_, num_pools);
+#endif
+  }
+
+  /*!
+   * \brief Get a PackedFunc from the ACL module.
+   *
+   * \param name The name of the function.
+   * \param sptr_to_self The ObjectPtr that points to this module node.
+   * \return The function pointer when it is found, otherwise, PackedFunc(nullptr).
+   */
+  PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
+#ifdef TVM_GRAPH_RUNTIME_ACL
+    if (this->subgraph_cache_.find(name) != this->subgraph_cache_.end()) {
+      return PackedFunc([sptr_to_self, this, name](TVMArgs args, TVMRetValue* rv) {
+        *rv = tvm::runtime::ACLModule::Inference(args, this->subgraph_cache_[name].get());
+      });
+    }
+#endif
+    return PackedFunc(nullptr);
+  }
+
+  /*!
+   * \brief The type key of the module.
+   *
+   * \return module type key.
+   */
+  const char* type_key() const override { return "acl"; }
+
+  /*!
+   * \brief Unpack inputs and outputs and run inference on a given layer.
+   *
+   * \param args Access inputs and outputs.
+   * \param function The layer to execute inference on.
+   * \return Status of inference.
+   */
+#ifdef TVM_GRAPH_RUNTIME_ACL
+  static bool Inference(tvm::runtime::TVMArgs args, contrib::acl::CachedLayer* function) {
+    // Unpack parameters
+    int argc = 0;
+    std::vector<DLTensor*> inputs;
+    for (size_t i = 0; i < function->GetNumInputs(); i++) {
+      inputs.push_back(args[argc++]);
+    }
+    std::vector<DLTensor*> outputs;
+    for (; argc < args.size(); argc++) {
+      outputs.push_back(args[argc]);
+    }
+    return function->Inference(inputs, outputs);
+  }
+#endif
+
+  /*!
+   * \brief Save a compiled network to a binary stream, which can then be
+   * serialized to disk.
+   *
+   * \param stream The stream to save the binary.
+   */
+  void SaveToBinary(dmlc::Stream* stream) final {
+    stream->Write(this->deserialized_functions_.size());
+    for (const auto& it : this->deserialized_functions_) {
+      stream->Write(it.first);
+      std::pair<api::JSONSubGraph, std::vector<NDArray>> subgraph_pair = it.second;
+      std::string serialized_function =
+          api::SerializeSubgraph(subgraph_pair.first, subgraph_pair.second);
+      stream->Write(serialized_function);
+    }
+  }
+
+  /*!
+   * \brief Load a compiled network from stream.
+   *
+   * \param strm The binary stream to load.
+   * \return The created ACL module.
+   */
+  static Module LoadFromBinary(void* strm) {
+    auto stream = static_cast<dmlc::Stream*>(strm);
+    size_t func_count;
+    stream->Read(&func_count);
+    std::vector<std::pair<std::string, std::string>> serialized_functions;
+    for (unsigned int i = 0; i < func_count; i++) {
+      std::string ext_symbol;
+      std::string serialized_function;
+      stream->Read(&ext_symbol);
+      stream->Read(&serialized_function);
+      serialized_functions.emplace_back(std::make_pair(ext_symbol, serialized_function));
+    }
+    auto n = make_object<ACLModule>(serialized_functions);
+    return Module(n);
+  }
+
+  /*!
+   * \brief Save a module to a specified path.
+   *
+   * \param path Where to save the serialized module.
+   * \param format The format of the file.
+   */
+  void SaveToFile(const std::string& path, const std::string& format) override {
+    std::string data;
+    dmlc::MemoryStringStream writer(&data);
+    dmlc::SeekStream* strm = &writer;
+    SaveToBinary(strm);
+    SaveBinaryToFile(path, data);
+  }
+
+  /*!
+   * \brief Create a module from a file.
+   *
+   * \param path The path of the file containing the serialized module.
+   * \return The created ACL module.
+   */
+  static Module LoadFromFile(const std::string& path) {
+    std::string data;
+    LoadBinaryFromFile(path, &data);
+    dmlc::MemoryStringStream reader(&data);
+    return LoadFromBinary(&reader);
+  }
+
+  /*!
+   * \brief Get the JSON generated by codegen.
+   *
+   * \param format the format to return (only JSON for the time being)
+   * \return A string of JSON.
+   */
+  std::string GetSource(const std::string& format) override {
+    std::ostringstream os;
+    dmlc::JSONWriter writer(&os);
+    writer.BeginObject();
+    for (const auto& it : deserialized_functions_) {
+      writer.WriteObjectKeyValue(it.first, it.second.first);
+    }
+    writer.EndObject();
+    return os.str();
+  }
+
+ private:
+  /* \brief A vector of (external symbol, serialized JSON subgraph) pairs. */
+  std::vector<std::pair<std::string, std::pair<api::JSONSubGraph, std::vector<NDArray>>>>
+      deserialized_functions_;
+
+#ifdef TVM_GRAPH_RUNTIME_ACL
+  /* \brief A map between ext_symbols (function names) and an ACL subgraph.
+   * \note Currently only a single op per subgraph is supported. Hence mapping to
+   * cached layer.*/
+  std::map<std::string, std::shared_ptr<contrib::acl::CachedLayer>> subgraph_cache_;
+  /*! \brief Allow ACL functions to request auxiliary memory from TVM. */
+  contrib::acl::ACLAllocator allocator_;
+#endif
+};
+
+TVM_REGISTER_GLOBAL("runtime.module.loadfile_acl").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = ACLModule::LoadFromFile(args[0]);
+});
+
+TVM_REGISTER_GLOBAL("runtime.module.loadbinary_acl").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = ACLModule::LoadFromBinary(args[0]);
+});
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/acl/acl_utils.cc b/src/runtime/contrib/acl/acl_utils.cc
new file mode 100644
index 0000000000000..6e29cc384d404
--- /dev/null
+++ b/src/runtime/contrib/acl/acl_utils.cc
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/acl/acl_utils.cc
+ * \brief Utils and common functions for the interface.
+ */
+
+#include "acl_utils.h"
+
+#include <tvm/runtime/data_type.h>
+#include <tvm/runtime/registry.h>
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+namespace acl {
+
+void CheckACLError(arm_compute::Status status) {
+  CHECK(status.error_code() == arm_compute::ErrorCode::OK) << "ACL: " << status.error_description();
+}
+
+acl::Tensor MakeTensor(const api::JSONTensor& tensor_rep, void* data) {
+  acl::Tensor tensor;
+  acl::TensorInfo info = MakeTensorInfo(tensor_rep);
+  tensor.allocator()->init(info);
+  if (data != nullptr) {
+    CheckACLError(tensor.allocator()->import_memory(data));
+  }
+  return tensor;
+}
+
+acl::TensorInfo MakeTensorInfo(const api::JSONTensor& tensor_rep) {
+  return acl::TensorInfo(MakeTensorShape(tensor_rep.shape), 1, acl::DataType::F32,
+                         acl::DataLayout::NHWC);
+}
+
+arm_compute::TensorShape MakeTensorShape(const std::vector<int>& shape) {
+  arm_compute::TensorShape acl_shape;
+  for (unsigned int i = shape.size(); i > 0; --i) {
+    acl_shape.set(shape.size() - i, shape[i - 1]);
+  }
+  return acl_shape;
+}
+
+}  // namespace acl
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/acl/acl_utils.h b/src/runtime/contrib/acl/acl_utils.h
new file mode 100644
index 0000000000000..111121d48308e
--- /dev/null
+++ b/src/runtime/contrib/acl/acl_utils.h
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/acl/acl_utils.h
+ * \brief Utils and common functions for the interface.
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_ACL_ACL_UTILS_H_
+#define TVM_RUNTIME_CONTRIB_ACL_ACL_UTILS_H_
+
+#include <arm_compute/core/Types.h>
+#include <arm_compute/runtime/Tensor.h>
+
+#include <vector>
+
+#include "../../../relay/backend/contrib/acl/acl_api.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+namespace acl {
+
+namespace api = relay::contrib::acl;
+namespace acl = arm_compute;
+
+/*!
+ * \brief Check if there are any errors from acl and forward them to TVM.
+ *
+ * \param status status of called function.
+ *
+ * Status values:
+ * - 0 => OK
+ * - 1 => RUNTIME_ERROR
+ * - 2 => UNSUPPORTED_EXTENSION_USE
+ */
+void CheckACLError(acl::Status status);
+
+/*!
+ * \brief Make an acl tensor from JSON tensor representation.
+ *
+ * \param tensor_rep A JSON tensor representation.
+ * \param data (optional) Initialize the tensor with memory.
+ * \return arm_compute::Tensor.
+ */
+acl::Tensor MakeTensor(const api::JSONTensor& tensor_rep, void* data = nullptr);
+
+/*!
+ * \brief Make an acl tensor info object from JSON tensor
+ * representation.
+ *
+ * \param tensor_rep A JSON tensor representation.
+ * \return arm_compute::TensorInfo.
+ */
+acl::TensorInfo MakeTensorInfo(const api::JSONTensor& tensor_rep);
+
+/*!
+ * \brief Convert vector object to acl TensorShape.
+ * \note This requires reversing the given vector.
+ *
+ * \param shape The shape of the tensor as a vector.
+ * \return acl TensorShape.
+ */
+acl::TensorShape MakeTensorShape(const std::vector<int>& shape);
+
+}  // namespace acl
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_CONTRIB_ACL_ACL_UTILS_H_
diff --git a/tests/python/contrib/test_acl/__init__.py b/tests/python/contrib/test_acl/__init__.py
new file mode 100644
index 0000000000000..a8671172febde
--- /dev/null
+++ b/tests/python/contrib/test_acl/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Infrastructure and tests for ACL"""
diff --git a/tests/python/contrib/test_acl/infrastructure.py b/tests/python/contrib/test_acl/infrastructure.py
new file mode 100644
index 0000000000000..04c5d2784c28a
--- /dev/null
+++ b/tests/python/contrib/test_acl/infrastructure.py
@@ -0,0 +1,162 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from itertools import zip_longest, combinations
+import json
+
+import tvm
+from tvm import relay
+from tvm import rpc
+from tvm.contrib import graph_runtime
+from tvm.relay.op.contrib import acl
+from tvm.contrib import util
+
+
+class Device:
+    """Adjust the following settings to connect to and use a remote device for tests."""
+    use_remote = False
+    target = "llvm -target=aarch64-linux-gnu -mattr=+neon"
+    # Enable cross compilation when connecting a remote device from a non-arm platform.
+    cross_compile = None
+    # cross_compile = "aarch64-linux-gnu-g++"
+
+    def __init__(self):
+        """Keep remote device for lifetime of object."""
+        self.device = self._get_remote()
+
+    @classmethod
+    def _get_remote(cls):
+        """Get a remote (or local) device to use for testing."""
+        if cls.use_remote:
+            # Here you may adjust settings to run the ACL unit tests via a remote
+            # device using the RPC mechanism. Use this in the case you want to compile
+            # an ACL module on a different machine to what you run the module on i.e.
+            # x86 -> AArch64.
+            #
+            # Use the following to connect directly to a remote device:
+            # device = rpc.connect(
+            #     hostname="0.0.0.0",
+            #     port=9090)
+            #
+            # Or connect via a tracker:
+            # device = tvm.autotvm.measure.request_remote(
+            #     host="0.0.0.0",
+            #     port=9090,
+            #     device_key="device_key",
+            #     timeout=1000)
+            #
+            # return device
+            raise NotImplementedError(
+                "Please adjust these settings to connect to your remote device.")
+        else:
+            device = rpc.LocalSession()
+            return device
+
+
+def skip_runtime_test():
+    """Skip test if it requires the runtime and it's not present."""
+    # ACL codegen not present.
+    if not tvm.get_global_func("relay.ext.acl", True):
+        print("Skip because ACL codegen is not available.")
+        return True
+
+    # Remote device is in use or ACL runtime not present
+    if not Device.use_remote and not acl.is_acl_runtime_present():
+        print("Skip because runtime isn't present or a remote device isn't being used.")
+        return True
+
+
+def skip_codegen_test():
+    """Skip test if it requires the ACL codegen and it's not present."""
+    if not tvm.get_global_func("relay.ext.acl", True):
+        print("Skip because ACL codegen is not available.")
+        return True
+
+
+def build_module(mod, target, params=None, enable_acl=True):
+    """Build module with option to build for ACL."""
+    if isinstance(mod, tvm.relay.expr.Call):
+        mod = tvm.IRModule.from_expr(mod)
+    with tvm.transform.PassContext(opt_level=3):
+        if enable_acl:
+            mod = acl.partition_for_acl(mod, params)
+        return relay.build(mod, target=target, params=params)
+
+
+def build_and_run(mod, inputs, outputs, params, device, enable_acl=True, no_runs=1):
+    """Build and run the relay module."""
+    graph, lib, params = build_module(mod, device.target, params, enable_acl)
+    lib = update_lib(lib, device.device, device.cross_compile)
+    gen_module = graph_runtime.create(graph, lib, ctx=device.device.cpu(0))
+    gen_module.set_input(**inputs)
+    gen_module.set_input(**params)
+    for _ in range(no_runs):
+        gen_module.run()
+    out = [gen_module.get_output(i) for i in range(outputs)]
+    return out
+
+
+def update_lib(lib, device, cross_compile):
+    """Export the library to the remote/local device."""
+    lib_name = "mod.so"
+    temp = util.tempdir()
+    lib_path = temp.relpath(lib_name)
+    if cross_compile:
+        lib.export_library(lib_path, cc=cross_compile)
+    else:
+        lib.export_library(lib_path)
+    device.upload(lib_path)
+    lib = device.load_module(lib_name)
+    return lib
+
+
+def verify(answers, atol, rtol):
+    """Compare the array of answers. Each entry is a list of outputs."""
+    if len(answers) < 2:
+        raise RuntimeError(
+            f"No results to compare: expected at least two, found {len(answers)}")
+    for answer in zip_longest(*answers):
+        for outs in combinations(answer, 2):
+            tvm.testing.assert_allclose(
+               outs[0].asnumpy(), outs[1].asnumpy(), rtol=rtol, atol=atol)
+
+
+def extract_acl_modules(module):
+    """Get the ACL module(s) from llvm module."""
+    return list(filter(lambda mod: mod.type_key == "acl",
+                       module.imported_modules))
+
+
+def verify_codegen(module, known_good_codegen, num_acl_modules,
+                   target="llvm -target=aarch64-linux-gnu -mattr=+neon"):
+    """Check acl codegen against a known good output."""
+    _, module, _ = build_module(module, target)
+    acl_modules = extract_acl_modules(module)
+
+    assert len(acl_modules) == num_acl_modules, \
+        f"The number of ACL modules produced ({len(acl_modules)}) does not " \
+        f"match the expected value ({num_acl_modules})."
+
+    for mod in acl_modules:
+        source = mod.get_source()
+        source_json = json.loads(source)
+        func_name = list(source_json.keys())[0]
+        codegen = source_json[func_name]["node"]
+
+        assert codegen == known_good_codegen, \
+            f"The JSON produced by codegen does not match the expected result. \n" \
+            f"Actual={json.dumps(codegen, sort_keys=True, indent=2)} \n" \
+            f"Expected={json.dumps(known_good_codegen, sort_keys=True, indent=2)}"
diff --git a/tests/python/contrib/test_acl/test_conv2d.py b/tests/python/contrib/test_acl/test_conv2d.py
new file mode 100644
index 0000000000000..a2724315c4e8e
--- /dev/null
+++ b/tests/python/contrib/test_acl/test_conv2d.py
@@ -0,0 +1,202 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""ACL Integration conv2d tests."""
+
+import numpy as np
+
+import tvm
+from tvm import relay
+
+from .infrastructure import skip_runtime_test, skip_codegen_test, build_and_run, \
+    verify, verify_codegen
+from .infrastructure import Device
+
+
+def _get_model(shape, kernel_size, padding, strides,
+               dilation, groups, dtype, channels,
+               var_names, has_bias=False, has_activation=False, has_pad=False):
+    """Return a model and any parameters it may have"""
+    a = relay.var(next(var_names), shape=shape, dtype=dtype)
+    if has_pad:
+        p = ((0, 0), (padding[0], padding[0]), (padding[1], padding[1]), (0, 0))
+        a = relay.nn.pad(a, pad_width=p)
+        padding = (0, 0, 0, 0)
+    else:
+        if len(padding) == 2:
+            padding = (padding[0], padding[1], padding[0], padding[1])
+        shape = (shape[0], shape[1] + padding[0] * 2,
+                 shape[2] + padding[1] * 2, shape[3])
+    weight_shape = (kernel_size, kernel_size, shape[3] // groups, channels)
+    w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype))
+    weights = relay.const(w, dtype)
+    out = relay.nn.conv2d(
+        a,
+        weights,
+        kernel_size=(kernel_size, kernel_size),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        dilation=(1, 1),
+        strides=strides,
+        padding=padding,
+        groups=groups,
+        channels=channels
+    )
+    params = {"w": w}
+    if has_bias:
+        b = tvm.nd.array(np.random.uniform(-128, 127, weight_shape[3]).astype(dtype))
+        biasc = relay.const(b, dtype)
+        out = relay.nn.bias_add(out, biasc, axis=3)
+        params["b"] = b
+    if has_activation:
+        out = relay.nn.relu(out)
+    return out, params
+
+
+def _get_expected_codegen(shape, kernel_size, padding, strides,
+                          dilation, groups, dtype, channels,
+                          has_bias=False, has_activation=False):
+    codegen = {
+        "name": "conv2d",
+        "inputs": [],
+        "outputs": [],
+        "attrs": {
+            "groups": ["Int", 1],
+            "num_inputs": ["Size_t", 2],
+            "num_outputs": ["Size_t", 1]
+        }
+    }
+
+    if len(padding) == 2:
+        padding = (padding[0], padding[1], padding[0], padding[1])
+    # Transpose padding to match ACL format
+    padding = (padding[1], padding[3], padding[0], padding[2])
+    weight_shape = (channels, kernel_size, kernel_size, shape[3] // groups)
+    output_height = ((shape[1] - kernel_size + padding[2] + padding[3]) / strides[0]) + 1
+    output_width = ((shape[2] - kernel_size + padding[0] + padding[1]) / strides[1]) + 1
+    output_shape = (1, int(output_height), int(output_width), channels)
+
+    codegen["attrs"]["padding"] = ["IntVector", list(padding)]
+    codegen["attrs"]["strides"] = ["IntVector", list(strides)]
+    if has_activation:
+        codegen["attrs"]["activation_type"] = ["String", "relu"]
+
+    inputs = [{"type": "var", "shape": list(shape)},
+              {"type": "const", "shape": list(weight_shape)}]
+    if has_bias:
+        inputs.append({"type": "const", "shape": [weight_shape[0]]})
+    outputs = [{"type": "var", "shape": list(output_shape)}]
+
+    codegen["inputs"] = inputs
+    codegen["outputs"] = outputs
+    codegen["attrs"]["num_inputs"] = ["Size_t", len(inputs)]
+    codegen["attrs"]["num_outputs"] = ["Size_t", len(outputs)]
+
+    return codegen
+
+
+def test_conv2d():
+    if skip_runtime_test():
+        return
+
+    device = Device()
+
+    shape = (1, 25, 25, 1)
+    dtype = "float32"
+
+    inputs = {
+        "a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype)),
+    }
+
+    for kernel_size in [2, 3]:
+        outputs = []
+        func, params = _get_model(shape, kernel_size,
+                                  (0, 0), (1, 1), 1, 1,
+                                  dtype, 1, iter(inputs))
+        for acl in [False, True]:
+            outputs.append(build_and_run(func, inputs, 1,
+                                         params, device,
+                                         enable_acl=acl))
+        verify(outputs, atol=0.002, rtol=0.01)
+
+    for pad_ksize in [((1, 1), 3), ((2, 2), 5), ((2, 1), 3)]:
+        outputs = []
+        func, params = _get_model(shape, pad_ksize[1], pad_ksize[0],
+                                  (1, 1), 1, 1, dtype, 1, iter(inputs))
+        for acl in [False, True]:
+            outputs.append(build_and_run(func, inputs, 1,
+                                         params, device,
+                                         enable_acl=acl))
+        verify(outputs, atol=0.002, rtol=0.01)
+
+    for strides in [(1, 1), (2, 2)]:
+        outputs = []
+        func, params = _get_model(shape, 2, (0, 0), strides,
+                                  1, 1, dtype, 1, iter(inputs))
+        for acl in [False, True]:
+            outputs.append(build_and_run(func, inputs, 1,
+                                         params, device,
+                                         enable_acl=acl))
+        verify(outputs, atol=0.002, rtol=0.01)
+
+    # Test composite convolution: (has_pad, has_bias, has_activation).
+    for composite in [(False, True, False), (False, False, True), (False, True, True),
+                      (True, False, False)]:
+        outputs = []
+        func, params = _get_model(shape, 2, (1, 1), (1, 1),
+                                  1, 1, dtype, 1, iter(inputs),
+                                  has_pad=composite[0],
+                                  has_bias=composite[1],
+                                  has_activation=composite[2])
+        for acl in [False, True]:
+            outputs.append(build_and_run(func, inputs, 1,
+                                         params, device,
+                                         enable_acl=acl))
+        verify(outputs, atol=0.002, rtol=0.01)
+
+
+def test_codegen_conv2d():
+    if skip_codegen_test():
+        return
+
+    shape = (1, 25, 25, 1)
+    dtype = "float32"
+    inputs = {"a"}
+
+    for pad_ksize in [((1, 1), 3), ((2, 1), 3)]:
+        args = (shape, pad_ksize[1], pad_ksize[0], (1, 1), 1, 1, dtype, 1)
+        func, params = _get_model(*args, var_names=iter(inputs))
+        exp_codegen = _get_expected_codegen(*args)
+        verify_codegen(func, exp_codegen, 1)
+
+    # Test composite convolution: (has_pad, has_bias, has_activation).
+    for composite in [(False, True, False), (False, False, True), (False, True, True),
+                      (True, False, False)]:
+        args = (shape, 2, (1, 1), (1, 1), 1, 1, dtype, 1)
+        func, params = _get_model(*args, var_names=iter(inputs),
+                                  has_pad=composite[0],
+                                  has_bias=composite[1],
+                                  has_activation=composite[2])
+        exp_codegen = _get_expected_codegen(*args,
+                                            has_bias=composite[1],
+                                            has_activation=composite[2],
+                                            )
+        verify_codegen(func, exp_codegen, 1)
+
+
+if __name__ == "__main__":
+    test_conv2d()
+    test_codegen_conv2d()
diff --git a/tests/python/contrib/test_acl/test_network.py b/tests/python/contrib/test_acl/test_network.py
new file mode 100644
index 0000000000000..e5afe905228f1
--- /dev/null
+++ b/tests/python/contrib/test_acl/test_network.py
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""ACL network tests."""
+
+import numpy as np
+
+from tvm import relay
+
+from .infrastructure import skip_runtime_test, build_and_run, verify
+from .infrastructure import Device
+
+
+def _build_and_run_keras_network(mod, params, inputs, device):
+    """Helper function to build and run a network from the Keras frontend."""
+    data = {}
+    for name, shape in inputs.items():
+        data[name] = np.random.uniform(-128, 127, shape).astype("float32")
+
+    outputs = []
+    for acl in [False, True]:
+        outputs.append(build_and_run(mod, data, 1, params,
+                                     device, enable_acl=acl))
+    verify(outputs, atol=0.002, rtol=0.01)
+
+
+def test_vgg16():
+    if skip_runtime_test():
+        return
+
+    device = Device()
+
+    def get_model():
+        from keras.applications import VGG16
+        vgg16 = VGG16(include_top=True, weights='imagenet',
+                      input_shape=(224, 224, 3), classes=1000)
+        inputs = {vgg16.input_names[0]: (1, 224, 224, 3)}
+        mod, params = relay.frontend.from_keras(vgg16, inputs, layout="NHWC")
+        return mod, params, inputs
+
+    _build_and_run_keras_network(*get_model(), device=device)
+
+
+def test_mobilenet():
+    if skip_runtime_test():
+        return
+
+    device = Device()
+
+    def get_model():
+        from keras.applications import MobileNet
+        mobilenet = MobileNet(include_top=True, weights='imagenet',
+                              input_shape=(224, 224, 3), classes=1000)
+        inputs = {mobilenet.input_names[0]: (1, 224, 224, 3)}
+        mod, params = relay.frontend.from_keras(mobilenet, inputs, layout="NHWC")
+        return mod, params, inputs
+
+    _build_and_run_keras_network(*get_model(), device=device)
+
+
+if __name__ == "__main__":
+    test_vgg16()
+    test_mobilenet()
diff --git a/tests/python/contrib/test_acl/test_pooling.py b/tests/python/contrib/test_acl/test_pooling.py
new file mode 100644
index 0000000000000..8fb1e93d6ac07
--- /dev/null
+++ b/tests/python/contrib/test_acl/test_pooling.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""ACL Integration pooling tests."""
+
+import numpy as np
+
+import tvm
+from tvm import relay
+
+from .infrastructure import skip_runtime_test, skip_codegen_test, build_and_run, \
+    verify, verify_codegen
+from .infrastructure import Device
+
+
+def _get_model(shape, typef, sizes, strides, padding,
+               ceil_mode, var_names):
+    """Return a model and any parameters it may have."""
+    var = relay.var(next(var_names), shape=shape, dtype="float32")
+    pool = typef(var, pool_size=sizes, strides=strides, padding=padding,
+                 ceil_mode=ceil_mode, layout="NHWC")
+    return pool
+
+
+def _get_expected_codegen(shape, typef, sizes, strides, padding,
+                          ceil_mode):
+    codegen = {
+        "name": "max_pool",
+        "inputs": [],
+        "outputs": [],
+        "attrs": {
+            "pooling_type": ["String", "max"]
+        }
+    }
+
+    if len(padding) == 2:
+        padding = (padding[1], padding[1], padding[0], padding[0])
+    # Transpose padding to match ACL format
+    padding = (padding[1], padding[3], padding[0], padding[2])
+    output_height = ((shape[1] - sizes[0] + padding[2] + padding[3]) / strides[0]) + 1
+    output_width = ((shape[2] - sizes[1] + padding[0] + padding[1]) / strides[1]) + 1
+    output_shape = (1, int(output_height), int(output_width), shape[3])
+
+    if typef == relay.nn.max_pool2d:
+        pooling_type = "max"
+    else:
+        raise NotImplementedError(f"No conversion from {typef} to pooling_type string.")
+
+    codegen["attrs"]["padding"] = ["IntVector", list(padding)]
+    codegen["attrs"]["strides"] = ["IntVector", list(strides)]
+    codegen["attrs"]["pool_size"] = ["IntVector", list(sizes)]
+    codegen["attrs"]["pooling_type"] = ["String", pooling_type]
+
+    inputs = [{"type": "var", "shape": list(shape)}]
+    outputs = [{"type": "var", "shape": list(output_shape)}]
+
+    codegen["inputs"] = inputs
+    codegen["outputs"] = outputs
+    codegen["attrs"]["num_inputs"] = ["Size_t", len(inputs)]
+    codegen["attrs"]["num_outputs"] = ["Size_t", len(outputs)]
+
+    return codegen
+
+
+def test_pooling():
+    if skip_runtime_test():
+        return
+
+    device = Device()
+
+    for size in [(2, 2), (3, 3)]:
+        for stride in [(2, 2)]:
+            shape = (1, size[0] + stride[0] * 5,
+                     size[1] + stride[1] * 5, 16)
+
+            inputs = {
+                "a": tvm.nd.array(np.random.uniform(-1, 1, shape).astype("float32")),
+            }
+
+            outputs = []
+            func = _get_model(shape, relay.nn.max_pool2d, size,
+                              stride, (0, 0), True, iter(inputs))
+            for acl in [False, True]:
+                outputs.append(build_and_run(func, inputs, 1, None, device,
+                                             enable_acl=acl))
+            verify(outputs, atol=0.001, rtol=0.001)
+
+
+def test_codegen_pooling():
+    if skip_codegen_test():
+        return
+
+    inputs = {"a"}
+
+    for size in [(2, 2), (3, 3)]:
+        for stride in [(2, 2)]:
+            shape = (1, size[0] + stride[0] * 5,
+                     size[1] + stride[1] * 5, 16)
+            args = (shape, relay.nn.max_pool2d, size,
+                    stride, (0, 0), True)
+            func = _get_model(*args, iter(inputs))
+            exp_codegen = _get_expected_codegen(*args)
+            verify_codegen(func, exp_codegen, 1)
+
+
+if __name__ == "__main__":
+    test_pooling()
+    test_codegen_pooling()
diff --git a/tests/python/contrib/test_acl/test_reshape.py b/tests/python/contrib/test_acl/test_reshape.py
new file mode 100644
index 0000000000000..81192cdf992c8
--- /dev/null
+++ b/tests/python/contrib/test_acl/test_reshape.py
@@ -0,0 +1,91 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""ACL Integration reshape tests."""
+
+import numpy as np
+
+import tvm
+from tvm import relay
+
+from .infrastructure import skip_runtime_test, skip_codegen_test, build_and_run, \
+    verify, verify_codegen
+from .infrastructure import Device
+
+
+def _get_model(input_shape, output_shape, var_names):
+    """Return a model and any parameters it may have."""
+    a = relay.var(next(var_names), shape=input_shape, dtype="float32")
+    reshape = relay.reshape(a, output_shape)
+    return reshape
+
+
+def _get_expected_codegen(input_shape, output_shape):
+    codegen = {
+        "name": "reshape",
+        "inputs": [],
+        "outputs": [],
+        "attrs": {}
+    }
+
+    inputs = [{"type": "var", "shape": list(input_shape)}]
+    outputs = [{"type": "var", "shape": list(output_shape)}]
+
+    codegen["inputs"] = inputs
+    codegen["outputs"] = outputs
+    codegen["attrs"]["num_inputs"] = ["Size_t", len(inputs)]
+    codegen["attrs"]["num_outputs"] = ["Size_t", len(outputs)]
+
+    return codegen
+
+
+def test_reshape():
+    if skip_runtime_test():
+        return
+
+    device = Device()
+
+    inputs = {
+        "a": tvm.nd.array(
+            np.random.uniform(-128, 127, (1, 1, 1, 1000)).astype("float32"))
+    }
+
+    for shape in [(1, 1000), (10, 10, 10)]:
+        outputs = []
+        func = _get_model(inputs["a"].shape, shape, iter(inputs))
+        for acl in [False, True]:
+            outputs.append(build_and_run(func, inputs, 1, None, device,
+                                         enable_acl=acl))
+        verify(outputs, atol=1e-7, rtol=1e-7)
+
+
+def test_codegen_reshape():
+    if skip_codegen_test():
+        return
+
+    shape = (1, 1, 1, 1000)
+    inputs = {"a"}
+
+    for new_shape in [(1, 1000), (10, 10, 10)]:
+        args = (shape, new_shape)
+        func = _get_model(*args, iter(inputs))
+        exp_codegen = _get_expected_codegen(*args)
+        verify_codegen(func, exp_codegen, 1)
+
+
+if __name__ == "__main__":
+    test_reshape()
+    test_codegen_reshape()
diff --git a/tests/python/contrib/test_acl/test_runtime.py b/tests/python/contrib/test_acl/test_runtime.py
new file mode 100644
index 0000000000000..7b332730e9538
--- /dev/null
+++ b/tests/python/contrib/test_acl/test_runtime.py
@@ -0,0 +1,97 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""ACL runtime tests."""
+
+import numpy as np
+
+import tvm
+from tvm import relay
+
+from .infrastructure import skip_runtime_test, build_and_run, verify
+from .infrastructure import Device
+
+
+def test_multiple_ops():
+    """
+    Test multiple operators destined for acl.
+    ACL will expect these ops as in 2 separate functions.
+    """
+    if skip_runtime_test():
+        return
+
+    device = Device()
+
+    def get_model(input_shape, var_names):
+        """Return a model and any parameters it may have."""
+        a = relay.var(next(var_names), shape=input_shape, dtype="float32")
+        out = relay.reshape(a, (1, 1, 1000))
+        out = relay.reshape(out, (1, 1000))
+        return out
+
+    inputs = {
+        "a": tvm.nd.array(np.random.uniform(0, 1, (1, 1, 1, 1000)).astype("float32"))
+    }
+
+    outputs = []
+    for acl in [False, True]:
+        func = get_model(inputs["a"].shape, iter(inputs))
+        outputs.append(build_and_run(func, inputs, 1, None, device,
+                                     enable_acl=acl))
+    verify(outputs, atol=0.002, rtol=0.01)
+
+
+def test_multiple_runs():
+    """
+    Test that multiple runs of an operator work.
+    Note: the result isn't checked.
+    """
+    if skip_runtime_test():
+        return
+
+    device = Device()
+
+    def get_model():
+        a = relay.var("a", shape=(1, 28, 28, 512), dtype="float32")
+        w = tvm.nd.array(np.ones((256, 1, 1, 512), dtype="float32"))
+        weights = relay.const(w, "float32")
+        conv = relay.nn.conv2d(
+            a,
+            weights,
+            kernel_size=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="OHWI",
+            strides=(1, 1),
+            padding=(0, 0),
+            dilation=(1, 1)
+        )
+        params = {"w": w}
+        return conv, params
+
+    inputs = {
+        "a": tvm.nd.array(np.ones((1, 28, 28, 512), dtype="float32")),
+    }
+
+    func, params = get_model()
+    build_and_run(func, inputs, 1,
+                  params, device,
+                  enable_acl=True,
+                  no_runs=3)
+
+
+if __name__ == "__main__":
+    test_multiple_ops()
+    test_multiple_runs()