From 7bdd494c2ccfd10dbd65b5379ac9125c46b979c8 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Tue, 14 Jul 2020 10:28:46 +0100
Subject: [PATCH 1/4] [BYOC][ACL] Support asymmetric per-layer quantization

Adds support for asymmetric per-layer quantization in the ACL runtime. This includes support for qnn.conv2d, nn.maxpool2d and reshape. Reflected these changes in codegen and runtime tests.

Change-Id: I8f610bd37af1e3740fd48c2d502bcc4727d9d712
---
 docs/deploy/arm_compute_lib.rst               |  23 +
 .../tvm/relay/op/contrib/arm_compute_lib.py   |  50 ++-
 python/tvm/relay/qnn/op/layout_conversions.py |  17 +-
 .../contrib/arm_compute_lib/codegen.cc        |  91 ++--
 .../contrib/arm_compute_lib/acl_runtime.cc    | 102 +++--
 .../contrib/arm_compute_lib/acl_utils.cc      |  65 ++-
 .../contrib/arm_compute_lib/acl_utils.h       |  52 ++-
 .../test_arm_compute_lib/infrastructure.py    |  30 +-
 .../test_arm_compute_lib/test_conv2d.py       | 392 +++++++++++++-----
 .../test_arm_compute_lib/test_network.py      |  93 ++++-
 .../test_arm_compute_lib/test_pooling.py      |  60 +--
 .../test_arm_compute_lib/test_reshape.py      |  45 +-
 12 files changed, 760 insertions(+), 260 deletions(-)

diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst
index 28abc9ce6e8f..d74c093c1b81 100644
--- a/docs/deploy/arm_compute_lib.rst
+++ b/docs/deploy/arm_compute_lib.rst
@@ -121,6 +121,29 @@ networks refer to the tests: `tests/python/contrib/test_arm_compute_lib`. Here y
 `infrastructure.py` to use the remote device you have setup.
 
 
+Operator support
+----------------
++--------------+-------------------------------------------------------------------------+
+| Relay Node   | Remarks                                                                 |
++==============+=========================================================================+
+| nn.conv2d    | fp32:                                                                   |
+|              |   Simple: nn.conv2d                                                     |
+|              |   Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu?                 |
++--------------+-------------------------------------------------------------------------+
+| qnn.conv2d   | uint8:                                                                  |
+|              |   Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu?, qnn.requantize |
++--------------+-------------------------------------------------------------------------+
+| nn.maxpool2d | fp32, uint8                                                             |
++--------------+-------------------------------------------------------------------------+
+| reshape      | fp32, uint8                                                             |
++--------------+-------------------------------------------------------------------------+
+
+.. note::
+    A composite operator is a series of operators that map to a single Arm Compute Library operator. You can view this
+    as being a single fused operator from the view point of Arm Compute Library. '?' denotes an optional operator in
+    the series of operators that make up a composite operator.
+
+
 Adding a new operator
 ---------------------
 Adding a new operator requires changes to a series of places. This section will give a hint on
diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index e5b2af5e9cd3..2f031b39e461 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -81,6 +81,23 @@ def conv_pattern():
         pattern = pattern.optional(is_op('nn.relu'))
         return pattern
 
+    def qnn_conv_pattern():
+        """Create a quantized convolution pattern.
+
+        Returns
+        -------
+        pattern : dataflow_pattern.AltPattern
+            Denotes the convolution pattern.
+        """
+        pattern = is_op('nn.pad')(wildcard()) | wildcard()
+        pattern = is_op('qnn.conv2d')(
+            pattern, is_constant(), is_constant(), is_constant(), is_constant(), is_constant())
+        pattern = pattern.optional(lambda x: is_op('nn.bias_add')(x, is_constant()))
+        pattern = pattern.optional(is_op('nn.relu'))
+        pattern = is_op('qnn.requantize')(
+            pattern, wildcard(), wildcard(), is_constant(), is_constant())
+        return pattern
+
     def check_conv(extract):
         """Check conv pattern is supported by ACL."""
         call = extract
@@ -88,7 +105,17 @@ def check_conv(extract):
             call = call.args[0]
         return conv2d(call.attrs, call.args)
 
-    return [('arm_compute_lib.conv2d', conv_pattern(), check_conv)]
+    def check_qnn_conv(extract):
+        """Check qnn conv pattern is supported by ACL."""
+        if extract.attrs.out_dtype != "uint8":
+            return False
+        call = extract
+        while call.op.name != "qnn.conv2d":
+            call = call.args[0]
+        return qnn_conv2d(call.attrs, call.args)
+
+    return [('arm_compute_lib.conv2d', conv_pattern(), check_conv),
+            ('arm_compute_lib.qnn_conv2d', qnn_conv_pattern(), check_qnn_conv)]
 
 
 def _register_external_op_helper(op_name, supported=True):
@@ -115,7 +142,24 @@ def conv2d(attrs, args):
     if len(data_typ.shape) != 4 or data_typ.shape[0] != 1 or data_typ.dtype != "float32":
         return False
     kernel_typ = args[1].checked_type
-    if kernel_typ.dtype != "float32":
+    if len(kernel_typ.shape) != 4 or kernel_typ.dtype != "float32":
+        return False
+    return True
+
+
+def qnn_conv2d(attrs, args):
+    """Check if the external ACL codegen for qnn.conv2d should be used."""
+    if attrs.groups != 1:
+        return False
+    if attrs.data_layout != "NHWC":
+        return False
+    if attrs.out_dtype != "int32" and attrs.out_dtype != "":
+        return False
+    data_typ = args[0].checked_type
+    if len(data_typ.shape) != 4 or data_typ.shape[0] != 1 or data_typ.dtype != "uint8":
+        return False
+    kernel_typ = args[1].checked_type
+    if len(kernel_typ.shape) != 4 or kernel_typ.dtype != "uint8":
         return False
     return True
 
@@ -126,6 +170,6 @@ def max_pool2d(attrs, args):
     if attrs.layout != "NHWC":
         return False
     typ = args[0].checked_type
-    if typ.dtype != "float32":
+    if typ.dtype not in ["float32", "uint8"]:
         return False
     return True
diff --git a/python/tvm/relay/qnn/op/layout_conversions.py b/python/tvm/relay/qnn/op/layout_conversions.py
index caa4c56f5abb..391714ac0427 100644
--- a/python/tvm/relay/qnn/op/layout_conversions.py
+++ b/python/tvm/relay/qnn/op/layout_conversions.py
@@ -20,6 +20,8 @@
 
 from tvm.relay.op import op as reg
 
+from ...op.strategy.generic import is_depthwise_conv2d
+
 
 @reg.register_convert_op_layout("qnn.conv2d")
 def convert_qnn_conv2d(attrs, inputs, tinfos, desired_layouts):
@@ -51,11 +53,20 @@ def convert_qnn_conv2d(attrs, inputs, tinfos, desired_layouts):
     new_attrs = dict(attrs)
     new_attrs['data_layout'] = desired_data_layout
 
+    if desired_kernel_layout != "default":
+        new_attrs['kernel_layout'] = desired_kernel_layout
+        return relay.qnn.op.conv2d(*inputs, **new_attrs)
+
     if desired_data_layout == 'NCHW':
-        if desired_kernel_layout != "default":
-            new_attrs['kernel_layout'] = desired_kernel_layout
+        new_attrs['kernel_layout'] = 'OIHW'
+        return relay.qnn.op.conv2d(*inputs, **new_attrs)
+    if desired_data_layout == 'NHWC':
+        # Check for depthwise convolution.
+        if is_depthwise_conv2d(inputs[0].shape, attrs['data_layout'], inputs[1].shape,
+                               attrs['kernel_layout'], attrs['groups']):
+            new_attrs['kernel_layout'] = 'HWOI'
         else:
-            new_attrs['kernel_layout'] = 'OIHW'
+            new_attrs['kernel_layout'] = 'HWIO'
         return relay.qnn.op.conv2d(*inputs, **new_attrs)
 
     raise ValueError('Layout %s is not yet supported' % desired_data_layout)
diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
index 8edbc15401bc..a6761e2347d7 100644
--- a/src/relay/backend/contrib/arm_compute_lib/codegen.cc
+++ b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
@@ -49,6 +49,18 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
  public:
   ACLJSONSerializer(const std::string& symbol, const Expr& expr) : JSONSerializer(symbol, expr) {}
 
+  /*!
+   * \brief A series of operators that form a composite
+   * convolution. Supports both nn.conv2d and qnn.conv2d.
+   */
+  struct CompositeConvNodes {
+    const CallNode* pad = nullptr;
+    const CallNode* conv = nullptr;
+    const CallNode* bias = nullptr;
+    const CallNode* activation = nullptr;
+    const CallNode* requantize = nullptr;
+  };
+
   /*!
    * \brief Visit call nodes and generate appropriate JSON node.
    *
@@ -68,7 +80,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
     CHECK(comp.defined()) << "Arm Compute Library JSON runtime only supports composite functions.";
     const std::string name = comp.value();
     std::shared_ptr<JSONGraphNode> json_node;
-    if (name == "arm_compute_lib.conv2d") {
+    if (name == "arm_compute_lib.conv2d" || name == "arm_compute_lib.qnn_conv2d") {
       json_node = CreateCompositeConvJSONNode(cn);
     } else {
       LOG(FATAL) << "Unrecognized Arm Compute Library pattern: " << name;
@@ -78,57 +90,83 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
 
  private:
   /*!
-   * \brief Create a JSON representation of a composite convolution.
+   * \brief Extract convolution nodes from a composite function.
    *
-   * \param call The call to be represented.
-   * \return A JSON representation of a specific operator.
+   * \param cn The call node of the composite function.
+   * \return Extracted composite convolution nodes.
    */
-  std::shared_ptr<JSONGraphNode> CreateCompositeConvJSONNode(const CallNode* cn) {
-    const std::string name = "nn.conv2d";
-    const CallNode* pad = nullptr;
-    const CallNode* conv = nullptr;
-    const CallNode* bias = nullptr;
-    bool has_activation = false;
-
-    // Unpack composite function
+  static CompositeConvNodes UnpackCompositeConvolution(const CallNode* cn) {
+    CompositeConvNodes nodes{};
     const auto* fn = cn->op.as<FunctionNode>();
     CHECK(fn);
     const auto* current_call = fn->body.as<CallNode>();
+    if (backend::IsOp(current_call, "qnn.requantize")) {
+      nodes.requantize = current_call;
+      current_call = current_call->args[0].as<CallNode>();
+    }
     if (backend::IsOp(current_call, "nn.relu")) {
-      has_activation = true;
+      nodes.activation = current_call;
       current_call = current_call->args[0].as<CallNode>();
     }
     if (backend::IsOp(current_call, "nn.bias_add")) {
-      bias = current_call;
+      nodes.bias = current_call;
       current_call = current_call->args[0].as<CallNode>();
     }
-    CHECK(backend::IsOp(current_call, "nn.conv2d"));
-    conv = current_call;
+    if (nodes.requantize) {
+      CHECK(backend::IsOp(current_call, "qnn.conv2d"));
+    } else {
+      CHECK(backend::IsOp(current_call, "nn.conv2d"));
+    }
+    nodes.conv = current_call;
     if (!current_call->args.empty() && current_call->args[0]->IsInstance<CallNode>()) {
       current_call = current_call->args[0].as<CallNode>();
       if (backend::IsOp(current_call, "nn.pad")) {
-        pad = current_call;
+        nodes.pad = current_call;
       }
     }
+    return nodes;
+  }
+
+  /*!
+   * \brief Create a JSON representation of a composite convolution.
+   *
+   * \param call The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  std::shared_ptr<JSONGraphNode> CreateCompositeConvJSONNode(const CallNode* cn) {
+    CompositeConvNodes nodes = UnpackCompositeConvolution(cn);
+    std::string name = "nn.conv2d";
 
-    const auto* conv_attr = conv->attrs.as<Conv2DAttrs>();
+    const auto* conv_attr = nodes.conv->attrs.as<Conv2DAttrs>();
     CHECK(conv_attr);
     CHECK(conv_attr->kernel_layout == "OHWI")
         << "Kernel layout must be OHWI, has the module been pre-processed correctly?";
 
+    // Inputs must be added in the same order they appear in the relay graph.
     std::vector<JSONGraphNodeEntry> inputs;
     inputs.push_back(VisitExpr(cn->args[0])[0]);
-    inputs.push_back(VisitExpr(conv->args[1])[0]);
-    if (bias) {
-      inputs.push_back(VisitExpr(bias->args[1])[0]);
+    inputs.push_back(VisitExpr(nodes.conv->args[1])[0]);
+    if (nodes.requantize) {
+      name = "qnn.conv2d";
+      inputs.push_back(VisitExpr(nodes.conv->args[2])[0]);  // input zero-point
+      inputs.push_back(VisitExpr(nodes.conv->args[3])[0]);  // kernel zero-point
+      inputs.push_back(VisitExpr(nodes.conv->args[4])[0]);  // input scale
+      inputs.push_back(VisitExpr(nodes.conv->args[5])[0]);  // kernel scale
+    }
+    if (nodes.bias) {
+      inputs.push_back(VisitExpr(nodes.bias->args[1])[0]);
+    }
+    if (nodes.requantize) {
+      inputs.push_back(VisitExpr(nodes.requantize->args[3])[0]);  // output scale
+      inputs.push_back(VisitExpr(nodes.requantize->args[4])[0]);  // output zero-point
     }
 
     auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 1);
-    SetCallNodeAttribute(json_node, conv);
+    SetCallNodeAttribute(json_node, nodes.conv);
 
     // Override attributes
-    if (pad) {
-      const auto* pad_attr = pad->attrs.as<PadAttrs>();
+    if (nodes.pad) {
+      const auto* pad_attr = nodes.pad->attrs.as<PadAttrs>();
       CHECK(pad_attr);
       auto p = pad_attr->pad_width;
       // Convert to TVM layout for now, conversion to ACL layout takes place in runtime.
@@ -141,7 +179,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
       padding_attr.emplace_back(padding);
       json_node->SetAttr("padding", padding_attr);
     }
-    if (has_activation) {
+    if (nodes.activation) {
       std::vector<std::string> activation_type = {"relu"};
       std::vector<dmlc::any> act_attr;
       act_attr.emplace_back(activation_type);
@@ -161,7 +199,8 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
  */
 IRModule PreProcessModule(const IRModule& mod) {
   IRModule preprocessed_module;
-  tvm::Map<String, Array<String>> desired_layouts = {{"nn.conv2d", {"NHWC", "OHWI"}}};
+  tvm::Map<String, Array<String>> desired_layouts = {{"nn.conv2d", {"NHWC", "OHWI"}},
+                                                     {"qnn.conv2d", {"NHWC", "OHWI"}}};
   preprocessed_module = transform::ConvertLayout(desired_layouts)(mod);
   preprocessed_module = transform::FoldConstant()(preprocessed_module);
   return preprocessed_module;
diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
index e8cdef743eb4..7b65692ea160 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
@@ -25,7 +25,6 @@
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/registry.h>
 
-#include "../../file_util.h"
 #include "../json/json_node.h"
 #include "../json/json_runtime.h"
 
@@ -116,19 +115,6 @@ class ACLRuntime : public JSONRuntimeBase {
   void BuildEngine() {
     std::shared_ptr<arm_compute::MemoryManagerOnDemand> mm = MakeMemoryManager();
     int num_pools = 0;
-
-    for (size_t i = 0; i < input_nodes_.size(); ++i) {
-      uint32_t nid = input_nodes_[i];
-      const auto& node = nodes_[nid];
-      if (node.GetOpType() == "input") {
-        layer_.inputs.push_back(MakeTensor(node));
-      } else if (node.GetOpType() == "const") {
-        uint32_t eid = EntryID(nid, 0);
-        void* data = data_entry_[eid]->data;
-        layer_.const_inputs.push_back(MakeTensor(node, data));
-      }
-    }
-
     bool found_kernel_node = false;
     for (size_t nid = 0; nid < nodes_.size(); ++nid) {
       const auto& node = nodes_[nid];
@@ -139,7 +125,7 @@ class ACLRuntime : public JSONRuntimeBase {
       if (node.GetOpType() == "kernel") {
         found_kernel_node = true;
         auto op_name = node.GetOpName();
-        if ("nn.conv2d" == op_name) {
+        if ("nn.conv2d" == op_name || "qnn.conv2d" == op_name) {
           CreateConvolution2DLayer(&layer_, node, mm);
           num_pools++;
         } else if ("nn.max_pool2d" == op_name) {
@@ -163,10 +149,48 @@ class ACLRuntime : public JSONRuntimeBase {
   struct CachedLayer {
     std::shared_ptr<arm_compute::IFunction> function;
     std::vector<arm_compute::Tensor> inputs;
-    std::vector<arm_compute::Tensor> const_inputs;
     std::vector<arm_compute::Tensor> outputs;
   };
 
+  /*!
+   * \brief Create an ACL tensor given the JSON representation.
+   *
+   * \param tensor The tensor to represent.
+   * \param scale (optional) The scale of the tensor as an input.
+   * \param offset (optional) The offset of the tensor as an input.
+   * \return ACL Tensor.
+   */
+  arm_compute::Tensor GetACLTensor(const JSONGraphNodeEntry& tensor,
+                                   JSONGraphNodeEntry* scale = nullptr,
+                                   JSONGraphNodeEntry* offset = nullptr) {
+    JSONGraphNode node = nodes_[tensor.id_];
+    void* node_data = nullptr;
+    if (node.GetOpType() == "const") {
+      node_data = data_entry_[EntryID(tensor)]->data;
+    }
+    return GetACLTensor(node, scale, offset, node_data);
+  }
+
+  /*!
+   * \brief Create an ACL tensor given the JSON representation.
+   *
+   * \param node The tensor to represent.
+   * \param scale (optional) The scale of the tensor as an input.
+   * \param offset (optional) The offset of the tensor as an input.
+   * \param data (optional) Constant data of input node.
+   * \return ACL Tensor.
+   */
+  arm_compute::Tensor GetACLTensor(const JSONGraphNode& node, JSONGraphNodeEntry* scale = nullptr,
+                                   JSONGraphNodeEntry* offset = nullptr, void* data = nullptr) {
+    const DLTensor* scale_data = nullptr;
+    const DLTensor* offset_data = nullptr;
+    if (scale && offset) {
+      scale_data = data_entry_[EntryID(*scale)];
+      offset_data = data_entry_[EntryID(*offset)];
+    }
+    return MakeTensor(node, data, scale_data, offset_data);
+  }
+
   /*!
    * \brief Create a 2D convolution layer.
    *
@@ -174,13 +198,12 @@ class ACLRuntime : public JSONRuntimeBase {
    * \param node The JSON representation of the operator.
    * \param mm The ACL conv2d layer can request auxiliary memory from TVM.
    */
-  static void CreateConvolution2DLayer(
-      CachedLayer* layer, const JSONGraphNode& node,
-      const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& mm) {
+  void CreateConvolution2DLayer(CachedLayer* layer, const JSONGraphNode& node,
+                                const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& mm) {
     std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
     std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
     std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
-    arm_compute::PadStrideInfo pad_stride_info = ToACLPadStride(padding, strides);
+    arm_compute::PadStrideInfo pad_stride_info = MakePadStride(padding, strides);
 
     int groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
     CHECK(groups == 1) << "Arm Compute Library NEON convolution only supports group size of 1.";
@@ -198,13 +221,30 @@ class ACLRuntime : public JSONRuntimeBase {
 
     arm_compute::Size2D dilation_2d(std::stoi(dilation[0]), std::stoi(dilation[1]));
 
-    layer->outputs.push_back(MakeOutputTensor(node.GetOpShape()[0]));
+    // Collect inputs and outputs, handling both nn.conv2d and qnn.conv2d cases.
+    std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
+    size_t num_inputs = inputs.size();
+    bool has_bias;
+    if (node.GetOpName() == "qnn.conv2d") {
+      has_bias = num_inputs == 9;
+      layer->inputs.push_back(GetACLTensor(inputs[0], &inputs[4], &inputs[2]));
+      layer->inputs.push_back(GetACLTensor(inputs[1], &inputs[5], &inputs[3]));
+      if (has_bias) {
+        layer->inputs.push_back(GetACLTensor(inputs[6]));
+      }
+      layer->outputs.push_back(GetACLTensor(node, &inputs[6 + has_bias], &inputs[7 + has_bias]));
+    } else {
+      has_bias = num_inputs == 3;
+      for (const auto& i : inputs) {
+        layer->inputs.push_back(GetACLTensor(i));
+      }
+      layer->outputs.push_back(GetACLTensor(node));
+    }
 
     auto function = std::make_shared<arm_compute::NEConvolutionLayer>(mm);
-    function->configure(&layer->inputs[0], &layer->const_inputs[0],
-                        layer->const_inputs.size() > 1 ? &layer->const_inputs[1] : nullptr,
-                        &layer->outputs[0], pad_stride_info, arm_compute::WeightsInfo(),
-                        dilation_2d, act_info);
+    function->configure(&layer->inputs[0], &layer->inputs[1],
+                        has_bias ? &layer->inputs[2] : nullptr, &layer->outputs[0], pad_stride_info,
+                        arm_compute::WeightsInfo(), dilation_2d, act_info);
     layer->function = function;
   }
 
@@ -216,10 +256,10 @@ class ACLRuntime : public JSONRuntimeBase {
    * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
    * \param node The JSON representation of the operator.
    */
-  static void CreatePoolingLayer(CachedLayer* layer, const JSONGraphNode& node) {
+  void CreatePoolingLayer(CachedLayer* layer, const JSONGraphNode& node) {
     std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
     std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
-    arm_compute::PadStrideInfo pad_stride_info = ToACLPadStride(padding, strides);
+    arm_compute::PadStrideInfo pad_stride_info = MakePadStride(padding, strides);
 
     auto attr_pool_size = node.GetAttr<std::vector<std::string>>("pool_size");
     int pool_size_h = std::stoi(attr_pool_size[0]);
@@ -236,7 +276,8 @@ class ACLRuntime : public JSONRuntimeBase {
         arm_compute::PoolingLayerInfo(pool_type, arm_compute::Size2D(pool_size_h, pool_size_w),
                                       arm_compute::DataLayout::NHWC, pad_stride_info);
 
-    layer->outputs.push_back(MakeOutputTensor(node.GetOpShape()[0]));
+    layer->inputs.push_back(GetACLTensor(node.GetInputs()[0]));
+    layer->outputs.push_back(GetACLTensor(node));
 
     auto function = std::make_shared<arm_compute::NEPoolingLayer>();
     function->configure(&layer->inputs[0], &layer->outputs[0], pool_info);
@@ -249,8 +290,9 @@ class ACLRuntime : public JSONRuntimeBase {
    * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
    * \param node The JSON representation of the operator.
    */
-  static void CreateReshapeLayer(CachedLayer* layer, const JSONGraphNode& node) {
-    layer->outputs.push_back(MakeOutputTensor(node.GetOpShape()[0]));
+  void CreateReshapeLayer(CachedLayer* layer, const JSONGraphNode& node) {
+    layer->inputs.push_back(GetACLTensor(node.GetInputs()[0]));
+    layer->outputs.push_back(GetACLTensor(node));
     auto function = std::make_shared<arm_compute::NEReshapeLayer>();
     function->configure(&layer->inputs[0], &layer->outputs[0]);
     layer->function = function;
diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.cc b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
index ad278ba31c8d..2da7ffde7428 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_utils.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
@@ -38,10 +38,12 @@ void CheckACLError(const arm_compute::Status& status) {
   CHECK(status.error_code() == arm_compute::ErrorCode::OK) << "ACL: " << status.error_description();
 }
 
-arm_compute::Tensor MakeTensor(const JSONGraphNode& tensor_rep, void* data) {
-  CHECK(tensor_rep.GetOpType() == "input" || tensor_rep.GetOpType() == "const");
+arm_compute::Tensor MakeTensor(const JSONGraphNode& tensor_rep, void* data, const DLTensor* scale,
+                               const DLTensor* offset) {
   arm_compute::Tensor tensor;
-  arm_compute::TensorInfo info = MakeTensorInfo(tensor_rep.GetOpShape()[0]);
+  std::vector<int64_t> shape = tensor_rep.GetOpShape()[0];
+  DLDataType dtype = tensor_rep.GetOpDataType()[0];
+  arm_compute::TensorInfo info = MakeTensorInfo(shape, dtype, scale, offset);
   tensor.allocator()->init(info);
   if (data != nullptr) {
     CheckACLError(tensor.allocator()->import_memory(data));
@@ -49,24 +51,25 @@ arm_compute::Tensor MakeTensor(const JSONGraphNode& tensor_rep, void* data) {
   return tensor;
 }
 
-arm_compute::Tensor MakeOutputTensor(const std::vector<int64_t>& shape) {
-  arm_compute::Tensor tensor;
-  tensor.allocator()->init(MakeTensorInfo(shape));
-  return tensor;
-}
-
-arm_compute::TensorInfo MakeTensorInfo(const std::vector<int64_t>& shape) {
-  arm_compute::TensorShape acl_shape = MakeTensorShape(shape);
-  return arm_compute::TensorInfo(acl_shape, 1, arm_compute::DataType::F32,
-                                 arm_compute::DataLayout::NHWC);
-}
-
-arm_compute::TensorShape MakeTensorShape(const std::vector<int64_t>& shape) {
+arm_compute::TensorInfo MakeTensorInfo(const std::vector<int64_t>& shape, const DLDataType& dtype,
+                                       const DLTensor* scale, const DLTensor* offset) {
   arm_compute::TensorShape acl_shape;
   for (unsigned int i = shape.size(); i > 0; --i) {
     acl_shape.set(shape.size() - i, shape[i - 1]);
   }
-  return acl_shape;
+  arm_compute::DataType acl_dtype = MakeDataType(dtype);
+  arm_compute::TensorInfo info(acl_shape, 1, acl_dtype, arm_compute::DataLayout::NHWC);
+
+  if (scale != nullptr && offset != nullptr) {
+    std::vector<float> scale_data = GetVectorFromDLTensor<float>(scale);
+    std::vector<int> offset_data = GetVectorFromDLTensor<int>(offset);
+    CHECK(scale_data.size() == 1 && offset_data.size() == 1)
+        << "Currently only per-layer quantization is supported in the Arm Compute Library runtime.";
+    arm_compute::QuantizationInfo qinfo(scale_data[0], offset_data[0]);
+    info.set_quantization_info(qinfo);
+  }
+
+  return info;
 }
 
 std::shared_ptr<arm_compute::MemoryManagerOnDemand> MakeMemoryManager() {
@@ -75,8 +78,8 @@ std::shared_ptr<arm_compute::MemoryManagerOnDemand> MakeMemoryManager() {
   return std::make_shared<arm_compute::MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
 }
 
-arm_compute::PadStrideInfo ToACLPadStride(const std::vector<std::string>& pad,
-                                          const std::vector<std::string>& stride) {
+arm_compute::PadStrideInfo MakePadStride(const std::vector<std::string>& pad,
+                                         const std::vector<std::string>& stride) {
   int pad_0 = 0, pad_1 = 0, pad_2 = 0, pad_3 = 0;
   int stride_0 = std::stoi(stride[0]), stride_1 = std::stoi(stride[1]);
   size_t size = pad.size();
@@ -108,6 +111,30 @@ arm_compute::PadStrideInfo ToACLPadStride(const std::vector<std::string>& pad,
                                     arm_compute::DimensionRoundingType::FLOOR);
 }
 
+arm_compute::DataType MakeDataType(const DLDataType& data_type) {
+  if (data_type.code == DLDataTypeCode::kDLFloat && data_type.bits == 32) {
+    return arm_compute::DataType::F32;
+  } else if (data_type.code == DLDataTypeCode::kDLUInt && data_type.bits == 8) {
+    return arm_compute::DataType::QASYMM8;
+  } else if (data_type.code == DLDataTypeCode::kDLInt && data_type.bits == 32) {
+    return arm_compute::DataType::S32;
+  } else {
+    LOG(FATAL) << "Datatype " << data_type << " unsupported by ACL runtime";
+    return arm_compute::DataType::UNKNOWN;
+  }
+}
+
+template <typename T>
+std::vector<T> GetVectorFromDLTensor(const DLTensor* tensor) {
+  CHECK(tensor) << "Cannot convert a nullptr";
+  int len = 1;
+  for (int i = 0; i < tensor->ndim; i++) {
+    len *= tensor->shape[i];
+  }
+  T* data = static_cast<T*>(tensor->data);
+  return std::vector<T>(data, data + len);
+}
+
 }  // namespace contrib
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.h b/src/runtime/contrib/arm_compute_lib/acl_utils.h
index 6a9278022e7a..67eebada4ea5 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_utils.h
+++ b/src/runtime/contrib/arm_compute_lib/acl_utils.h
@@ -58,35 +58,26 @@ void CheckACLError(const arm_compute::Status& status);
  *
  * \param tensor_rep A JSON tensor representation.
  * \param data (optional) Initialize the tensor with memory.
+ * \param scale (optional) The quantization scale.
+ * \param offset (optional) The quantization offset.
  * \return arm_compute::Tensor.
  */
-arm_compute::Tensor MakeTensor(const JSONGraphNode& tensor_rep, void* data = nullptr);
-
-/*!
- * \brief Make an acl tensor from type and shape, without having a JSON representation.
- *
- * \param shape The shape of the tensor to create.
- * \return arm_compute::Tensor.
- */
-arm_compute::Tensor MakeOutputTensor(const std::vector<int64_t>& shape);
+arm_compute::Tensor MakeTensor(const JSONGraphNode& tensor_rep, void* data = nullptr,
+                               const DLTensor* scale = nullptr, const DLTensor* offset = nullptr);
 
 /*!
  * \brief Make an acl tensor info object from JSON tensor
  * representation.
  *
  * \param shape The shape of the tensor to create.
+ * \param dtype The data type of the tensor to create.
+ * \param scale (optional) The quantization scale.
+ * \param offset (optional) The quantization offset.
  * \return arm_compute::TensorInfo.
  */
-arm_compute::TensorInfo MakeTensorInfo(const std::vector<int64_t>& shape);
-
-/*!
- * \brief Convert vector object to acl TensorShape.
- * \note This requires reversing the given vector.
- *
- * \param shape The shape of the tensor as a vector.
- * \return arm_compute::TensorShape.
- */
-arm_compute::TensorShape MakeTensorShape(const std::vector<int64_t>& shape);
+arm_compute::TensorInfo MakeTensorInfo(const std::vector<int64_t>& shape, const DLDataType& dtype,
+                                       const DLTensor* scale = nullptr,
+                                       const DLTensor* offset = nullptr);
 
 /*!
  * \brief Create a memory manager for use with a layer that
@@ -103,8 +94,27 @@ std::shared_ptr<arm_compute::MemoryManagerOnDemand> MakeMemoryManager();
  * \param stride The stride vector.
  * \return arm_compute::PadStrideInfo
  */
-arm_compute::PadStrideInfo ToACLPadStride(const std::vector<std::string>& pad,
-                                          const std::vector<std::string>& stride);
+arm_compute::PadStrideInfo MakePadStride(const std::vector<std::string>& pad,
+                                         const std::vector<std::string>& stride);
+
+/*!
+ * \brief Convert DLDataType to arm_compute::DataType.
+ *
+ * \param data_type The data type to convert.
+ * \return arm_compute::DataType.
+ */
+arm_compute::DataType MakeDataType(const DLDataType& data_type);
+
+/*!
+ * \brief Get a vector from DLTensor data.
+ * \note Performs a copy of data.
+ *
+ * \tparam T The type of the vector.
+ * \param tensor The tensor to convert.
+ * \return Vector of type T.
+ */
+template <typename T>
+std::vector<T> GetVectorFromDLTensor(const DLTensor* tensor);
 
 }  // namespace contrib
 }  // namespace runtime
diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
index ea486b09da59..dfded94b7b11 100644
--- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py
+++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
@@ -17,6 +17,8 @@
 from itertools import zip_longest, combinations
 import json
 
+import numpy as np
+
 import tvm
 from tvm import relay
 from tvm import rpc
@@ -154,13 +156,18 @@ def update_lib(lib, device, cross_compile):
     return lib
 
 
-def verify(answers, atol, rtol):
+def verify(answers, atol, rtol, verify_saturation=False):
     """Compare the array of answers. Each entry is a list of outputs."""
     if len(answers) < 2:
         raise RuntimeError(
             f"No results to compare: expected at least two, found {len(answers)}")
     for answer in zip_longest(*answers):
         for outs in combinations(answer, 2):
+            if verify_saturation:
+                assert np.count_nonzero(outs[0].asnumpy() == 255) < 0.25 * outs[0].asnumpy().size, \
+                    "Output is saturated: {}".format(outs[0])
+                assert np.count_nonzero(outs[0].asnumpy() == 0) < 0.25 * outs[0].asnumpy().size, \
+                    "Output is saturated: {}".format(outs[0])
             tvm.testing.assert_allclose(
                outs[0].asnumpy(), outs[1].asnumpy(), rtol=rtol, atol=atol)
 
@@ -195,3 +202,24 @@ def verify_codegen(module, known_good_codegen, num_acl_modules,
             f"The JSON produced by codegen does not match the expected result. \n" \
             f"Actual={codegen_str} \n" \
             f"Expected={known_good_codegen_str}"
+
+
+def generate_trials(space, r_factor=3):
+    """Generate a list of trials given series of parameters."""
+    np.random.seed(0)
+    max_len = 1
+    for option in space:
+        max_len = max(max_len, len(option))
+
+    num_trials = r_factor * max_len
+    trials = []
+    for i in range(num_trials):
+        trial = []
+        for option in space:
+            if i % len(option) == 0:
+                np.random.shuffle(option)
+            trial.append(option[i % len(option)])
+
+        trials.append(trial)
+
+    return trials
diff --git a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
index 8765878c9571..96206ffb9631 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
@@ -16,19 +16,20 @@
 # under the License.
 """Arm Compute Library integration conv2d tests."""
 
+import random
 import numpy as np
 
 import tvm
 from tvm import relay
 
 from .infrastructure import skip_runtime_test, skip_codegen_test, build_and_run, \
-    verify, verify_codegen
+    verify, verify_codegen, generate_trials
 from .infrastructure import Device
 
 
-def _get_model(shape, kernel_size, padding, strides,
-               dilation, groups, dtype, channels,
-               var_names, has_bias=False, has_activation=False, has_pad=False):
+def _get_model(shape, kernel_h, kernel_w, padding, strides,
+               dilation, groups, dtype, channels, var_names,
+               has_bias=False, has_activation=False, has_pad=False):
     """Return a model and any parameters it may have"""
     a = relay.var(next(var_names), shape=shape, dtype=dtype)
     if has_pad:
@@ -40,20 +41,21 @@ def _get_model(shape, kernel_size, padding, strides,
             padding = (padding[0], padding[1], padding[0], padding[1])
         shape = (shape[0], shape[1] + padding[0] * 2,
                  shape[2] + padding[1] * 2, shape[3])
-    weight_shape = (kernel_size, kernel_size, shape[3] // groups, channels)
+    weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels)
     w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype))
     weights = relay.const(w, dtype)
     out = relay.nn.conv2d(
         a,
         weights,
-        kernel_size=(kernel_size, kernel_size),
+        kernel_size=(kernel_h, kernel_w),
         data_layout="NHWC",
         kernel_layout="HWIO",
-        dilation=(1, 1),
+        dilation=dilation,
         strides=strides,
         padding=padding,
         groups=groups,
-        channels=channels
+        channels=channels,
+        out_dtype=dtype
     )
     params = {"w": w}
     if has_bias:
@@ -66,59 +68,171 @@ def _get_model(shape, kernel_size, padding, strides,
     return out, params
 
 
-def _get_expected_codegen(shape, kernel_size, padding, strides,
+def _get_qnn_params(input_zp, input_sc, kernel_zp, kernel_sc, kernel_h, kernel_w, channels):
+    """Get output qnn parameters given input and kernel paramters."""
+    input_max = input_sc * (255 - input_zp)
+    input_min = - input_sc * input_zp
+    kernel_max = kernel_sc * (255 - kernel_zp)
+    kernel_min = - kernel_sc * kernel_zp
+    output_limits = [kernel_max * kernel_h * kernel_w * channels * input_max,
+                     kernel_min * kernel_h * kernel_w * channels * input_max,
+                     kernel_min * kernel_h * kernel_w * channels * input_min,
+                     kernel_max * kernel_h * kernel_w * channels * input_min]
+    output_max = max(output_limits)
+    output_min = min(output_limits)
+    output_sc = (output_max - output_min) / 255
+    output_zp = - int(output_min / output_sc)
+    return output_zp, output_sc
+
+
+def _get_qnn_model(shape, kernel_h, kernel_w,
+                   padding, strides, dilation, groups, dtype,
+                   channels, input_zp, input_sc,
+                   kernel_zp, kernel_sc, output_zp,
+                   output_sc, var_names, has_bias=False,
+                   has_activation=False, has_pad=False):
+    """Return a model and any parameters it may have."""
+    a = relay.var(next(var_names), shape=shape, dtype=dtype)
+    if has_pad:
+        p = ((0, 0), (padding[0], padding[0]), (padding[1], padding[1]), (0, 0))
+        a = relay.nn.pad(a, pad_width=p, pad_value=input_zp, pad_mode="constant")
+        padding = (0, 0, 0, 0)
+    else:
+        if len(padding) == 2:
+            padding = (padding[0], padding[1], padding[0], padding[1])
+        shape = (shape[0], shape[1] + padding[0] * 2,
+                 shape[2] + padding[1] * 2, shape[3])
+    weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels)
+    w = tvm.nd.array(np.random.uniform(0, 255, weight_shape).astype(dtype))
+    weights = relay.const(w, dtype)
+    out = relay.qnn.op.conv2d(
+        a,
+        weights,
+        input_zero_point=relay.const(input_zp, "int32"),
+        kernel_zero_point=relay.const(kernel_zp, "int32"),
+        input_scale=relay.const(input_sc, "float32"),
+        kernel_scale=relay.const(kernel_sc, "float32"),
+        kernel_size=(kernel_h, kernel_w),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        dilation=dilation,
+        strides=strides,
+        padding=padding,
+        groups=groups,
+        channels=channels,
+        out_dtype="int32"
+    )
+    params = {"w": w}
+    if has_bias:
+        b = tvm.nd.array(np.random.uniform(0, 255, weight_shape[3]).astype("int32"))
+        biasc = relay.const(b, "int32")
+        out = relay.nn.bias_add(out, biasc, axis=3)
+        params['b'] = b
+    if has_activation:
+        out = relay.nn.relu(out)
+    req = relay.qnn.op.requantize(
+        out,
+        relay.const(input_sc * kernel_sc, 'float32'),  # input scale
+        relay.const(0, 'int32'),  # input zero point
+        relay.const(output_sc, 'float32'),  # output scale
+        relay.const(output_zp, 'int32'),  # output zero point
+        out_dtype="uint8"
+    )
+    return req, params
+
+
+def _get_expected_codegen(shape, kernel_h, kernel_w, padding, strides,
                           dilation, groups, dtype, channels,
                           has_bias=False, has_activation=False):
     if len(padding) == 2:
         padding = (padding[0], padding[1], padding[0], padding[1])
-    weight_shape = (channels, kernel_size, kernel_size, shape[3] // groups)
-    output_height = ((shape[1] - kernel_size + padding[0] + padding[2]) / strides[0]) + 1
-    output_width = ((shape[2] - kernel_size + padding[1] + padding[3]) / strides[1]) + 1
+    weight_shape = (channels, kernel_h, kernel_w, shape[3] // groups)
+    output_height = ((shape[1] - kernel_h + padding[0] + padding[2]) / strides[0]) + 1
+    output_width = ((shape[2] - kernel_w + padding[1] + padding[3]) / strides[1]) + 1
     output_shape = (1, int(output_height), int(output_width), channels)
+    out_dtype = "int32" if dtype == "uint8" else "float32"
 
     node = {
-            "op": "kernel",
-            "name": "nn.conv2d",
-            "inputs": [[0, 0, 0], [1, 0, 0]],
-            "attrs": {
-                "groups": [["1"]],
-                "num_inputs": str(3 if has_bias else 2),
-                "num_outputs": "1",
-                "data_layout": [["NHWC"]],
-                "kernel_layout": [["OHWI"]],
-                "channels": [["1"]],
-                "dilation": [["1", "1"]],
-                "out_layout": [[""]],
-                "out_dtype": [[""]],
-                "kernel_size": [[str(kernel_size), str(kernel_size)]],
-                "shape": [[list(output_shape)]],
-                "dtype": [[dtype]],
-                "padding": [[str(p) for p in padding]],
-                "strides": [[str(s) for s in strides]]
-            },
-        }
+        "op": "kernel",
+        "name": "nn.conv2d",
+        "inputs": [],
+        "attrs": {
+            "groups": [["1"]],
+            "num_outputs": "1",
+            "data_layout": [["NHWC"]],
+            "kernel_layout": [["OHWI"]],
+            "channels": [[str(channels)]],
+            "dilation": [[str(dilation[0]), str(dilation[1])]],
+            "out_layout": [[""]],
+            "out_dtype": [[out_dtype]],
+            "kernel_size": [[str(kernel_h), str(kernel_w)]],
+            "shape": [[list(output_shape)]],
+            "dtype": [[dtype]],
+            "padding": [[str(p) for p in padding]],
+            "strides": [[str(s) for s in strides]]
+        },
+    }
 
     if has_activation:
         node["attrs"]["activation_type"] = [["relu"]]
 
-    input = {
+    inputs = [{
         "op": "input",
         "name": "",
-        "attrs": {"shape": [[list(shape)]], "dtype": [["float32"]]}}
-    kernel = {
+        "attrs": {
+            "shape": [[list(shape)]],
+            "dtype": [[str(dtype)]]
+        }}, {
         "op": "const",
         "name": "",
-        "attrs": {"shape": [[list(weight_shape)]], "dtype": [["float32"]]}}
+        "attrs": {
+            "shape": [[list(weight_shape)]],
+            "dtype": [[str(dtype)]]
+        }}]
+
+    # qnn.conv2d params, input and kernel
+    if dtype == "uint8":
+        node["name"] = "qnn.conv2d"
+        for param_dtype in ["int32", "float32"]:
+            for _ in range(2):
+                inputs.append({
+                    "op": "const",
+                    "name": "",
+                    "attrs": {
+                        "shape": [[[]]],
+                        "dtype": [[param_dtype]]
+                    }
+                })
 
     if has_bias:
-        bias = {
+        bias_dtype = "int32" if dtype == "uint8" else "float32"
+        inputs.append({
             "op": "const",
             "name": "",
-            "attrs": {"shape": [[[weight_shape[0]]]], "dtype": [["float32"]]}}
-        node["inputs"].append([2, 0, 0])
-        return [input, kernel, bias, node]
-    else:
-        return [input, kernel, node]
+            "attrs": {
+                "shape": [[[weight_shape[0]]]],
+                "dtype": [[bias_dtype]]}
+        })
+
+    # qnn.conv2d params, output
+    if dtype == "uint8":
+        for param_dtype in ["float32", "int32"]:
+            inputs.append({
+                "op": "const",
+                "name": "",
+                "attrs": {
+                    "shape": [[[]]],
+                    "dtype": [[param_dtype]]
+                }
+            })
+
+    input_idx = 0
+    for _ in range(len(inputs)):
+        node["inputs"].append([input_idx, 0, 0])
+        input_idx += 1
+    node["attrs"]["num_inputs"] = str(len(inputs))
+    inputs.append(node)
+    return inputs
 
 
 def test_conv2d():
@@ -127,51 +241,31 @@ def test_conv2d():
 
     device = Device()
     np.random.seed(0)
+    r = random.Random(0)
 
-    shape = (1, 14, 14, 32)
+    kernel_hs = [1, 2, 3, 5]
+    kernel_ws = [1, 2, 3, 5]
+    pad = [(1, 1), (2, 2), (2, 1)]
+    strides = [(1, 1), (2, 2)]
+    dilation = [(1, 1)]
+    # composite operator (pad, bias, activation)
+    composite = [(False, False, False), (False, True, False), (False, False, True),
+                 (False, True, True), (True, False, False)]
     dtype = "float32"
+    trials = generate_trials([kernel_hs, kernel_ws, pad, strides, dilation, composite], 3)
 
-    inputs = {
-        "a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype)),
-    }
-
-    for kernel_size in [1, 2, 3]:
-        outputs = []
-        func, params = _get_model(shape, kernel_size,
-                                  (0, 0), (1, 1), 1, 1,
-                                  dtype, 1, iter(inputs))
-        for acl in [False, True]:
-            outputs.append(build_and_run(func, inputs, 1,
-                                         params, device,
-                                         enable_acl=acl)[0])
-        verify(outputs, atol=0.002, rtol=0.01)
-
-    for pad_ksize in [((1, 1), 3), ((2, 2), 5), ((2, 1), 3)]:
-        outputs = []
-        func, params = _get_model(shape, pad_ksize[1], pad_ksize[0],
-                                  (1, 1), 1, 1, dtype, 1, iter(inputs))
-        for acl in [False, True]:
-            outputs.append(build_and_run(func, inputs, 1,
-                                         params, device,
-                                         enable_acl=acl)[0])
-        verify(outputs, atol=0.002, rtol=0.01)
-
-    for strides in [(1, 1), (2, 2)]:
+    for kernel_h, kernel_w, pad, stride, dilation, composite in trials:
+        out_channels = r.randint(4, 16)
+        groups = 1
+        shape = (1,) + tuple(np.random.randint(low=max(kernel_hs + kernel_ws), high=32, size=(3,)))
         outputs = []
-        func, params = _get_model(shape, 2, (0, 0), strides,
-                                  1, 1, dtype, 1, iter(inputs))
-        for acl in [False, True]:
-            outputs.append(build_and_run(func, inputs, 1,
-                                         params, device,
-                                         enable_acl=acl)[0])
-        verify(outputs, atol=0.002, rtol=0.01)
+        inputs = {
+            "a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype)),
+        }
 
-    # Test composite convolution: (has_pad, has_bias, has_activation).
-    for composite in [(False, True, False), (False, False, True), (False, True, True),
-                      (True, False, False)]:
-        outputs = []
-        func, params = _get_model(shape, 2, (1, 1), (1, 1),
-                                  1, 1, dtype, 1, iter(inputs),
+        func, params = _get_model(shape, kernel_h, kernel_w,
+                                  pad, stride, dilation, groups,
+                                  dtype, out_channels, iter(inputs),
                                   has_pad=composite[0],
                                   has_bias=composite[1],
                                   has_activation=composite[2])
@@ -186,19 +280,28 @@ def test_codegen_conv2d():
     if skip_codegen_test():
         return
 
-    shape = (1, 25, 25, 1)
+    np.random.seed(0)
+    r = random.Random(0)
+
+    kernel_hs = [1, 2, 3, 5]
+    kernel_ws = [1, 2, 3, 5]
+    pad = [(1, 1), (2, 2), (2, 1)]
+    strides = [(1, 1), (2, 2)]
+    dilation = [(1, 1)]
+    # composite operator (pad, bias, activation)
+    composite = [(False, False, False), (False, True, False), (False, False, True),
+                 (False, True, True), (True, False, False)]
     dtype = "float32"
-    inputs = {"a"}
+    trials = generate_trials([kernel_hs, kernel_ws, pad, strides, dilation, composite], 3)
+
+    for kernel_h, kernel_w, pad, stride, dilation, composite in trials:
+        out_channels = r.randint(4, 16)
+        groups = 1
+        shape = (1,) + tuple(r.randint(a=max(kernel_hs + kernel_ws), b=32) for _ in range(3))
+        inputs = {"a"}
+
+        args = (shape, kernel_h, kernel_w, pad, stride, dilation, groups, dtype, out_channels)
 
-    for pad_ksize in [((1, 1), 3), ((2, 1), 3)]:
-        args = (shape, pad_ksize[1], pad_ksize[0], (1, 1), 1, 1, dtype, 1)
-        func, params = _get_model(*args, var_names=iter(inputs))
-        exp_codegen = _get_expected_codegen(*args)
-        verify_codegen(func, exp_codegen, 1)
-    # Test composite convolution: (has_pad, has_bias, has_activation).
-    for composite in [(False, True, False), (False, False, True), (False, True, True),
-                      (True, False, False)]:
-        args = (shape, 2, (1, 1), (1, 1), 1, 1, dtype, 1)
         func, params = _get_model(*args, var_names=iter(inputs),
                                   has_pad=composite[0],
                                   has_bias=composite[1],
@@ -209,6 +312,109 @@ def test_codegen_conv2d():
         verify_codegen(func, exp_codegen, 1)
 
 
+def test_qnn_conv2d():
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    np.random.seed(0)
+    r = random.Random(0)
+
+    kernel_hs = [1, 2, 3, 5]
+    kernel_ws = [1, 2, 3, 5]
+    pad = [(1, 1), (2, 2)]
+    strides = [(1, 1), (2, 2)]
+    dilation = [(1, 1)]
+    # composite operator (pad, bias, activation)
+    composite = [(False, False, False), (False, True, False), (False, False, True),
+                 (False, True, True), (True, False, False)]
+    dtype = "uint8"
+    trials = generate_trials([kernel_hs, kernel_ws, pad, strides, dilation, composite], 3)
+
+    for kernel_h, kernel_w, pad, stride, dilation, composite in trials:
+        out_channels = r.randint(4, 16)
+        groups = 1
+        shape = (1,) + tuple(np.random.randint(low=max(kernel_hs + kernel_ws), high=32, size=(3,)))
+        outputs = []
+        inputs = {
+            "a": tvm.nd.array(np.random.uniform(0, 255, shape).astype(dtype))
+        }
+
+        input_zp = r.randint(0, 255)
+        input_sc = r.random() * 2
+        kernel_zp = r.randint(0, 255)
+        kernel_sc = r.random() * 2
+        output_zp, output_sc = _get_qnn_params(input_zp, input_sc,
+                                               kernel_zp, kernel_sc,
+                                               kernel_h, kernel_w, shape[3])
+
+        func, params = _get_qnn_model(shape, kernel_h, kernel_w,
+                                      pad, stride, dilation, groups,
+                                      dtype, out_channels,
+                                      input_zp, input_sc,
+                                      kernel_zp, kernel_sc,
+                                      output_zp, output_sc,
+                                      iter(inputs),
+                                      has_pad=composite[0],
+                                      has_bias=composite[1],
+                                      has_activation=composite[2])
+        for acl in [False, True]:
+            outputs.append(build_and_run(func, inputs, 1,
+                                         params, device,
+                                         enable_acl=acl)[0])
+        verify(outputs, atol=1, rtol=0)
+
+
+def test_codegen_qnn_conv2d():
+    if skip_codegen_test():
+        return
+
+    np.random.seed(0)
+    r = random.Random(0)
+
+    kernel_hs = [1, 2, 3, 5]
+    kernel_ws = [1, 2, 3, 5]
+    pad = [(1, 1), (2, 2), (2, 1)]
+    strides = [(1, 1), (2, 2)]
+    dilation = [(1, 1)]
+    # composite operator (pad, bias, activation)
+    composite = [(False, False, False), (False, True, False), (False, False, True),
+                 (False, True, True), (True, False, False)]
+    dtype = "uint8"
+    trials = generate_trials([kernel_hs, kernel_ws, pad, strides, dilation, composite], 3)
+
+    for kernel_h, kernel_w, pad, stride, dilation, composite in trials:
+        out_channels = r.randint(4, 16)
+        groups = 1
+        shape = (1,) + tuple(r.randint(a=max(kernel_hs + kernel_ws), b=32) for _ in range(3))
+        inputs = {"a"}
+
+        input_zp = r.randint(0, 255)
+        input_sc = r.random() * 2
+        kernel_zp = r.randint(0, 255)
+        kernel_sc = r.random() * 2
+        output_zp, output_sc = _get_qnn_params(input_zp, input_sc,
+                                               kernel_zp, kernel_sc,
+                                               kernel_h, kernel_w, shape[3])
+
+        args = (shape, kernel_h, kernel_w, pad, stride, dilation, groups, dtype, out_channels)
+
+        func, params = _get_qnn_model(*args,
+                                      input_zp=input_zp, input_sc=input_sc,
+                                      kernel_zp=kernel_zp, kernel_sc=kernel_sc,
+                                      output_zp=output_zp, output_sc=output_sc,
+                                      var_names=iter(inputs),
+                                      has_pad=composite[0],
+                                      has_bias=composite[1],
+                                      has_activation=composite[2])
+        exp_codegen = _get_expected_codegen(*args,
+                                            has_bias=composite[1],
+                                            has_activation=composite[2])
+        verify_codegen(func, exp_codegen, 1)
+
+
 if __name__ == "__main__":
     test_conv2d()
+    test_qnn_conv2d()
     test_codegen_conv2d()
+    test_codegen_qnn_conv2d()
diff --git a/tests/python/contrib/test_arm_compute_lib/test_network.py b/tests/python/contrib/test_arm_compute_lib/test_network.py
index 8648a017ad4a..1ba6ca724f04 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_network.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_network.py
@@ -24,12 +24,17 @@
 from .infrastructure import Device
 
 
-def _build_and_run_keras_network(mod, params, inputs, device, tvm_ops, acl_partitions):
-    """Helper function to build and run a network from the Keras frontend."""
+def _build_and_run_network(mod, params, inputs, device, tvm_ops, acl_partitions, atol, rtol):
+    """Helper function to build and run a network."""
     data = {}
     np.random.seed(0)
-    for name, shape in inputs.items():
-        data[name] = np.random.uniform(-128, 127, shape).astype("float32")
+
+    for name, (shape, dtype) in inputs.items():
+        if dtype == "uint8":
+            low, high = 0, 255
+        else:
+            low, high = -127, 128
+        data[name] = np.random.uniform(low, high, shape).astype(dtype)
 
     outputs = []
     for acl in [False, True]:
@@ -37,7 +42,40 @@ def _build_and_run_keras_network(mod, params, inputs, device, tvm_ops, acl_parti
                                      device, enable_acl=acl,
                                      tvm_ops=tvm_ops,
                                      acl_partitions=acl_partitions)[0])
-    verify(outputs, atol=0.002, rtol=0.01)
+    verify(outputs, atol=atol, rtol=rtol, verify_saturation=False)
+
+
+def _get_tflite_model(tflite_model_path, inputs_dict):
+    """Convert TFlite graph to relay."""
+    import tflite.Model
+
+    with open(tflite_model_path, 'rb') as f:
+        tflite_model_buffer = f.read()
+
+    try:
+        tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buffer, 0)
+    except AttributeError:
+        tflite_model = tflite.Model.GetRootAsModel(tflite_model_buffer, 0)
+    shape_dict = {}
+    dtype_dict = {}
+    for input in inputs_dict:
+        input_shape, input_dtype = inputs_dict[input]
+        shape_dict[input] = input_shape
+        dtype_dict[input] = input_dtype
+
+    return relay.frontend.from_tflite(
+        tflite_model,
+        shape_dict=shape_dict,
+        dtype_dict=dtype_dict
+    )
+
+
+def _get_keras_model(keras_model, inputs_dict):
+    """Convert Keras graph to relay."""
+    inputs = {}
+    for name, (shape, _) in inputs_dict.items():
+        inputs[keras_model.input_names[0]] = shape
+    return relay.frontend.from_keras(keras_model, inputs, layout="NHWC")
 
 
 def test_vgg16():
@@ -50,12 +88,13 @@ def get_model():
         from keras.applications import VGG16
         vgg16 = VGG16(include_top=True, weights='imagenet',
                       input_shape=(224, 224, 3), classes=1000)
-        inputs = {vgg16.input_names[0]: (1, 224, 224, 3)}
-        mod, params = relay.frontend.from_keras(vgg16, inputs, layout="NHWC")
+        inputs = {vgg16.input_names[0]: ((1, 224, 224, 3), "float32")}
+        mod, params = _get_keras_model(vgg16, inputs)
         return mod, params, inputs
 
-    _build_and_run_keras_network(*get_model(), device=device,
-                                 tvm_ops=10, acl_partitions=18)
+    _build_and_run_network(*get_model(), device=device,
+                           tvm_ops=10, acl_partitions=18,
+                           atol=0.002, rtol=0.01)
 
 
 def test_mobilenet():
@@ -68,14 +107,42 @@ def get_model():
         from keras.applications import MobileNet
         mobilenet = MobileNet(include_top=True, weights='imagenet',
                               input_shape=(224, 224, 3), classes=1000)
-        inputs = {mobilenet.input_names[0]: (1, 224, 224, 3)}
-        mod, params = relay.frontend.from_keras(mobilenet, inputs, layout="NHWC")
+        inputs = {mobilenet.input_names[0]: ((1, 224, 224, 3), "float32")}
+        mod, params = _get_keras_model(mobilenet, inputs)
+        return mod, params, inputs
+
+    _build_and_run_network(*get_model(), device=device,
+                           tvm_ops=74, acl_partitions=17,
+                           atol=0.002, rtol=0.01)
+
+
+def test_quantized_mobilenet():
+    if skip_runtime_test():
+        return
+
+    import tvm.relay.testing.tf as tf_testing
+
+    device = Device()
+
+    def get_model():
+        model_path = tf_testing.get_workload_official(
+            "https://storage.googleapis.com/download.tensorflow.org/" \
+            "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
+            "mobilenet_v1_1.0_224_quant.tflite",
+        )
+        inputs = {"input": ((1, 224, 224, 3), "uint8")}
+        mod, params = _get_tflite_model(
+            model_path,
+            inputs_dict=inputs
+        )
         return mod, params, inputs
 
-    _build_and_run_keras_network(*get_model(), device=device,
-                                 tvm_ops=74, acl_partitions=17)
+    _build_and_run_network(*get_model(), device=device,
+                           tvm_ops=45, acl_partitions=16,
+                           atol=8, rtol=0)
 
 
 if __name__ == "__main__":
     test_vgg16()
     test_mobilenet()
+    test_quantized_mobilenet()
diff --git a/tests/python/contrib/test_arm_compute_lib/test_pooling.py b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
index aac77959aeb6..792483b45b16 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_pooling.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
@@ -26,17 +26,17 @@
 from .infrastructure import Device
 
 
-def _get_model(shape, typef, sizes, strides, padding,
+def _get_model(shape, dtype, typef, sizes, strides, padding,
                ceil_mode, var_names):
     """Return a model and any parameters it may have."""
-    var = relay.var(next(var_names), shape=shape, dtype="float32")
+    var = relay.var(next(var_names), shape=shape, dtype=dtype)
     pool = typef(var, pool_size=sizes, strides=strides, padding=padding,
                  ceil_mode=ceil_mode, layout="NHWC")
     return pool
 
 
-def _get_expected_codegen(shape, typef, sizes, strides, padding,
-                          ceil_mode):
+def _get_expected_codegen(shape, dtype, typef, sizes, strides,
+                          padding, ceil_mode):
     if len(padding) == 2:
         padding = (padding[1], padding[1], padding[0], padding[0])
     output_height = ((shape[1] - sizes[0] + padding[0] + padding[2]) / strides[0]) + 1
@@ -52,7 +52,7 @@ def _get_expected_codegen(shape, typef, sizes, strides, padding,
             "num_outputs": "1",
             "layout": [["NHWC"]],
             "shape": [[list(output_shape)]],
-            "dtype": [["float32"]],
+            "dtype": [[dtype]],
             "padding": [[str(p) for p in padding]],
             "strides": [[str(s) for s in strides]],
             "pool_size": [[str(s) for s in sizes]],
@@ -63,7 +63,7 @@ def _get_expected_codegen(shape, typef, sizes, strides, padding,
     input = {
         "op": "input",
         "name": "",
-        "attrs": {"shape": [[list(shape)]], "dtype": [["float32"]]}}
+        "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}}
     return [input, node]
 
 
@@ -74,22 +74,23 @@ def test_pooling():
     device = Device()
     np.random.seed(0)
 
-    for size in [(2, 2), (3, 3)]:
-        for stride in [(2, 2)]:
-            shape = (1, size[0] + stride[0] * 5,
-                     size[1] + stride[1] * 5, 16)
+    for dtype, low, high, atol, rtol in [("float32", -127, 128, 0.001, 0.001), ("uint8", 0, 255, 0, 0)]:
+        for size in [(2, 2), (3, 3)]:
+            for stride in [(2, 2)]:
+                shape = (1, size[0] + stride[0] * 5,
+                         size[1] + stride[1] * 5, 16)
 
-            inputs = {
-                "a": tvm.nd.array(np.random.uniform(-1, 1, shape).astype("float32")),
-            }
+                inputs = {
+                    "a": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)),
+                }
 
-            outputs = []
-            func = _get_model(shape, relay.nn.max_pool2d, size,
-                              stride, (0, 0), True, iter(inputs))
-            for acl in [False, True]:
-                outputs.append(build_and_run(func, inputs, 1, None, device,
-                                             enable_acl=acl)[0])
-            verify(outputs, atol=0.001, rtol=0.001)
+                outputs = []
+                func = _get_model(shape, dtype, relay.nn.max_pool2d, size,
+                                  stride, (0, 0), True, iter(inputs))
+                for acl in [False, True]:
+                    outputs.append(build_and_run(func, inputs, 1, None, device,
+                                                 enable_acl=acl)[0])
+                verify(outputs, atol=atol, rtol=rtol)
 
 
 def test_codegen_pooling():
@@ -98,15 +99,16 @@ def test_codegen_pooling():
 
     inputs = {"a"}
 
-    for size in [(2, 2), (3, 3)]:
-        for stride in [(2, 2)]:
-            shape = (1, size[0] + stride[0] * 5,
-                     size[1] + stride[1] * 5, 16)
-            args = (shape, relay.nn.max_pool2d, size,
-                    stride, (0, 0), True)
-            func = _get_model(*args, iter(inputs))
-            exp_codegen = _get_expected_codegen(*args)
-            verify_codegen(func, exp_codegen, 1)
+    for dtype in ["float32", "uint8"]:
+        for size in [(2, 2), (3, 3)]:
+            for stride in [(2, 2)]:
+                shape = (1, size[0] + stride[0] * 5,
+                         size[1] + stride[1] * 5, 16)
+                args = (shape, dtype, relay.nn.max_pool2d, size,
+                        stride, (0, 0), True)
+                func = _get_model(*args, iter(inputs))
+                exp_codegen = _get_expected_codegen(*args)
+                verify_codegen(func, exp_codegen, 1)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_arm_compute_lib/test_reshape.py b/tests/python/contrib/test_arm_compute_lib/test_reshape.py
index cb9f2954170e..98e5ae6f2f43 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_reshape.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_reshape.py
@@ -26,14 +26,14 @@
 from .infrastructure import Device
 
 
-def _get_model(input_shape, output_shape, var_names):
+def _get_model(input_shape, output_shape, dtype, var_names):
     """Return a model and any parameters it may have."""
-    a = relay.var(next(var_names), shape=input_shape, dtype="float32")
+    a = relay.var(next(var_names), shape=input_shape, dtype=dtype)
     reshape = relay.reshape(a, output_shape)
     return reshape
 
 
-def _get_expected_codegen(input_shape, output_shape):
+def _get_expected_codegen(input_shape, output_shape, dtype):
     node = {
         "op": "kernel",
         "name": "reshape",
@@ -43,7 +43,7 @@ def _get_expected_codegen(input_shape, output_shape):
             "num_outputs": "1",
             "newshape": [[str(s) for s in output_shape]],
             "shape": [[list(output_shape)]],
-            "dtype": [["float32"]],
+            "dtype": [[dtype]],
             "reverse": [["0"]]
         },
     }
@@ -51,7 +51,7 @@ def _get_expected_codegen(input_shape, output_shape):
     input = {
         "op": "input",
         "name": "",
-        "attrs": {"shape": [[list(input_shape)]], "dtype": [["float32"]]}}
+        "attrs": {"shape": [[list(input_shape)]], "dtype": [[dtype]]}}
 
     return [input, node]
 
@@ -63,18 +63,19 @@ def test_reshape():
     device = Device()
     np.random.seed(0)
 
-    inputs = {
-        "a": tvm.nd.array(
-            np.random.uniform(-128, 127, (1, 1, 1, 1000)).astype("float32"))
-    }
+    for dtype, low, high, atol, rtol in [("float32", -127, 128, 0.001, 0.001), ("uint8", 0, 255, 0, 0)]:
+        inputs = {
+            "a": tvm.nd.array(
+                np.random.uniform(low, high, (1, 1, 1, 1000)).astype(dtype))
+        }
 
-    for shape in [(1, 1000), (10, 10, 10)]:
-        outputs = []
-        func = _get_model(inputs["a"].shape, shape, iter(inputs))
-        for acl in [False, True]:
-            outputs.append(build_and_run(func, inputs, 1, None, device,
-                                         enable_acl=acl)[0])
-        verify(outputs, atol=1e-7, rtol=1e-7)
+        for shape in [(1, 1000), (10, 10, 10)]:
+            outputs = []
+            func = _get_model(inputs["a"].shape, shape, dtype, iter(inputs))
+            for acl in [False, True]:
+                outputs.append(build_and_run(func, inputs, 1, None, device,
+                                             enable_acl=acl)[0])
+            verify(outputs, atol=1e-7, rtol=1e-7)
 
 
 def test_codegen_reshape():
@@ -83,12 +84,12 @@ def test_codegen_reshape():
 
     shape = (1, 1, 1, 1000)
     inputs = {"a"}
-
-    for new_shape in [(1, 1000), (10, 10, 10)]:
-        args = (shape, new_shape)
-        func = _get_model(*args, iter(inputs))
-        exp_codegen = _get_expected_codegen(*args)
-        verify_codegen(func, exp_codegen, 1)
+    for dtype in ["float32", "uint8"]:
+        for new_shape in [(1, 1000), (10, 10, 10)]:
+            args = (shape, new_shape, dtype)
+            func = _get_model(*args, iter(inputs))
+            exp_codegen = _get_expected_codegen(*args)
+            verify_codegen(func, exp_codegen, 1)
 
 
 if __name__ == "__main__":

From 63574f6546619a3ebeb90a8cc68dcae0091112fb Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 23 Jul 2020 17:46:41 +0100
Subject: [PATCH 2/4] Address comments

Change-Id: I4f9e3e7dbf6053066927cf07c4c19ecc88572e9d
---
 docs/deploy/arm_compute_lib.rst               |  2 +
 .../contrib/arm_compute_lib/codegen.cc        | 10 ++--
 .../contrib/arm_compute_lib/acl_runtime.cc    | 49 ++++++++++---------
 .../contrib/arm_compute_lib/acl_utils.cc      | 22 +++++----
 .../contrib/arm_compute_lib/acl_utils.h       | 19 +++----
 5 files changed, 56 insertions(+), 46 deletions(-)

diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst
index d74c093c1b81..6d9e2d8b5959 100644
--- a/docs/deploy/arm_compute_lib.rst
+++ b/docs/deploy/arm_compute_lib.rst
@@ -129,9 +129,11 @@ Operator support
 | nn.conv2d    | fp32:                                                                   |
 |              |   Simple: nn.conv2d                                                     |
 |              |   Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu?                 |
+|              | (only groups = 1 supported)                                             |
 +--------------+-------------------------------------------------------------------------+
 | qnn.conv2d   | uint8:                                                                  |
 |              |   Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu?, qnn.requantize |
+|              | (only groups = 1 supported)                                             |
 +--------------+-------------------------------------------------------------------------+
 | nn.maxpool2d | fp32, uint8                                                             |
 +--------------+-------------------------------------------------------------------------+
diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
index a6761e2347d7..08004a85880c 100644
--- a/src/relay/backend/contrib/arm_compute_lib/codegen.cc
+++ b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
@@ -53,7 +53,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
    * \brief A series of operators that form a composite
    * convolution. Supports both nn.conv2d and qnn.conv2d.
    */
-  struct CompositeConvNodes {
+  struct CompositeConvNode {
     const CallNode* pad = nullptr;
     const CallNode* conv = nullptr;
     const CallNode* bias = nullptr;
@@ -95,8 +95,8 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
    * \param cn The call node of the composite function.
    * \return Extracted composite convolution nodes.
    */
-  static CompositeConvNodes UnpackCompositeConvolution(const CallNode* cn) {
-    CompositeConvNodes nodes{};
+  static CompositeConvNode UnpackCompositeConvolution(const CallNode* cn) {
+    CompositeConvNode nodes{};
     const auto* fn = cn->op.as<FunctionNode>();
     CHECK(fn);
     const auto* current_call = fn->body.as<CallNode>();
@@ -130,11 +130,11 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
   /*!
    * \brief Create a JSON representation of a composite convolution.
    *
-   * \param call The call to be represented.
+   * \param cn The call to be represented.
    * \return A JSON representation of a specific operator.
    */
   std::shared_ptr<JSONGraphNode> CreateCompositeConvJSONNode(const CallNode* cn) {
-    CompositeConvNodes nodes = UnpackCompositeConvolution(cn);
+    CompositeConvNode nodes = UnpackCompositeConvolution(cn);
     std::string name = "nn.conv2d";
 
     const auto* conv_attr = nodes.conv->attrs.as<Conv2DAttrs>();
diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
index 7b65692ea160..d30c683d9632 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
@@ -113,7 +113,7 @@ class ACLRuntime : public JSONRuntimeBase {
    * per engine.
    */
   void BuildEngine() {
-    std::shared_ptr<arm_compute::MemoryManagerOnDemand> mm = MakeMemoryManager();
+    std::shared_ptr<arm_compute::MemoryManagerOnDemand> mm = MakeACLMemoryManager();
     int num_pools = 0;
     bool found_kernel_node = false;
     for (size_t nid = 0; nid < nodes_.size(); ++nid) {
@@ -153,26 +153,28 @@ class ACLRuntime : public JSONRuntimeBase {
   };
 
   /*!
-   * \brief Create an ACL tensor given the JSON representation.
+   * \brief Create an ACL tensor given the JSON representation. If scale
+   * and offset are given, then create a quantized ACL tensor.
    *
    * \param tensor The tensor to represent.
    * \param scale (optional) The scale of the tensor as an input.
    * \param offset (optional) The offset of the tensor as an input.
    * \return ACL Tensor.
    */
-  arm_compute::Tensor GetACLTensor(const JSONGraphNodeEntry& tensor,
-                                   JSONGraphNodeEntry* scale = nullptr,
-                                   JSONGraphNodeEntry* offset = nullptr) {
+  arm_compute::Tensor MakeACLTensorFromJSONEntry(const JSONGraphNodeEntry& tensor,
+                                                 JSONGraphNodeEntry* scale = nullptr,
+                                                 JSONGraphNodeEntry* offset = nullptr) {
     JSONGraphNode node = nodes_[tensor.id_];
     void* node_data = nullptr;
     if (node.GetOpType() == "const") {
       node_data = data_entry_[EntryID(tensor)]->data;
     }
-    return GetACLTensor(node, scale, offset, node_data);
+    return MakeACLTensorFromJSONNode(node, scale, offset, node_data);
   }
 
   /*!
-   * \brief Create an ACL tensor given the JSON representation.
+   * \brief Create an ACL tensor given the JSON representation. If scale
+   * and offset are given, then create a quantized ACL tensor.
    *
    * \param node The tensor to represent.
    * \param scale (optional) The scale of the tensor as an input.
@@ -180,15 +182,17 @@ class ACLRuntime : public JSONRuntimeBase {
    * \param data (optional) Constant data of input node.
    * \return ACL Tensor.
    */
-  arm_compute::Tensor GetACLTensor(const JSONGraphNode& node, JSONGraphNodeEntry* scale = nullptr,
-                                   JSONGraphNodeEntry* offset = nullptr, void* data = nullptr) {
+  arm_compute::Tensor MakeACLTensorFromJSONNode(const JSONGraphNode& node,
+                                                JSONGraphNodeEntry* scale = nullptr,
+                                                JSONGraphNodeEntry* offset = nullptr,
+                                                void* data = nullptr) {
     const DLTensor* scale_data = nullptr;
     const DLTensor* offset_data = nullptr;
     if (scale && offset) {
       scale_data = data_entry_[EntryID(*scale)];
       offset_data = data_entry_[EntryID(*offset)];
     }
-    return MakeTensor(node, data, scale_data, offset_data);
+    return MakeACLTensor(node, data, scale_data, offset_data);
   }
 
   /*!
@@ -203,7 +207,7 @@ class ACLRuntime : public JSONRuntimeBase {
     std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
     std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
     std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
-    arm_compute::PadStrideInfo pad_stride_info = MakePadStride(padding, strides);
+    arm_compute::PadStrideInfo pad_stride_info = MakeACLPadStride(padding, strides);
 
     int groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
     CHECK(groups == 1) << "Arm Compute Library NEON convolution only supports group size of 1.";
@@ -227,18 +231,19 @@ class ACLRuntime : public JSONRuntimeBase {
     bool has_bias;
     if (node.GetOpName() == "qnn.conv2d") {
       has_bias = num_inputs == 9;
-      layer->inputs.push_back(GetACLTensor(inputs[0], &inputs[4], &inputs[2]));
-      layer->inputs.push_back(GetACLTensor(inputs[1], &inputs[5], &inputs[3]));
+      layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[0], &inputs[4], &inputs[2]));
+      layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[1], &inputs[5], &inputs[3]));
       if (has_bias) {
-        layer->inputs.push_back(GetACLTensor(inputs[6]));
+        layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[6]));
       }
-      layer->outputs.push_back(GetACLTensor(node, &inputs[6 + has_bias], &inputs[7 + has_bias]));
+      layer->outputs.push_back(
+          MakeACLTensorFromJSONNode(node, &inputs[6 + has_bias], &inputs[7 + has_bias]));
     } else {
       has_bias = num_inputs == 3;
       for (const auto& i : inputs) {
-        layer->inputs.push_back(GetACLTensor(i));
+        layer->inputs.push_back(MakeACLTensorFromJSONEntry(i));
       }
-      layer->outputs.push_back(GetACLTensor(node));
+      layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
     }
 
     auto function = std::make_shared<arm_compute::NEConvolutionLayer>(mm);
@@ -259,7 +264,7 @@ class ACLRuntime : public JSONRuntimeBase {
   void CreatePoolingLayer(CachedLayer* layer, const JSONGraphNode& node) {
     std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
     std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
-    arm_compute::PadStrideInfo pad_stride_info = MakePadStride(padding, strides);
+    arm_compute::PadStrideInfo pad_stride_info = MakeACLPadStride(padding, strides);
 
     auto attr_pool_size = node.GetAttr<std::vector<std::string>>("pool_size");
     int pool_size_h = std::stoi(attr_pool_size[0]);
@@ -276,8 +281,8 @@ class ACLRuntime : public JSONRuntimeBase {
         arm_compute::PoolingLayerInfo(pool_type, arm_compute::Size2D(pool_size_h, pool_size_w),
                                       arm_compute::DataLayout::NHWC, pad_stride_info);
 
-    layer->inputs.push_back(GetACLTensor(node.GetInputs()[0]));
-    layer->outputs.push_back(GetACLTensor(node));
+    layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0]));
+    layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
 
     auto function = std::make_shared<arm_compute::NEPoolingLayer>();
     function->configure(&layer->inputs[0], &layer->outputs[0], pool_info);
@@ -291,8 +296,8 @@ class ACLRuntime : public JSONRuntimeBase {
    * \param node The JSON representation of the operator.
    */
   void CreateReshapeLayer(CachedLayer* layer, const JSONGraphNode& node) {
-    layer->inputs.push_back(GetACLTensor(node.GetInputs()[0]));
-    layer->outputs.push_back(GetACLTensor(node));
+    layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0]));
+    layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
     auto function = std::make_shared<arm_compute::NEReshapeLayer>();
     function->configure(&layer->inputs[0], &layer->outputs[0]);
     layer->function = function;
diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.cc b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
index 2da7ffde7428..98c9cda9fae7 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_utils.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
@@ -38,12 +38,12 @@ void CheckACLError(const arm_compute::Status& status) {
   CHECK(status.error_code() == arm_compute::ErrorCode::OK) << "ACL: " << status.error_description();
 }
 
-arm_compute::Tensor MakeTensor(const JSONGraphNode& tensor_rep, void* data, const DLTensor* scale,
-                               const DLTensor* offset) {
+arm_compute::Tensor MakeACLTensor(const JSONGraphNode& tensor_rep, void* data,
+                                  const DLTensor* scale, const DLTensor* offset) {
   arm_compute::Tensor tensor;
   std::vector<int64_t> shape = tensor_rep.GetOpShape()[0];
   DLDataType dtype = tensor_rep.GetOpDataType()[0];
-  arm_compute::TensorInfo info = MakeTensorInfo(shape, dtype, scale, offset);
+  arm_compute::TensorInfo info = MakeACLTensorInfo(shape, dtype, scale, offset);
   tensor.allocator()->init(info);
   if (data != nullptr) {
     CheckACLError(tensor.allocator()->import_memory(data));
@@ -51,15 +51,17 @@ arm_compute::Tensor MakeTensor(const JSONGraphNode& tensor_rep, void* data, cons
   return tensor;
 }
 
-arm_compute::TensorInfo MakeTensorInfo(const std::vector<int64_t>& shape, const DLDataType& dtype,
-                                       const DLTensor* scale, const DLTensor* offset) {
+arm_compute::TensorInfo MakeACLTensorInfo(const std::vector<int64_t>& shape,
+                                          const DLDataType& dtype, const DLTensor* scale,
+                                          const DLTensor* offset) {
   arm_compute::TensorShape acl_shape;
   for (unsigned int i = shape.size(); i > 0; --i) {
     acl_shape.set(shape.size() - i, shape[i - 1]);
   }
-  arm_compute::DataType acl_dtype = MakeDataType(dtype);
+  arm_compute::DataType acl_dtype = MakeACLDataType(dtype);
   arm_compute::TensorInfo info(acl_shape, 1, acl_dtype, arm_compute::DataLayout::NHWC);
 
+  // If scale and offset provided create quantized ACL tensor.
   if (scale != nullptr && offset != nullptr) {
     std::vector<float> scale_data = GetVectorFromDLTensor<float>(scale);
     std::vector<int> offset_data = GetVectorFromDLTensor<int>(offset);
@@ -72,14 +74,14 @@ arm_compute::TensorInfo MakeTensorInfo(const std::vector<int64_t>& shape, const
   return info;
 }
 
-std::shared_ptr<arm_compute::MemoryManagerOnDemand> MakeMemoryManager() {
+std::shared_ptr<arm_compute::MemoryManagerOnDemand> MakeACLMemoryManager() {
   auto lifetime_mgr = std::make_shared<arm_compute::OffsetLifetimeManager>();
   auto pool_mgr = std::make_shared<arm_compute::PoolManager>();
   return std::make_shared<arm_compute::MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
 }
 
-arm_compute::PadStrideInfo MakePadStride(const std::vector<std::string>& pad,
-                                         const std::vector<std::string>& stride) {
+arm_compute::PadStrideInfo MakeACLPadStride(const std::vector<std::string>& pad,
+                                            const std::vector<std::string>& stride) {
   int pad_0 = 0, pad_1 = 0, pad_2 = 0, pad_3 = 0;
   int stride_0 = std::stoi(stride[0]), stride_1 = std::stoi(stride[1]);
   size_t size = pad.size();
@@ -111,7 +113,7 @@ arm_compute::PadStrideInfo MakePadStride(const std::vector<std::string>& pad,
                                     arm_compute::DimensionRoundingType::FLOOR);
 }
 
-arm_compute::DataType MakeDataType(const DLDataType& data_type) {
+arm_compute::DataType MakeACLDataType(const DLDataType& data_type) {
   if (data_type.code == DLDataTypeCode::kDLFloat && data_type.bits == 32) {
     return arm_compute::DataType::F32;
   } else if (data_type.code == DLDataTypeCode::kDLUInt && data_type.bits == 8) {
diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.h b/src/runtime/contrib/arm_compute_lib/acl_utils.h
index 67eebada4ea5..80c6f0bcd958 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_utils.h
+++ b/src/runtime/contrib/arm_compute_lib/acl_utils.h
@@ -62,8 +62,9 @@ void CheckACLError(const arm_compute::Status& status);
  * \param offset (optional) The quantization offset.
  * \return arm_compute::Tensor.
  */
-arm_compute::Tensor MakeTensor(const JSONGraphNode& tensor_rep, void* data = nullptr,
-                               const DLTensor* scale = nullptr, const DLTensor* offset = nullptr);
+arm_compute::Tensor MakeACLTensor(const JSONGraphNode& tensor_rep, void* data = nullptr,
+                                  const DLTensor* scale = nullptr,
+                                  const DLTensor* offset = nullptr);
 
 /*!
  * \brief Make an acl tensor info object from JSON tensor
@@ -75,9 +76,9 @@ arm_compute::Tensor MakeTensor(const JSONGraphNode& tensor_rep, void* data = nul
  * \param offset (optional) The quantization offset.
  * \return arm_compute::TensorInfo.
  */
-arm_compute::TensorInfo MakeTensorInfo(const std::vector<int64_t>& shape, const DLDataType& dtype,
-                                       const DLTensor* scale = nullptr,
-                                       const DLTensor* offset = nullptr);
+arm_compute::TensorInfo MakeACLTensorInfo(const std::vector<int64_t>& shape,
+                                          const DLDataType& dtype, const DLTensor* scale = nullptr,
+                                          const DLTensor* offset = nullptr);
 
 /*!
  * \brief Create a memory manager for use with a layer that
@@ -85,7 +86,7 @@ arm_compute::TensorInfo MakeTensorInfo(const std::vector<int64_t>& shape, const
  *
  * \return reference counted memory manager.
  */
-std::shared_ptr<arm_compute::MemoryManagerOnDemand> MakeMemoryManager();
+std::shared_ptr<arm_compute::MemoryManagerOnDemand> MakeACLMemoryManager();
 
 /*!
  * \brief Convert TVM padding and stride format to acl PadStrideInfo.
@@ -94,8 +95,8 @@ std::shared_ptr<arm_compute::MemoryManagerOnDemand> MakeMemoryManager();
  * \param stride The stride vector.
  * \return arm_compute::PadStrideInfo
  */
-arm_compute::PadStrideInfo MakePadStride(const std::vector<std::string>& pad,
-                                         const std::vector<std::string>& stride);
+arm_compute::PadStrideInfo MakeACLPadStride(const std::vector<std::string>& pad,
+                                            const std::vector<std::string>& stride);
 
 /*!
  * \brief Convert DLDataType to arm_compute::DataType.
@@ -103,7 +104,7 @@ arm_compute::PadStrideInfo MakePadStride(const std::vector<std::string>& pad,
  * \param data_type The data type to convert.
  * \return arm_compute::DataType.
  */
-arm_compute::DataType MakeDataType(const DLDataType& data_type);
+arm_compute::DataType MakeACLDataType(const DLDataType& data_type);
 
 /*!
  * \brief Get a vector from DLTensor data.

From 05016ad7c427674417e825fe8946822af5434044 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 23 Jul 2020 19:24:34 +0100
Subject: [PATCH 3/4] Fix tutorial

Change-Id: I4371e9d97a120fb7776db40ffcde60f46927af4d
---
 docs/deploy/arm_compute_lib.rst               |  2 +
 .../contrib/arm_compute_lib/codegen.cc        |  3 +
 .../contrib/arm_compute_lib/acl_runtime.cc    |  4 ++
 .../test_arm_compute_lib/test_conv2d.py       | 60 ++++++++++---------
 4 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst
index 6d9e2d8b5959..6dc8df082a28 100644
--- a/docs/deploy/arm_compute_lib.rst
+++ b/docs/deploy/arm_compute_lib.rst
@@ -129,10 +129,12 @@ Operator support
 | nn.conv2d    | fp32:                                                                   |
 |              |   Simple: nn.conv2d                                                     |
 |              |   Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu?                 |
+|              |                                                                         |
 |              | (only groups = 1 supported)                                             |
 +--------------+-------------------------------------------------------------------------+
 | qnn.conv2d   | uint8:                                                                  |
 |              |   Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu?, qnn.requantize |
+|              |                                                                         |
 |              | (only groups = 1 supported)                                             |
 +--------------+-------------------------------------------------------------------------+
 | nn.maxpool2d | fp32, uint8                                                             |
diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
index 08004a85880c..88de3edd1124 100644
--- a/src/relay/backend/contrib/arm_compute_lib/codegen.cc
+++ b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
@@ -99,6 +99,8 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
     CompositeConvNode nodes{};
     const auto* fn = cn->op.as<FunctionNode>();
     CHECK(fn);
+
+    // Traverse composite convolution function from child to parent
     const auto* current_call = fn->body.as<CallNode>();
     if (backend::IsOp(current_call, "qnn.requantize")) {
       nodes.requantize = current_call;
@@ -112,6 +114,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
       nodes.bias = current_call;
       current_call = current_call->args[0].as<CallNode>();
     }
+    // Enforce a convolution node exists at this point during traversal
     if (nodes.requantize) {
       CHECK(backend::IsOp(current_call, "qnn.conv2d"));
     } else {
diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
index d30c683d9632..2498dcf2ae6d 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
@@ -230,6 +230,8 @@ class ACLRuntime : public JSONRuntimeBase {
     size_t num_inputs = inputs.size();
     bool has_bias;
     if (node.GetOpName() == "qnn.conv2d") {
+      CHECK(num_inputs >= 8U && num_inputs <= 9U)
+          << "Quantized convolution requires 9 inputs with a bias, 8 inputs without.";
       has_bias = num_inputs == 9;
       layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[0], &inputs[4], &inputs[2]));
       layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[1], &inputs[5], &inputs[3]));
@@ -239,6 +241,8 @@ class ACLRuntime : public JSONRuntimeBase {
       layer->outputs.push_back(
           MakeACLTensorFromJSONNode(node, &inputs[6 + has_bias], &inputs[7 + has_bias]));
     } else {
+      CHECK(num_inputs >= 2U && num_inputs <= 3U)
+          << "Convolution requires 3 inputs with a bias, 2 inputs without.";
       has_bias = num_inputs == 3;
       for (const auto& i : inputs) {
         layer->inputs.push_back(MakeACLTensorFromJSONEntry(i));
diff --git a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
index 96206ffb9631..e2367b431bd1 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
@@ -241,23 +241,24 @@ def test_conv2d():
 
     device = Device()
     np.random.seed(0)
-    r = random.Random(0)
 
     kernel_hs = [1, 2, 3, 5]
     kernel_ws = [1, 2, 3, 5]
     pad = [(1, 1), (2, 2), (2, 1)]
     strides = [(1, 1), (2, 2)]
     dilation = [(1, 1)]
+    out_channels = [4, 7, 16]
+    input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)]
     # composite operator (pad, bias, activation)
     composite = [(False, False, False), (False, True, False), (False, False, True),
                  (False, True, True), (True, False, False)]
     dtype = "float32"
-    trials = generate_trials([kernel_hs, kernel_ws, pad, strides, dilation, composite], 3)
+    trials = generate_trials([kernel_hs, kernel_ws, pad, strides, dilation, out_channels,
+                              input_shapes, composite], 3)
 
-    for kernel_h, kernel_w, pad, stride, dilation, composite in trials:
-        out_channels = r.randint(4, 16)
+    for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials:
         groups = 1
-        shape = (1,) + tuple(np.random.randint(low=max(kernel_hs + kernel_ws), high=32, size=(3,)))
+        shape = (1, *input_shapes)
         outputs = []
         inputs = {
             "a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype)),
@@ -281,23 +282,24 @@ def test_codegen_conv2d():
         return
 
     np.random.seed(0)
-    r = random.Random(0)
 
     kernel_hs = [1, 2, 3, 5]
     kernel_ws = [1, 2, 3, 5]
     pad = [(1, 1), (2, 2), (2, 1)]
     strides = [(1, 1), (2, 2)]
     dilation = [(1, 1)]
+    out_channels = [4, 7, 16]
+    input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)]
     # composite operator (pad, bias, activation)
     composite = [(False, False, False), (False, True, False), (False, False, True),
                  (False, True, True), (True, False, False)]
     dtype = "float32"
-    trials = generate_trials([kernel_hs, kernel_ws, pad, strides, dilation, composite], 3)
+    trials = generate_trials([kernel_hs, kernel_ws, pad, strides, dilation, out_channels,
+                              input_shapes, composite], 3)
 
-    for kernel_h, kernel_w, pad, stride, dilation, composite in trials:
-        out_channels = r.randint(4, 16)
+    for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials:
         groups = 1
-        shape = (1,) + tuple(r.randint(a=max(kernel_hs + kernel_ws), b=32) for _ in range(3))
+        shape = (1, *input_shapes)
         inputs = {"a"}
 
         args = (shape, kernel_h, kernel_w, pad, stride, dilation, groups, dtype, out_channels)
@@ -318,32 +320,33 @@ def test_qnn_conv2d():
 
     device = Device()
     np.random.seed(0)
-    r = random.Random(0)
 
     kernel_hs = [1, 2, 3, 5]
     kernel_ws = [1, 2, 3, 5]
     pad = [(1, 1), (2, 2)]
     strides = [(1, 1), (2, 2)]
     dilation = [(1, 1)]
+    out_channels = [4, 7, 16]
+    input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)]
     # composite operator (pad, bias, activation)
     composite = [(False, False, False), (False, True, False), (False, False, True),
                  (False, True, True), (True, False, False)]
     dtype = "uint8"
-    trials = generate_trials([kernel_hs, kernel_ws, pad, strides, dilation, composite], 3)
+    trials = generate_trials([kernel_hs, kernel_ws, pad, strides, dilation, out_channels,
+                              input_shapes, composite], 3)
 
-    for kernel_h, kernel_w, pad, stride, dilation, composite in trials:
-        out_channels = r.randint(4, 16)
+    for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials:
         groups = 1
-        shape = (1,) + tuple(np.random.randint(low=max(kernel_hs + kernel_ws), high=32, size=(3,)))
+        shape = (1, *input_shapes)
         outputs = []
         inputs = {
             "a": tvm.nd.array(np.random.uniform(0, 255, shape).astype(dtype))
         }
 
-        input_zp = r.randint(0, 255)
-        input_sc = r.random() * 2
-        kernel_zp = r.randint(0, 255)
-        kernel_sc = r.random() * 2
+        input_zp = 100
+        input_sc = 0.5
+        kernel_zp = 25
+        kernel_sc = 0.03
         output_zp, output_sc = _get_qnn_params(input_zp, input_sc,
                                                kernel_zp, kernel_sc,
                                                kernel_h, kernel_w, shape[3])
@@ -370,29 +373,30 @@ def test_codegen_qnn_conv2d():
         return
 
     np.random.seed(0)
-    r = random.Random(0)
 
     kernel_hs = [1, 2, 3, 5]
     kernel_ws = [1, 2, 3, 5]
     pad = [(1, 1), (2, 2), (2, 1)]
     strides = [(1, 1), (2, 2)]
     dilation = [(1, 1)]
+    out_channels = [4, 7, 16]
+    input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)]
     # composite operator (pad, bias, activation)
     composite = [(False, False, False), (False, True, False), (False, False, True),
                  (False, True, True), (True, False, False)]
     dtype = "uint8"
-    trials = generate_trials([kernel_hs, kernel_ws, pad, strides, dilation, composite], 3)
+    trials = generate_trials([kernel_hs, kernel_ws, pad, strides, dilation, out_channels,
+                              input_shapes, composite], 3)
 
-    for kernel_h, kernel_w, pad, stride, dilation, composite in trials:
-        out_channels = r.randint(4, 16)
+    for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials:
         groups = 1
-        shape = (1,) + tuple(r.randint(a=max(kernel_hs + kernel_ws), b=32) for _ in range(3))
+        shape = (1, *input_shapes)
         inputs = {"a"}
 
-        input_zp = r.randint(0, 255)
-        input_sc = r.random() * 2
-        kernel_zp = r.randint(0, 255)
-        kernel_sc = r.random() * 2
+        input_zp = 100
+        input_sc = 0.5
+        kernel_zp = 25
+        kernel_sc = 0.03
         output_zp, output_sc = _get_qnn_params(input_zp, input_sc,
                                                kernel_zp, kernel_sc,
                                                kernel_h, kernel_w, shape[3])

From d3c08f24a5a4e4dc91540cca76c8ec6eaed143f2 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Tue, 28 Jul 2020 12:53:36 +0100
Subject: [PATCH 4/4] Improve test infrastructure

* Doc-string for generate trials
* Output params on error

Change-Id: Ib2e2b1fcdf05cdc77f7f4fb4b46395f28c129957
---
 .../test_arm_compute_lib/infrastructure.py    | 39 +++++++++++++++++--
 .../test_arm_compute_lib/test_conv2d.py       | 35 +++++++++++++++--
 .../test_arm_compute_lib/test_pooling.py      | 14 ++++++-
 .../test_arm_compute_lib/test_reshape.py      | 12 ++++--
 4 files changed, 87 insertions(+), 13 deletions(-)

diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
index dfded94b7b11..5ed276318aef 100644
--- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py
+++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
@@ -156,8 +156,11 @@ def update_lib(lib, device, cross_compile):
     return lib
 
 
-def verify(answers, atol, rtol, verify_saturation=False):
+def verify(answers, atol, rtol, verify_saturation=False, params=None):
     """Compare the array of answers. Each entry is a list of outputs."""
+    if params is None:
+        params = {}
+
     if len(answers) < 2:
         raise RuntimeError(
             f"No results to compare: expected at least two, found {len(answers)}")
@@ -168,8 +171,15 @@ def verify(answers, atol, rtol, verify_saturation=False):
                     "Output is saturated: {}".format(outs[0])
                 assert np.count_nonzero(outs[0].asnumpy() == 0) < 0.25 * outs[0].asnumpy().size, \
                     "Output is saturated: {}".format(outs[0])
-            tvm.testing.assert_allclose(
-               outs[0].asnumpy(), outs[1].asnumpy(), rtol=rtol, atol=atol)
+            try:
+                tvm.testing.assert_allclose(
+                   outs[0].asnumpy(), outs[1].asnumpy(), rtol=rtol, atol=atol)
+            except AssertionError as e:
+                err_msg = "Results not within the acceptable tolerance.\n"
+                if params:
+                    err_msg += f"The test failed with the following parameters: {params}\n"
+                err_msg += str(e)
+                raise AssertionError(err_msg)
 
 
 def extract_acl_modules(module):
@@ -205,7 +215,28 @@ def verify_codegen(module, known_good_codegen, num_acl_modules,
 
 
 def generate_trials(space, r_factor=3):
-    """Generate a list of trials given series of parameters."""
+    """Generates a series of trials.
+
+    This algorithm generates a series of non-deterministic trials given a
+    space of options to test. A trial is generated by pulling a value from
+    each option in the space. On some occasions the values are shuffled to
+    ensure a different trial on each r_factor iteration. The algorithm ensures
+    that each value from an option is used at least once. The total number of
+    trials is determined by the r_factor * the option with the largest number
+    of values.
+
+    Parameters
+    ----------
+    space: List[List[Any]]
+        A list of different options with varying values to test.
+    r_factor: (optional) int
+        The repeat factor.
+
+    Returns
+    -------
+    A list of trials specifying values for each option.
+
+    """
     np.random.seed(0)
     max_len = 1
     for option in space:
diff --git a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
index e2367b431bd1..c40746674116 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
@@ -16,7 +16,6 @@
 # under the License.
 """Arm Compute Library integration conv2d tests."""
 
-import random
 import numpy as np
 
 import tvm
@@ -69,7 +68,7 @@ def _get_model(shape, kernel_h, kernel_w, padding, strides,
 
 
 def _get_qnn_params(input_zp, input_sc, kernel_zp, kernel_sc, kernel_h, kernel_w, channels):
-    """Get output qnn parameters given input and kernel paramters."""
+    """Get output qnn parameters given input and kernel parameters."""
     input_max = input_sc * (255 - input_zp)
     input_min = - input_sc * input_zp
     kernel_max = kernel_sc * (255 - kernel_zp)
@@ -274,7 +273,18 @@ def test_conv2d():
             outputs.append(build_and_run(func, inputs, 1,
                                          params, device,
                                          enable_acl=acl)[0])
-        verify(outputs, atol=0.002, rtol=0.01)
+
+        params = {
+            "shape": shape,
+            "groups": groups,
+            "kernel size": (kernel_h, kernel_w),
+            "padding": pad,
+            "stride": stride,
+            "dilation": dilation,
+            "out channels": out_channels,
+            "composite operators (pad, bias, activation)": composite
+        }
+        verify(outputs, atol=0.002, rtol=0.01, params=params)
 
 
 def test_codegen_conv2d():
@@ -365,7 +375,24 @@ def test_qnn_conv2d():
             outputs.append(build_and_run(func, inputs, 1,
                                          params, device,
                                          enable_acl=acl)[0])
-        verify(outputs, atol=1, rtol=0)
+
+        params = {
+            "shape": shape,
+            "groups": groups,
+            "kernel size": (kernel_h, kernel_w),
+            "padding": pad,
+            "stride": stride,
+            "dilation": dilation,
+            "out channels": out_channels,
+            "composite operators (pad, bias, activation)": composite,
+            "input scale": input_sc,
+            "input zero point": input_zp,
+            "kernel scale": kernel_sc,
+            "kernel zero point": kernel_zp,
+            "output scale": output_sc,
+            "output zero point": output_zp
+        }
+        verify(outputs, atol=1, rtol=0, params=params)
 
 
 def test_codegen_qnn_conv2d():
diff --git a/tests/python/contrib/test_arm_compute_lib/test_pooling.py b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
index 792483b45b16..4d48f793a636 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_pooling.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
@@ -79,6 +79,7 @@ def test_pooling():
             for stride in [(2, 2)]:
                 shape = (1, size[0] + stride[0] * 5,
                          size[1] + stride[1] * 5, 16)
+                pad = (0, 0)
 
                 inputs = {
                     "a": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)),
@@ -86,11 +87,20 @@ def test_pooling():
 
                 outputs = []
                 func = _get_model(shape, dtype, relay.nn.max_pool2d, size,
-                                  stride, (0, 0), True, iter(inputs))
+                                  stride, pad, True, iter(inputs))
                 for acl in [False, True]:
                     outputs.append(build_and_run(func, inputs, 1, None, device,
                                                  enable_acl=acl)[0])
-                verify(outputs, atol=atol, rtol=rtol)
+
+                params = {
+                    "size": size,
+                    "stride": stride,
+                    "shape": shape,
+                    "pooling type": "max",
+                    "dtype": dtype,
+                    "padding": pad
+                }
+                verify(outputs, atol=atol, rtol=rtol, params=params)
 
 
 def test_codegen_pooling():
diff --git a/tests/python/contrib/test_arm_compute_lib/test_reshape.py b/tests/python/contrib/test_arm_compute_lib/test_reshape.py
index 98e5ae6f2f43..8ab94377a256 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_reshape.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_reshape.py
@@ -69,13 +69,19 @@ def test_reshape():
                 np.random.uniform(low, high, (1, 1, 1, 1000)).astype(dtype))
         }
 
-        for shape in [(1, 1000), (10, 10, 10)]:
+        for new_shape in [(1, 1000), (10, 10, 10)]:
             outputs = []
-            func = _get_model(inputs["a"].shape, shape, dtype, iter(inputs))
+            func = _get_model(inputs["a"].shape, new_shape, dtype, iter(inputs))
             for acl in [False, True]:
                 outputs.append(build_and_run(func, inputs, 1, None, device,
                                              enable_acl=acl)[0])
-            verify(outputs, atol=1e-7, rtol=1e-7)
+
+            params = {
+                "new shape": inputs["a"].shape,
+                "shape": new_shape,
+                "dtype": dtype,
+            }
+            verify(outputs, atol=1e-7, rtol=1e-7, params=params)
 
 
 def test_codegen_reshape():