From 7a4433fc4d0fd43d90903faf2a717d80827d6871 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Fri, 31 Jul 2020 09:27:01 +0100
Subject: [PATCH 1/5] [BYOC][ACL] Improved pooling support

Adds support in ACL for the following relay pooling operators and composite functions:
  * nn.avg_pool2d (fp32), cast + nn.avg_pool2d(uint8) + cast => AVG pool
  * nn.global_max_pool2d => Global MAX pool
  * nn.global_avg_pool2d, cast + nn.global_avg_pool2d(uint8) + cast => Global AVG pool
  * power(2) + nn.avg_pool2d + sqrt => L2 pooling (for fp32 only)

Tests updated to reflect these changes.

Change-Id: I1644b67b60ebb252344eb9695a521d2d958c724e
---
 docs/deploy/arm_compute_lib.rst               |  69 +++--
 .../tvm/relay/op/contrib/arm_compute_lib.py   | 113 +++++++-
 .../contrib/arm_compute_lib/codegen.cc        |  61 +++++
 .../contrib/arm_compute_lib/acl_runtime.cc    |  54 +++-
 .../contrib/arm_compute_lib/acl_utils.cc      |  10 +-
 .../contrib/arm_compute_lib/acl_utils.h       |   4 +-
 .../test_arm_compute_lib/infrastructure.py    |  25 +-
 .../test_arm_compute_lib/test_conv2d.py       |  10 +-
 .../test_arm_compute_lib/test_network.py      |   4 +-
 .../test_arm_compute_lib/test_pooling.py      | 252 ++++++++++++++----
 .../test_arm_compute_lib/test_reshape.py      |   4 +-
 11 files changed, 503 insertions(+), 103 deletions(-)

diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst
index 26b42ae4a9c3..e3399c57db26 100644
--- a/docs/deploy/arm_compute_lib.rst
+++ b/docs/deploy/arm_compute_lib.rst
@@ -188,31 +188,50 @@ An example configuration for `test_config.json`:
 
 Operator support
 ----------------
-+--------------+-------------------------------------------------------------------------+
-| Relay Node   | Remarks                                                                 |
-+==============+=========================================================================+
-| nn.conv2d    | fp32:                                                                   |
-|              |   Simple: nn.conv2d                                                     |
-|              |   Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu?                 |
-|              |                                                                         |
-|              | (only groups = 1 supported)                                             |
-+--------------+-------------------------------------------------------------------------+
-| qnn.conv2d   | uint8:                                                                  |
-|              |   Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu?, qnn.requantize |
-|              |                                                                         |
-|              | (only groups = 1 supported)                                             |
-+--------------+-------------------------------------------------------------------------+
-| nn.dense     | fp32:                                                                   |
-|              |   Simple: nn.dense                                                      |
-|              |   Composite: nn.dense, nn.bias_add?                                     |
-+--------------+-------------------------------------------------------------------------+
-| qnn.dense    | uint8:                                                                  |
-|              |   Composite: qnn.dense, nn.bias_add?, qnn.requantize                    |
-+--------------+-------------------------------------------------------------------------+
-| nn.maxpool2d | fp32, uint8                                                             |
-+--------------+-------------------------------------------------------------------------+
-| reshape      | fp32, uint8                                                             |
-+--------------+-------------------------------------------------------------------------+
++----------------------+-------------------------------------------------------------------------+
+| Relay Node           | Remarks                                                                 |
++======================+=========================================================================+
+| nn.conv2d            | fp32:                                                                   |
+|                      |   Simple: nn.conv2d                                                     |
+|                      |   Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu?                 |
+|                      |                                                                         |
+|                      | (only groups = 1 supported)                                             |
++----------------------+-------------------------------------------------------------------------+
+| qnn.conv2d           | uint8:                                                                  |
+|                      |   Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu?, qnn.requantize |
+|                      |                                                                         |
+|                      | (only groups = 1 supported)                                             |
++----------------------+-------------------------------------------------------------------------+
+| nn.dense             | fp32:                                                                   |
+|                      |   Simple: nn.dense                                                      |
+|                      |   Composite: nn.dense, nn.bias_add?                                     |
++----------------------+-------------------------------------------------------------------------+
+| qnn.dense            | uint8:                                                                  |
+|                      |   Composite: qnn.dense, nn.bias_add?, qnn.requantize                    |
++----------------------+-------------------------------------------------------------------------+
+| nn.max_pool2d        | fp32, uint8                                                             |
++----------------------+-------------------------------------------------------------------------+
+| nn.global_max_pool2d | fp32, uint8                                                             |
++----------------------+-------------------------------------------------------------------------+
+| nn.avg_pool2d        | fp32:                                                                   |
+|                      |    Simple: nn.avg_pool2d                                                |
+|                      |                                                                         |
+|                      | uint8:                                                                  |
+|                      |    Composite: cast(int32), nn.avg_pool2d, cast(uint8)                   |
++----------------------+-------------------------------------------------------------------------+
+| nn.global_avg_pool2d | fp32:                                                                   |
+|                      |    Simple: nn.global_avg_pool2d                                         |
+|                      |                                                                         |
+|                      | uint8:                                                                  |
+|                      |    Composite: cast(int32), nn.avg_pool2d, cast(uint8)                   |
++----------------------+-------------------------------------------------------------------------+
+| power(of 2) +        | A special case for L2 pooling.                                          |
+| nn.avg_pool2d +      |                                                                         |
+| sqrt                 | fp32:                                                                   |
+|                      |    Composite: power(of 2), nn.avg_pool2d, sqrt                          |
++----------------------+-------------------------------------------------------------------------+
+| reshape              | fp32, uint8                                                             |
++----------------------+-------------------------------------------------------------------------+
 
 .. note::
     A composite operator is a series of operators that map to a single Arm Compute Library operator. You can view this
diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index e20f2d191d03..d78c0bf9a960 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -17,10 +17,11 @@
 # pylint: disable=invalid-name, unused-argument
 """Arm Compute Library supported operators."""
 import tvm
+from tvm.relay.expr import const
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
 
-from ...dataflow_pattern import wildcard, is_op, is_constant
+from ...dataflow_pattern import wildcard, is_op, is_constant, is_expr
 from .register import register_pattern_table
 
 
@@ -125,6 +126,45 @@ def qnn_dense_pattern():
             pattern, wildcard(), wildcard(), is_constant(), is_constant())
         return pattern
 
+    def avg_pool2d_pattern():
+        """Create a uint8 avg_pool2d pattern.
+
+        Returns
+        -------
+        pattern : dataflow_pattern.AltPattern
+            Denotes the convolution pattern.
+        """
+        pattern = is_op('cast')(wildcard())
+        pattern = is_op('nn.avg_pool2d')(pattern)
+        pattern = is_op('cast')(pattern)
+        return pattern
+
+    def global_avg_pool2d_pattern():
+        """Create a uint8 global_avg_pool2d pattern.
+
+        Returns
+        -------
+        pattern : dataflow_pattern.AltPattern
+            Denotes the convolution pattern.
+        """
+        pattern = is_op('cast')(wildcard())
+        pattern = is_op('nn.global_avg_pool2d')(pattern)
+        pattern = is_op('cast')(pattern)
+        return pattern
+
+    def l2_pool2d_pattern():
+        """Create an l2 pooling pattern from equivalent relay operators.
+
+        Returns
+        -------
+        pattern : dataflow_pattern.AltPattern
+            Denotes the convolution pattern.
+        """
+        pattern = is_op('power')(wildcard(), is_expr(const(2.0)))
+        pattern = is_op('nn.avg_pool2d')(pattern)
+        pattern = is_op('sqrt')(pattern)
+        return pattern
+
     def check_conv(extract):
         """Check conv pattern is supported by ACL."""
         call = extract
@@ -157,10 +197,38 @@ def check_qnn_dense(extract):
             call = call.args[0]
         return qnn_dense(call.attrs, call.args)
 
+    def check_avg_pool2d(extract):
+        """Check average pool2d pattern is supported by ACL."""
+        if extract.attrs.dtype != "uint8":
+            return False
+        pool = extract.args[0]
+        if pool.args[0].attrs.dtype != "int32":
+            return False
+        return avg_pool2d(pool.attrs, pool.args, from_quantized_composite=True)
+
+    def check_global_avg_pool2d(extract):
+        """Check global average pool2d pattern is supported by ACL."""
+        if extract.attrs.dtype != "uint8":
+            return False
+        pool = extract.args[0]
+        if pool.args[0].attrs.dtype != "int32":
+            return False
+        return global_avg_pool2d(pool.attrs, pool.args, from_quantized_composite=True)
+
+    def check_l2_pool2d(extract):
+        """Check l2 pool2d pattern is supported by ACL."""
+        pool = extract.args[0]
+        return avg_pool2d(pool.attrs, pool.args)
+
     return [('arm_compute_lib.conv2d', conv_pattern(), check_conv),
             ('arm_compute_lib.qnn_conv2d', qnn_conv_pattern(), check_qnn_conv),
             ('arm_compute_lib.dense', dense_pattern(), check_dense),
-            ('arm_compute_lib.qnn_dense', qnn_dense_pattern(), check_qnn_dense)]
+            ('arm_compute_lib.qnn_dense', qnn_dense_pattern(), check_qnn_dense),
+            ('arm_compute_lib.qnn_conv2d', qnn_conv_pattern(), check_qnn_conv),
+            ('arm_compute_lib.avg_pool2d', avg_pool2d_pattern(), check_avg_pool2d),
+            ('arm_compute_lib.global_avg_pool2d', global_avg_pool2d_pattern(),
+             check_global_avg_pool2d),
+            ('arm_compute_lib.l2_pool2d', l2_pool2d_pattern(), check_l2_pool2d)]
 
 
 def _register_external_op_helper(op_name, supported=True):
@@ -245,3 +313,44 @@ def max_pool2d(attrs, args):
     if typ.dtype not in ["float32", "uint8"]:
         return False
     return True
+
+
+@tvm.ir.register_op_attr("nn.avg_pool2d", "target.arm_compute_lib")
+def avg_pool2d(attrs, args, from_quantized_composite=False):
+    """Check if the external ACL codegen for avgpool2d should be used."""
+    typ = args[0].checked_type
+    if from_quantized_composite:
+        if typ.dtype != "int32":
+            return False
+    else:
+        if typ.dtype not in ["float32"]:
+            return False
+    if attrs.layout != "NHWC":
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.global_max_pool2d", "target.arm_compute_lib")
+def global_max_pool2d(attrs, args):
+    """Check if the external ACL codegen for gloval_maxpool2d should be used."""
+    typ = args[0].checked_type
+    if typ.dtype not in ["float32", "uint8"]:
+        return False
+    if attrs.layout != "NHWC":
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.arm_compute_lib")
+def global_avg_pool2d(attrs, args, from_quantized_composite=False):
+    """Check if the external ACL codegen for global_avgpool2d should be used."""
+    typ = args[0].checked_type
+    if from_quantized_composite:
+        if typ.dtype != "int32":
+            return False
+    else:
+        if typ.dtype not in ["float32"]:
+            return False
+    if attrs.layout != "NHWC":
+        return False
+    return True
diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
index 1132b1c56cbc..1184362c96c4 100644
--- a/src/relay/backend/contrib/arm_compute_lib/codegen.cc
+++ b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
@@ -94,6 +94,11 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
       json_node = CreateCompositeConvJSONNode(cn);
     } else if (name == "arm_compute_lib.dense" || name == "arm_compute_lib.qnn_dense") {
       json_node = CreateCompositeDenseJSONNode(cn);
+    } else if (name == "arm_compute_lib.avg_pool2d" ||
+               name == "arm_compute_lib.global_avg_pool2d") {
+      json_node = CreateCompositeAvgPool2DJSONNode(cn);
+    } else if (name == "arm_compute_lib.l2_pool2d") {
+      json_node = CreateCompositeL2Pool2DJSONNode(cn);
     } else {
       LOG(FATAL) << "Unrecognized Arm Compute Library pattern: " << name;
     }
@@ -267,6 +272,62 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
     SetCallNodeAttribute(json_node, nodes.dense);
     return json_node;
   }
+
+  /*!
+   * \brief Create a JSON representation of a composite (global) average pooling operator.
+   *
+   * A composite function is only created when using the uint8 datatype for these operators.
+   *
+   * \param cn The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  std::shared_ptr<JSONGraphNode> CreateCompositeAvgPool2DJSONNode(const CallNode* cn) {
+    const auto* fn = cn->op.as<FunctionNode>();
+    CHECK(fn);
+    const auto* cast = fn->body.as<CallNode>();
+    CHECK(cast);
+    const auto* avg_pool = cast->args[0].as<CallNode>();
+    CHECK(avg_pool);
+    const auto* avg_pool_op = avg_pool->op.as<OpNode>();
+    CHECK(avg_pool_op);
+    const std::string name = avg_pool_op->name;
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    inputs.push_back(VisitExpr(cn->args[0])[0]);
+    auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 1);
+    SetCallNodeAttribute(json_node, avg_pool);
+    return json_node;
+  }
+
+  /*!
+   * \brief Create a JSON representation of a composite L2 pooling operator.
+   *
+   * \note Relay does not have an operator for L2 pooling, instead we can create
+   * an equivalent from power(2) + nn.avg_pool2d + sqrt.
+   *
+   * \param cn The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  std::shared_ptr<JSONGraphNode> CreateCompositeL2Pool2DJSONNode(const CallNode* cn) {
+    const std::string name = "nn.l2_pool2d";
+    const auto* fn = cn->op.as<FunctionNode>();
+    CHECK(fn);
+    const auto* sqrt = fn->body.as<CallNode>();
+    CHECK(sqrt);
+    const auto* avg_pool = sqrt->args[0].as<CallNode>();
+    CHECK(avg_pool);
+    const auto* pow = avg_pool->args[0].as<CallNode>();
+    CHECK(pow);
+    const auto* exponent = pow->args[1].as<ConstantNode>();
+    CHECK(exponent);
+    CHECK_EQ(*static_cast<float*>(exponent->data->data), 2) << "Exponent must be 2 for L2 pooling";
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    inputs.push_back(VisitExpr(cn->args[0])[0]);
+    auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 1);
+    SetCallNodeAttribute(json_node, avg_pool);
+    return json_node;
+  }
 };
 
 /*!
diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
index f62420a3684f..f2d2fca64055 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
@@ -132,8 +132,11 @@ class ACLRuntime : public JSONRuntimeBase {
         } else if ("nn.dense" == op_name || "qnn.dense" == op_name) {
           CreateFullyConnectedLayer(&layer_, node, mm);
           num_pools++;
-        } else if ("nn.max_pool2d" == op_name) {
+        } else if ("nn.max_pool2d" == op_name || "nn.avg_pool2d" == op_name ||
+                   "nn.l2_pool2d" == op_name) {
           CreatePoolingLayer(&layer_, node);
+        } else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" == op_name) {
+          CreateGlobalPoolingLayer(&layer_, node);
         } else if ("reshape" == op_name) {
           CreateReshapeLayer(&layer_, node);
         } else {
@@ -308,7 +311,7 @@ class ACLRuntime : public JSONRuntimeBase {
   /*!
    * \brief Create a pooling layer.
    *
-   * \note Currently only maxpool is supported.
+   * \note Currently max_pool2d, avg_pool2d and L2 pooling are supported.
    *
    * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
    * \param node The JSON representation of the operator.
@@ -316,22 +319,65 @@ class ACLRuntime : public JSONRuntimeBase {
   void CreatePoolingLayer(CachedLayer* layer, const JSONGraphNode& node) {
     std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
     std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
-    arm_compute::PadStrideInfo pad_stride_info = MakeACLPadStride(padding, strides);
+    bool ceil_mode = std::stoi(node.GetAttr<std::vector<std::string>>("ceil_mode")[0]);
+    arm_compute::PadStrideInfo pad_stride_info = MakeACLPadStride(padding, strides, ceil_mode);
 
     auto attr_pool_size = node.GetAttr<std::vector<std::string>>("pool_size");
     int pool_size_h = std::stoi(attr_pool_size[0]);
     int pool_size_w = std::stoi(attr_pool_size[1]);
 
+    // Only applies to average pool and l2 pool.
+    // ACL exclude pad option is inverse to Relays include pad option.
+    bool exclude_pad = false;
+    if (node.HasAttr("count_include_pad")) {
+      int count_include_pad =
+          std::stoi(node.GetAttr<std::vector<std::string>>("count_include_pad")[0]);
+      exclude_pad = !count_include_pad;
+    }
+
     arm_compute::PoolingType pool_type;
     if (node.GetOpName() == "nn.max_pool2d") {
       pool_type = arm_compute::PoolingType::MAX;
+    } else if (node.GetOpName() == "nn.avg_pool2d") {
+      pool_type = arm_compute::PoolingType::AVG;
+    } else if (node.GetOpName() == "nn.l2_pool2d") {
+      pool_type = arm_compute::PoolingType::L2;
     } else {
       LOG(FATAL) << "Pooling type not supported";
     }
 
     arm_compute::PoolingLayerInfo pool_info =
         arm_compute::PoolingLayerInfo(pool_type, arm_compute::Size2D(pool_size_h, pool_size_w),
-                                      arm_compute::DataLayout::NHWC, pad_stride_info);
+                                      arm_compute::DataLayout::NHWC, pad_stride_info, exclude_pad);
+
+    layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0]));
+    layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
+
+    auto function = std::make_shared<arm_compute::NEPoolingLayer>();
+    function->configure(&layer->inputs[0], &layer->outputs[0], pool_info);
+    layer->function = function;
+  }
+
+  /*!
+   * \brief Create a global pooling layer.
+   *
+   * \note Currently global_max_pool2d and global_avg_pool2d are supported.
+   *
+   * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
+   * \param node The JSON representation of the operator.
+   */
+  void CreateGlobalPoolingLayer(CachedLayer* layer, const JSONGraphNode& node) {
+    arm_compute::PoolingType pool_type;
+    if (node.GetOpName() == "nn.global_max_pool2d") {
+      pool_type = arm_compute::PoolingType::MAX;
+    } else if (node.GetOpName() == "nn.global_avg_pool2d") {
+      pool_type = arm_compute::PoolingType::AVG;
+    } else {
+      LOG(FATAL) << "Pooling type not supported";
+    }
+
+    arm_compute::PoolingLayerInfo pool_info =
+        arm_compute::PoolingLayerInfo(pool_type, arm_compute::DataLayout::NHWC);
 
     layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0]));
     layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.cc b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
index 98c9cda9fae7..59c941df5195 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_utils.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
@@ -81,9 +81,11 @@ std::shared_ptr<arm_compute::MemoryManagerOnDemand> MakeACLMemoryManager() {
 }
 
 arm_compute::PadStrideInfo MakeACLPadStride(const std::vector<std::string>& pad,
-                                            const std::vector<std::string>& stride) {
+                                            const std::vector<std::string>& stride,
+                                            bool ceil_mode) {
   int pad_0 = 0, pad_1 = 0, pad_2 = 0, pad_3 = 0;
   int stride_0 = std::stoi(stride[0]), stride_1 = std::stoi(stride[1]);
+  auto dimensions_rounding = arm_compute::DimensionRoundingType::FLOOR;
   size_t size = pad.size();
   if (size == 1) {
     int pad_v = std::stoi(pad[0]);
@@ -109,8 +111,12 @@ arm_compute::PadStrideInfo MakeACLPadStride(const std::vector<std::string>& pad,
     LOG(FATAL) << "Unsupported padding dimensions";
   }
 
+  if (ceil_mode) {
+    dimensions_rounding = arm_compute::DimensionRoundingType::CEIL;
+  }
+
   return arm_compute::PadStrideInfo(stride_0, stride_1, pad_0, pad_1, pad_2, pad_3,
-                                    arm_compute::DimensionRoundingType::FLOOR);
+                                    dimensions_rounding);
 }
 
 arm_compute::DataType MakeACLDataType(const DLDataType& data_type) {
diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.h b/src/runtime/contrib/arm_compute_lib/acl_utils.h
index 80c6f0bcd958..576ed916ff60 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_utils.h
+++ b/src/runtime/contrib/arm_compute_lib/acl_utils.h
@@ -93,10 +93,12 @@ std::shared_ptr<arm_compute::MemoryManagerOnDemand> MakeACLMemoryManager();
  *
  * \param pad The pad vector.
  * \param stride The stride vector.
+ * \param ceil_mode Dimensions rounding.
  * \return arm_compute::PadStrideInfo
  */
 arm_compute::PadStrideInfo MakeACLPadStride(const std::vector<std::string>& pad,
-                                            const std::vector<std::string>& stride);
+                                            const std::vector<std::string>& stride,
+                                            bool ceil_mode = false);
 
 /*!
  * \brief Convert DLDataType to arm_compute::DataType.
diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
index 4e930e2276ee..c292dc3b885e 100644
--- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py
+++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
@@ -181,9 +181,20 @@ def build_module(mod, target, params=None, enable_acl=True, tvm_ops=0, acl_parti
 
 
 def build_and_run(mod, inputs, outputs, params, device, enable_acl=True, no_runs=1,
-                  tvm_ops=0, acl_partitions=1):
+                  tvm_ops=0, acl_partitions=1, config=None):
     """Build and run the relay module."""
-    lib = build_module(mod, device.target, params, enable_acl, tvm_ops, acl_partitions)
+    if not config:
+        config = {}
+
+    try:
+        lib = build_module(mod, device.target, params, enable_acl, tvm_ops, acl_partitions)
+    except Exception as e:
+        err_msg = "The module could not be built.\n"
+        if config:
+            err_msg += f"The test failed with the following parameters: {config}\n"
+        err_msg += str(e)
+        raise Exception(err_msg)
+
     lib = update_lib(lib, device.device, device.cross_compile)
     gen_module = graph_runtime.GraphModule(lib['default'](device.device.cpu(0)))
     gen_module.set_input(**inputs)
@@ -208,10 +219,10 @@ def update_lib(lib, device, cross_compile):
     return lib
 
 
-def verify(answers, atol, rtol, verify_saturation=False, params=None):
+def verify(answers, atol, rtol, verify_saturation=False, config=None):
     """Compare the array of answers. Each entry is a list of outputs."""
-    if params is None:
-        params = {}
+    if config is None:
+        config = {}
 
     if len(answers) < 2:
         raise RuntimeError(
@@ -228,8 +239,8 @@ def verify(answers, atol, rtol, verify_saturation=False, params=None):
                    outs[0].asnumpy(), outs[1].asnumpy(), rtol=rtol, atol=atol)
             except AssertionError as e:
                 err_msg = "Results not within the acceptable tolerance.\n"
-                if params:
-                    err_msg += f"The test failed with the following parameters: {params}\n"
+                if config:
+                    err_msg += f"The test failed with the following parameters: {config}\n"
                 err_msg += str(e)
                 raise AssertionError(err_msg)
 
diff --git a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
index 555cbe193408..37575cccf9eb 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
@@ -276,7 +276,7 @@ def test_conv2d():
                                          params, device,
                                          enable_acl=acl)[0])
 
-        params = {
+        config = {
             "shape": shape,
             "groups": groups,
             "kernel size": (kernel_h, kernel_w),
@@ -286,7 +286,7 @@ def test_conv2d():
             "out channels": out_channels,
             "composite operators (pad, bias, activation)": composite
         }
-        verify(outputs, atol=0.002, rtol=0.01, params=params)
+        verify(outputs, atol=0.002, rtol=0.01, config=config)
 
 
 def test_codegen_conv2d():
@@ -380,7 +380,7 @@ def test_qnn_conv2d():
                                          params, device,
                                          enable_acl=acl)[0])
 
-        params = {
+        config = {
             "shape": shape,
             "groups": groups,
             "kernel size": (kernel_h, kernel_w),
@@ -396,15 +396,13 @@ def test_qnn_conv2d():
             "output scale": output_sc,
             "output zero point": output_zp
         }
-        verify(outputs, atol=1, rtol=0, params=params, verify_saturation=True)
+        verify(outputs, atol=1, rtol=0, config=config, verify_saturation=True)
 
 
 def test_codegen_qnn_conv2d():
     if skip_codegen_test():
         return
 
-    np.random.seed(0)
-
     kernel_hs = [1, 2, 3, 5]
     kernel_ws = [1, 2, 3, 5]
     pad = [(1, 1), (2, 2), (2, 1)]
diff --git a/tests/python/contrib/test_arm_compute_lib/test_network.py b/tests/python/contrib/test_arm_compute_lib/test_network.py
index 18cac3380315..e1bb83b52079 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_network.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_network.py
@@ -116,7 +116,7 @@ def get_model():
         return mod, params, inputs
 
     _build_and_run_network(*get_model(), device=device,
-                           tvm_ops=74, acl_partitions=17,
+                           tvm_ops=73, acl_partitions=18,
                            atol=0.002, rtol=0.01)
 
 
@@ -144,7 +144,7 @@ def get_model():
         return mod, params, inputs
 
     _build_and_run_network(*get_model(), device=device,
-                           tvm_ops=45, acl_partitions=16,
+                           tvm_ops=42, acl_partitions=17,
                            atol=8, rtol=0)
 
 
diff --git a/tests/python/contrib/test_arm_compute_lib/test_pooling.py b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
index 32176afd1346..aca4a3336dac 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_pooling.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
@@ -22,30 +22,74 @@
 from tvm import relay
 
 from .infrastructure import skip_runtime_test, skip_codegen_test, build_and_run, \
-    verify, verify_codegen
+    verify, verify_codegen, generate_trials
 from .infrastructure import Device
 
 
-def _get_model(shape, dtype, typef, sizes, strides, padding,
-               ceil_mode, var_names):
+def _calculate_output_shape(shape, sizes, padding, strides):
+    """Calculate pooling output shape."""
+    output_height = ((shape[1] - sizes[0] + padding[0] + padding[2]) / strides[0]) + 1
+    output_width = ((shape[2] - sizes[1] + padding[1] + padding[3]) / strides[1]) + 1
+    return 1, int(output_height), int(output_width), shape[3]
+
+
+def _get_pooling_model(shape, dtype, typef, sizes, strides, padding,
+                       ceil_mode, count_include_pad, var_names):
     """Return a model and any parameters it may have."""
-    var = relay.var(next(var_names), shape=shape, dtype=dtype)
-    pool = typef(var, pool_size=sizes, strides=strides, padding=padding,
-                 ceil_mode=ceil_mode, layout="NHWC")
-    return pool
+    if len(padding) == 2:
+        padding = (padding[0], padding[1], padding[0], padding[1])
+    out = relay.var(next(var_names), shape=shape, dtype=dtype)
+
+    if typef == "nn.max_pool2d":
+        out = relay.nn.max_pool2d(out, pool_size=sizes, strides=strides, padding=padding,
+                                  ceil_mode=ceil_mode, layout="NHWC")
+    elif typef == "nn.avg_pool2d":
+        if dtype == "uint8":
+            out = relay.cast(out, 'int32')
+        out = relay.nn.avg_pool2d(out, pool_size=sizes, strides=strides, padding=padding,
+                                  ceil_mode=ceil_mode, count_include_pad=count_include_pad,
+                                  layout="NHWC")
+        if dtype == "uint8":
+            out = relay.cast(out, 'uint8')
+    elif typef == "nn.l2_pool2d":
+        out = relay.power(out, relay.const(2.0))
+        out = relay.nn.avg_pool2d(out, pool_size=sizes, strides=strides, padding=padding,
+                                  ceil_mode=ceil_mode, count_include_pad=count_include_pad,
+                                  layout="NHWC")
+        out = relay.sqrt(out)
+    else:
+        raise ValueError("Function not supported")
 
+    return out
 
-def _get_expected_codegen(shape, dtype, typef, sizes, strides,
-                          padding, ceil_mode):
+
+def _get_global_pooling_model(shape, dtype, typef, var_names):
+    """Return a model and any parameters it may have."""
+    out = relay.var(next(var_names), shape=shape, dtype=dtype)
+
+    if typef == "nn.global_max_pool2d":
+        out = relay.nn.global_max_pool2d(out, layout="NHWC")
+    elif typef == "nn.global_avg_pool2d":
+        if dtype == "uint8":
+            out = relay.cast(out, 'int32')
+        out = relay.nn.global_avg_pool2d(out, layout="NHWC")
+        if dtype == "uint8":
+            out = relay.cast(out, 'uint8')
+    else:
+        raise ValueError("Function not supported")
+
+    return out
+
+
+def _get_expected_pooling_codegen(shape, dtype, typef, sizes, strides,
+                                  padding, ceil_mode, count_include_pad):
     if len(padding) == 2:
-        padding = (padding[1], padding[1], padding[0], padding[0])
-    output_height = ((shape[1] - sizes[0] + padding[0] + padding[2]) / strides[0]) + 1
-    output_width = ((shape[2] - sizes[1] + padding[1] + padding[3]) / strides[1]) + 1
-    output_shape = (1, int(output_height), int(output_width), shape[3])
+        padding = (padding[0], padding[1], padding[0], padding[1])
+    output_shape = _calculate_output_shape(shape, sizes, padding, strides)
 
     node = {
         "op": "kernel",
-        "name": "nn.max_pool2d",
+        "name": typef,
         "inputs": [[0, 0, 0]],
         "attrs": {
             "num_inputs": "1",
@@ -60,6 +104,30 @@ def _get_expected_codegen(shape, dtype, typef, sizes, strides,
         },
     }
 
+    if typef == "nn.avg_pool2d" or typef == "nn.l2_pool2d":
+        node["attrs"]["count_include_pad"] = [["1" if count_include_pad else "0"]]
+
+    input = {
+        "op": "input",
+        "name": "",
+        "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}}
+    return [input, node]
+
+
+def _get_expected_global_pooling_codegen(shape, dtype, typef):
+    node = {
+        "op": "kernel",
+        "name": typef,
+        "inputs": [[0, 0, 0]],
+        "attrs": {
+            "num_inputs": "1",
+            "num_outputs": "1",
+            "layout": [["NHWC"]],
+            "shape": [[[1, 1, 1, shape[3]]]],
+            "dtype": [[dtype]]
+        }
+    }
+
     input = {
         "op": "input",
         "name": "",
@@ -76,53 +144,133 @@ def test_pooling():
     device = Device()
     np.random.seed(0)
 
-    for dtype, low, high, atol, rtol in [("float32", -127, 128, 0.001, 0.001), ("uint8", 0, 255, 0, 0)]:
-        for size in [(2, 2), (3, 3)]:
-            for stride in [(2, 2)]:
-                shape = (1, size[0] + stride[0] * 5,
-                         size[1] + stride[1] * 5, 16)
-                pad = (0, 0)
-
-                inputs = {
-                    "a": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)),
-                }
-
-                outputs = []
-                func = _get_model(shape, dtype, relay.nn.max_pool2d, size,
-                                  stride, pad, True, iter(inputs))
-                for acl in [False, True]:
-                    outputs.append(build_and_run(func, inputs, 1, None, device,
-                                                 enable_acl=acl)[0])
-
-                params = {
-                    "size": size,
-                    "stride": stride,
-                    "shape": shape,
-                    "pooling type": "max",
-                    "dtype": dtype,
-                    "padding": pad
-                }
-                verify(outputs, atol=atol, rtol=rtol, params=params, verify_saturation=True)
+    typef = ["nn.max_pool2d", "nn.avg_pool2d", "nn.l2_pool2d"]
+    dtype = [("float32", -127, 128, 0.001, 0.001), ("uint8", 0, 255, 1, 0)]
+    size = [(2, 2), (3, 3)]
+    stride = [(2, 2)]
+    pad = [(0, 0), (1, 1), (0, 1)]
+    ceil_mode = [False, True]
+    count_include_pad = [False, True]
+    input_shapes = [(8, 8, 16), (9, 9, 16)]
+    trials = generate_trials([typef, dtype, size, stride, pad, ceil_mode, count_include_pad, input_shapes], 3)
+
+    for typef, (dtype, low, high, atol, rtol), size, stride, pad, ceil_mode, count_include_pad, \
+            input_shape in trials:
+
+        # L2 pooling not currently supported for uint8
+        if typef == "nn.l2_pool2d" and dtype == "uint8":
+            continue
+
+        shape = (1, *input_shape)
+        outputs = []
+        inputs = {
+            "a": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)),
+        }
+
+        func = _get_pooling_model(shape, dtype, typef, size,
+                                  stride, pad, ceil_mode, count_include_pad, iter(inputs))
+
+        config = {
+            "size": size,
+            "stride": stride,
+            "shape": shape,
+            "pooling type": typef,
+            "dtype": dtype,
+            "padding": pad,
+            "ceil_mode": ceil_mode,
+            "count_include_pad": count_include_pad
+        }
+
+        for acl in [False, True]:
+            outputs.append(build_and_run(func, inputs, 1, None, device,
+                                         enable_acl=acl, config=config)[0])
+
+        verify(outputs, atol=atol, rtol=rtol, config=config)
+
+
+def test_global_pooling():
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    np.random.seed(0)
+
+    typef = ["nn.global_max_pool2d", "nn.global_avg_pool2d"]
+    dtype = [("float32", -127, 128, 0.001, 0.001), ("uint8", 0, 255, 1, 0)]
+    input_shapes = [(8, 8, 16), (9, 9, 16)]
+    trials = generate_trials([typef, dtype, input_shapes], 3)
+
+    for typef, (dtype, low, high, atol, rtol), input_shape in trials:
+        shape = (1, *input_shape)
+        outputs = []
+        inputs = {
+            "a": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)),
+        }
+
+        func = _get_global_pooling_model(shape, dtype, typef, iter(inputs))
+
+        config = {
+            "shape": shape,
+            "pooling type": typef,
+            "dtype": dtype,
+        }
+
+        for acl in [False, True]:
+            outputs.append(build_and_run(func, inputs, 1, None, device,
+                                         enable_acl=acl, config=config)[0])
+        verify(outputs, atol=atol, rtol=rtol, config=config, verify_saturation=True)
 
 
 def test_codegen_pooling():
     if skip_codegen_test():
         return
 
-    inputs = {"a"}
+    typef = ["nn.max_pool2d", "nn.avg_pool2d", "nn.l2_pool2d"]
+    dtype = [("float32", -127, 128), ("uint8", 0, 255)]
+    size = [(2, 2), (3, 3)]
+    stride = [(2, 2)]
+    pad = [(0, 0), (1, 1), (2, 1)]
+    ceil_mode = [False, True]
+    count_include_pad = [False, True]
+    input_shapes = [(8, 8, 16), (9, 9, 16)]
+    trials = generate_trials([typef, dtype, size, stride, pad, ceil_mode, count_include_pad, input_shapes], 3)
+
+    for typef, (dtype, low, high), size, stride, pad, ceil_mode, count_include_pad, \
+            input_shape in trials:
+
+        # L2 pooling not currently supported for uint8
+        if typef == "nn.l2_pool2d" and dtype == "uint8":
+            continue
+
+        shape = (1, *input_shape)
+        inputs = {"a"}
+        args = (shape, dtype, typef, size,
+                stride, pad, False, False)
+        func = _get_pooling_model(*args, iter(inputs))
+        exp_codegen = _get_expected_pooling_codegen(*args)
+        verify_codegen(func, exp_codegen, 1)
+
+
+def test_codegen_global_pooling():
+    if skip_codegen_test():
+        return
+
+    typef = ["nn.global_max_pool2d", "nn.global_avg_pool2d"]
+    dtype = [("float32", -127, 128), ("uint8", 0, 255)]
+    input_shapes = [(8, 8, 16), (9, 9, 16)]
+    trials = generate_trials([typef, dtype, input_shapes], 3)
 
-    for dtype in ["float32", "uint8"]:
-        for size in [(2, 2), (3, 3)]:
-            for stride in [(2, 2)]:
-                shape = (1, size[0] + stride[0] * 5,
-                         size[1] + stride[1] * 5, 16)
-                args = (shape, dtype, relay.nn.max_pool2d, size,
-                        stride, (0, 0), True)
-                func = _get_model(*args, iter(inputs))
-                exp_codegen = _get_expected_codegen(*args)
-                verify_codegen(func, exp_codegen, 1)
+    for typef, (dtype, low, high), input_shape in trials:
+        shape = (1, *input_shape)
+        inputs = {"a"}
+        args = (shape, dtype, typef)
+        func = _get_global_pooling_model(*args, iter(inputs))
+        exp_codegen = _get_expected_global_pooling_codegen(*args)
+        verify_codegen(func, exp_codegen, 1)
 
 
 if __name__ == "__main__":
     test_pooling()
+    test_global_pooling()
     test_codegen_pooling()
+    test_codegen_global_pooling()
diff --git a/tests/python/contrib/test_arm_compute_lib/test_reshape.py b/tests/python/contrib/test_arm_compute_lib/test_reshape.py
index 38694e8ccaaa..b6a87542062a 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_reshape.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_reshape.py
@@ -78,12 +78,12 @@ def test_reshape():
                 outputs.append(build_and_run(func, inputs, 1, None, device,
                                              enable_acl=acl)[0])
 
-            params = {
+            config = {
                 "new shape": inputs["a"].shape,
                 "shape": new_shape,
                 "dtype": dtype,
             }
-            verify(outputs, atol=1e-7, rtol=1e-7, params=params)
+            verify(outputs, atol=1e-7, rtol=1e-7, config=config)
 
 
 def test_codegen_reshape():

From 5f4209d152ed46f383354032ab28dbbaed80a74d Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Fri, 14 Aug 2020 14:52:26 +0100
Subject: [PATCH 2/5] Address comments

Change-Id: Ibe8a61b4c42da246ce54701c89ea985b423c8f83
---
 .../tvm/relay/op/contrib/arm_compute_lib.py   | 39 +++----------------
 .../contrib/arm_compute_lib/codegen.cc        |  3 +-
 2 files changed, 7 insertions(+), 35 deletions(-)

diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index d78c0bf9a960..adeeeb1edebb 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -127,7 +127,8 @@ def qnn_dense_pattern():
         return pattern
 
     def avg_pool2d_pattern():
-        """Create a uint8 avg_pool2d pattern.
+        """Creates a pattern that matches either quantized
+        avg_pool2d or quantized global_avg_pool2d.
 
         Returns
         -------
@@ -135,20 +136,7 @@ def avg_pool2d_pattern():
             Denotes the convolution pattern.
         """
         pattern = is_op('cast')(wildcard())
-        pattern = is_op('nn.avg_pool2d')(pattern)
-        pattern = is_op('cast')(pattern)
-        return pattern
-
-    def global_avg_pool2d_pattern():
-        """Create a uint8 global_avg_pool2d pattern.
-
-        Returns
-        -------
-        pattern : dataflow_pattern.AltPattern
-            Denotes the convolution pattern.
-        """
-        pattern = is_op('cast')(wildcard())
-        pattern = is_op('nn.global_avg_pool2d')(pattern)
+        pattern = is_op('nn.avg_pool2d')(pattern) | is_op('nn.global_avg_pool2d')(pattern)
         pattern = is_op('cast')(pattern)
         return pattern
 
@@ -206,15 +194,6 @@ def check_avg_pool2d(extract):
             return False
         return avg_pool2d(pool.attrs, pool.args, from_quantized_composite=True)
 
-    def check_global_avg_pool2d(extract):
-        """Check global average pool2d pattern is supported by ACL."""
-        if extract.attrs.dtype != "uint8":
-            return False
-        pool = extract.args[0]
-        if pool.args[0].attrs.dtype != "int32":
-            return False
-        return global_avg_pool2d(pool.attrs, pool.args, from_quantized_composite=True)
-
     def check_l2_pool2d(extract):
         """Check l2 pool2d pattern is supported by ACL."""
         pool = extract.args[0]
@@ -226,8 +205,6 @@ def check_l2_pool2d(extract):
             ('arm_compute_lib.qnn_dense', qnn_dense_pattern(), check_qnn_dense),
             ('arm_compute_lib.qnn_conv2d', qnn_conv_pattern(), check_qnn_conv),
             ('arm_compute_lib.avg_pool2d', avg_pool2d_pattern(), check_avg_pool2d),
-            ('arm_compute_lib.global_avg_pool2d', global_avg_pool2d_pattern(),
-             check_global_avg_pool2d),
             ('arm_compute_lib.l2_pool2d', l2_pool2d_pattern(), check_l2_pool2d)]
 
 
@@ -342,15 +319,11 @@ def global_max_pool2d(attrs, args):
 
 
 @tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.arm_compute_lib")
-def global_avg_pool2d(attrs, args, from_quantized_composite=False):
+def global_avg_pool2d(attrs, args):
     """Check if the external ACL codegen for global_avgpool2d should be used."""
     typ = args[0].checked_type
-    if from_quantized_composite:
-        if typ.dtype != "int32":
-            return False
-    else:
-        if typ.dtype not in ["float32"]:
-            return False
+    if typ.dtype not in ["float32"]:
+        return False
     if attrs.layout != "NHWC":
         return False
     return True
diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
index 1184362c96c4..087c895f4614 100644
--- a/src/relay/backend/contrib/arm_compute_lib/codegen.cc
+++ b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
@@ -94,8 +94,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
       json_node = CreateCompositeConvJSONNode(cn);
     } else if (name == "arm_compute_lib.dense" || name == "arm_compute_lib.qnn_dense") {
       json_node = CreateCompositeDenseJSONNode(cn);
-    } else if (name == "arm_compute_lib.avg_pool2d" ||
-               name == "arm_compute_lib.global_avg_pool2d") {
+    } else if (name == "arm_compute_lib.avg_pool2d") {
       json_node = CreateCompositeAvgPool2DJSONNode(cn);
     } else if (name == "arm_compute_lib.l2_pool2d") {
       json_node = CreateCompositeL2Pool2DJSONNode(cn);

From 3c710acc62083d518971c2f793b1c063ae3d917c Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Fri, 14 Aug 2020 14:53:09 +0100
Subject: [PATCH 3/5] Fix not checking output saturation

Change-Id: Ia6f3d9db31cfb8c417d8556d29961210fea418b2
---
 .../contrib/test_arm_compute_lib/infrastructure.py   | 12 ++++++------
 .../contrib/test_arm_compute_lib/test_pooling.py     |  9 ++++++---
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
index c292dc3b885e..cc4818e96625 100644
--- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py
+++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
@@ -183,7 +183,7 @@ def build_module(mod, target, params=None, enable_acl=True, tvm_ops=0, acl_parti
 def build_and_run(mod, inputs, outputs, params, device, enable_acl=True, no_runs=1,
                   tvm_ops=0, acl_partitions=1, config=None):
     """Build and run the relay module."""
-    if not config:
+    if config is None:
         config = {}
 
     try:
@@ -229,12 +229,12 @@ def verify(answers, atol, rtol, verify_saturation=False, config=None):
             f"No results to compare: expected at least two, found {len(answers)}")
     for answer in zip_longest(*answers):
         for outs in combinations(answer, 2):
-            if verify_saturation:
-                assert np.count_nonzero(outs[0].asnumpy() == 255) < 0.25 * outs[0].asnumpy().size, \
-                    "Output is saturated: {}".format(outs[0])
-                assert np.count_nonzero(outs[0].asnumpy() == 0) < 0.25 * outs[0].asnumpy().size, \
-                    "Output is saturated: {}".format(outs[0])
             try:
+                if verify_saturation:
+                    assert np.count_nonzero(outs[0].asnumpy() == 255) < 0.25 * outs[0].asnumpy().size, \
+                        "Output is saturated: {}".format(outs[0])
+                    assert np.count_nonzero(outs[0].asnumpy() == 0) < 0.25 * outs[0].asnumpy().size, \
+                        "Output is saturated: {}".format(outs[0])
                 tvm.testing.assert_allclose(
                    outs[0].asnumpy(), outs[1].asnumpy(), rtol=rtol, atol=atol)
             except AssertionError as e:
diff --git a/tests/python/contrib/test_arm_compute_lib/test_pooling.py b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
index aca4a3336dac..09c2228ed793 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_pooling.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
@@ -151,7 +151,7 @@ def test_pooling():
     pad = [(0, 0), (1, 1), (0, 1)]
     ceil_mode = [False, True]
     count_include_pad = [False, True]
-    input_shapes = [(8, 8, 16), (9, 9, 16)]
+    input_shapes = [(16, 16, 16), (15, 15, 16)]
     trials = generate_trials([typef, dtype, size, stride, pad, ceil_mode, count_include_pad, input_shapes], 3)
 
     for typef, (dtype, low, high, atol, rtol), size, stride, pad, ceil_mode, count_include_pad, \
@@ -180,12 +180,13 @@ def test_pooling():
             "ceil_mode": ceil_mode,
             "count_include_pad": count_include_pad
         }
+        verify_saturation = True if dtype == "uint8" else False
 
         for acl in [False, True]:
             outputs.append(build_and_run(func, inputs, 1, None, device,
                                          enable_acl=acl, config=config)[0])
 
-        verify(outputs, atol=atol, rtol=rtol, config=config)
+        verify(outputs, atol=atol, rtol=rtol, config=config, verify_saturation=verify_saturation)
 
 
 def test_global_pooling():
@@ -214,11 +215,13 @@ def test_global_pooling():
             "pooling type": typef,
             "dtype": dtype,
         }
+        verify_saturation = True if dtype == "uint8" else False
 
         for acl in [False, True]:
             outputs.append(build_and_run(func, inputs, 1, None, device,
                                          enable_acl=acl, config=config)[0])
-        verify(outputs, atol=atol, rtol=rtol, config=config, verify_saturation=True)
+
+        verify(outputs, atol=atol, rtol=rtol, config=config, verify_saturation=verify_saturation)
 
 
 def test_codegen_pooling():

From 3d6f42f5a8431c47901c02b63b887ea9f8c5655b Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Fri, 21 Aug 2020 16:13:53 +0100
Subject: [PATCH 4/5] Use defined set of trials

Change-Id: Ib180e3a0cbb84d6fa00c7e1994f58cb62662db15
---
 .../test_arm_compute_lib/test_pooling.py      | 96 ++++++++++++-------
 1 file changed, 59 insertions(+), 37 deletions(-)

diff --git a/tests/python/contrib/test_arm_compute_lib/test_pooling.py b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
index 09c2228ed793..934315ceb9ab 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_pooling.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
@@ -22,7 +22,7 @@
 from tvm import relay
 
 from .infrastructure import skip_runtime_test, skip_codegen_test, build_and_run, \
-    verify, verify_codegen, generate_trials
+    verify, verify_codegen
 from .infrastructure import Device
 
 
@@ -144,23 +144,25 @@ def test_pooling():
     device = Device()
     np.random.seed(0)
 
-    typef = ["nn.max_pool2d", "nn.avg_pool2d", "nn.l2_pool2d"]
-    dtype = [("float32", -127, 128, 0.001, 0.001), ("uint8", 0, 255, 1, 0)]
-    size = [(2, 2), (3, 3)]
-    stride = [(2, 2)]
-    pad = [(0, 0), (1, 1), (0, 1)]
-    ceil_mode = [False, True]
-    count_include_pad = [False, True]
-    input_shapes = [(16, 16, 16), (15, 15, 16)]
-    trials = generate_trials([typef, dtype, size, stride, pad, ceil_mode, count_include_pad, input_shapes], 3)
+    fp32_dtype = ("float32", -127, 128, 0.001, 0.001)
+    uint8_dtype = ("uint8", 0, 255, 1, 0)
+
+    trials = [["nn.max_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
+              ["nn.max_pool2d", fp32_dtype, (3, 3), (2, 2), (1, 1), True, True, (15, 15, 16)],
+              ["nn.max_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 1), False, False, (16, 16, 16)],
+              ["nn.max_pool2d", uint8_dtype, (3, 3), (2, 2), (0, 1), False, False, (16, 16, 16)],
+              ["nn.max_pool2d", uint8_dtype, (2, 2), (2, 2), (1, 1), True, True, (15, 15, 16)],
+              ["nn.avg_pool2d", fp32_dtype, (2, 2), (2, 2), (1, 1), False, False, (16, 16, 16)],
+              ["nn.avg_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
+              ["nn.avg_pool2d", fp32_dtype, (3, 3), (2, 2), (0, 1), True, False, (15, 15, 16)],
+              ["nn.avg_pool2d", uint8_dtype, (2, 2), (2, 2), (1, 1), False, True, (16, 16, 16)],
+              ["nn.avg_pool2d", uint8_dtype, (3, 3), (2, 2), (0, 1), False, False, (16, 16, 16)],
+              ["nn.l2_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 1), True, False, (16, 16, 16)],
+              ["nn.l2_pool2d", fp32_dtype, (3, 3), (2, 2), (0, 0), False, False, (16, 16, 16)],
+              ["nn.l2_pool2d", fp32_dtype, (2, 2), (2, 2), (1, 1), False, True, (15, 15, 16)]]
 
     for typef, (dtype, low, high, atol, rtol), size, stride, pad, ceil_mode, count_include_pad, \
             input_shape in trials:
-
-        # L2 pooling not currently supported for uint8
-        if typef == "nn.l2_pool2d" and dtype == "uint8":
-            continue
-
         shape = (1, *input_shape)
         outputs = []
         inputs = {
@@ -196,10 +198,19 @@ def test_global_pooling():
     device = Device()
     np.random.seed(0)
 
-    typef = ["nn.global_max_pool2d", "nn.global_avg_pool2d"]
-    dtype = [("float32", -127, 128, 0.001, 0.001), ("uint8", 0, 255, 1, 0)]
-    input_shapes = [(8, 8, 16), (9, 9, 16)]
-    trials = generate_trials([typef, dtype, input_shapes], 3)
+    fp32_dtype = ("float32", -127, 128, 0.001, 0.001)
+    uint8_dtype = ("uint8", 0, 255, 1, 0)
+
+    trials = [["nn.global_max_pool2d", fp32_dtype, (8, 8, 16)],
+              ["nn.global_max_pool2d", fp32_dtype, (9, 9, 16)],
+              ["nn.global_max_pool2d", fp32_dtype, (8, 8, 16)],
+              ["nn.global_max_pool2d", uint8_dtype, (8, 8, 16)],
+              ["nn.global_max_pool2d", uint8_dtype, (9, 9, 16)],
+              ["nn.global_avg_pool2d", fp32_dtype, (8, 8, 16)],
+              ["nn.global_avg_pool2d", fp32_dtype, (8, 8, 16)],
+              ["nn.global_avg_pool2d", fp32_dtype, (9, 9, 16)],
+              ["nn.global_avg_pool2d", uint8_dtype, (8, 8, 16)],
+              ["nn.global_avg_pool2d", uint8_dtype, (8, 8, 16)]]
 
     for typef, (dtype, low, high, atol, rtol), input_shape in trials:
         shape = (1, *input_shape)
@@ -228,23 +239,25 @@ def test_codegen_pooling():
     if skip_codegen_test():
         return
 
-    typef = ["nn.max_pool2d", "nn.avg_pool2d", "nn.l2_pool2d"]
-    dtype = [("float32", -127, 128), ("uint8", 0, 255)]
-    size = [(2, 2), (3, 3)]
-    stride = [(2, 2)]
-    pad = [(0, 0), (1, 1), (2, 1)]
-    ceil_mode = [False, True]
-    count_include_pad = [False, True]
-    input_shapes = [(8, 8, 16), (9, 9, 16)]
-    trials = generate_trials([typef, dtype, size, stride, pad, ceil_mode, count_include_pad, input_shapes], 3)
+    fp32_dtype = ("float32", -127, 128)
+    uint8_dtype = ("uint8", 0, 255)
+
+    trials = [["nn.max_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
+              ["nn.max_pool2d", fp32_dtype, (3, 3), (2, 2), (1, 1), True, True, (15, 15, 16)],
+              ["nn.max_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 1), False, False, (16, 16, 16)],
+              ["nn.max_pool2d", uint8_dtype, (3, 3), (2, 2), (0, 1), False, False, (16, 16, 16)],
+              ["nn.max_pool2d", uint8_dtype, (2, 2), (2, 2), (1, 1), True, True, (15, 15, 16)],
+              ["nn.avg_pool2d", fp32_dtype, (2, 2), (2, 2), (1, 1), False, False, (16, 16, 16)],
+              ["nn.avg_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
+              ["nn.avg_pool2d", fp32_dtype, (3, 3), (2, 2), (0, 1), True, False, (15, 15, 16)],
+              ["nn.avg_pool2d", uint8_dtype, (2, 2), (2, 2), (1, 1), False, True, (16, 16, 16)],
+              ["nn.avg_pool2d", uint8_dtype, (3, 3), (2, 2), (0, 1), False, False, (16, 16, 16)],
+              ["nn.l2_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 1), True, False, (15, 15, 16)],
+              ["nn.l2_pool2d", fp32_dtype, (3, 3), (2, 2), (0, 0), False, False, (16, 16, 16)],
+              ["nn.l2_pool2d", fp32_dtype, (2, 2), (2, 2), (1, 1), False, True, (15, 15, 16)]]
 
     for typef, (dtype, low, high), size, stride, pad, ceil_mode, count_include_pad, \
             input_shape in trials:
-
-        # L2 pooling not currently supported for uint8
-        if typef == "nn.l2_pool2d" and dtype == "uint8":
-            continue
-
         shape = (1, *input_shape)
         inputs = {"a"}
         args = (shape, dtype, typef, size,
@@ -258,10 +271,19 @@ def test_codegen_global_pooling():
     if skip_codegen_test():
         return
 
-    typef = ["nn.global_max_pool2d", "nn.global_avg_pool2d"]
-    dtype = [("float32", -127, 128), ("uint8", 0, 255)]
-    input_shapes = [(8, 8, 16), (9, 9, 16)]
-    trials = generate_trials([typef, dtype, input_shapes], 3)
+    fp32_dtype = ("float32", -127, 128)
+    uint8_dtype = ("uint8", 0, 255)
+
+    trials = [["nn.global_max_pool2d", fp32_dtype, (8, 8, 16)],
+              ["nn.global_max_pool2d", fp32_dtype, (9, 9, 16)],
+              ["nn.global_max_pool2d", fp32_dtype, (8, 8, 16)],
+              ["nn.global_max_pool2d", uint8_dtype, (8, 8, 16)],
+              ["nn.global_max_pool2d", uint8_dtype, (9, 9, 16)],
+              ["nn.global_avg_pool2d", fp32_dtype, (8, 8, 16)],
+              ["nn.global_avg_pool2d", fp32_dtype, (8, 8, 16)],
+              ["nn.global_avg_pool2d", fp32_dtype, (9, 9, 16)],
+              ["nn.global_avg_pool2d", uint8_dtype, (8, 8, 16)],
+              ["nn.global_avg_pool2d", uint8_dtype, (8, 8, 16)]]
 
     for typef, (dtype, low, high), input_shape in trials:
         shape = (1, *input_shape)

From a19721c9273c95fa5dcbeeba169f31889f36c33b Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 26 Aug 2020 09:54:24 +0100
Subject: [PATCH 5/5] Rebase master

Change-Id: I5c932751cd38da06d6f2b397be5d8ab7fdeb169f
---
 tests/python/contrib/test_arm_compute_lib/test_pooling.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/python/contrib/test_arm_compute_lib/test_pooling.py b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
index 934315ceb9ab..c104a0659b7f 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_pooling.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
@@ -192,6 +192,8 @@ def test_pooling():
 
 
 def test_global_pooling():
+    Device.load("test_config.json")
+
     if skip_runtime_test():
         return