From 81da33f81b7feed52aa7d8b04d0faca0ce09a416 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Lorenzo=20D=C3=ADaz?=
 <6094231+javierlorenzod@users.noreply.github.com>
Date: Mon, 19 Nov 2018 18:00:55 +0100
Subject: [PATCH 01/30] Update README.md typo (#2132)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index e2fc7b8c45d28..828b0f7e880ba 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,6 @@ Acknowledgement
 ---------------
 We learnt a lot from the following projects when building TVM.
 - [Halide](https://github.com/halide/Halide): TVM uses [HalideIR](https://github.com/dmlc/HalideIR) as data structure for
-  arithematic simplification and low level lowering. We also learnt and adapted some part of lowering pipeline from Halide.
+  arithmetic simplification and low level lowering. We also learnt and adapted some part of lowering pipeline from Halide.
 - [Loopy](https://github.com/inducer/loopy): use of integer set analysis and its loop transformation primitives.
 - [Theano](https://github.com/Theano/Theano): the design inspiration of symbolic scan operator for recurrence.

From b71edd76bdb73cc188e4e985e987b8e083b2bcf2 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 19 Nov 2018 14:23:37 -0500
Subject: [PATCH 02/30] Relay Op sprint (part 2) - Level 1 - log_softmax
 (#2128)

---
 python/tvm/relay/op/nn/_nn.py        | 10 +++++++++-
 src/relay/op/nn/nn.cc                | 13 ++++++++++++-
 tests/python/relay/test_op_level1.py | 15 +++++++++++----
 3 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 8d53e27892bc8..e30cf8ba2ccf6 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -9,7 +9,6 @@
 reg.register_schedule("nn.relu", schedule_injective)
 reg.register_pattern("nn.relu", OpPattern.ELEMWISE)
 
-
 @reg.register_schedule("nn.softmax")
 def schedule_softmax(_, outputs, target):
     """Schedule definition of softmax"""
@@ -19,6 +18,15 @@ def schedule_softmax(_, outputs, target):
 reg.register_pattern("nn.softmax", OpPattern.OPAQUE)
 
 
+@reg.register_schedule("nn.log_softmax")
+def schedule_log_softmax(_, outputs, target):
+    """Schedule definition of log_softmax"""
+    with target:
+        return topi.generic.schedule_softmax(outputs)
+
+reg.register_pattern("nn.log_softmax", OpPattern.OPAQUE)
+
+
 # dense
 @reg.register_compute("nn.dense")
 def compute_dense(attrs, inputs, out_type, target):
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 16b65aeeab7f3..dfa68197819b9 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -291,7 +291,18 @@ RELAY_REGISTER_OP("nn.log_softmax")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(1)
-.add_type_rel("Identity", IdentityRel);
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
+                                         const Array<Tensor>& inputs,
+                                         const Type& out_type,
+                                         const Target& target) {
+  const auto* param = attrs.as<SoftmaxAttrs>();
+  CHECK(param != nullptr);
+  CHECK(param->axis == -1 || param->axis == static_cast<int32_t>(inputs[0].ndim()) - 1)
+      << "log_softmax currently only works on last dimension";
+  return Array<Tensor>{ topi::nn::log_softmax(inputs[0]) };
+});
+
 
 
 // BatchFlatten
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 53de7aa262797..35844ddd4a3fb 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -137,12 +137,19 @@ def test_softmax():
 
 
 def test_log_softmax():
-    n, d = tvm.var("n"), tvm.var("d")
-    x = relay.var("x", shape=(n, d))
-    y = relay.nn.log_softmax(x, axis=0)
+    shape = (10, 4)
+    x = relay.var("x", shape=shape)
+    y = relay.nn.log_softmax(x, axis=1)
     assert "nn.log_softmax" in y.astext()
     yy = relay.ir_pass.infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, d))
+    assert yy.checked_type == relay.TensorType(shape)
+    func = relay.Function([x], y)
+    x_data = np.random.uniform(size=shape).astype("float32")
+    ref_res = topi.testing.log_softmax_python(x_data)
+    for target, ctx in ctx_list():
+        intrp = relay.create_executor("graph", ctx=ctx, target=target)
+        op_res = intrp.evaluate(func)(x_data)
+        np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
 
 def test_concatenate():

From 7858a1ea7f17ec78be60135417f1b59c0c7f028d Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 19 Nov 2018 12:26:35 -0800
Subject: [PATCH 03/30] [COMMUNITY] new community guideline (#2077)

---
 docs/contribute/community.rst | 53 +++++++++++------------------------
 1 file changed, 16 insertions(+), 37 deletions(-)

diff --git a/docs/contribute/community.rst b/docs/contribute/community.rst
index 1023cf0ddcccf..3a3e5ec3d0fde 100644
--- a/docs/contribute/community.rst
+++ b/docs/contribute/community.rst
@@ -1,51 +1,30 @@
-TVM Community Structure
+TVM Community Guideline
 =======================
 
-TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use, contribute to, and influence the direction of the project. We actively invite contributors who have earned the merit to be part of the development community. There are several roles in the community:
+TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use, contribute to, and influence the direction of the project. See `CONTRIBUTORS.md <https://github.com/dmlc/tvm/blob/master/CONTRIBUTORS.md>`_ for the current list of contributors.
 
-- Project Management Committee(PMC) Small group of active committers that moderate the discussion, RFC, manage project releases.
-- Committer Individual who has made substantial contributions to the project and is granted write access to the project and oversees the general direction of the projects.
-- Code Owner Individual who is responsible for a specific area of the codebase.
-- Reviewer Individual who is qualified to review for a specific area of the codebase.
-- Contributor Anyone who contributes to the project.
 
-This document explains responsibility and criteria for each role.
-See `CONTRIBUTORS.md <https://github.com/dmlc/tvm/blob/master/CONTRIBUTORS.md>`_ for the current list of contributors and their roles.
+General Development Process
+---------------------------
+Everyone in the community is welcomed to send patches, documents, and propose new directions to the project. The key guideline here is to enable everyone in the community to get involved and participate the decision and development.  When major changes are proposed, an RFC should be sent to allow discussion by the community. We encourage public discussion, archivable channels such as issues, discuss forum and mailing-list, so that everyone in the community can participate and review the process later.
 
+Code reviews are one of the key ways to ensure the quality of the code. High-quality code reviews prevent technical debt for long-term and are crucial to the success of the project. A pull request needs to be reviewed before it gets merged. A committer who has the expertise of the corresponding area would moderate the pull request and the merge the code when it is ready. The corresponding committer could request multiple reviewers who are familiar with the area of the code. We encourage contributors to request code reviews themselves and help review each other's code -- remember everyone is volunteering their time to the community, high-quality code review itself costs as much as the actual code contribution, you could get your code quickly reviewed if you do others the same favor.
 
-Project Management Committee
-----------------------------
+The community should strive to reach a consensus on technical decisions through discussion. We expect committers and PMCs to moderate technical discussions in a diplomatic way, and provide suggestions with clear technical reasoning when necessary.
 
-The PMC consists of a small group of active committers that moderate the discussion, provide mentorship to committers and code owners and manage the project release. PMC members need to actively manage the general project directions. Note that most major design choices and proposed changes should reach consensus among the committers.
 
-Committer
----------
-
-Committers are individuals who are granted the write access to the project. Committers oversee the general project directions and participate in the evaluation of the RFCs involving major design changes. Here is a list of useful things to do to help become a committer.
-
-- Deep understanding of one or a few modules in the project.
-- Good understanding of general project structure, demonstrated by discussion over RFCs, code reviews and proposals of new features
-- Active history of code reviews that demonstrate a good technical ability
-- Contribution history of high-quality documentation and tutorials to the promote project
-- History of creating clean, maintainable code and including good test cases.
 
-New committers are nominated by current committers from current code owners.
-
-Code Owner
+Committers
 ----------
+Committers are individuals who are granted the write access to the project. A committer is usually responsible for a certain area or several areas of the code where they oversee the code review process. The area of contribution can take all forms, including code contributions and code reviews, documents, education, and outreach. Committers are essential for a high quality and healthy project. The community actively look for new committers from contributors. Here is a list of useful traits that help the community to recognize potential committers:
 
-A code owner is an individual who is responsible for a specific area of the code-base. Code owners are responsible for the areas they are in charge of and oversee the code review process of the corresponding module. Changes to a specific area need to be approved by one of its owners in order to be merged. Once a pull request is approved by the designated code owner, the code can be directly merged into the repo. Code owners are essential for a high quality and healthy codebase.
-
-We welcome new code owners that help to keep good code quality, testing, and documentation in specific areas. Here is a list of useful traits that help the community to recognize potential code owners:
+- Sustained contribution to the project, demonstrated by discussion over RFCs, code reviews and proposals of new features, and other development activities. Being familiar with, and being able to take ownership on one or several areas of the project.
+- Quality of contributions: High-quality, readable code contributions indicated by pull requests that can be merged without a substantial code review.  History of creating clean, maintainable code and including good test cases. Informative code reviews to help other contributors that adhere to a good standard.
+- Community involvement: active participation in the discussion forum, promote the projects via tutorials, talks and outreach. We encourage committers to collaborate broadly, e.g. do code reviews and discuss designs with community members that they do not interact physically.
 
-- High-quality, readable code contributions indicated by pull requests that can be merged without a substantial code review
-- Good coverage of tests and documentation in the contributions
-- Informative code reviews to help other contributors that adhere to a good standard, spot problems in contributions etc.
-- Active participation in the discussion forum
+The Project Management Committee(PMC) consists group of active committers that moderate the discussion, manage the project release, and proposes new committer/PMC members. Potential candidates are usually proposed via an internal discussion among PMCs, followed by a consensus approval, i.e. least 3 +1 votes, and no vetoes. Any veto must be accompanied by reasoning. PMCs should serve the community by upholding the community practices and guidelines TVM a better community for everyone. PMCs should strive to identify new candidates outside of their own organization.
 
-Reviewer
---------
 
-A reviewer is an individual who actively contributed to the project and is willing to participate in the code review of new contributions. We invite reviewers from active contributors. The reviewer invitation will be sent to the potential reviewer’s email, so please log in to the discussion forum so that we can know which email address we could send an invitation to.
-We actively seek reviews from reviewers. High-quality code reviews prevent technical debt for long-term and are crucial to the success of the project.
-A pull request to the project has to be reviewed by a reviewer in order to be merged.
+Reviewers
+---------
+Reviewers are individuals who actively contributed to the project and are willing to participate in the code review of new contributions. We identify reviewers from active contributors. The committers should explicitly solicit reviews from reviewers.  High-quality code reviews prevent technical debt for long-term and are crucial to the success of the project. A pull request to the project has to be reviewed by at least one reviewer in order to be merged.

From 83b24b5b779f79eabae4c962a3216af7dd8ce1b6 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Mon, 19 Nov 2018 15:32:30 -0500
Subject: [PATCH 04/30] [TOPI] Minor fix in the LSTM recipe (#2131)

---
 topi/recipe/rnn/lstm.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/topi/recipe/rnn/lstm.py b/topi/recipe/rnn/lstm.py
index 53ccbe598c3da..f627d6ce8f8ed 100644
--- a/topi/recipe/rnn/lstm.py
+++ b/topi/recipe/rnn/lstm.py
@@ -1,8 +1,6 @@
 """LSTM Example, still work in progress.."""
 import tvm
-import time
 import os
-import argparse
 from tvm.contrib import nvcc
 import numpy as np
 
@@ -14,16 +12,19 @@
 SKIP_CHECK = False
 UNROLL_WLOAD = True
 
+
 @tvm.register_func
 def tvm_callback_cuda_compile(code):
     """Use nvcc compiler for better perf."""
     ptx =  nvcc.compile_cuda(code, target="ptx")
     return ptx
 
+
 def write_code(code, fname):
     with open(fname, "w") as f:
         f.write(code)
 
+
 @tvm.register_func
 def tvm_callback_cuda_postproc(code):
     if not os.path.exists("perf"):
@@ -33,16 +34,16 @@ def tvm_callback_cuda_postproc(code):
         code = open("perf/%s_manual.cu" % TASK).read()
     return code
 
+
 def lstm():
     if not PERSIST_KERNEL:
         raise ValueError("Non persist LSTM not yet supported")
-    detect_global_barrier = DETECT_GLOBAL_BARRIER
     num_thread_y = 8
-    num_thread_x = 16 * 3 / 2
+    num_thread_x = 16 * 3 // 2
     num_sm = 24
     n_num_step = 128
     num_step = tvm.var('num_step')
-    num_hidden = 1152 / 2
+    num_hidden = 1152 // 2
     batch_size = 1
     # Global transition matrix
     # Input hidden channel can be pre-caculated by a gemm
@@ -165,11 +166,9 @@ def check_device(target):
         flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
         ctx.sync()
         # measure time cost of second step.
-        tstart = time.time()
-        flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
-        ctx.sync()
-        tgap = time.time() - tstart
-        print("Time cost=%g" % tgap)
+        evaluator = flstm.time_evaluator(flstm.entry_name, ctx, 1, repeat=1000)
+        eval_result = evaluator(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
+        print("Time cost=%g" % eval_result.mean)
 
     # set unroll_explicit for more readable code.
     with tvm.build_config(

From 7761416f00aed73e93bf919036625fc3f7941cbf Mon Sep 17 00:00:00 2001
From: eqy <eqy@cs.washington.edu>
Date: Mon, 19 Nov 2018 12:35:15 -0800
Subject: [PATCH 05/30] [WIP] [RPC] clean up uploaded modules (#2121)

 [RPC] clean up uploaded modules
---
 python/tvm/autotvm/measure/measure_methods.py |  6 ++++++
 python/tvm/rpc/client.py                      | 13 +++++++++++++
 src/runtime/file_util.cc                      |  4 ++++
 src/runtime/file_util.h                       |  6 ++++++
 src/runtime/rpc/rpc_server_env.cc             |  7 +++++++
 5 files changed, 36 insertions(+)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 802abe0190133..ff93704edb442 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -467,6 +467,12 @@ def run_through_rpc(measure_input, build_result,
             ctx.sync()
 
         costs = time_f(*args).results
+
+        # clean up remote files
+        remote.remove(build_result.filename)
+        remote.remove(os.path.splitext(build_result.filename)[0] + '.so')
+        remote.remove('')
+
         if len(costs) > 2:  # remove largest and smallest value to reduce variance
             costs = list(costs)
             costs.sort()
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index ae44e5a799339..c975ec64aa760 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -103,6 +103,19 @@ def download(self, path):
                 "tvm.rpc.server.download")
         return self._remote_funcs["download"](path)
 
+    def remove(self, path):
+        """Remove file from remote temp folder.
+
+        Parameters
+        ----------
+        path: str
+            The relative location to remote temp folder.
+        """
+        if "remove" not in self._remote_funcs:
+            self._remote_funcs["remove"] = self.get_function(
+                "tvm.rpc.server.remove")
+        self._remote_funcs["remove"](path)
+
     def load_module(self, path):
         """Load a remote module, the file need to be uploaded first.
 
diff --git a/src/runtime/file_util.cc b/src/runtime/file_util.cc
index 4df335a54f25a..ff579d12112d1 100644
--- a/src/runtime/file_util.cc
+++ b/src/runtime/file_util.cc
@@ -142,5 +142,9 @@ void LoadMetaDataFromFile(
   fs.close();
 }
 
+void RemoveFile(const std::string& file_name) {
+  std::remove(file_name.c_str());
+}
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/file_util.h b/src/runtime/file_util.h
index de520fa3158c5..2b797614281be 100644
--- a/src/runtime/file_util.h
+++ b/src/runtime/file_util.h
@@ -71,6 +71,12 @@ void SaveMetaDataToFile(
 void LoadMetaDataFromFile(
     const std::string& file_name,
     std::unordered_map<std::string, FunctionInfo>* fmap);
+
+/*!
+ * \brief Remove (unlink) a file.
+ * \param file_name The file name.
+ */
+void RemoveFile(const std::string& file_name);
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_RUNTIME_FILE_UTIL_H_
diff --git a/src/runtime/rpc/rpc_server_env.cc b/src/runtime/rpc/rpc_server_env.cc
index ca91b88247e53..fb8d95d60b95f 100644
--- a/src/runtime/rpc/rpc_server_env.cc
+++ b/src/runtime/rpc/rpc_server_env.cc
@@ -35,5 +35,12 @@ TVM_REGISTER_GLOBAL("tvm.rpc.server.download")
     *rv = arr;
   });
 
+TVM_REGISTER_GLOBAL("tvm.rpc.server.remove")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    std::string file_name = RPCGetPath(args[0]);
+    LOG(INFO) << "Remove " << file_name;
+    RemoveFile(file_name);
+  });
+
 }  // namespace runtime
 }  // namespace tvm

From 6edb3564a19bc2451cef2b61ae4a9affb8b70352 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Tue, 20 Nov 2018 02:07:46 +0530
Subject: [PATCH 06/30] [RELAY]sch & comp for ops in nn.py (#2092)

---
 include/tvm/relay/attrs/nn.h         |  2 +-
 python/tvm/relay/op/nn/_nn.py        | 45 ++++++++++++++++++++
 src/relay/op/nn/nn.cc                | 32 ++++++++++++--
 tests/python/relay/test_op_level2.py | 62 ++++++++++++++++++++++++++++
 tests/python/relay/test_op_level3.py | 41 ++++++++++++++++++
 5 files changed, 177 insertions(+), 5 deletions(-)

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 5077c82412a6e..33f18a89e3e8a 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -327,7 +327,7 @@ struct BatchNormAttrs : public tvm::AttrsNode<BatchNormAttrs> {
 
 /*! \brief Attributes for LRN operator */
 struct LRNAttrs : public tvm::AttrsNode<LRNAttrs> {
-  IndexExpr size;
+  int size;
   int axis;
   double bias;
   double alpha;
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index e30cf8ba2ccf6..cd807ad621283 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -17,6 +17,7 @@ def schedule_softmax(_, outputs, target):
 
 reg.register_pattern("nn.softmax", OpPattern.OPAQUE)
 
+schedule_broadcast = schedule_injective
 
 @reg.register_schedule("nn.log_softmax")
 def schedule_log_softmax(_, outputs, target):
@@ -194,3 +195,47 @@ def schedule_global_avg_pool2d(_, outs, target):
         return topi.generic.schedule_global_pool(outs)
 
 reg.register_pattern("nn.global_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+# leaky_relu
+reg.register_schedule("nn.leaky_relu", schedule_broadcast)
+reg.register_pattern("nn.leaky_relu", OpPattern.ELEMWISE)
+
+# prelu
+reg.register_schedule("nn.prelu", schedule_broadcast)
+reg.register_pattern("nn.prelu", OpPattern.BROADCAST)
+
+# flatten
+reg.register_schedule("nn.batch_flatten", schedule_broadcast)
+reg.register_pattern("nn.batch_flatten", OpPattern.INJECTIVE)
+
+
+# lrn
+@reg.register_compute("nn.lrn")
+def compute_lrn(attrs, inputs, out_dtype, target):
+    """Compute definition of lrn"""
+    assert len(inputs) == 1
+    return [topi.nn.lrn(inputs[0], attrs.size, attrs.axis,
+                        attrs.alpha, attrs.beta, attrs.bias)]
+
+@reg.register_schedule("nn.lrn")
+def schedule_lrn(attrs, outs, target):
+    """Schedule definition of lrn"""
+    with target:
+        return topi.generic.schedule_lrn(outs)
+
+reg.register_pattern("nn.lrn", OpPattern.OPAQUE)
+
+
+# l2_normalize
+@reg.register_compute("nn.l2_normalize")
+def compute_l2_normalize(attrs, inputs, out_dtype, target):
+    """Compute definition of l2 normalize"""
+    return [topi.nn.l2_normalize(inputs[0], attrs.eps, attrs.axis)]
+
+@reg.register_schedule("nn.l2_normalize")
+def schedule_l2_normalize(attrs, outs, target):
+    """Schedule definition of l2 normalize"""
+    with target:
+        return topi.generic.schedule_l2_normalize(outs)
+
+reg.register_pattern("nn.l2_normalize", OpPattern.OUT_ELEMWISE_FUSABLE)
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index dfa68197819b9..d00f05cfc6fe3 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -9,6 +9,7 @@
 #include <tvm/relay/attrs/image.h>
 #include <topi/nn.h>
 #include <topi/nn/softmax.h>
+#include <topi/nn/flatten.h>
 #include <vector>
 #include "../type_relations.h"
 #include "../op_common.h"
@@ -169,7 +170,15 @@ RELAY_REGISTER_OP("nn.leaky_relu")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "Input data.")
 .set_support_level(3)
-.add_type_rel("Identity", IdentityRel);
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const Attrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Type& out_type,
+                    const Target& target) {
+    const auto* param = attrs.as<LeakyReluAttrs>();
+    return Array<Tensor>{ topi::leaky_relu(inputs[0], param->alpha) };
+});
 
 
 TVM_REGISTER_NODE_TYPE(PReluAttrs);
@@ -225,7 +234,15 @@ where :math:`*` is an channelwise multiplication for each sample in the batch.
 .add_argument("data", "Tensor", "Input data.")
 .add_argument("alpha", "Tensor", "Input channelwise alpha.")
 .set_support_level(3)
-.add_type_rel("PRelu", PReluRel);
+.add_type_rel("PRelu", PReluRel)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const Attrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Type& out_type,
+                    const Target& target) {
+    const auto* param = attrs.as<PReluAttrs>();
+    return Array<Tensor>{ topi::prelu(inputs[0], inputs[1], param->axis)};
+});
 
 
 TVM_REGISTER_API("relay.op.nn._make.softmax")
@@ -365,7 +382,14 @@ Example::
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
-.add_type_rel("BatchFlatten", BatchFlattenRel);
+.add_type_rel("BatchFlatten", BatchFlattenRel)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const Attrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Type& out_type,
+                    const Target& target) {
+    return Array<Tensor>{ topi::nn::flatten(inputs[0]) };
+});
 
 
 // relu
@@ -398,7 +422,7 @@ RELAY_REGISTER_OP("nn.relu")
 TVM_REGISTER_NODE_TYPE(LRNAttrs);
 
 Expr MakeLRN(Expr data,
-             IndexExpr size,
+             int size,
              int axis,
              double alpha,
              double beta,
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 7b3a6d3fe15e0..1ae37240788f1 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -295,6 +295,25 @@ def test_flatten_infer_type():
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.TensorType((d1, ((2*d3)*3)), "float32")
 
+    shape = (1, 5, 10, 10)
+    o_shape = (1, 500)
+    dtype = "float32"
+    x = relay.var("x", relay.TensorType(shape, dtype))
+    z = relay.nn.batch_flatten(x)
+    yy = relay.ir_pass.infer_type(z)
+    assert yy.checked_type == relay.TensorType(o_shape, dtype)
+    func = relay.Function([x], z)
+    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
+    ref_res = x_data.flatten().reshape(o_shape)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
 def test_pad_infer_type():
     # entirely concrete case
     n, c, h, w = 1, 2, 3, 4
@@ -320,6 +339,29 @@ def test_lrn():
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.TensorType((n, c , h, w))
 
+    shape = (1, 5, 10, 10)
+    dtype = "float32"
+    x = relay.var("x", relay.TensorType(shape, dtype))
+    size=5
+    axis=1
+    bias=0.5
+    alpha=.00001
+    beta=0.75
+    z = relay.nn.lrn(x, size=size, axis=axis, bias=bias, alpha=alpha, beta=beta)
+    yy = relay.ir_pass.infer_type(z)
+    assert yy.checked_type == relay.TensorType(shape, dtype)
+    func = relay.Function([x], z)
+    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
+    ref_res = topi.testing.lrn_python(x_data, size, axis, bias, alpha, beta)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
 def test_l2_normalize():
     n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
     x = relay.var("x", shape=(n, c , h, w))
@@ -328,6 +370,26 @@ def test_l2_normalize():
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.TensorType((n, c , h, w))
 
+    shape = (1, 5, 10, 10)
+    dtype = "float32"
+    x = relay.var("x", relay.TensorType(shape, dtype))
+    eps=0.001
+    axis=1
+    z = relay.nn.l2_normalize(x, eps=0.001, axis=[axis])
+    yy = relay.ir_pass.infer_type(z)
+    assert yy.checked_type == relay.TensorType(shape, dtype)
+    func = relay.Function([x], z)
+    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
+    ref_res = topi.testing.l2_normalize_python(x_data, eps, axis)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
 
 if __name__ == "__main__":
     test_pool2d()
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 26eccf991d0eb..22469cc7fdbe9 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -4,6 +4,7 @@
 import numpy as np
 from tvm import relay
 from tvm.relay import create_executor
+from tvm.relay.testing import ctx_list
 from nose.tools import raises
 
 def test_zeros_ones():
@@ -214,6 +215,25 @@ def test_infer_type_leaky_relu():
     yy = relay.ir_pass.infer_type(y)
     assert yy.checked_type == relay.TensorType((n, c, h, w), "float32")
 
+    shape = (1, 5, 10, 10)
+    dtype = "float32"
+    x = relay.var("x", relay.TensorType(shape, dtype))
+    z = relay.nn.leaky_relu(x, alpha=0.1)
+    assert "alpha=0.1" in z.astext()
+    yy = relay.ir_pass.infer_type(z)
+    assert yy.checked_type == relay.TensorType(shape, dtype)
+    func = relay.Function([x], z)
+    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
+    ref_res = np.where(x_data > 0, x_data, x_data * 0.1)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
 def verify_infer_type_prelu(data, alpha, axis, output, dtype="float32"):
     x = relay.var("data", relay.TensorType(data, dtype))
     if alpha:
@@ -230,6 +250,27 @@ def verify_infer_type_prelu(data, alpha, axis, output, dtype="float32"):
         alpha_shape = (data[axis],)
         assert zz.args[1].checked_type == relay.TensorType(alpha_shape, "float32")
 
+    if all(isinstance(v, tvm.expr.Var) == 1 for v in data) or not alpha:
+        return
+
+    func = relay.Function([x, y], z)
+    x_data = np.random.uniform(low=-1, high=1, size=data).astype(dtype)
+    a_data = np.random.uniform(low=-1, high=1, size=alpha).astype(dtype)
+
+    if axis == 1:
+        ref_res = (x_data < 0) * (x_data * a_data.reshape(3, 1, 1)) + (x_data>=0) * x_data
+    else:
+        ref_res = (x_data < 0) * (x_data * a_data.reshape(1, 1, 3)) + (x_data>=0) * x_data
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data, a_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data, a_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
+
 def test_infer_type_prelu():
     n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
     verify_infer_type_prelu((n, c, h, w), (c,), 1, (n, c, h, w))

From c113712db7d6529c6ab1d43083c18b9d0b675e0d Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 19 Nov 2018 14:59:41 -0800
Subject: [PATCH 07/30] [RELAY][BACKEND] Enable PlanMemory in the graph
 runtime. (#2120)

---
 include/tvm/relay/expr.h                      |   2 +
 python/tvm/relay/backend/_backend.py          |   1 +
 .../relay/backend/graph_runtime_codegen.py    |  33 +-
 python/tvm/relay/base.py                      |  14 +-
 src/relay/backend/graph_plan_memory.cc        | 349 ++++++++++++++++++
 src/relay/ir/text_printer.cc                  |  39 +-
 src/relay/pass/fuse_ops.cc                    |   2 +-
 .../relay/test_backend_graph_runtime.py       |  35 +-
 8 files changed, 450 insertions(+), 25 deletions(-)
 create mode 100644 src/relay/backend/graph_plan_memory.cc

diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index c72612791b521..887d28b0fa9f8 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -458,12 +458,14 @@ inline const TTypeNode* ExprNode::type_as() const {
 /*!
  * \brief Print node as text format.
  * \param node The node to be printed.
+ * \param show_meta_data Whether to print meta data section.
  * \param annotate An optional callback function for attaching
  *        additional comment block to an expr.
  * \return The text representation.
  */
 std::string RelayPrint(
     const NodeRef& node,
+    bool show_meta_data = true,
     runtime::TypedPackedFunc<std::string(Expr)> annotate = nullptr);
 }  // namespace relay
 }  // namespace tvm
diff --git a/python/tvm/relay/backend/_backend.py b/python/tvm/relay/backend/_backend.py
index b5454031cb4ab..a51cc8072aaca 100644
--- a/python/tvm/relay/backend/_backend.py
+++ b/python/tvm/relay/backend/_backend.py
@@ -55,6 +55,7 @@ def build(funcs, target, target_host=None):
     funcs : List[tvm.LoweredFunc]
          The list of lowered functions.
 
+
     target : tvm.Target
          The target to run the code on.
 
diff --git a/python/tvm/relay/backend/graph_runtime_codegen.py b/python/tvm/relay/backend/graph_runtime_codegen.py
index 4bbab957ab1d1..50568b58607b8 100644
--- a/python/tvm/relay/backend/graph_runtime_codegen.py
+++ b/python/tvm/relay/backend/graph_runtime_codegen.py
@@ -21,6 +21,7 @@
 from __future__ import absolute_import
 import json
 import attr
+from . import _backend
 from . import compile_engine
 from ..op import Op
 from ..expr import Function, GlobalVar, ExprFunctor
@@ -103,11 +104,12 @@ def __init__(self, mod, target):
         self.nodes = []
         self.var_map = {}
         self.params = {}
+        self.storage_map = None
         self.compile_engine = compile_engine.get()
         self.lowered_funcs = set()
         self._name_map = {}
 
-    def add_node(self, node, checked_type):
+    def add_node(self, node, expr):
         """
         Add a node to the graph.
 
@@ -116,14 +118,21 @@ def add_node(self, node, checked_type):
         node: Node
             The node to add to the graph.
 
-        checked_type: Type
-            The type of the node.
+        expr: tvm.relay.Expr
+            The corresponding expression.
 
         Returns
         -------
         node_ref: Union[NodeRef, List[NodeRef]]
             A reference to the node.
         """
+        checked_type = expr.checked_type
+        # setup storage ids
+        assert expr in self.storage_map
+        node.attrs["storage_id"] = [
+            x.value for x in self.storage_map[expr]
+        ]
+
         node_id = len(self.nodes)
         self.nodes.append(node)
         # Tuple return value, flatten as tuple
@@ -168,7 +177,7 @@ def visit_constant(self, op):
         name = "p%d" % index
         self.params[name] = op.data
         node = InputNode(name, {})
-        return self.add_node(node, op.checked_type)
+        return self.add_node(node, op)
 
     def visit_function(self, _):
         raise RuntimeError("function not supported")
@@ -244,7 +253,7 @@ def visit_call(self, call):
         op_name = cached_func.func_name
         op_node = OpNode(self._get_unique_name(op_name), {},
                          op_name, inputs, {})
-        return self.add_node(op_node, call.checked_type)
+        return self.add_node(op_node, call)
 
     def _get_json(self):
         """
@@ -281,8 +290,7 @@ def _get_json(self):
             assert node.num_outputs == len(node.attrs["shape"])
             shapes += node.attrs["shape"]
             dltypes += node.attrs["dtype"]
-            for i in range(node.num_outputs):
-                storage_ids.append(i + num_entry)
+            storage_ids += node.attrs["storage_id"]
             num_entry += node.num_outputs
             node_row_ptr.append(num_entry)
 
@@ -302,6 +310,14 @@ def _get_json(self):
 
         return json.dumps(json_dict, indent=2)
 
+    def debug_dump_memory_plan(self, func):
+        """Debug function to dump memory plan."""
+        def _annotate(expr):
+            if expr in self.storage_map:
+                return str(self.storage_map[expr])
+            return ""
+        return func.astext(show_meta_data=False, annotate=_annotate)
+
     def codegen(self, func):
         """Compile a single function into a graph.
 
@@ -321,11 +337,12 @@ def codegen(self, func):
         params : Dict[str, tvm.nd.NDArray]
             Additional constant parameters.
         """
+        self.storage_map = _backend.GraphPlanMemory(func)
         # First we convert all the parameters into input nodes.
         for param in func.params:
             node = InputNode(param.name_hint, {})
             self.var_map[param] = self.add_node(
-                node, param.type_annotation)
+                node, param)
 
         # Then we compile the body into a graph which can depend
         # on input variables.
diff --git a/python/tvm/relay/base.py b/python/tvm/relay/base.py
index 012315b40f510..0feffeb809c54 100644
--- a/python/tvm/relay/base.py
+++ b/python/tvm/relay/base.py
@@ -23,7 +23,7 @@ def register_relay_node(type_key=None):
 
 class RelayNode(NodeBase):
     """Base class of all relay node."""
-    def astext(self, annotate=None):
+    def astext(self, show_meta_data=True, annotate=None):
         """Get the text format of the expression.
 
         Returns
@@ -31,11 +31,21 @@ def astext(self, annotate=None):
         text : str
             The text format of the expression.
 
+        show_meta_data : bool
+            Whether to include meta data section in the text
+            if there is meta data.
+
         annotate: Optional[relay.Expr->str]
             Optional annotate function to provide additional
             information in the comment block.
+
+        Note
+        ----
+        meta data section is necessary to fully parse the text format.
+        However, it can contain dumps that are big(constat weights),
+        so it can be helpful to skip printing the meta data section.
         """
-        return _expr.RelayPrint(self, annotate)
+        return _expr.RelayPrint(self, show_meta_data, annotate)
 
 
 @register_relay_node
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
new file mode 100644
index 0000000000000..f3c3e2935d224
--- /dev/null
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -0,0 +1,349 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file relay/backend/graph_mem_alloca.cc
+ * \brief Memory index assignment pass for executing
+ *   the program in the graph runtime.
+ */
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include "../../common/arena.h"
+
+namespace tvm {
+namespace relay {
+
+struct StorageToken {
+  /*! \brief Reference counter */
+  int ref_counter{0};
+  /*! \brief number of bytes */
+  size_t max_bytes{0};
+  /*! \brief The corresponding tensor type node. */
+  const TensorTypeNode* ttype{nullptr};
+  /*! \brief virtual device index */
+  int device_id{0};
+  /*! \brief The storage id */
+  int64_t storage_id{-1};
+};
+
+class StorageAllocaBaseVisitor : public ExprVisitor {
+ public:
+  // run the visitor on a function.
+  void Run(const Function& func) {
+    for (Var param : func->params) {
+      CreateToken(param.operator->(), false);
+    }
+    this->VisitExpr(func->body);
+  }
+
+  void VisitExpr_(const ConstantNode* op) final {
+    this->CreateToken(op, false);
+  }
+
+  void VisitExpr_(const VarNode* op) final {
+    // Do nothing.
+  }
+
+  void VisitExpr_(const FunctionNode* op) final {
+    // do not recursive into sub function.
+  }
+
+  void VisitExpr_(const GlobalVarNode* op) final {
+    // Do nothing.
+  }
+
+  void VisitExpr_(const OpNode* op) final {
+    // Do nothing.
+  }
+
+  void VisitExpr_(const TupleNode* op) final {
+    std::vector<StorageToken*> fields;
+    for (Expr field : op->fields) {
+      auto tok = GetToken(field);
+      CHECK_EQ(tok.size(), 1U);
+      fields.push_back(tok[0]);
+    }
+    token_map_[op] = fields;
+  }
+
+  void VisitExpr_(const TupleGetItemNode* op) final {
+    const auto& tok = GetToken(op->tuple);
+    CHECK_LT(static_cast<size_t>(op->index), tok.size());
+    token_map_[op] = {tok[op->index]};
+  }
+
+  void VisitExpr_(const IfNode* op) final {
+    LOG(FATAL) << "if is not supported.";
+  }
+
+  void VisitExpr_(const LetNode* op) final {
+    auto token = GetToken(op->value);
+    token_map_[op->var.operator->()] = token;
+    token_map_[op] = GetToken(op->body);
+  }
+
+ protected:
+  /*! \brief internal token map */
+  std::unordered_map<const ExprNode*, std::vector<StorageToken*> > token_map_;
+
+  /*!
+   * \brief Get the necessary token.
+   * \param expr The expression.
+   * \return The corresponding token.
+   */
+  const std::vector<StorageToken*>& GetToken(const Expr& expr) {
+    this->VisitExpr(expr);
+    auto it = token_map_.find(expr.operator->());
+    CHECK(it != token_map_.end());
+    return it->second;
+  }
+  /*!
+   * \brief Populate the token map to set op's tokens
+   * \param op The node to be processed.
+   * \param can_realloc Whether we can re-allocate the memory.
+   */
+  virtual void CreateToken(const ExprNode* op, bool can_realloc) = 0;
+};
+
+
+class StorageAllocaInit : protected StorageAllocaBaseVisitor {
+ public:
+  explicit StorageAllocaInit(common::Arena* arena)
+      : arena_(arena) {}
+
+
+  /*! \return The internal token map */
+  std::unordered_map<const ExprNode*, std::vector<StorageToken*> >
+  GetInitTokenMap(const Function& func) {
+    this->Run(func);
+    return std::move(token_map_);
+  }
+
+
+ protected:
+  using StorageAllocaBaseVisitor::VisitExpr_;
+
+  void CreateToken(const ExprNode* op, bool can_realloc)  final {
+    CHECK(!token_map_.count(op));
+    std::vector<StorageToken*> tokens;
+    if (const auto* tuple_type = op->checked_type().as<TupleTypeNode>()) {
+      for (Type t : tuple_type->fields) {
+        const auto* ttype = t.as<TensorTypeNode>();
+        CHECK(ttype);
+        StorageToken* token = arena_->make<StorageToken>();
+        token->ttype = ttype;
+        tokens.push_back(token);
+      }
+    } else {
+      const auto* ttype = op->checked_type().as<TensorTypeNode>();
+      CHECK(ttype);
+      StorageToken* token = arena_->make<StorageToken>();
+      token->ttype = ttype;
+      tokens.push_back(token);
+    }
+    token_map_[op] = tokens;
+  }
+
+  void VisitExpr_(const CallNode* op) final {
+    // create token for the call node.
+    CreateToken(op, true);
+    // for each input, visit argument token.
+    for (Expr arg : op->args) {
+      for (StorageToken* tok : GetToken(arg)) {
+        tok->ref_counter += 1;
+      }
+    }
+  }
+
+ private:
+  // allocator
+  common::Arena* arena_;
+};
+
+
+class StorageAllocator : public StorageAllocaBaseVisitor {
+ public:
+  /*!
+   * \return totoal number of bytes allocated
+   */
+  size_t TotalAllocBytes() const {
+    size_t total = 0;
+    for (const auto* p : data_) {
+      total += p->max_bytes;
+    }
+    return total;
+  }
+
+  // Run storage allocation for a function.
+  Map<Expr, Array<Integer> > Plan(const Function& func) {
+    prototype_ = StorageAllocaInit(&arena_).GetInitTokenMap(func);
+    this->Run(func);
+
+    Map<Expr, Array<Integer> > smap;
+
+    for (const auto& kv : token_map_) {
+      Array<Integer> vec;
+      for (StorageToken* tok : kv.second) {
+        vec.push_back(tok->storage_id);
+      }
+      smap.Set(GetRef<Expr>(kv.first), vec);
+    }
+    return smap;
+  }
+
+
+ protected:
+  using StorageAllocaBaseVisitor::VisitExpr_;
+  // override create token by getting token as prototype requirements.
+  void CreateToken(const ExprNode* op, bool can_realloc)  final {
+    CHECK(!token_map_.count(op));
+    auto it = prototype_.find(op);
+    CHECK(it != prototype_.end());
+    std::vector<StorageToken*> tokens;
+    for (StorageToken* tok : it->second) {
+      if (can_realloc) {
+        tokens.push_back(Request(tok));
+      } else {
+        // Allocate a new token,
+        StorageToken* allocated_tok = Alloc(tok, GetMemorySize(tok));
+        // ensure it never get de-allocated.
+        allocated_tok->ref_counter += 1;
+        tokens.push_back(allocated_tok);
+      }
+    }
+    token_map_[op] = tokens;
+  }
+  // The call map
+  void VisitExpr_(const CallNode* op) final {
+    std::vector<StorageToken*> args;
+    // for each input, visit argument token.
+    for (Expr arg : op->args) {
+      for (StorageToken* tok : GetToken(arg)) {
+        args.push_back(tok);
+      }
+    }
+    // create token for the call node.
+    CreateToken(op, true);
+    // check if there is orphaned output that can be released immediately.
+    for (StorageToken* tok : token_map_.at(op)) {
+      CheckForRelease(tok);
+    }
+    for (StorageToken* tok : args) {
+      tok->ref_counter -= 1;
+      CheckForRelease(tok);
+    }
+  }
+  /*!
+   * \brief ceil(size/word_size) to get number of words.
+   * \param size The original size.
+   * \param word_size The element size.
+   */
+  static size_t DivRoundUp(size_t size, size_t word_size) {
+    return (size + word_size - 1) / word_size;
+  }
+  /*!
+   * \brief Get the memory requirement.
+   * \param prototype The prototype token.
+   * \return The required memory size.
+   */
+  size_t GetMemorySize(StorageToken* prototype) {
+    const TensorTypeNode* ttype = prototype->ttype;
+    CHECK(ttype != nullptr);
+    size_t size = 1;
+    for (IndexExpr dim : ttype->shape) {
+      const int64_t* pval = as_const_int(dim);
+      CHECK(pval != nullptr)
+          << "Cannot allocate memory symbolic tensor shape "
+          << ttype->shape;
+      size *= static_cast<size_t>(pval[0]);
+    }
+    size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
+    return size;
+  }
+  /*!
+   * \brief Request a storage token for a given prototype.
+   * \param prototype. The prototype storage token.
+   * \return The result token.
+   */
+  StorageToken* Request(StorageToken* prototype) {
+    // calculate the size;
+    size_t size = GetMemorySize(prototype);
+    // search memory block in [size / match_range_, size * match_range_)
+    if (match_range_ == 0) {
+      return this->Alloc(prototype, size);
+    }
+    auto begin = free_.lower_bound(size / match_range_);
+    auto mid = free_.lower_bound(size);
+    auto end = free_.upper_bound(size * match_range_);
+    // search for memory blocks larger than requested
+    for (auto it = mid; it != end; ++it) {
+      StorageToken *tok = it->second;
+      if (tok->device_id != prototype->device_id) continue;
+      CHECK_EQ(tok->ref_counter, 0);
+      // Use exect matching strategy
+      tok->max_bytes = std::max(size, tok->max_bytes);
+      tok->ref_counter = prototype->ref_counter;
+      // find a exact match, erase from map and return
+      free_.erase(it);
+      return tok;
+    }
+    // then search for memory blocks smaller than requested space
+    for (auto it = mid; it != begin;) {
+      --it;
+      StorageToken *tok = it->second;
+      if (tok->device_id != prototype->device_id) continue;
+      CHECK_EQ(tok->ref_counter, 0);
+      // Use exect matching strategy
+      tok->max_bytes = std::max(size, tok->max_bytes);
+      tok->ref_counter = prototype->ref_counter;
+      // erase from map and return
+      free_.erase(it);
+      return tok;
+    }
+    // cannot find anything return a new one.
+    return this->Alloc(prototype, size);
+  }
+  /*!
+   * \brief Allocate a storage token by consuming prototype
+   * \param prototype The prototype token.
+   * \param size The size of memory being requested.
+   */
+  StorageToken* Alloc(StorageToken* prototype, size_t size) {
+    prototype->max_bytes = size;
+    prototype->storage_id = static_cast<int64_t>(data_.size());
+    data_.push_back(prototype);
+    return prototype;
+  }
+  /*!
+   * \brief Check if we can release token.
+   * \tok The token to be released.
+   */
+  void CheckForRelease(StorageToken* tok) {
+    CHECK_GE(tok->storage_id, 0);
+    CHECK_GE(tok->ref_counter, 0);
+    if (tok->ref_counter == 0) {
+      free_.insert({tok->max_bytes, tok});
+    }
+  }
+
+ private:
+  // allocator
+  common::Arena arena_;
+  // scale used for rough match
+  size_t match_range_{16};
+  // free list of storage entry
+  std::multimap<size_t, StorageToken*> free_;
+  // all the storage resources available
+  std::vector<StorageToken*> data_;
+  /*! \brief internal prototype token map */
+  std::unordered_map<const ExprNode*, std::vector<StorageToken*> > prototype_;
+};
+
+
+Map<Expr, Array<Integer> > GraphPlanMemory(const Function& func) {
+  return StorageAllocator().Plan(func);
+}
+
+TVM_REGISTER_GLOBAL("relay.backend.GraphPlanMemory")
+.set_body_typed<Map<Expr, Array<Integer> >(const Function&)>(GraphPlanMemory);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/text_printer.cc b/src/relay/ir/text_printer.cc
index bfc5f0db52b72..5e97ce1010ade 100644
--- a/src/relay/ir/text_printer.cc
+++ b/src/relay/ir/text_printer.cc
@@ -113,6 +113,11 @@ class TextMetaDataContext {
     return SaveJSON(Array<NodeRef>(meta_data_));
   }
 
+  /*! \return whether the meta data context is empty. */
+  bool empty() const {
+    return meta_data_.empty();
+  }
+
  private:
   /*! \brief additional metadata stored in TVM json format */
   std::vector<NodeRef> meta_data_;
@@ -125,8 +130,9 @@ class TextPrinter :
     public TypeFunctor<void (const Type&, std::ostream& os)>,  // NOLINT(*)
     public AttrFunctor<void (const NodeRef&, std::ostream& os)> { // NOLINT(*)
  public:
-  explicit TextPrinter(runtime::TypedPackedFunc<std::string(Expr)> annotate)
-      : annotate_(annotate) {}
+  explicit TextPrinter(bool show_meta_data,
+                       runtime::TypedPackedFunc<std::string(Expr)> annotate)
+      : show_meta_data_(show_meta_data), annotate_(annotate) {}
   /*!
    * \brief Print a node to string.
    * \param node.
@@ -144,13 +150,17 @@ class TextPrinter :
     } else {
       stream_ << node;
     }
-    std::string meta_json = meta_.GetMetaSection();
-    if (meta_json.length() != 0) {
-      // append meta data in the end.
-      stream_ << "# meta data\n"
-              << "r\"\"\"\n"
-              << meta_json << "\n"
-              << "\"\"\"";
+    if (!meta_.empty()) {
+      if (show_meta_data_) {
+        std::string meta_json = meta_.GetMetaSection();
+        // append meta data in the end.
+        stream_ << "# meta data\n"
+                << "r\"\"\"\n"
+                << meta_json << "\n"
+                << "\"\"\"";
+      } else {
+        stream_ << "# meta data omitted. you can use show_meta_data=True to include meta-data\n";
+      }
     }
     return stream_.str();
   }
@@ -227,7 +237,9 @@ class TextPrinter :
     TextValue id = this->AllocTempVar();
     this->PrintIndent();
     stream_ << id << " = " << meta_.GetMetaNode(GetRef<NodeRef>(op));
-    this->PrintEndInst("\n");
+    this->PrintEndInst("");
+    this->PrintOptionalInfo(GetRef<Expr>(op));
+    stream_ << '\n';
     return id;
   }
 
@@ -697,6 +709,8 @@ class TextPrinter :
  private:
   class AttrPrinter;
   friend class AttrPrinter;
+  /*! \brief Whether to print meta data. */
+  bool show_meta_data_;
   /*! \brief additional comment function */
   runtime::TypedPackedFunc<std::string(Expr)> annotate_;
   /*! \brief meta data context */
@@ -790,13 +804,14 @@ void TextPrinter::PrintCallAttrs(const Expr& op,
 }
 
 std::string RelayPrint(const NodeRef& node,
+                       bool show_meta_data,
                        runtime::TypedPackedFunc<std::string(Expr)> annotate) {
-  return TextPrinter(annotate).Print(node);
+  return TextPrinter(show_meta_data, annotate).Print(node);
 }
 
 TVM_REGISTER_API("relay._expr.RelayPrint")
 .set_body_typed<std::string(
-    const NodeRef&,
+    const NodeRef&, bool,
     runtime::TypedPackedFunc<std::string(Expr)>)>(RelayPrint);
 
 }  // namespace relay
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index cb5f86f4b525d..b9e0823e88fac 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -749,7 +749,7 @@ class FuseMutator : private ExprMutator {
   }
   // Debug function, dump the group assignment in text.
   void DebugDumpGroup(const Expr& body) {
-    std::string text = RelayPrint(body, [this](const Expr& expr) -> std::string {
+    std::string text = RelayPrint(body, false, [this](const Expr& expr) -> std::string {
         auto it = gmap_.find(expr.get());
         if (it == gmap_.end()) return "";
         std::ostringstream os;
diff --git a/tests/python/relay/test_backend_graph_runtime.py b/tests/python/relay/test_backend_graph_runtime.py
index 7b610f82f6a53..7baa906abacc4 100644
--- a/tests/python/relay/test_backend_graph_runtime.py
+++ b/tests/python/relay/test_backend_graph_runtime.py
@@ -77,7 +77,9 @@ def test_add_op_broadcast():
 def test_with_params():
     x = relay.var('x', shape=(10, 5))
     y = relay.var('y', shape=(1, 5))
-    func = relay.Function([x, y], add(x, y))
+    z = relay.add(x, y)
+    z = relay.exp(z)
+    func = relay.Function([x, y], z)
     x_data = np.random.rand(10, 5).astype('float32')
     y_data = np.random.rand(1, 5).astype('float32')
     params = {"y": y_data}
@@ -87,11 +89,40 @@ def test_with_params():
     mod.set_input(x=x_data)
     mod.run()
     res = mod.get_output(0).asnumpy()
-    ref_res = y_data + x_data
+    ref_res = np.exp(y_data + x_data)
     tvm.testing.assert_allclose(res, ref_res)
 
 
+def test_plan_memory():
+    # it is sufficient to cycle through two memories.
+
+    x = relay.var("x", shape=(10,))
+    y = relay.var("x", shape=(1,))
+    y2 = relay.exp(y)
+    z = relay.add(x, y2)
+    z = relay.exp(z)
+    z = relay.exp(z)
+    z = relay.exp(z)
+    z = relay.exp(z)
+    z = relay.exp(z)
+    func = relay.Function([x, y], z)
+    func = relay.ir_pass.infer_type(func)
+    func = relay.ir_pass.fuse_ops(func, opt_level=0)
+    func = relay.ir_pass.infer_type(func)
+    smap = relay.backend._backend.GraphPlanMemory(func)
+    storage_ids = set()
+    for k, v in smap.items():
+        for x in v:
+            storage_ids.add(x.value)
+
+    # Current rule requires vars have unique storage id
+    # because we don't do inplace, we will need another
+    # two alternating temporary space.
+    assert len(storage_ids) == 4
+
+
 if __name__ == "__main__":
+    test_plan_memory()
     test_with_params()
     test_add_op_scalar()
     test_add_op_tensor()

From 401ffe131e207bc83fa424df3dbc14ed1c987731 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Mon, 19 Nov 2018 21:35:21 -0800
Subject: [PATCH 08/30] [Relay][Op] Add test for batch_flatten (#2134)

* Add tests for batch_flatten and softmax

* Softmax is already tested elsewhere
---
 python/tvm/relay/op/nn/_nn.py        |  1 +
 tests/python/relay/test_op_level2.py | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index cd807ad621283..b48bfde97f334 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -9,6 +9,7 @@
 reg.register_schedule("nn.relu", schedule_injective)
 reg.register_pattern("nn.relu", OpPattern.ELEMWISE)
 
+# softmax
 @reg.register_schedule("nn.softmax")
 def schedule_softmax(_, outputs, target):
     """Schedule definition of softmax"""
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 1ae37240788f1..cd9321c5a91fe 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -391,6 +391,27 @@ def test_l2_normalize():
         tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
 
 
+def batch_flatten(data):
+    shape = data.shape
+    target_dim = 1
+    for i in range(len(shape) - 1):
+        target_dim = target_dim * shape[i + 1]
+    return np.reshape(data, (shape[0], target_dim))
+
+
+def test_batch_flatten():
+    t1 = relay.TensorType((5, 10, 5))
+    x = relay.Var("x", t1)
+    func = relay.Function([x], relay.nn.batch_flatten(x))
+
+    data = np.random.rand(5, 10, 5).astype(t1.dtype)
+    ref_res = batch_flatten(data)
+    for target, ctx in ctx_list():
+        intrp = relay.create_executor("graph", ctx=ctx, target=target)
+        op_res = intrp.evaluate(func)(data)
+        np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+
+
 if __name__ == "__main__":
     test_pool2d()
     test_avg_pool2d_no_count_pad()
@@ -403,3 +424,4 @@ def test_l2_normalize():
     test_conv2d_transpose_infer_type()
     test_conv2d_transpose_run()
     test_conv2d_run()
+    test_batch_flatten()

From 2849930465259e86de660e5a67754e76a85d32ad Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Tue, 20 Nov 2018 22:50:09 +0530
Subject: [PATCH 09/30] [RELAY]Slice_like support (#2014)

---
 docs/langref/relay_op.rst             |   3 +-
 include/tvm/relay/attrs/transform.h   |  13 +++
 python/tvm/relay/op/_transform.py     |   6 +-
 python/tvm/relay/op/transform.py      |  26 +++++
 src/relay/op/tensor/transform.cc      | 147 ++++++++++++++++++++++++++
 tests/python/relay/test_op_level10.py |  62 +++++++++++
 6 files changed, 255 insertions(+), 2 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index e99ac3c97f73d..95581a54e5a12 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -143,6 +143,7 @@ This level support backpropagation of broadcast operators. It is temporary.
 
    tvm.relay.broadcast_to_like
    tvm.relay.collapse_sum_like
+   tvm.relay.slice_like
 
 
 Level 1 Definitions
@@ -231,7 +232,6 @@ Level 4 Definitions
 .. autofunction:: tvm.relay.strided_slice
 
 
-
 Level 5 Definitions
 -------------------
 .. autofunction:: tvm.relay.image.resize
@@ -241,3 +241,4 @@ Level 10 Definitions
 --------------------
 .. autofunction:: tvm.relay.broadcast_to_like
 .. autofunction:: tvm.relay.collapse_sum_like
+.. autofunction:: tvm.relay.slice_like
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index fc539f3ce7429..7a8129180c4db 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -138,6 +138,19 @@ struct StridedSliceAttrs : public tvm::AttrsNode<StridedSliceAttrs> {
         .describe("Stride values of the slice");
   }
 };
+
+
+struct SliceLikeAttrs : public tvm::AttrsNode<SliceLikeAttrs> {
+  Array<Integer> axes;
+
+  TVM_DECLARE_ATTRS(SliceLikeAttrs, "relay.attrs.SliceLikeAttrs") {
+    TVM_ATTR_FIELD(axes)
+        .describe("List of axes on which input data will be sliced according to the "
+                  "corresponding size of the second input. By default will slice "
+                  "on all axes. Negative axes mean counting in reverse.");
+  }
+};
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_TRANSFORM_H_
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 7867336d033fe..01814e0f73e05 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -2,7 +2,11 @@
 """Backend compiler related feature registration"""
 from __future__ import absolute_import
 from . import op as _reg
-from .op import schedule_injective
+from .op import schedule_injective, OpPattern
 
 # strided_slice
 _reg.register_schedule("strided_slice", schedule_injective)
+
+# slice_like
+_reg.register_schedule("slice_like", schedule_injective)
+_reg.register_pattern("slice_like", OpPattern.INJECTIVE)
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index e43a4a573e542..c5fedab054d2f 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -361,3 +361,29 @@ def strided_slice(data, begin, end, strides=None):
     """
     strides = strides or []
     return _make.strided_slice(data, list(begin), list(end), list(strides))
+
+
+def slice_like(data, shape_like, axes=None):
+    """Slice the first input with respect to the second input.
+
+    For an input array with shape ``(d1, d2, ..., dk)``, `slice_like` operation slices the
+    the input array corresponding size of second array. By default will slice on all axes.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The source array.
+
+    shape_like : tvm.relay.Expr
+        The new shape.
+
+    axes : Optional[Tuple[int]]
+        List of axes on which input data will be sliced according to the corresponding size of
+        the second input. By default will slice on all axes. Negative axes mean counting in reverse.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.slice_like(data, shape_like, axes)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 98ac1c30b66ca..7a3a2151158d2 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1153,5 +1153,152 @@ the entries indicate where along axis the array is split.
 .set_support_level(3)
 .add_type_rel("Split", SplitRel);
 
+
+TVM_REGISTER_NODE_TYPE(SliceLikeAttrs);
+
+/*!
+* \brief SliceLikeRel User defined type constraint function.
+* \param num_inputs Number of input types in the args.
+* \param attrs The additional attributes of the operator.
+* \param reporter The reporter to report solution to.
+* \return False if the relation has not been resolved, it might be resolved later.
+*  True if this relation has been resolved.
+*/
+bool SliceLikeRel(const Array<Type>& types,
+                  int num_inputs,
+                  const Attrs& attrs,
+                  const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    return false;
+  }
+
+  const auto* target = types[1].as<TensorTypeNode>();
+  if (target == nullptr) {
+    return false;
+  }
+
+  const auto param = attrs.as<SliceLikeAttrs>();
+  CHECK(param != nullptr);
+
+  const Array<IndexExpr> dshape = data->shape;
+  const Array<IndexExpr> target_shape = target->shape;
+  std::vector<IndexExpr>&& oshape = AsVector(dshape);
+
+  if (!param->axes.defined()) {
+    for (size_t i = 0; i < dshape.size(); ++i) {
+      if (i < target_shape.size()) {
+        oshape[i] = target_shape[i];
+        CHECK(reporter->Assert(oshape[i] <= dshape[i]))
+          << "End index of axis " << i << " exceeds input shape: "
+          << oshape[i] << " vs " << dshape[i];
+      }
+    }
+  } else {
+    CHECK(param->axes.size() != 0) << "Axes cannot be empty.";
+    for (Integer val : param->axes) {
+      int axis = val->value;
+      if (axis < 0) {
+        axis += dshape.size();
+      }
+      CHECK(axis < static_cast<int>(target_shape.size()))
+        << "Axis " << axis << " exceeds dimension "
+        << target_shape.size() << " of target_shape.";
+      oshape[axis] = target_shape[axis];
+      CHECK(reporter->Assert(oshape[axis] <= dshape[axis]))
+        << "End index of axis " << axis << " exceeds input shape: "
+        << oshape[axis] << " vs " << dshape[axis];
+    }
+  }
+
+  reporter->Assign(types[2], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+
+Expr MakeSliceLike(Expr data,
+                   Expr shape_like,
+                   Array<Integer> axes) {
+  auto attrs = make_node<SliceLikeAttrs>();
+  attrs->axes = std::move(axes);
+  static const Op& op = Op::Get("slice_like");
+  return CallNode::make(op, {data, shape_like}, Attrs(attrs), {});
+}
+
+// Adapter function to make int array.
+Array<Integer> GetIntArray(Array<IndexExpr> arr) {
+  for (size_t i = 0; i < arr.size(); ++i) {
+    CHECK(!arr[i].defined() || arr[i].as<IntImm>())
+        << "Expect an int array";
+  }
+  return Array<Integer>(arr.node_);
+}
+
+template<typename AttrType>
+Array<Tensor> SliceLikeCompute(const Attrs& attrs,
+                               const Array<Tensor>& inputs,
+                               const Type& out_type,
+                               const Target& target) {
+  const auto* param = attrs.as<AttrType>();
+  CHECK(param != nullptr);
+  Array<IndexExpr> src_shape = inputs[0]->shape;
+  Array<IndexExpr> target_shape = inputs[1]->shape;
+  Array<IndexExpr> begin_idx, end_idx, strides;
+  for (size_t i = 0; i < src_shape.size(); ++i) {
+    begin_idx.push_back(0);
+    strides.push_back(1);
+  }
+  end_idx = Array<IndexExpr>(src_shape);
+  if (!param->axes.defined()) {
+    for (size_t i = 0; i < src_shape.size(); ++i) {
+      if (i < target_shape.size()) {
+        end_idx.Set(i, target_shape[i]);
+        CHECK_LE(topi::GetConstInt(end_idx[i]),
+                 topi::GetConstInt(src_shape[i]))
+          << "End index of axis " << i << " exceeds input shape: "
+          << topi::GetConstInt(end_idx[i]) << " vs "
+          << topi::GetConstInt(src_shape[i]);
+      }
+    }
+  } else {
+    for (int axis : param->axes) {
+      if (axis < 0) {
+        axis = static_cast<int>(src_shape.size()) + axis;
+      }
+      end_idx.Set(axis, target_shape[axis]);
+      CHECK_LE(topi::GetConstInt(end_idx[axis]),
+               topi::GetConstInt(src_shape[axis]))
+        << "End index of axis " << axis << " exceeds input shape: "
+        << topi::GetConstInt(end_idx[axis]) << " vs "
+        << topi::GetConstInt(src_shape[axis]);
+    }
+  }
+  return Array<Tensor>{
+    topi::strided_slice(inputs[0],
+                        GetIntArray(begin_idx),
+                        GetIntArray(end_idx),
+                        GetIntArray(strides))
+  };
+}
+
+
+TVM_REGISTER_API("relay.op._make.slice_like")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakeSliceLike, args, rv);
+});
+
+
+RELAY_REGISTER_OP("slice_like")
+.describe(R"code(Slice the first input respect to the second input.
+)code" TVM_ADD_FILELINE)
+  .set_attrs_type_key("relay.attrs.SlicelikeAttrs")
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("shape_like", "Tensor", "Shape tensor.")
+.set_support_level(10)
+.add_type_rel("SliceLike", SliceLikeRel)
+.set_attr<FTVMCompute>("FTVMCompute", SliceLikeCompute<SliceLikeAttrs>);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 9486d029876d8..ef1c57d263fad 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -1,7 +1,9 @@
 """ Support level10 operator test cases.
 """
+import numpy as np
 import tvm
 from tvm import relay
+from tvm.relay.testing import ctx_list
 
 def test_collapse_sum_like():
     x = relay.Var("x", relay.ty.TensorType((3, 4, 5, 6), "int8"))
@@ -18,6 +20,66 @@ def test_broadcast_to_like():
     zz = relay.ir_pass.infer_type(z)
     assert zz.checked_type == relay.ty.TensorType((3, 4, 5, 6), "int8")
 
+
+def np_slice_like(np_data, np_shape_like, axis=None):
+    begin_idx = [0 for _ in np_data.shape]
+    end_idx = list(np_data.shape)
+    if axis:
+        for i in axis:
+            if i < 0:
+                i = len(np_data.shape) + i
+            end_idx[i] = np_shape_like.shape[i]
+    else:
+        for i in range(len(np_data.shape)):
+            if i < len(np_shape_like.shape):
+                end_idx[i] = np_shape_like.shape[i]
+    slice_idx = []
+    for b, e in zip(begin_idx, end_idx):
+        slice_idx.append(slice(b, e))
+    np_result = np_data[tuple(slice_idx)]
+    return np_result
+
+
+def verify_slice_like(data, slice_like, axes, output, dtype="float32"):
+    x = relay.var("data", relay.TensorType(data, dtype))
+    y = relay.var("slice_like", relay.TensorType(slice_like, dtype))
+    z = relay.slice_like(x, y, axes)
+    zz = relay.ir_pass.infer_type(z)
+    if axes:
+        assert "axes" in z.astext()
+    assert zz.checked_type == relay.ty.TensorType(output, dtype)
+
+    if all(isinstance(v, int) == 0 for v in data) or \
+        all(isinstance(v, int) == 0 for v in slice_like):
+        return
+
+    func = relay.Function([x, y], z)
+    x_data = np.random.uniform(size=data).astype(dtype)
+    y_data = np.random.uniform(size=slice_like).astype(dtype)
+    ref_res = np_slice_like(x_data, y_data, axes)
+
+    for target, ctx in ctx_list():
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(x_data, y_data)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+def test_slice_like():
+    d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
+    verify_slice_like(data=(d1, d2, d3), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3))
+    verify_slice_like(data=(1, 2, 3), slice_like=(d1, d2, d3), axes=None, output=(d1, d2, d3))
+    verify_slice_like(data=(d2, d3, d4), slice_like=(d1, d2, d3), axes=(1,2), output=(d2, d2, d3))
+    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3))
+    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2), axes=None, output=(1, 2, 5))
+    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2, 3), axes=(1, 2), output=(3, 2, 3))
+    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2, 3), axes=(-1, -3), output=(1, 4, 3))
+    verify_slice_like(data=(1, 3, 224, 224),
+                      slice_like=(1, 3, 112, 112),
+                      axes=(2, 3),
+                      output=(1, 3, 112, 112))
+
+
 if __name__ == "__main__":
     test_collapse_sum_like()
     test_broadcast_to_like()
+    test_slice_like()

From 794825fecc3e4e923525e3c2d84f58c4cfa7709b Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 20 Nov 2018 10:31:19 -0800
Subject: [PATCH 10/30] [COMMUNITY] Update contributor list to reflect new
 guideline. (#2138)

---
 CONTRIBUTORS.md | 68 ++++++++++++++++++++++++++++---------------------
 1 file changed, 39 insertions(+), 29 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 91ecb28519850..602663ee867a8 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -5,38 +5,48 @@ contribute to, and influence the direction of the project. We actively invite co
 
 See the [community structure document](http://docs.tvm.ai/contribute/community.html) for the explanation of community structure and contribution guidelines.
 
+
 ## Committers
-- [Tianqi Chen](https://github.com/tqchen) (PMC)
-- [Thierry Moreau](http://homes.cs.washington.edu/~moreau/)
-- [Ziheng Jiang](https://github.com/ZihengJiang)
-- [Haichen Shen](http://homes.cs.washington.edu/~haichen/)
-- [Yizhi Liu](https://github.com/yzhliu)
-
-## Code Owners
-- [Aditya Atluri](https://github.com/adityaatluri) ROCM
-- [Leyuan Wang](https://github.com/Laurawly) TOPI
-- [Yuwei Hu](https://github.com/Huyuwei) TOPI
-- [Zhixun Tan](https://github.com/phisiart) OpenGL/WebGL backend
-- [Nick Hynes](https://github.com/nhynes) SGX and secured computing
-- [Lianmin Zheng](https://github.com/merrymercy) AutoTVM
+
+We add tag along with committer name to show areas that they are familiar with.
+We do encourage everyone to work anything they are interested in.
+
+- [Tianqi Chen](https://github.com/tqchen) (PMC): @tqchen - topi, compiler, relay, docs
+- [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta,
+- [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
+- [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
+- [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay
 
 ## Reviewers
-- [Zhi Chen](https://github.com/zhiics)
-- [Xiaoqiang Dan](https://github.com/xqdan)
-- [Liangfu Chen](https://github.com/liangfu)
-- [Wuwei Lin](https://github.com/vinx13)
-- [Masahiro Masuda](https://github.com/masahi)
-- [Kazutaka Morita](https://github.com/kazum)
-- [Tatsuya Nishiyama](https://github.com/nishi-t)
-- [Pariksheet Pinjari](https://github.com/PariksheetPinjari909)
-- [Jared Roesch](https://github.com/jroesch)
-- [Siva](https://github.com/srkreddy1238)
-- [Siju Samuel](https://github.com/siju-samuel)
-- [Alex Weaver](https://github.com/alex-weaver)
-- [Yao Wang](https://github.com/kevinthesun)
-- [Jian Weng](https://github.com/were)
-- [Eddie Yan](https://github.com/eqy)
-- [Joshua Z. Zhang](https://github.com/zhreshold)
+
+- [Aditya Atluri](https://github.com/adityaatluri): @adityaatluri
+- [Tianqi Chen](https://github.com/tqchen): @tqchen
+- [Liangfu Chen](https://github.com/liangfu): @liangfu
+- [Zhi Chen](https://github.com/zhiics): @zhiics
+- [Nick Hynes](https://github.com/nhynes): @nhynes
+- [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei
+- [Yizhi Liu](https://github.com/yzhliu) : @yzhliu
+- [Zhixun Tan](https://github.com/phisiart): @phisiart
+- [Zhi Chen](https://github.com/zhiics): @zhiics
+- [Xiaoqiang Dan](https://github.com/xqdan): @xqdan
+- [Ziheng Jiang](https://github.com/ZihengJiang): @ZihengJiang
+- [Wuwei Lin](https://github.com/vinx13): @vinx13
+- [Masahiro Masuda](https://github.com/masahi): @masahi
+- [Thierry Moreau](https://github.com/tmoreau89): @tmoreau89
+- [Kazutaka Morita](https://github.com/kazum): @kazum
+- [Tatsuya Nishiyama](https://github.com/nishi-t): @nishi-t
+- [Pariksheet Pinjari](https://github.com/PariksheetPinjari909): @PariksheetPinjari909
+- [Jared Roesch](https://github.com/jroesch): @jroesch
+- [Siva](https://github.com/srkreddy1238): @srkreddy1238
+- [Siju Samuel](https://github.com/siju-samuel): @siju-samuel
+- [Haichen Shen](https://github.com/icemelon9): @icemelon9
+- [Alex Weaver](https://github.com/alex-weaver): @alex-weaver
+- [Yao Wang](https://github.com/kevinthesun): @kevinthesun
+- [Leyuan Wang](https://github.com/Laurawly): @Laurawly
+- [Jian Weng](https://github.com/were): @were
+- [Eddie Yan](https://github.com/eqy): @eqy
+- [Joshua Z. Zhang](https://github.com/zhreshold): @zhreshold
+- [Lianmin Zheng](https://github.com/merrymercy): @merrymercy
 
 ## List of Contributors
 - [Full List of Contributors](https://github.com/dmlc/tvm/graphs/contributors)

From f03166dd4fa908cbb4b77b6d2b7410586ded88f0 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 20 Nov 2018 10:33:42 -0800
Subject: [PATCH 11/30] Update CONTRIBUTORS.md

make name alphabetical
---
 CONTRIBUTORS.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 602663ee867a8..0e77c34c115ba 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -12,10 +12,11 @@ We add tag along with committer name to show areas that they are familiar with.
 We do encourage everyone to work anything they are interested in.
 
 - [Tianqi Chen](https://github.com/tqchen) (PMC): @tqchen - topi, compiler, relay, docs
-- [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta,
+- [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
+- [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
 - [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
-- [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay
+
 
 ## Reviewers
 

From 43abe2ca012819646cafc6c0c9f7d81afc7f19b6 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 20 Nov 2018 10:47:22 -0800
Subject: [PATCH 12/30] [TEAM] Huyuwei -> committer (#2139)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 0e77c34c115ba..1286716b70fad 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -12,6 +12,7 @@ We add tag along with committer name to show areas that they are familiar with.
 We do encourage everyone to work anything they are interested in.
 
 - [Tianqi Chen](https://github.com/tqchen) (PMC): @tqchen - topi, compiler, relay, docs
+- [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei - topi, frontends
 - [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
 - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta

From 9d782188a3a6be84a4f568d55c2c2ced84763809 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 20 Nov 2018 10:55:41 -0800
Subject: [PATCH 13/30] [TEAM] adityaatluri -> committer (#2140)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 1286716b70fad..d6a6dbb67b7a2 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -11,6 +11,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 We add tag along with committer name to show areas that they are familiar with.
 We do encourage everyone to work anything they are interested in.
 
+- [Aditya Atluri](https://github.com/adityaatluri): @adityaatluri - rocm
 - [Tianqi Chen](https://github.com/tqchen) (PMC): @tqchen - topi, compiler, relay, docs
 - [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei - topi, frontends
 - [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay

From 7f420f8f222c38d8eb673b69b7655668864e14f0 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 20 Nov 2018 11:05:42 -0800
Subject: [PATCH 14/30] [TEAM] Laurawly -> committer (#2141)

---
 CONTRIBUTORS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index d6a6dbb67b7a2..fb1828ae96fb0 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -18,7 +18,7 @@ We do encourage everyone to work anything they are interested in.
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
 - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
 - [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
-
+- [Leyuan Wang](https://github.com/Laurawly): @Laurawly: - topi
 
 ## Reviewers
 

From da1a79fd6c42c422ce74a5ffa65dd4017205e31e Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Tue, 20 Nov 2018 13:26:49 -0800
Subject: [PATCH 15/30] [TEAM] Lianmin Zheng -> committer (#2142)

---
 CONTRIBUTORS.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index fb1828ae96fb0..6ca1e997450c1 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -18,7 +18,8 @@ We do encourage everyone to work anything they are interested in.
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
 - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
 - [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
-- [Leyuan Wang](https://github.com/Laurawly): @Laurawly: - topi
+- [Leyuan Wang](https://github.com/Laurawly): @Laurawly - topi
+- [Lianmin Zheng](https://github.com/merrymercy): @merrymercy - autotvm, topi
 
 ## Reviewers
 

From 642340c8072bd036bbef750cc1c45f5342fb41a4 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Tue, 20 Nov 2018 13:27:22 -0800
Subject: [PATCH 16/30] Add nick to committer (#2143)

---
 CONTRIBUTORS.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 6ca1e997450c1..945dfb7b3ae5d 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -18,7 +18,8 @@ We do encourage everyone to work anything they are interested in.
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
 - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
 - [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
-- [Leyuan Wang](https://github.com/Laurawly): @Laurawly - topi
+- [Leyuan Wang](https://github.com/Laurawly): @Laurawly: - topi
+- [Nick Hynes](https://github.com/nhynes): @nhynes: - sgx
 - [Lianmin Zheng](https://github.com/merrymercy): @merrymercy - autotvm, topi
 
 ## Reviewers

From f373d6a0f6547ff6f6b23a066f3009bf0c7d23a0 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 20 Nov 2018 13:29:53 -0800
Subject: [PATCH 17/30] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 945dfb7b3ae5d..cbdcf396e9b4b 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -19,7 +19,7 @@ We do encourage everyone to work anything they are interested in.
 - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
 - [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
 - [Leyuan Wang](https://github.com/Laurawly): @Laurawly: - topi
-- [Nick Hynes](https://github.com/nhynes): @nhynes: - sgx
+- [Nick Hynes](https://github.com/nhynes): @nhynes: - sgx, rust
 - [Lianmin Zheng](https://github.com/merrymercy): @merrymercy - autotvm, topi
 
 ## Reviewers

From 11f897ac2f1ca79c16a560104bd1b1c480ba2f87 Mon Sep 17 00:00:00 2001
From: Josh Pollock <joshpollock1997@gmail.com>
Date: Tue, 20 Nov 2018 22:50:37 -0800
Subject: [PATCH 18/30] fix dcgan layer naming overlap (#2145)

---
 python/tvm/relay/testing/dcgan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/testing/dcgan.py b/python/tvm/relay/testing/dcgan.py
index 96cd871e4122b..d6c1d55df01a1 100644
--- a/python/tvm/relay/testing/dcgan.py
+++ b/python/tvm/relay/testing/dcgan.py
@@ -36,7 +36,7 @@ def deconv2d_bn_relu(data, prefix, **kwargs):
     """a block of deconv + batch norm + relu"""
     eps = 1e-5 + 1e-12
     net = deconv2d(data, name="%s_deconv" % prefix, **kwargs)
-    net = layers.batch_norm_infer(net, epsilon=eps, name="batch_norm")
+    net = layers.batch_norm_infer(net, epsilon=eps, name="%s_batch_norm" % prefix)
     net = relay.nn.relu(net)
     return net
 

From 2c7d2d78ac9cb273eedb9b5f79c22ab9546c44ae Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Wed, 21 Nov 2018 16:31:27 +0800
Subject: [PATCH 19/30] Fix relative import in x86 conv2d (#2149)

---
 topi/python/topi/x86/conv2d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index e48a95780e7f8..1a73736264bd3 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -21,7 +21,7 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depth
     """
     if is_depthwise:
         wkl = _get_depthwise_conv2d_workload(data, kernel, strides, padding, out_dtype)
-        from depthwise_conv2d import _fallback_schedule
+        from .depthwise_conv2d import _fallback_schedule
         _fallback_schedule(cfg, wkl)
     else:
         wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype)

From 2c231b5af16f7d5e6fd2ee544a53284933aeced3 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Wed, 21 Nov 2018 09:48:58 -0800
Subject: [PATCH 20/30] [RELAY] Move Layout to tvm Node system (#2125)

---
 include/tvm/expr.h                |   2 +-
 src/relay/op/image/resize.cc      |   4 +-
 src/relay/op/layout.cc            |  80 +++++
 src/relay/op/layout.h             | 422 +++++++++++++++++++++++
 src/relay/op/nn/convolution.cc    |  15 +-
 src/relay/op/nn/layout.h          | 536 ------------------------------
 src/relay/op/nn/nn.cc             |   2 +-
 src/relay/op/nn/pad.cc            |   2 +-
 src/relay/op/nn/pooling.cc        |  30 +-
 src/relay/op/nn/upsampling.cc     |   4 +-
 src/relay/pass/fold_scale_axis.cc |  33 +-
 src/relay/pass/pattern_util.h     |   3 +-
 12 files changed, 551 insertions(+), 582 deletions(-)
 create mode 100644 src/relay/op/layout.cc
 create mode 100644 src/relay/op/layout.h
 delete mode 100644 src/relay/op/nn/layout.h

diff --git a/include/tvm/expr.h b/include/tvm/expr.h
index 37b122ae5b034..35083cafae818 100644
--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -85,7 +85,7 @@ class Var : public HalideIR::VarExpr {
 
 
 /*!
- * \brief Container of constant ineteger (IntImm).
+ * \brief Container of constant integer (IntImm).
  *
  * This is used to store and automate type check
  * attributes that must be constant integer.
diff --git a/src/relay/op/image/resize.cc b/src/relay/op/image/resize.cc
index b4984becdf8b6..bfa2ea4cdfa58 100644
--- a/src/relay/op/image/resize.cc
+++ b/src/relay/op/image/resize.cc
@@ -5,7 +5,7 @@
  */
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/image.h>
-#include "../nn/layout.h"
+#include "../layout.h"
 
 namespace tvm {
 namespace relay {
@@ -25,7 +25,7 @@ bool ResizeRel(const Array<Type>& types,
   const ResizeAttrs* param = attrs.as<ResizeAttrs>();
   CHECK(param != nullptr);
   const Layout in_layout(param->layout);
-  CHECK(in_layout.convertible(kNCHW))
+  CHECK(in_layout.Convertible(kNCHW))
     << "Resize only support input layouts that are convertible from NCHW."
     << " But got " << in_layout;
 
diff --git a/src/relay/op/layout.cc b/src/relay/op/layout.cc
new file mode 100644
index 0000000000000..98fea55aa4c13
--- /dev/null
+++ b/src/relay/op/layout.cc
@@ -0,0 +1,80 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/relay/op/layout.cc
+ * \brief Layout expression.
+ */
+
+#include "layout.h"
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(LayoutNode);
+
+std::vector<IndexExpr> ConvertLayout(
+    std::vector<IndexExpr> src,
+    const Layout& src_layout,
+    const Layout& dst_layout) {
+  CHECK_EQ(src_layout.ndim(), src.size());
+  if (src_layout == dst_layout) {
+    return src;
+  } else if (!src_layout.defined()) {
+    LOG(FATAL) << "cannot convert undefined layout to " << dst_layout;
+  } else if (!dst_layout.defined()) {
+    LOG(FATAL) << "cannot convert " << src_layout << " to undefined layout";
+  }
+
+  CHECK(src_layout.Convertible(dst_layout))
+    << "cannot convert from "
+    << src_layout << " to " << dst_layout;
+
+  std::vector<IndexExpr> dst(dst_layout.ndim());
+  for (size_t i = 0; i < src_layout.ndim(); ++i) {
+    Layout::LayoutDim src_dim = src_layout[i];
+    if (Layout::IsSuperdim(src_dim)) {
+      int dst_major_pos = dst_layout.Indexof(Layout::ToSuperdim(src_dim));
+      int dst_minor_pos = dst_layout.Indexof(Layout::ToSubdim(src_dim));
+      int src_minor_pos = src_layout.Indexof(Layout::ToSubdim(src_dim));
+      int src_factor = src_layout.Subsizeof(src_dim);
+      int dst_factor = dst_layout.Subsizeof(src_dim);
+      IndexExpr src_dim_size = src[i];
+
+      if (src_minor_pos >= 0) {
+        CHECK(is_const_int(src[src_minor_pos], src_factor))
+          << "src shape " << Array<IndexExpr>(src)
+          << " does not agree with layout "
+          << src_layout;
+        src_dim_size *= src_factor;
+      }
+      dst[dst_major_pos] = src_dim_size;
+      if (dst_minor_pos >= 0) {
+        CHECK_GT(dst_factor, 0);
+        if (const int64_t* const_src_dim_size = as_const_int(src_dim_size)) {
+          CHECK_LE(dst_factor, const_src_dim_size[0])
+            << "Converting " << Array<IndexExpr>(src)
+            << " from " << src_layout
+            << " to " << dst_layout
+            << ": cannot split dimension size of "
+            << src_dim_size << " by " << dst_factor;
+        }
+        dst[dst_major_pos] /= dst_factor;
+        dst[dst_minor_pos] = dst_factor;
+      }
+    }
+  }
+  return dst;
+}
+
+std::vector<IndexExpr> ConvertLayout(
+    const Array<IndexExpr>& src,
+    const Layout& src_layout,
+    const Layout& dst_layout) {
+  std::vector<IndexExpr> ret(src.size());
+  for (size_t i = 0; i < src.size(); ++i) {
+    ret[i] = src[i];
+  }
+  return ConvertLayout(ret, src_layout, dst_layout);
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/layout.h b/src/relay/op/layout.h
new file mode 100644
index 0000000000000..97160f3cbb9eb
--- /dev/null
+++ b/src/relay/op/layout.h
@@ -0,0 +1,422 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file relay/op/layout.h
+ * \brief Layout expression.
+ *
+ *  This file is adapted from its nnvm counterpart and will keep involving
+ *  to the new layout system
+ *
+ *  The layout is composed of upper cases, lower cases and numbers,
+ *  where upper case indicates a (super-)dimension and
+ *  the corresponding lower case with factor size indicates the split (sub-)dimension.
+ *  For example, NCHW16c can describe a 5-D tensor of
+ *  [batch_size, channel, height, width, channel_block].
+ *  Here sub-dimension channel_block=16 is the split of super-dimension C (channel).
+ */
+#ifndef TVM_RELAY_OP_LAYOUT_H_
+#define TVM_RELAY_OP_LAYOUT_H_
+
+#include <tvm/base.h>
+#include <tvm/expr.h>
+#include <tvm/relay/base.h>
+
+#include <string>
+#include <sstream>
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+namespace tvm {
+namespace relay {
+
+class LayoutNode : public Node {
+ public:
+  std::string name;
+  Array<Integer> superdim_pos;
+  Array<Integer> subdim_pos;
+  Array<Integer> subdim_size;
+  Array<Integer> layout_simplified;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("name", &name);
+    v->Visit("superdim_pos", &superdim_pos);
+    v->Visit("subdim_pos", &subdim_pos);
+    v->Visit("subdim_size", &subdim_size);
+    v->Visit("layout_simplified", &layout_simplified);
+  }
+
+  static constexpr const char* _type_key = "Layout";
+  TVM_DECLARE_NODE_TYPE_INFO(LayoutNode, Node);
+};
+
+class Layout : public NodeRef {
+ public:
+  using LayoutDim = char;
+  static constexpr uint32_t kUniqueDim = 26;
+
+  explicit Layout(NodePtr<Node> n) : NodeRef(n) {}
+
+  /*! \brief default constructor */
+  Layout() : Layout("__undef__") {} // NOLINT(*)
+
+  /*! \brief construct from a string */
+  Layout(const char* str) : Layout(std::string(str)) {} // NOLINT(*)
+
+  /*!
+   * \brief construct from a string.
+   * \param layout input in layout convention:
+   *        upper case indicates a dimension and
+   *        the corresponding lower case with factor size
+   *        indicates the split dimension.
+   *        return undefined layout if "__undef__" is passed.
+   */
+  Layout(const std::string& layout) { // NOLINT(*)
+    if (layout.length() != 0) {
+      Parse(layout);
+    } else {
+      Parse("__undef__");
+    }
+  }
+
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  const LayoutNode* operator->() const {
+    return static_cast<const LayoutNode*>(node_.get());
+  }
+
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  LayoutNode* operator->() {
+    return static_cast<LayoutNode*>(node_.get());
+  }
+
+  /*!
+   * \brief Check whether a given dimension is a super-dimension.
+   * \param dim input dimension
+   * \return Whether a given dimension is a super-dimension.
+   */
+  static bool IsSuperdim(LayoutDim dim) {
+    return dim >= 'A' && dim <= 'Z';
+  }
+
+  /*!
+   * \brief Check whether a given dimension is a sub-dimension.
+   * \param dim input dimension
+   * \return Whether a given dimension is a sub-dimension.
+   */
+  static bool IsSubdim(LayoutDim dim) {
+    return dim >= 'a' && dim <= 'z';
+  }
+
+  /*!
+   * \brief Convert a given dimension to super-dimension.
+   * \param dim input dimension
+   * \return The converted description.
+   */
+  static LayoutDim ToSuperdim(LayoutDim dim) {
+    if (IsSubdim(dim)) {
+      return dim - 'a' + 'A';
+    }
+    return dim;
+  }
+
+  /*!
+   * \brief Convert a given dimension to sub-dimension.
+   * \param dim input dimension
+   * \return The converted description.
+   */
+  static LayoutDim ToSubdim(LayoutDim dim) {
+    if (IsSuperdim(dim)) {
+      return dim - 'A' + 'a';
+    }
+    return dim;
+  }
+
+  /*!
+ * \brief Return an undefined layout.
+ * \return a (global) undefined layout.
+ */
+  static const Layout& Undef() {
+    static Layout undef;
+    return undef;
+  }
+
+  /*!
+   * \brief Two layouts are convertible only if
+   *        they have same set of super-dimensions.
+   *        e.g., NCHW, NCHW16c, NHWC are convertible between each other,
+   *        but NCHW, CHW, OIHW are not.
+   * \param dst the target layout
+   * \return Whether can be converted to dst layout.
+   */
+  bool Convertible(const Layout &dst) const {
+    const LayoutNode *n = operator->();
+    if (!this->defined() || !dst.defined()) return false;
+    for (size_t i = 0; i < kUniqueDim; ++i) {
+      if ((n->superdim_pos[i]->value >= 0 && dst->superdim_pos[i]->value < 0) ||
+          (n->superdim_pos[i]->value < 0 && dst->superdim_pos[i]->value >= 0)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /*!
+   * \brief Returns a sublayout which is the portion of the object
+   *        that starts at dimension \p pos and spans \p len dimensions
+   *        (or until the end of the layout, whichever comes first).
+   * \param pos The start position.
+   * \param len The length of the sub-layout.
+   * \return A newly constructed Layout object.
+   */
+  Layout Sublayout(size_t pos, size_t len) const {
+    const Array<Integer>& layout_simplified = operator->()->layout_simplified;
+    if (pos > ndim()) return Layout::Undef();
+    if (pos + len > ndim()) len = ndim() - pos;
+    if (len == 0) return Layout::Undef();
+    std::ostringstream new_layout;
+    for (size_t i = pos; i < pos + len; ++i) {
+      if (IsSubdim(layout_simplified[i]->value)) {
+        auto block_size = this->Subsizeof(layout_simplified[i]->value);
+        CHECK_GT(block_size, 0);
+        new_layout << block_size;
+      }
+      new_layout << layout_simplified[i]->value;
+    }
+    return Layout(new_layout.str());
+  }
+
+  /*! \return A newly constructed reversed Layout object. */
+  Layout Reverse() const {
+    const Array<Integer>& layout_simplified = operator->()->layout_simplified;
+    if (!this->defined()) return Layout::Undef();
+    std::ostringstream new_layout;
+    for (int64_t i = this->ndim() - 1; i >= 0; --i) {
+      if (IsSubdim(layout_simplified[i]->value)) {
+        auto block_size = this->Subsizeof(layout_simplified[i]->value);
+        CHECK_GT(block_size, 0);
+        new_layout << block_size;
+      }
+      new_layout << layout_simplified[i]->value;
+    }
+    return Layout(new_layout.str());
+  }
+
+  /*!
+   * \brief Split \p dim by \p size and put the sub-dimension to position \p target_pos.
+   * \param dim The source dimension to be split. It must be a super-dimension.
+   * \param target_pos The target position of the newly split sub-dimension.
+   * \param size size of the sub-dimension.
+   * \return A newly constructed Layout object.
+   */
+  Layout Split(LayoutDim dim, size_t target_pos, uint32_t size) const {
+    const std::string &name = operator->()->name;
+    CHECK(target_pos <= this->ndim()) << "Invalid split position "
+                                      << target_pos << " for layout " << name;
+    CHECK(IsSuperdim(dim)) << "Cannot split a sub-dimension " << dim;
+    CHECK(this->Contains(dim)) << "Axis " << dim << " does not exist in " << name;
+    CHECK(!this->Contains(ToSubdim(dim))) << "Dimension " << dim
+                                           << " has already been split in "
+                                           << name;
+    CHECK(size > 0) << "Invalid split size " << size;
+    std::ostringstream new_layout;
+    for (size_t i = 0; i <= this->ndim(); ++i) {
+      if (i == target_pos) {
+        new_layout << size << Layout::ToSubdim(dim);
+      }
+      if (i == this->ndim()) break;
+      new_layout << this->at(i);
+    }
+    Layout x(new_layout.str());
+    return x;
+  }
+
+
+  /*! \return number of dimensions */
+  size_t ndim() const {
+    return operator->()->layout_simplified.size();
+  }
+
+  /*!
+   * \brief The description of the \p i-th dimension.
+   *        If it is a sub-dimension, the size will be returned as well,
+   *        e.g., 16c. Otherwise a single character is returned, e.g., C.
+   * \param i The position
+   * \return the description of the dimension.
+   */
+  std::string at(size_t i) const {
+    const Array<Integer>& layout_simplified = operator->()->layout_simplified;
+    CHECK_LT(i, this->ndim()) << "position " << i
+                              << " exceeds ndim=" << this->ndim();
+    std::ostringstream repr;
+    if (IsSubdim(layout_simplified[i]->value)) {
+      auto factor = Subsizeof(layout_simplified[i]->value);
+      CHECK_GT(factor, 0);
+      repr << factor;
+    }
+    repr << static_cast<char>(layout_simplified[i]->value);
+    return repr.str();
+  }
+
+  /*!
+   * \brief return the index of the input dimension.
+   *        If it is not found in the layout or the layout is undefined,
+   *        return -1.
+   * \param dim the input dimension.
+   * \return the index or -1 if not found.
+   */
+  int32_t Indexof(LayoutDim dim) const {
+    if (!this->defined()) return -1;
+    else if (IsSuperdim(dim)) return operator->()->superdim_pos[dim - 'A']->value;
+    else if (IsSubdim(dim)) return operator->()->subdim_pos[dim - 'a']->value;
+    return -1;
+  }
+
+  /*!
+   * \param dim the input super-dimension or sub-dimension.
+   * \return the size of the sub-dimension of \p dim (if \p dim is a super-dimension),
+   *         or the size of \p dim itself (if \p dim is a sub-dimension).
+   *         Return -1 if \p dim is not in the layout or the layout is undefined.
+   */
+  int64_t Subsizeof(LayoutDim dim) const {
+    CHECK(IsSuperdim(dim) || IsSubdim(dim)) << "Invalid dim " << dim;
+    if (!this->defined() || !this->Contains(ToSubdim(dim))) {
+      return -1;
+    }
+    int idx = ToSubdim(dim) - 'a';
+    return operator->()->subdim_size[idx]->value;
+  }
+
+  /*!
+   * \brief Whether the layout contains a dimension.
+   * \param dim dimension to be checked.
+   * \return Whether the layout contains the dimension.
+   */
+  bool Contains(LayoutDim dim) const {
+    if (IsSuperdim(dim)) {
+      return operator->()->superdim_pos[dim-'A']->value >= 0;
+    } else if (IsSubdim(dim)) {
+      return operator->()->subdim_pos[dim-'a']->value >= 0;
+    }
+    return false;
+  }
+
+  LayoutDim operator[](size_t i) const {
+    return operator->()->layout_simplified[i];
+  }
+
+  /*! \return whether the layout is defined */
+  bool defined() const {
+    return operator->()->name != "__undef__";
+  }
+  /*! \return the string description of the layout */
+  const std::string& name() const {
+    return operator->()->name;
+  }
+
+  /*!
+   * \brief Whether the two layouts are equal.
+   * \param rhs Another layout.
+   * \return whether the two layouts are equal.
+   */
+  bool Equals(const Layout &rhs) const {
+    return operator->()->name == rhs->name;
+  }
+
+  using ContainerType = LayoutNode;
+
+ private:
+  void Parse(const std::string &layout) {
+    node_ = make_node<LayoutNode>();
+
+    std::vector<uint32_t> superdim_pos(kUniqueDim, -1);
+    std::vector<uint32_t> subdim_pos(kUniqueDim, -1);
+    std::vector<uint32_t> subdim_size(kUniqueDim, -1);
+    std::vector<char> layout_simplified;
+
+    if (layout != "__undef__") {  // parse layout string
+      int32_t factor = 0;
+      uint32_t curr = 0;
+      for (size_t i = 0; i < layout.size(); ++i) {
+        const LayoutDim c = layout.at(i);
+        if (IsSuperdim(c)) {
+          int pos = c - 'A';
+          CHECK_EQ(factor, 0) << "Invalid layout " << layout
+                              << ": invalid factor size " << factor
+                              << " before dimension " << c;
+          CHECK_EQ(superdim_pos[pos], -1) << "Invalid layout " << layout
+                                          << ": duplicate dimension " << c;
+          superdim_pos[pos] = curr++;
+          layout_simplified.push_back(c);
+        } else if (IsSubdim(c)) {
+          int pos = c - 'a';
+          CHECK_GT(factor, 0) << "Invalid layout " << layout << ": invalid factor size "
+                              << factor << " for dimension " << c;
+          CHECK_EQ(subdim_pos[pos], -1) << "Invalid layout " << layout
+                                        << ": duplicate dimension " << c;
+          CHECK_EQ(subdim_size[pos], -1) << "Invalid layout " << layout
+                                         << ": duplicate dimension " << c;
+          subdim_pos[pos] = curr++;
+          subdim_size[pos] = factor;
+          layout_simplified.push_back(c);
+          factor = 0;
+        } else if (c >= '0' && c <= '9') {
+          CHECK(factor >= 0) << "Invalid layout " << layout << ": _ is adjacent to a number.";
+          factor = factor * 10 + c - '0';
+        } else {
+          LOG(FATAL) << "Invalid layout " << layout;
+        }
+      }
+      CHECK(!layout_simplified.empty()) << "Invalid layout " << layout;
+      for (LayoutDim dim : layout_simplified) {
+        CHECK(IsSuperdim(dim) || superdim_pos[dim-'a'] >= 0)
+          << "Invalid layout " << layout << ": missing axis "
+          << static_cast<char>(dim - 'a' + 'A');
+      }
+    }
+
+    LayoutNode *node = operator->();
+    node->name = layout;
+
+    for (uint32_t i = 0; i < kUniqueDim; ++i) {
+      node->superdim_pos.push_back(superdim_pos[i]);
+      node->subdim_pos.push_back(subdim_pos[i]);
+      node->subdim_size.push_back(subdim_size[i]);
+    }
+    for (LayoutDim dim : layout_simplified) {
+      node->layout_simplified.push_back(dim);
+    }
+  }
+};
+
+/*!
+ * \brief Convert shape in src_layout to shape in dst_layout
+ * \param src original shape
+ * \param src_layout layout of original shape
+ * \param dst_layout target layout
+ * \return shape in target layout
+ */
+std::vector<IndexExpr> ConvertLayout(
+    std::vector<IndexExpr> src,
+    const Layout& src_layout,
+    const Layout& dst_layout);
+
+/*!
+ * \brief Convert shape in src_layout to shape in dst_layout
+ * \param src original shape
+ * \param src_layout layout of original shape
+ * \param dst_layout target layout
+ * \return shape in target layout
+ */
+std::vector<IndexExpr> ConvertLayout(
+    const Array<IndexExpr>& src,
+    const Layout& src_layout,
+    const Layout& dst_layout);
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_OP_LAYOUT_H_
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 8e1d9db50e7e3..cb648166f7bb6 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -6,7 +6,8 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <vector>
-#include "layout.h"
+
+#include "../layout.h"
 
 namespace tvm {
 namespace relay {
@@ -28,16 +29,16 @@ bool Conv2DRel(const Array<Type>& types,
   CHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->weight_layout);
-  CHECK(in_layout.convertible(kNCHW))
+  CHECK(in_layout.Convertible(kNCHW))
     << "Conv only support input layouts that are convertible from NCHW."
     << " But got " << in_layout;
-  CHECK(kernel_layout.convertible(kOIHW))
+  CHECK(kernel_layout.Convertible(kOIHW))
     << "Conv only support kernel layouts that are convertible from OIHW."
     << " But got "<< kernel_layout;
 
   Layout out_layout(param->out_layout);
   if (!out_layout.defined()) out_layout = in_layout;
-  CHECK(out_layout.convertible(kNCHW))
+  CHECK(out_layout.Convertible(kNCHW))
       << "Conv only support output layouts that are convertible from NCHW."
       << " But got " << out_layout;
 
@@ -55,7 +56,7 @@ bool Conv2DRel(const Array<Type>& types,
          param->kernel_size[0],
          param->kernel_size[1]});
     wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
-    wshape[kernel_layout.indexof('O')] *= param->groups;
+    wshape[kernel_layout.Indexof('O')] *= param->groups;
     channels = param->channels;
     dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
     dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
@@ -177,10 +178,10 @@ bool Conv2DTransposeRel(const Array<Type>& types,
   CHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->weight_layout);
-  CHECK(in_layout.convertible(kNCHW))
+  CHECK(in_layout.Convertible(kNCHW))
     << "Conv only support input layouts that are convertible from NCHW."
     << " But got " << in_layout;
-  CHECK(kernel_layout.convertible(kOIHW))
+  CHECK(kernel_layout.Convertible(kOIHW))
     << "Conv only support kernel layouts that are convertible from OIHW."
     << " But got "<< kernel_layout;
 
diff --git a/src/relay/op/nn/layout.h b/src/relay/op/nn/layout.h
deleted file mode 100644
index d9eb59d6e31c1..0000000000000
--- a/src/relay/op/nn/layout.h
+++ /dev/null
@@ -1,536 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file relay/op/nn/layout.h
- * \brief Layout expression.
- *
- *  This file is adapted from its nnvm counterpart and will keep involving
- *  to the new layout system
- *
- *  The layout is composed of upper cases, lower cases and numbers,
- *  where upper case indicates a (super-)dimension and
- *  the corresponding lower case with factor size indicates the split (sub-)dimension.
- *  For example, NCHW16c can describe a 5-D tensor of
- *  [batch_size, channel, height, width, channel_block].
- *  Here sub-dimension channel_block=16 is the split of super-dimension C (channel).
- */
-#ifndef TVM_RELAY_OP_NN_LAYOUT_H_
-#define TVM_RELAY_OP_NN_LAYOUT_H_
-
-#include <string>
-#include <sstream>
-#include <vector>
-#include <utility>
-#include <algorithm>
-
-namespace tvm {
-namespace relay {
-
-/*! \brief layout auxiliary structure */
-class Layout {
- public:
-  using LayoutDim = char;
-
-  /*! \brief default constructor */
-  Layout() : name_("__undef__") {} // NOLINT(*)
-
-  /*!
-   * \brief construct from a string.
-   * \param layout input in layout convention:
-   *        upper case indicates a dimension and
-   *        the corresponding lower case with factor size
-   *        indicates the split dimension.
-   *        return undefined layout if "__undef__" is passed.
-   */
-  Layout(const std::string& layout) { // NOLINT(*)
-    if (layout.length() != 0) {
-      parse(layout);
-    } else {
-      parse("__undef__");
-    }
-  }
-  /*!
-   * \brief copy constructor from another layout
-   * \param s the source layout
-   */
-  Layout(const Layout& s) { // NOLINT(*)
-    this->parse(s.name_);
-  }
-  /*!
-   * \brief move constructor from Layout
-   * \param src the source layout
-   */
-  Layout(Layout&& src) { // NOLINT(*)
-    this->swap(src);
-  }
-  /*!
-   * \brief assignment from another layout.
-   * \param src source layout
-   * \return reference of self
-   */
-  Layout& operator=(const Layout& src) {
-    this->parse(src.name_);
-    return *this;
-  }
-  /*!
-   * \brief assignment from rvalue of another layout.
-   * \param src source layout
-   * \return reference of self
-   */
-  Layout& operator=(Layout&& src) {
-    Layout(std::move(src)).swap(*this); // NOLINT(*)
-    return *this;
-  }
-  /*!
-   * \brief assignment from string.
-   * \param src source layout
-   * \return reference of self
-   */
-  Layout& operator=(const std::string& src) {
-    this->parse(src);
-    return *this;
-  }
-  /*!
-   * \return whether two layout equals
-   * \param s the layout to compare against
-   */
-  bool operator==(const Layout& s) const {
-    return name_ == s.name_;
-  }
-  /*!
-   * \return whether two layout not equal
-   * \param s the layout to compare against
-   */
-  bool operator!=(const Layout& s) const {
-    return !(*this == s);
-  }
-
-  /*!
-   * \brief Append the current layout by another.
-   * @param other the layout to be appended
-   * @return a new layout
-   */
-  Layout operator+(const Layout& other) const {
-    if (!this->defined() && !other.defined()) {
-      return Layout::Undef();
-    } else if (!this->defined()) {
-      return other;
-    } else if (!other.defined()) {
-      return *this;
-    }
-    return Layout(this->name_ + other.name_);
-  }
-
-  /*!
-   * \brief Check whether a given dimension is a super-dimension.
-   * \param dim input dimension
-   * \return Whether a given dimension is a super-dimension.
-   */
-  static bool is_superdim(LayoutDim dim) {
-    return dim >= 'A' && dim <= 'Z';
-  }
-
-  /*!
-   * \brief Check whether a given dimension is a sub-dimension.
-   * \param dim input dimension
-   * \return Whether a given dimension is a sub-dimension.
-   */
-  static bool is_subdim(LayoutDim dim) {
-    return dim >= 'a' && dim <= 'z';
-  }
-
-  /*!
-   * \brief Convert a given dimension to super-dimension.
-   * \param dim input dimension
-   * \return The converted description.
-   */
-  static LayoutDim to_superdim(LayoutDim dim) {
-    if (is_subdim(dim)) {
-      return dim - 'a' + 'A';
-    }
-    return dim;
-  }
-
-  /*!
-   * \brief Convert a given dimension to sub-dimension.
-   * \param dim input dimension
-   * \return The converted description.
-   */
-  static LayoutDim to_subdim(LayoutDim dim) {
-    if (is_superdim(dim)) {
-      return dim - 'A' + 'a';
-    }
-    return dim;
-  }
-
-  /*!
-   * \brief Return an undefined layout.
-   * \return a (global) undefined layout.
-   */
-  static const Layout& Undef() {
-    static Layout undef;
-    return undef;
-  }
-
-  /*!
-   * \brief Swap current object with other
-   * \param other another object to be swapped.
-   */
-  void swap(Layout& other) {  // NOLINT(*)
-    std::swap(name_, other.name_);
-    std::swap(superdim_pos_, other.superdim_pos_);
-    std::swap(subdim_pos_, other.subdim_pos_);
-    std::swap(subdim_size_, other.subdim_size_);
-    std::swap(layout_simplified_, other.layout_simplified_);
-  }
-
-  /*!
-   * \brief Two layouts are convertible only if
-   *        they have same set of super-dimensions.
-   *        e.g., NCHW, NCHW16c, NHWC are convertible between each other,
-   *        but NCHW, CHW, OIHW are not.
-   * \param dst the target layout
-   * \return Whether can be converted to dst layout.
-   */
-  bool convertible(const Layout &dst) const {
-    if (!this->defined() || !dst.defined()) return false;
-    for (size_t i = 0; i < kUniqueDim; ++i) {
-      if ((superdim_pos_[i] >= 0 && dst.superdim_pos_[i] < 0) ||
-          (superdim_pos_[i] < 0 && dst.superdim_pos_[i] >= 0)) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  /*!
-   * \brief Returns a sublayout which is the portion of the object
-   *        that starts at dimension \p pos and spans \p len dimensions
-   *        (or until the end of the layout, whichever comes first).
-   * \param pos The start position.
-   * \param len The length of the sub-layout.
-   * \return A newly constructed Layout object.
-   */
-  Layout sublayout(size_t pos, size_t len) const {
-    if (pos > ndim()) return Layout::Undef();
-    if (pos + len > ndim()) len = ndim() - pos;
-    if (len == 0) return Layout::Undef();
-    std::ostringstream new_layout;
-    for (size_t i = pos; i < pos + len; ++i) {
-      if (is_subdim(layout_simplified_[i])) {
-        auto block_size = this->subsizeof(layout_simplified_[i]);
-        CHECK_GT(block_size, 0);
-        new_layout << block_size;
-      }
-      new_layout << layout_simplified_[i];
-    }
-    return Layout(new_layout.str());
-  }
-
-  /*! \return A newly constructed reversed Layout object. */
-  Layout reverse() const {
-    if (!this->defined()) return Layout::Undef();
-    std::ostringstream new_layout;
-    for (int64_t i = this->ndim() - 1; i >= 0; --i) {
-      if (is_subdim(layout_simplified_[i])) {
-        auto block_size = this->subsizeof(layout_simplified_[i]);
-        CHECK_GT(block_size, 0);
-        new_layout << block_size;
-      }
-      new_layout << layout_simplified_[i];
-    }
-    return Layout(new_layout.str());
-  }
-
-  /*!
-   * \brief Split \p dim by \p size and put the sub-dimension to position \p target_pos.
-   * \param dim The source dimension to be split. It must be a super-dimension.
-   * \param target_pos The target position of the newly split sub-dimension.
-   * \param size size of the sub-dimension.
-   * \return A newly constructed Layout object.
-   */
-  Layout split(LayoutDim dim, size_t target_pos, uint32_t size) const {
-    CHECK(target_pos <= this->ndim()) << "Invalid split position "
-                                      << target_pos << " for layout " << name_;
-    CHECK(is_superdim(dim)) << "Cannot split a sub-dimension " << dim;
-    CHECK(this->contains(dim)) << "Axis " << dim << " does not exist in " << name_;
-    CHECK(!this->contains(to_subdim(dim))) << "Dimension " << dim
-                                           << " has already been split in "
-                                           << name_;
-    CHECK(size > 0) << "Invalid split size " << size;
-    std::ostringstream new_layout;
-    for (size_t i = 0; i <= this->ndim(); ++i) {
-      if (i == target_pos) {
-        new_layout << size << Layout::to_subdim(dim);
-      }
-      if (i == this->ndim()) break;
-      new_layout << this->at(i);
-    }
-    Layout x(new_layout.str());
-    return x;
-  }
-
-  using iterator = std::vector<LayoutDim>::const_iterator;
-  using reverse_iterator = std::vector<LayoutDim>::const_reverse_iterator;
-
-  /*! \return begin iterator */
-  iterator begin() const {
-    return layout_simplified_.begin();
-  }
-  /*! \return end iterator */
-  iterator end() const {
-    return layout_simplified_.end();
-  }
-  /*! \return rbegin iterator */
-  reverse_iterator rbegin() const {
-    return layout_simplified_.rbegin();
-  }
-  /*! \return rend iterator */
-  reverse_iterator rend() const {
-    return layout_simplified_.rend();
-  }
-
-  /*! \return number of dimensions */
-  size_t ndim() const {
-    return layout_simplified_.size();
-  }
-
-  /*!
-   * \brief The description of the \p i-th dimension.
-   *        If it is a sub-dimension, the size will be returned as well,
-   *        e.g., 16c. Otherwise a single character is returned, e.g., C.
-   * \param i The position
-   * \return the description of the dimension.
-   */
-  std::string at(size_t i) const {
-    CHECK_LT(i, this->ndim()) << "position " << i
-                              << " exceeds ndim=" << this->ndim();
-    std::ostringstream repr;
-    if (is_subdim(layout_simplified_[i])) {
-      auto factor = subsizeof(layout_simplified_[i]);
-      CHECK_GT(factor, 0);
-      repr << factor;
-    }
-    repr << layout_simplified_[i];
-    return repr.str();
-  }
-
-  /*!
-   * \brief return the index of the input dimension.
-   *        If it is not found in the layout or the layout is undefined,
-   *        return -1.
-   * \param dim the input dimension.
-   * \return the index or -1 if not found.
-   */
-  int32_t indexof(LayoutDim dim) const {
-    if (!this->defined()) return -1;
-    else if (is_superdim(dim)) return superdim_pos_[dim - 'A'];
-    else if (is_subdim(dim)) return subdim_pos_[dim - 'a'];
-    return -1;
-  }
-
-  /*!
-   * \param dim the input super-dimension or sub-dimension.
-   * \return the size of the sub-dimension of \p dim (if \p dim is a super-dimension),
-   *         or the size of \p dim itself (if \p dim is a sub-dimension).
-   *         Return -1 if \p dim is not in the layout or the layout is undefined.
-   */
-  int64_t subsizeof(LayoutDim dim) const {
-    CHECK(is_superdim(dim) || is_subdim(dim)) << "Invalid dim " << dim;
-    if (!this->defined() || !this->contains(to_subdim(dim))) {
-      return -1;
-    }
-    int idx = to_subdim(dim) - 'a';
-    return subdim_size_[idx];
-  }
-
-  /*!
-   * \brief Whether the layout contains a dimension.
-   * \param dim dimension to be checked.
-   * \return Whether the layout contains the dimension.
-   */
-  bool contains(LayoutDim dim) const {
-    if (is_superdim(dim)) {
-      return superdim_pos_[dim-'A'] >= 0;
-    } else if (is_subdim(dim)) {
-      return subdim_pos_[dim-'a'] >= 0;
-    }
-    return false;
-  }
-
-  LayoutDim operator[](size_t i) const {
-    return layout_simplified_[i];
-  }
-
-  /*! \return whether the layout is defined */
-  bool defined() const {
-    return name_ != "__undef__";
-  }
-
-  /*! \return the string description of the layout */
-  const std::string& name() const {
-    return name_;
-  }
-
-  /*!
-   * \brief Write layout in JSON format.
-   * \param writer JSONWriter
-   */
-  void Save(dmlc::JSONWriter* writer) const {
-    writer->Write(name_);
-  }
-
-  /*!
-   * \brief Load layout from JSON.
-   * \param reader JSONReader
-   */
-  void Load(dmlc::JSONReader* reader) {
-    std::string tmp;
-    reader->Read(&tmp);
-    this->parse(tmp);
-  }
-
-  /*!
-   * \brief allow output string of layout to ostream
-   * \param os the output stream
-   * \param l the layout
-   * \return the ostream
-   */
-  friend std::ostream& operator<<(std::ostream& os, const Layout& l) {
-    os << l.name_;
-    return os;
-  }
-
- private:
-  static const uint32_t kUniqueDim = 26;
-
-  std::string name_;
-  int32_t superdim_pos_[kUniqueDim];
-  int32_t subdim_pos_[kUniqueDim];
-  int64_t subdim_size_[kUniqueDim];
-  std::vector<LayoutDim> layout_simplified_;
-
-  void parse(const std::string& layout) {
-    name_ = layout;
-    std::fill_n(superdim_pos_, kUniqueDim, -1);
-    std::fill_n(subdim_pos_, kUniqueDim, -1);
-    std::fill_n(subdim_size_, kUniqueDim, -1);
-    layout_simplified_.clear();
-
-    if (layout == "__undef__") return;
-
-    int32_t factor = 0;
-    uint32_t curr = 0;
-    for (size_t i = 0; i < layout.size(); ++i) {
-      const LayoutDim c = layout.at(i);
-      if (is_superdim(c)) {
-        int pos = c - 'A';
-        CHECK_EQ(factor, 0) << "Invalid layout " << layout
-                            << ": invalid factor size " << factor
-                            << " before dimension " << c;
-        CHECK_EQ(superdim_pos_[pos], -1) << "Invalid layout " << layout
-                                         << ": duplicate dimension " << c;
-        superdim_pos_[pos] = curr++;
-        layout_simplified_.push_back(c);
-      } else if (is_subdim(c)) {
-        int pos = c - 'a';
-        CHECK_GT(factor, 0) << "Invalid layout " << layout << ": invalid factor size "
-                            << factor << " for dimension " << c;
-        CHECK_EQ(subdim_pos_[pos], -1) << "Invalid layout " << layout
-                                       << ": duplicate dimension " << c;
-        CHECK_EQ(subdim_size_[pos], -1) << "Invalid layout " << layout
-                                        << ": duplicate dimension " << c;
-        subdim_pos_[pos] = curr++;
-        subdim_size_[pos] = factor;
-        layout_simplified_.push_back(c);
-        factor = 0;
-      } else if (c >= '0' && c <= '9') {
-        CHECK(factor >= 0) << "Invalid layout " << layout << ": _ is adjacent to a number.";
-        factor = factor * 10 + c - '0';
-      } else {
-        LOG(FATAL) << "Invalid layout " << layout;
-      }
-    }
-    CHECK(!layout_simplified_.empty()) << "Invalid layout " << layout;
-    for (LayoutDim dim : layout_simplified_) {
-      CHECK(is_superdim(dim) || superdim_pos_[dim-'a'] >= 0)
-        << "Invalid layout " << layout << ": missing axis "
-        << static_cast<char>(dim - 'a' + 'A');
-    }
-  }
-};
-
-/*!
- * \brief Convert shape in src_layout to shape in dst_layout
- * \param src original shape
- * \param src_layout layout of original shape
- * \param dst_layout target layout
- * \return shape in target layout
- */
-inline std::vector<IndexExpr> ConvertLayout(
-    std::vector<IndexExpr> src,
-    const Layout& src_layout,
-    const Layout& dst_layout) {
-  CHECK_EQ(src_layout.ndim(), src.size());
-  if (src_layout == dst_layout) {
-    return src;
-  } else if (!src_layout.defined()) {
-    LOG(FATAL) << "cannot convert undefined layout to " << dst_layout;
-  } else if (!dst_layout.defined()) {
-    LOG(FATAL) << "cannot convert " << src_layout << " to undefined layout";
-  }
-
-  CHECK(src_layout.convertible(dst_layout))
-      << "cannot convert from "
-      << src_layout << " to " << dst_layout;
-
-  std::vector<IndexExpr> dst(dst_layout.ndim());
-  for (size_t i = 0; i < src_layout.ndim(); ++i) {
-    Layout::LayoutDim src_dim = src_layout[i];
-    if (Layout::is_superdim(src_dim)) {
-      int dst_major_pos = dst_layout.indexof(Layout::to_superdim(src_dim));
-      int dst_minor_pos = dst_layout.indexof(Layout::to_subdim(src_dim));
-      int src_minor_pos = src_layout.indexof(Layout::to_subdim(src_dim));
-      int src_factor = src_layout.subsizeof(src_dim);
-      int dst_factor = dst_layout.subsizeof(src_dim);
-      IndexExpr src_dim_size = src[i];
-
-      if (src_minor_pos >= 0) {
-        CHECK(is_const_int(src[src_minor_pos], src_factor))
-            << "src shape " << Array<IndexExpr>(src)
-            << " does not agree with layout "
-            << src_layout;
-        src_dim_size *= src_factor;
-      }
-      dst[dst_major_pos] = src_dim_size;
-      if (dst_minor_pos >= 0) {
-        CHECK_GT(dst_factor, 0);
-        if (const int64_t* const_src_dim_size = as_const_int(src_dim_size)) {
-          CHECK_LE(dst_factor, const_src_dim_size[0])
-              << "Converting " << Array<IndexExpr>(src)
-              << " from " << src_layout
-              << " to " << dst_layout
-              << ": cannot split dimension size of "
-              << src_dim_size << " by " << dst_factor;
-        }
-        dst[dst_major_pos] /= dst_factor;
-        dst[dst_minor_pos] = dst_factor;
-      }
-    }
-  }
-  return dst;
-}
-
-inline std::vector<IndexExpr> ConvertLayout(
-    const Array<IndexExpr>& src,
-    const Layout& src_layout,
-    const Layout& dst_layout) {
-  std::vector<IndexExpr> ret(src.size());
-  for (size_t i = 0; i < src.size(); ++i) {
-    ret[i] = src[i];
-  }
-  return ConvertLayout(ret, src_layout, dst_layout);
-}
-
-}  // namespace relay
-}  // namespace tvm
-#endif  // TVM_RELAY_OP_NN_LAYOUT_H_
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index d00f05cfc6fe3..d3b454f35ede3 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -13,7 +13,7 @@
 #include <vector>
 #include "../type_relations.h"
 #include "../op_common.h"
-#include "layout.h"
+#include "../layout.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
index da7db042178ec..6e02d74e6ea83 100644
--- a/src/relay/op/nn/pad.cc
+++ b/src/relay/op/nn/pad.cc
@@ -7,7 +7,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <vector>
-#include "layout.h"
+#include "../layout.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 0e54564e00326..0af0bbf636336 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -8,7 +8,7 @@
 #include <tvm/relay/attrs/nn.h>
 #include <topi/nn/pooling.h>
 #include <vector>
-#include "layout.h"
+#include "../layout.h"
 
 namespace tvm {
 namespace relay {
@@ -33,13 +33,13 @@ bool Pool2DRel(const Array<Type>& types,
   CHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.contains('H') && layout.contains('W') &&
-        !layout.contains('h') && !layout.contains('w'))
+  CHECK(layout.Contains('H') && layout.Contains('W') &&
+        !layout.Contains('h') && !layout.Contains('w'))
     << "Invalid layout " << layout
     << ". Pool2D layout must have H and W, which cannot be split";
 
-  const auto hidx = layout.indexof('H');
-  const auto widx = layout.indexof('W');
+  const auto hidx = layout.Indexof('H');
+  const auto widx = layout.Indexof('W');
 
   IndexExpr pad_h, pad_w;
   if (param->padding.size() == 1) {
@@ -102,10 +102,10 @@ Array<Tensor> Pool2DCompute(const Attrs& attrs,
   auto padding = param->padding;
   auto ceil_mode = param->ceil_mode;
   Layout layout(param->layout);
-  CHECK(layout.convertible(Layout("NCHW")))
+  CHECK(layout.Convertible(Layout("NCHW")))
       << "max_pool2d currently only supports layouts that are convertible from NCHW";
-  CHECK_EQ(layout.indexof('h'), -1) << "max_pool2d does not support input split on height";
-  CHECK_EQ(layout.indexof('w'), -1) << "max_pool2d does not support input split on width";
+  CHECK_EQ(layout.Indexof('h'), -1) << "max_pool2d does not support input split on height";
+  CHECK_EQ(layout.Indexof('w'), -1) << "max_pool2d does not support input split on width";
 
   CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
       << "Pool2D only support 4-D input (e.g., NCHW)"
@@ -240,13 +240,13 @@ bool GlobalPool2DRel(const Array<Type>& types,
   CHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.contains('H') && layout.contains('W') &&
-        !layout.contains('h') && !layout.contains('w'))
+  CHECK(layout.Contains('H') && layout.Contains('W') &&
+        !layout.Contains('h') && !layout.Contains('w'))
     << "Invalid layout " << layout
     << ". Pool2D layout must have H and W, which cannot be split";
 
-  const auto hidx = layout.indexof('H');
-  const auto widx = layout.indexof('W');
+  const auto hidx = layout.Indexof('H');
+  const auto widx = layout.Indexof('W');
   std::vector<IndexExpr> oshape({dshape[0], dshape[1], dshape[2], dshape[3]});
   oshape[hidx] = oshape[widx] = 1;
 
@@ -264,11 +264,11 @@ Array<Tensor> GlobalPool2DCompute(const Attrs& attrs,
   const auto* param = attrs.as<GlobalPool2DAttrs>();
   CHECK(param != nullptr);
   Layout layout(param->layout);
-  CHECK(layout.convertible(Layout("NCHW")))
+  CHECK(layout.Convertible(Layout("NCHW")))
     << "global_avg_pool2d currently only supports layouts that are convertible from NCHW";
-  CHECK_EQ(layout.indexof('h'), -1)
+  CHECK_EQ(layout.Indexof('h'), -1)
     << "global_avg_pool2d does not support input split on height";
-  CHECK_EQ(layout.indexof('w'), -1)
+  CHECK_EQ(layout.Indexof('w'), -1)
     << "global_avg_pool2d does not support input split on width";
 
   CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
diff --git a/src/relay/op/nn/upsampling.cc b/src/relay/op/nn/upsampling.cc
index 45bedd73c4c06..ed7b8449eace2 100644
--- a/src/relay/op/nn/upsampling.cc
+++ b/src/relay/op/nn/upsampling.cc
@@ -5,7 +5,7 @@
  */
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
-#include "layout.h"
+#include "../layout.h"
 
 namespace tvm {
 namespace relay {
@@ -25,7 +25,7 @@ bool UpSamplingRel(const Array<Type>& types,
   const UpSamplingAttrs* param = attrs.as<UpSamplingAttrs>();
   CHECK(param != nullptr);
   const Layout in_layout(param->layout);
-  CHECK(in_layout.convertible(kNCHW))
+  CHECK(in_layout.Convertible(kNCHW))
     << "UpSampling only support input layouts that are convertible from NCHW."
     << " But got " << in_layout;
 
diff --git a/src/relay/pass/fold_scale_axis.cc b/src/relay/pass/fold_scale_axis.cc
index d3f7043088ebe..96fe030c2d030 100644
--- a/src/relay/pass/fold_scale_axis.cc
+++ b/src/relay/pass/fold_scale_axis.cc
@@ -11,7 +11,8 @@
 #include <tvm/relay/expr_functor.h>
 #include "pattern_util.h"
 #include "pass_util.h"
-#include "../op/nn/layout.h"
+#include "../op/layout.h"
+
 
 namespace tvm {
 namespace relay {
@@ -378,8 +379,8 @@ Array<AxesSet> Conv2DForwardPrep(const Call& call, AxesSet out) {
   CHECK(param != nullptr);
   Layout data_layout(param->data_layout);
   Layout weight_layout(param->weight_layout);
-  int c_big_axis = data_layout.indexof('C');
-  int c_small_axis = data_layout.indexof('c');
+  int c_big_axis = data_layout.Indexof('C');
+  int c_small_axis = data_layout.Indexof('c');
 
   CHECK_GE(c_big_axis, 0);
   AxesSet data_axes = NullValue<AxesSet>();
@@ -391,7 +392,7 @@ Array<AxesSet> Conv2DForwardPrep(const Call& call, AxesSet out) {
   // only handle depthwise or full conv2d.
   // TODO(tvm-team) handle grouped conv by reshape + bcast
   bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, weight_layout);
-  if (weight_layout.indexof('i') < 0 &&
+  if (weight_layout.Indexof('i') < 0 &&
       c_small_axis < 0 &&
       (param->groups == 1 || is_depthwise_conv2d)) {
     data_axes = {c_big_axis};
@@ -412,15 +413,15 @@ Expr Conv2DForwardRewrite(const Call& ref_call,
   CHECK(param != nullptr);
   Layout data_layout(param->data_layout);
   Layout weight_layout(param->weight_layout);
-  int c_big_axis = data_layout.indexof('C');
+  int c_big_axis = data_layout.Indexof('C');
   CHECK_GE(c_big_axis, 0);
   // For now, we only support simple pattern (no folded weight/data)
   // TODO(tvm-team) support general data layout
-  CHECK_EQ(weight_layout.indexof('i'), -1);
+  CHECK_EQ(weight_layout.Indexof('i'), -1);
   CHECK(sdata->axes.size() == 1 &&
         c_big_axis == sdata->axes[0]->value);
-  int big_oc_axis = weight_layout.indexof('O');
-  int big_ic_axis = weight_layout.indexof('I');
+  int big_oc_axis = weight_layout.Indexof('O');
+  int big_ic_axis = weight_layout.Indexof('I');
 
   // Check it must be depthwise or full conv2d.
   bool is_depthwise_conv2d = IsDepthwiseConv2D(ref_call, param, weight_layout);
@@ -779,8 +780,8 @@ AxesSet Conv2DBackwardPrep(const Call& call, const Array<AxesSet>& in_axes) {
     out_layout = Layout(param->data_layout);
   }
   Layout weight_layout(param->weight_layout);
-  int c_big_axis = out_layout.indexof('C');
-  int c_small_axis = out_layout.indexof('c');
+  int c_big_axis = out_layout.Indexof('C');
+  int c_small_axis = out_layout.Indexof('c');
 
   CHECK_GE(c_big_axis, 0);
   // For now, we only support simple pattern (no folded weight/data)
@@ -791,8 +792,8 @@ AxesSet Conv2DBackwardPrep(const Call& call, const Array<AxesSet>& in_axes) {
   // only handle depthwise or full conv2d.
   // TODO(tvm-team) handle grouped conv by reshape + bcast
   bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, weight_layout);
-  if (weight_layout.indexof('o') < 0 &&
-      weight_layout.indexof('i') < 0 &&
+  if (weight_layout.Indexof('o') < 0 &&
+      weight_layout.Indexof('i') < 0 &&
       c_small_axis < 0 &&
       (param->groups == 1 || is_depthwise_conv2d)) {
     return {c_big_axis};
@@ -816,16 +817,16 @@ Expr Conv2DBackwardTransform(const Call& call,
     out_layout = Layout(param->data_layout);
   }
   Layout weight_layout(param->weight_layout);
-  int c_big_axis = out_layout.indexof('C');
+  int c_big_axis = out_layout.Indexof('C');
   CHECK_GE(c_big_axis, 0);
   // For now, we only support simple pattern (no folded weight/data)
   // TODO(tvm-team) support general data layout
-  CHECK_EQ(weight_layout.indexof('o'), -1);
-  CHECK_EQ(weight_layout.indexof('i'), -1);
+  CHECK_EQ(weight_layout.Indexof('o'), -1);
+  CHECK_EQ(weight_layout.Indexof('i'), -1);
   CHECK(axes.size() == 1 &&
         c_big_axis == axes[0]->value);
 
-  int big_oc_axis = weight_layout.indexof('O');
+  int big_oc_axis = weight_layout.Indexof('O');
   // Check it must be depthwise or full conv2d.
   bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, weight_layout);
   CHECK(param->groups == 1 || is_depthwise_conv2d);
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index f8e67bac33c52..1c855d9a53cb4 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -11,7 +11,8 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/attrs/transform.h>
-#include "../op/nn/layout.h"
+#include "../op/layout.h"
+
 
 namespace tvm {
 namespace relay {

From 77869913ba349ef642ec8aec76d439e0c29a641f Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <ashutosh.parkhi@imgtec.com>
Date: Wed, 21 Nov 2018 23:25:15 +0530
Subject: [PATCH 21/30] tensorflow frontend supports user given outputs (#1913)

---
 nnvm/python/nnvm/frontend/tensorflow.py       |  17 ++-
 .../frontend/tensorflow/test_forward.py       | 120 ++++++++++++------
 2 files changed, 91 insertions(+), 46 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index e7282eb9afd6b..13ed717b04509 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -1039,7 +1039,7 @@ def __init__(self):
         self._num_param = 0
         self._num_rnn_layer = False
 
-    def from_tensorflow(self, graph, layout="NHWC", shape=None):
+    def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
         """Construct nnvm nodes from tensorflow  graph definition - GraphDef.
 
         Follow the tensorflow graph definition to parse and convert it to NNVM.
@@ -1086,6 +1086,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None):
             raise NotImplementedError( \
                 "The following operators are not implemented: {}".format(missing_operators))
 
+        final_op = None
         # Parse the nodes to re-create TF graph using Symbol API of NNVM
         for node in graph.node:
             # Tensorflow doesn't have seperate list for params extraction.
@@ -1165,6 +1166,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None):
 
                 # Assuming only one output.
                 self._nodes[node.name] = op
+                final_op = op
 
             # Infer shapes if passed explicitely
             node_output = self._nodes[node.name]
@@ -1175,13 +1177,16 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None):
                 _, out_shapes = graph_util.infer_shape(g, **shape_dict)
                 self._output_shapes[node.name] = out_shapes
 
-        # Assume the final node is the output node
-        out = node_output
+        out = []
+        if outputs is None:
+            out.append(final_op)
+        else:
+            out = [self._nodes[out_name] for out_name in outputs]
 
         #Add the RNN outputs also with 'head' nodes of the nnvm graph
         if self._num_rnn_layer:
             out_rnn = _sym.concatenate(*self._out_rnn, axis=0)
-            out = [out, out_rnn]
+            out.append(out_rnn)
 
         if isinstance(out, list):
             out = _sym.Group(out)
@@ -1378,7 +1383,7 @@ def _fix_extranodes(self, op_name, attr, inputs):
 
         return inputs
 
-def from_tensorflow(graph, layout="NHWC", shape=None):
+def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None):
     """  Load tensorflow graph which is a python tensorflow graph object into nnvm graph.
     The companion parameters will be handled automatically.
 
@@ -1396,5 +1401,5 @@ def from_tensorflow(graph, layout="NHWC", shape=None):
         Dict of converted parameters stored in tvm.ndarray format
     """
     g = GraphProto()
-    sym, params = g.from_tensorflow(graph, layout, shape)
+    sym, params = g.from_tensorflow(graph, layout, shape, outputs)
     return sym, params
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index 62d3577ba10ae..e93f14ceb9689 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -26,8 +26,15 @@
 #######################################################################
 # Generic run functions for TVM & tensorflow
 # ------------------------------------------
-def run_tvm_graph(graph_def, input_data, input_node, num_output=1, target='llvm'):
+def convert_to_list(x):
+    if not isinstance(x, list):
+        x = [x]
+    return x
+
+def run_tvm_graph(graph_def, input_data, input_node, num_output=1, target='llvm', out_names=None):
     """ Generic function to compile on nnvm and execute on tvm """
+    input_data = convert_to_list(input_data)
+    input_node = convert_to_list(input_node)
 
     layout = None
     if target == "cuda":
@@ -43,8 +50,8 @@ def run_tvm_graph(graph_def, input_data, input_node, num_output=1, target='llvm'
     else:
         shape_dict = {input_node: input_data.shape}
         dtype_dict = {input_node: input_data.dtype}
-
-    sym, params = nnvm.frontend.from_tensorflow(graph_def, layout=layout, shape=shape_dict)
+   
+    sym, params = nnvm.frontend.from_tensorflow(graph_def, layout=layout, shape=shape_dict, outputs=out_names)
     graph, lib, params = nnvm.compiler.build(sym, target=target, target_host=target_host, shape=shape_dict,
                                              dtype=dtype_dict, params=params)
 
@@ -52,37 +59,34 @@ def run_tvm_graph(graph_def, input_data, input_node, num_output=1, target='llvm'
     from tvm.contrib import graph_runtime
     m = graph_runtime.create(graph, lib, ctx)
     # set inputs
-    if isinstance(input_data, list):
-        for i, e in enumerate(input_node):
-            m.set_input(e, tvm.nd.array(input_data[i].astype(input_data[i].dtype)))
-    else:
-        m.set_input(input_node, tvm.nd.array(input_data.astype(input_data.dtype)))
+    for i, e in enumerate(input_node):
+        m.set_input(e, tvm.nd.array(input_data[i].astype(input_data[i].dtype)))
 
     m.set_input(**params)
     # execute
     m.run()
     # get outputs
-    if num_output > 1:
-        tvm_output_list = []
-        for i in range(0, num_output):
-            tvm_output = m.get_output(i)
-            tvm_output_list.append(tvm_output.asnumpy())
-        return tvm_output_list
-    else:
-        tvm_output = m.get_output(0)
-        return tvm_output.asnumpy()
+    assert out_names is None or num_output == len(out_names),"out_names: {} num_output: {}".format(
+                                                              out_names, num_output)
+    tvm_output_list = []
+    for i in range(0, num_output):
+        tvm_output = m.get_output(i)
+        tvm_output_list.append(tvm_output.asnumpy())
+    return tvm_output_list
 
 def run_tf_graph(sess, input_data, input_node, output_node):
     """ Generic function to execute tensorflow """
+    input_data = convert_to_list(input_data)
+    input_node = convert_to_list(input_node)
+    output_node = convert_to_list(output_node)
 
-    tensor = sess.graph.get_tensor_by_name(output_node)
+    tensor = [0] * len(output_node)
+    for i in range(len(output_node)):
+        tensor[i] = sess.graph.get_tensor_by_name(output_node[i])
 
-    if isinstance(input_data, list):
-        input_dict = {}
-        for i, e in enumerate(input_node):
-            input_dict[e] = input_data[i]
-    else:
-        input_dict = {input_node: input_data}
+    input_dict = {}
+    for i, e in enumerate(input_node):
+        input_dict[e] = input_data[i]
 
     output_data = sess.run(tensor, input_dict)
     return output_data
@@ -91,14 +95,16 @@ def run_tf_graph(sess, input_data, input_node, output_node):
 def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False, no_gpu=False):
     """Generic function to generate and compare tensorflow and TVM output"""
 
-    out_node = out_name.split(':')[0] if ":" in out_name else out_name
+    out_name = convert_to_list(out_name)
+    out_node = [0]*len(out_name)
+    for i in range(len(out_name)):
+        out_node[i] = out_name[i].split(':')[0] if ":" in out_name[i] else out_name[i]
 
-    if isinstance(in_name, list):
-        in_node = [0]*len(in_name)
-        for i in range(len(in_name)):
-            in_node[i] = in_name[i].split(':')[0] if ":" in in_name[i] else in_name[i]
-    else:
-        in_node = in_name.split(':')[0] if ":" in in_name else in_name
+    in_data = convert_to_list(in_data)
+    in_name = convert_to_list(in_name)
+    in_node = [0]*len(in_name)
+    for i in range(len(in_name)):
+        in_node[i] = in_name[i].split(':')[0] if ":" in in_name[i] else in_name[i]
 
     with tf.Session() as sess:
         if init_global_variables:
@@ -106,9 +112,8 @@ def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False,
         final_graph_def = tf.graph_util.convert_variables_to_constants(
             sess,
             sess.graph.as_graph_def(add_shapes=True),
-            [out_node],
+            out_node,
             )
-
         tf_output = run_tf_graph(sess, in_data, in_name, out_name)
 
         for device in ["llvm", "cuda"]:
@@ -120,7 +125,10 @@ def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False,
                 continue
 
             tvm_output = run_tvm_graph(final_graph_def, in_data, in_node, target=device)
-            tvm.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
+            # since the names from tensorflow and nnvm runs are not exactly same, 
+            # first len(tf_output) will be compared
+            for i in range(len(tf_output)):
+                tvm.testing.assert_allclose(tf_output[i], tvm_output[i], atol=1e-5, rtol=1e-5)
 
         sess.close()
 
@@ -259,6 +267,7 @@ def test_forward_reshape():
     _test_reshape(np.arange(6), [3, -1])
     _test_reshape(np.arange(6), [-1])
 
+#######################################################################
 #######################################################################
 # Squeeze
 # -------
@@ -508,6 +517,35 @@ def test_forward_multi_input():
         compare_tf_with_tvm([in_data, in_data, in_data, in_data],
                             ['in1:0', 'in2:0', 'in3:0', 'in4:0'], 'out:0')
 
+#######################################################################
+# Multi Output to Graph
+# ---------------------
+
+def test_forward_multi_output():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.int32, shape=[3, 3], name='in1')
+        in2 = tf.placeholder(tf.int32, shape=[3, 3], name='in2')
+        in3 = tf.placeholder(tf.int32, shape=[3, 3], name='in3')
+        in4 = tf.placeholder(tf.int32, shape=[3, 3], name='in4')
+
+        out1 = tf.add(in1, in2, name='out1')
+        out2 = tf.subtract(in3, in4, name='out2')
+        in_data = np.arange(9, dtype='int32').reshape([3, 3])
+        in_data = [in_data] * 4
+        in_name = ['in1:0', 'in2:0', 'in3:0', 'in4:0']
+        out_name = ['out1:0', 'out2:0']
+        out_node = [out.strip(':0') for out in out_name]
+        in_node = [inp.strip(':0') for inp in in_name]
+        
+        with tf.Session() as sess:
+            final_graph_def = tf.graph_util.convert_variables_to_constants(
+                sess, sess.graph.as_graph_def(add_shapes=True), out_node,)
+            tf_output = run_tf_graph(sess, in_data, in_name, out_name)
+            tvm_output = run_tvm_graph(final_graph_def, in_data, in_node, target='llvm',
+                                       out_names=out_node, num_output=2)
+            for i in range(len(tf_output)):
+                tvm.testing.assert_allclose(tf_output[i], tvm_output[i], atol=1e-5, rtol=1e-5)
+
 #######################################################################
 # Resize Bilinear
 # ---------------
@@ -580,7 +618,7 @@ def _get_tensorflow_output():
     out_state_c = np.reshape(out_state_tup[0], (batch_size, num_hidden))
     out_state_h = np.reshape(out_state_tup[1], (batch_size, num_hidden))
     tvm_out = [out, out_state_c, out_state_h]
-    tvm.testing.assert_allclose(tf_out, tvm_out, rtol=1e-3, atol=1e-3)
+    tvm.testing.assert_allclose(tf_out[0], tvm_out[0], rtol=1e-3, atol=1e-3)
 
 def test_forward_lstm():
     '''test LSTM block cell'''
@@ -653,7 +691,7 @@ def test_forward_inception_v3():
         with tf.Session() as sess:
             tf_output = run_tf_graph(sess, data, 'input:0', 'InceptionV3/Predictions/Reshape_1:0')
             tvm_output = run_tvm_graph(graph_def, data, 'input')
-            tvm.testing.assert_allclose(tf_output, tvm_output, rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(tf_output[0], tvm_output[0], rtol=1e-5, atol=1e-5)
 
 #######################################################################
 # Inception V1
@@ -689,7 +727,7 @@ def test_forward_inception_v1():
         with tf.Session() as sess:
             tf_output = run_tf_graph(sess, data, 'DecodeJpeg/contents:0', 'softmax:0')
             tvm_output = run_tvm_graph(graph_def, tvm_data, 'DecodeJpeg/contents')
-            tvm.testing.assert_allclose(tf_output, tvm_output, rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(tf_output[0], tvm_output[0], rtol=1e-5, atol=1e-5)
 
 #######################################################################
 # Mobilenet
@@ -712,7 +750,7 @@ def test_forward_mobilenet():
             graph_def = nnvm.testing.tf.AddShapesToGraphDef(sess, out_node)
             tf_output = run_tf_graph(sess, data, 'input:0', out_node + ':0')
             tvm_output = run_tvm_graph(graph_def, data, 'input')
-            tvm.testing.assert_allclose(np.squeeze(tvm_output), np.squeeze(tf_output), rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tf_output[0]), rtol=1e-5, atol=1e-5)
 
 #######################################################################
 # ResnetV2
@@ -731,7 +769,7 @@ def test_forward_resnetv2():
             with tf.Session() as sess:
                 tf_output = run_tf_graph(sess, data, 'input_tensor:0', out_node + ':0')
                 tvm_output = run_tvm_graph(graph_def, data, 'input_tensor', tf_output.shape, 'float32')
-                tvm.testing.assert_allclose(np.squeeze(tvm_output), np.squeeze(tf_output), rtol=1e-5, atol=1e-5)
+                tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tf_output[0]), rtol=1e-5, atol=1e-5)
 
 #######################################################################
 # PTB
@@ -797,6 +835,7 @@ def _get_sample(data, state):
             state_output = model.get_output(1, tvm.nd.empty(out_state_shape,
                                                         "float32")).asnumpy()
             sample = nnvm.testing.tf.pick_from_weight(tvm_output[0])
+
             return sample, state_output
 
         for x in data:
@@ -942,7 +981,7 @@ def test_forward_leaky_relu():
     with tf.Graph().as_default():
         in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
         tf.nn.leaky_relu(in1, alpha=0.4)
-        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'LeakyRelu:0')
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'LeakyRelu/mul:0')
 
 def test_forward_elu():
     ishape = (1, 3, 10, 10)
@@ -1042,6 +1081,7 @@ def test_forward_rel_ops():
 
     # General
     test_forward_multi_input()
+    test_forward_multi_output()
     test_forward_variable()
 
     # End to End

From c1157ecf55fa75959ef76dae229754a1cf60d43b Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Wed, 21 Nov 2018 23:27:33 +0530
Subject: [PATCH 22/30] [FRONTEND][TENSORFLOW] Enable strided_slice with fix.
 (#2002)

---
 nnvm/python/nnvm/frontend/tensorflow.py       | 32 +++++++++++--------
 .../frontend/tensorflow/test_forward.py       |  8 +++--
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index 13ed717b04509..b01d489fb0423 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -569,6 +569,7 @@ def _transform_mask(stride_dim, ellipsis_mask):
             m_begin = [0] * data_dim
             m_end = [0] * data_dim
             m_stride = [0] * data_dim
+            fshape_indices = []
             #Count new axis after ellipsis_mask, consider while applying ellipsis_mask.
             ellipsis_seen = False
             new_axes_after_ellipsis = 0
@@ -593,7 +594,10 @@ def _transform_mask(stride_dim, ellipsis_mask):
                         m_begin[final_index] = 0
                         m_end[final_index] = data_shape[0][final_index]
                         m_stride[final_index] = 1
+                        fshape_indices.append(final_index)
                         final_index += 1
+                elif mask &new_axis_mask:
+                    fshape_indices.append(-1)
                 elif not mask & new_axis_mask:
                     if final_index == len(m_begin):
                         break
@@ -614,28 +618,30 @@ def _transform_mask(stride_dim, ellipsis_mask):
                                                  if begin[index] < 0 else begin[index]
                         m_end[final_index] = begin[index] + 1
                         m_stride[final_index] = 1
+                        fshape_indices.append(-2)
+                    else:
+                        fshape_indices.append(final_index)
+
                     final_index += 1
-            return m_begin, m_end, m_stride
+            return m_begin, m_end, m_stride, fshape_indices
 
+        fshape_indices = None
         if begin_mask or end_mask or ellipsis_mask or new_axis_mask or shrink_axis_mask:
-            begin, end, stride = _transform_mask(stride_dim, ellipsis_mask)
+            begin, end, stride, fshape_indices = _transform_mask(stride_dim, ellipsis_mask)
         out = _sym.strided_slice(inputs[0], begin=begin, end=end, stride=stride)
         out_shape = _infer_out_shapes(out, params)[0]
+        if not fshape_indices:
+            fshape_indices = range(len(out_shape))
 
         #Create final output shape.
         final_output = []
-        out_index = 0
-        index = 0
-        while out_index != len(out_shape):
-            #axis with shrink_axis_mask dimension=1 and it is ignored.
-            mask = 1 << index
-            if (new_axis_mask & mask) and not ellipsis_mask & mask:
+        for gather_index in fshape_indices:
+            if gather_index == -1:
                 final_output.append(1)
-            elif (not mask & shrink_axis_mask) or index >= stride_dim:
-                #Shrink is considered till stride_dim
-                final_output.append(out_shape[out_index])
-                out_index += 1
-            index += 1
+            elif gather_index == -2:
+                pass
+            else:
+                final_output.append(out_shape[gather_index])
         return _sym.reshape(out, shape=tuple(final_output))
     return _impl
 
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index e93f14ceb9689..c98748c0fc033 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -435,11 +435,15 @@ def _test_stridedslice(ip_shape, begin, end, stride, dtype,
 
 def test_forward_stridedslice():
     '''test StridedSlice'''
-    return
+
     _test_stridedslice((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], 'float32')
     _test_stridedslice((3, 4, 3), [1, 0], [4, 3], [2, 1], 'float32', ellipsis_mask=8)
+    _test_stridedslice((3, 4, 3), [1, 0], [4, 2], [2, 1], 'float32', ellipsis_mask=2)
+    _test_stridedslice((3, 4, 5, 3), [1, 0], [4, 2], [2, 1], 'float32', ellipsis_mask=2)
+    _test_stridedslice((3, 4, 5, 3), [1, 0, 1], [4, 2, 2], [2, 1, 1], 'float32', ellipsis_mask=2)
     _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 2], [2, 1, 1], 'float32', new_axis_mask=5)
     _test_stridedslice((3, 4, 3), [1, 1, 1], [4, 4, 1], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=4)
+    _test_stridedslice((6, 4, 5), [1, 1, 1], [6, 3, 4], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=5)
     _test_stridedslice((3, 4, 3), [1, 1, 2], [4, 4, 3], [2, 1, 1], 'float32', ellipsis_mask=4, new_axis_mask=2)
     _test_stridedslice((3, 4, 3), [1, 1, 2], [4, 4, 3], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=3)
     _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 1], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=3)
@@ -1056,7 +1060,7 @@ def test_forward_rel_ops():
     test_forward_resize_bilinear()
     test_forward_pad()
     test_forward_gather()
-    #test_forward_stridedslice()
+    test_forward_stridedslice()
 
     # Activations
     test_forward_sigmoid()

From 0edb9443dc1e725037c8c9a47ed906c61073ae70 Mon Sep 17 00:00:00 2001
From: Alexey Romanov <alexey.v.romanov@gmail.com>
Date: Wed, 21 Nov 2018 20:58:19 +0300
Subject: [PATCH 23/30] [FRONTEND][TENSORFLOW] Fix a typo in _matmul (#2152)

---
 nnvm/python/nnvm/frontend/tensorflow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index b01d489fb0423..b0b546a32b3d5 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -342,7 +342,7 @@ def _matmul():
     def _impl(inputs, attr, params):
         channels = _infer_channels(inputs[1], params, not attr['transpose_b'])
         if attr['transpose_a']:
-            inputs[0] = _sym.transpose(inputs[0], axes(1, 0))
+            inputs[0] = _sym.transpose(inputs[0], axes=(1, 0))
         if not attr['transpose_b']:
             inputs[1] = _sym.transpose(inputs[1], axes=(1, 0))
         return AttrCvt(op_name="dense",

From 34648272bb51983e200c9a2478acb4658509eed5 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Wed, 21 Nov 2018 17:11:33 -0500
Subject: [PATCH 24/30] [Relay] Port LSTM to Relay for testing (#2011)

---
 python/tvm/relay/testing/__init__.py       |   1 +
 python/tvm/relay/testing/layers.py         |   4 +-
 python/tvm/relay/testing/lstm.py           | 182 +++++++++++++++++++++
 src/relay/op/tensor/transform.cc           |   2 +-
 tests/python/relay/test_ir_text_printer.py |   8 +
 tests/python/relay/test_op_level3.py       |  13 ++
 6 files changed, 207 insertions(+), 3 deletions(-)
 create mode 100644 python/tvm/relay/testing/lstm.py

diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index 913f97ecd4a19..43160d64549c9 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -6,4 +6,5 @@
 from . import dqn
 from . import dcgan
 from . import mobilenet
+from . import lstm
 from .config import ctx_list
diff --git a/python/tvm/relay/testing/layers.py b/python/tvm/relay/testing/layers.py
index 1b279d9e72af7..9d4d3b3b4e133 100644
--- a/python/tvm/relay/testing/layers.py
+++ b/python/tvm/relay/testing/layers.py
@@ -105,7 +105,7 @@ def conv2d_transpose(data, weight=None, **kwargs):
         weight = relay.var(name + "_weight")
     return relay.nn.conv2d_transpose(data, weight, **kwargs)
 
-def dense_add_bias(data, weight=None, bias=None, **kwargs):
+def dense_add_bias(data, weight=None, bias=None, units=None, **kwargs):
     """Wrapper of dense which automatically creates weights if not given.
 
     Parameters
@@ -133,6 +133,6 @@ def dense_add_bias(data, weight=None, bias=None, **kwargs):
         weight = relay.var(name + "_weight")
     if not bias:
         bias = relay.var(name + "_bias")
-    data = relay.nn.dense(data, weight, **kwargs)
+    data = relay.nn.dense(data, weight, units, **kwargs)
     data = relay.nn.bias_add(data, bias)
     return data
diff --git a/python/tvm/relay/testing/lstm.py b/python/tvm/relay/testing/lstm.py
new file mode 100644
index 0000000000000..47e68a988dab7
--- /dev/null
+++ b/python/tvm/relay/testing/lstm.py
@@ -0,0 +1,182 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Implementation of a Long Short-Term Memory (LSTM) cell.
+
+Adapted from:
+https://gist.github.com/merrymercy/5eb24e3b019f84200645bd001e9caae9
+"""
+
+from tvm import relay
+from . import layers
+from .init import create_workload
+
+def lstm_cell(num_hidden, batch_size=1, dtype="float32", name=""):
+    """Long-Short Term Memory (LSTM) network cell.
+
+    Parameters
+    ----------
+    num_hidden : int
+        Number of units in output symbol.
+
+    batch_size : int
+        Batch size (length of states).
+
+    Returns
+    -------
+    result : tvm.relay.Function
+        A Relay function that evaluates an LSTM cell.
+        The function takes in a tensor of input data, a tuple of two
+        states, and weights and biases for dense operations on the
+        inputs and on the state. It returns a tuple with two members,
+        an output tensor and a tuple of two new states.
+    """
+    builder = relay.ScopeBuilder()
+
+    input_type = relay.TensorType((batch_size, num_hidden), dtype)
+    weight_type = relay.TensorType((num_hidden, 4*num_hidden), dtype)
+    bias_type = relay.TensorType((4*num_hidden,), dtype)
+
+    dense_type = relay.TensorType((batch_size, 4*num_hidden), dtype)
+    slice_type = relay.TupleType([input_type, input_type,
+                                  input_type, input_type])
+    ret_type = relay.TupleType([input_type,
+                                relay.TupleType([input_type, input_type])])
+
+    inputs = relay.Var("inputs", input_type)
+    states = relay.Var("states",
+                       relay.TupleType([input_type, input_type]))
+
+    i2h_weight = relay.Var("i2h_weight", weight_type)
+    i2h_bias = relay.Var("i2h_bias", bias_type)
+
+    h2h_weight = relay.Var("h2h_weight", weight_type)
+    h2h_bias = relay.Var("h2h_bias", bias_type)
+
+    i2h = builder.let(("i2h", dense_type),
+                      layers.dense_add_bias(
+                          data=inputs,
+                          units=num_hidden * 4,
+                          weight=i2h_weight, bias=i2h_bias,
+                          name="%si2h" % name))
+    h2h = builder.let(("h2h", dense_type),
+                      layers.dense_add_bias(
+                          data=relay.TupleGetItem(states, 0),
+                          units=num_hidden * 4,
+                          weight=h2h_weight, bias=h2h_bias,
+                          name="%sh2h" % name))
+
+    gates = builder.let(("gates", dense_type), relay.add(i2h, h2h))
+    slice_gates = builder.let(("slice_gates", slice_type),
+                              relay.split(gates,
+                                          indices_or_sections=4,
+                                          axis=1).astuple())
+
+    in_gate = builder.let(("in_gate", input_type),
+                          relay.sigmoid(relay.TupleGetItem(slice_gates, 0)))
+    forget_gate = builder.let(("forget_gate", input_type),
+                              relay.sigmoid(relay.TupleGetItem(slice_gates, 1)))
+    in_transform = builder.let(("in_transform", input_type),
+                               relay.tanh(relay.TupleGetItem(slice_gates, 2)))
+    out_gate = builder.let(("out_gate", input_type),
+                           relay.sigmoid(relay.TupleGetItem(slice_gates, 3)))
+
+    next_c = builder.let(("next_c", input_type),
+                         relay.add(relay.multiply(forget_gate,
+                                                  relay.TupleGetItem(states, 1)),
+                                   relay.multiply(in_gate, in_transform)))
+    next_h = builder.let(("next_h", input_type),
+                         relay.multiply(out_gate, relay.tanh(next_c)))
+    ret = builder.let(("ret", ret_type),
+                      relay.Tuple([next_h, relay.Tuple([next_h, next_c])]))
+    builder.ret(ret)
+
+    body = builder.get()
+
+    return relay.Function([inputs, states, i2h_weight,
+                           i2h_bias, h2h_weight, h2h_bias],
+                          body, ret_type)
+
+
+def get_net(iterations, num_hidden, batch_size=1, dtype="float32"):
+    '''Constructs an unrolled RNN with LSTM cells'''
+    input_type = relay.TensorType((batch_size, num_hidden), dtype)
+    weight_type = relay.TensorType((num_hidden, 4*num_hidden), dtype)
+    bias_type = relay.TensorType((4*num_hidden,), dtype)
+
+    state_type = relay.TupleType([input_type, input_type])
+    cell_type = relay.TupleType([input_type, state_type])
+
+    builder = relay.ScopeBuilder()
+
+    zeros = builder.let(("zeros", input_type),
+                        relay.zeros((batch_size, num_hidden), dtype))
+    init_states = builder.let(("init_states", state_type),
+                              relay.Tuple([zeros, zeros]))
+
+    states = init_states
+    out = None
+
+    for i in range(iterations):
+        inputs = relay.Var("data", input_type)
+        i2h_weight = relay.Var("i2h_%s_weight" % i, weight_type)
+        i2h_bias = relay.Var("i2h_%i_bias" % i, bias_type)
+        h2h_weight = relay.Var("h2h_%s_weight" % i, weight_type)
+        h2h_bias = relay.Var("h2h_%s_bias" % i, bias_type)
+
+        cell_fn = lstm_cell(num_hidden, batch_size, dtype, "lstm_%s" % i)
+
+        call = builder.let(("call_%s" % i, cell_type),
+                           relay.Call(cell_fn,
+                                      [inputs, states, i2h_weight,
+                                       i2h_bias, h2h_weight, h2h_bias]))
+        new_out = builder.let(("out_%s" % i, input_type),
+                              relay.TupleGetItem(call, 0))
+        new_states = builder.let(("states_%s" % i, state_type),
+                                 relay.TupleGetItem(call, 1))
+        states = new_states
+        out = new_out
+
+    builder.ret(out)
+    body = builder.get()
+    args = relay.ir_pass.free_vars(body)
+    return relay.Function(args, body, input_type)
+
+
+def get_workload(iterations, num_hidden, batch_size=1, dtype="float32"):
+    """Get benchmark workload for an LSTM RNN.
+
+    Parameters
+    ----------
+    iterations : int
+        The number of iterations in the desired LSTM RNN.
+    num_hidden : int
+        The size of the hiddxen state
+    batch_size : int, optional (default 1)
+        The batch size used in the model
+    dtype : str, optional (default "float32")
+        The data type
+    Returns
+    -------
+    net : nnvm.symbol
+        The computational graph
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_net(iterations, num_hidden, batch_size, dtype)
+    return create_workload(net)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 7a3a2151158d2..a9e0a969fc5bb 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1078,7 +1078,7 @@ bool SplitRel(const Array<Type>& types,
   }
   CHECK_LT(axis, data->shape.size())
     << "axis should be within the input dimension range.";
-  CHECK_GT(axis, 0)
+  CHECK_GE(axis, 0)
     << "axis should be within the input dimension range.";
 
   if (const IntImm* sections = param->indices_or_sections.as<IntImm>()) {
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index d12804d512f09..30130fd7bcac7 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -97,10 +97,12 @@ def test_variable_name():
     v1 = relay.var("1")
     assert "%v1" in v1.astext()
 
+
 def test_mlp():
     net, params = tvm.relay.testing.mlp.get_workload(batch_size=1)
     net.astext()
 
+
 def test_resnet():
     net, params = tvm.relay.testing.resnet.get_workload(batch_size=1)
     net.astext()
@@ -117,6 +119,12 @@ def test_dcgan():
     net, params = tvm.relay.testing.dcgan.get_workload(batch_size=1)
     net.astext()
 
+
+def test_lstm():
+    net, params = tvm.relay.testing.lstm.get_workload(4, 4)
+    net.astext()
+
+
 if __name__ == "__main__":
     do_print[0] = True
     test_resnet()
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 22469cc7fdbe9..6f8fbd5512937 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -161,6 +161,14 @@ def verify_split(dshape, indices_or_sections, ret_type, axis=None):
                      relay.ty.TensorType((5, 1, 2, 2), "float32"),
                      relay.ty.TensorType((5, 1, 2, 2), "float32")])),
                   axis=1)
+    verify_split((5, 5, 2, 2), 5,
+                 relay.ty.TupleType(tvm.convert([
+                     relay.ty.TensorType((1, 5, 2, 2), "float32"),
+                     relay.ty.TensorType((1, 5, 2, 2), "float32"),
+                     relay.ty.TensorType((1, 5, 2, 2), "float32"),
+                     relay.ty.TensorType((1, 5, 2, 2), "float32"),
+                     relay.ty.TensorType((1, 5, 2, 2), "float32")])),
+                  axis=0)
     verify_split((d1, d2, d3, d4), 4,
                  relay.ty.TupleType(tvm.convert([
                      relay.ty.TensorType((d1, d2, d3/4, d4), "float32"),
@@ -168,6 +176,11 @@ def verify_split(dshape, indices_or_sections, ret_type, axis=None):
                      relay.ty.TensorType((d1, d2, d3/4, d4), "float32"),
                      relay.ty.TensorType((d1, d2, d3/4, d4), "float32")])),
                   axis=2)
+    verify_split((d1, d2, d3, d4), 2,
+                 relay.ty.TupleType(tvm.convert([
+                     relay.ty.TensorType((d1/2, d2, d3, d4), "float32"),
+                     relay.ty.TensorType((d1/2, d2, d3, d4), "float32")])),
+                  axis=0)
     verify_split((d1, d2, d3, d4), (2, 4, 7),
                  relay.ty.TupleType(tvm.convert([
                      relay.ty.TensorType((d1, 2, d3, d4), "float32"),

From bac220738dbb2eb3e26696285b5caf5a671074df Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Thu, 22 Nov 2018 06:12:02 +0800
Subject: [PATCH 25/30] Alter op layout for group_conv2d on CUDA (#2148)

---
 nnvm/python/nnvm/top/nn.py               |  4 +--
 topi/python/topi/cuda/conv2d_winograd.py | 38 +++++++++++++++++++++---
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index 2069a0a5ad50a..74196c0787987 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -108,7 +108,7 @@ def compute_conv2d(attrs, inputs, _):
          groups == channels:
         out = topi.nn.depthwise_conv2d_nchw(
             inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
-    elif layout == "NCHW":
+    elif layout in ["NCHW", "NCHW4c"]:
         out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups,
                                         out_dtype=out_dtype)
     elif layout == "NHWC" and \
@@ -146,7 +146,7 @@ def schedule_conv2d(attrs, outs, target):
             return topi.generic.schedule_depthwise_conv2d_nchw(outs)
         elif groups == channels and layout == "NHWC" and kernel_layout == "HWOI":
             return topi.generic.schedule_depthwise_conv2d_nhwc(outs)
-        elif layout == "NCHW":
+        elif layout in ["NCHW", "NCHW4c"]:
             return topi.generic.schedule_group_conv2d_nchw(outs)
         else:
             raise ValueError("No compatible schedule")
diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
index 1f2112979ee74..d32a87ba6b9d3 100644
--- a/topi/python/topi/cuda/conv2d_winograd.py
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -7,7 +7,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..nn import conv2d, conv2d_winograd_without_weight_transform
+from ..nn import conv2d, group_conv2d_nchw, conv2d_winograd_without_weight_transform
 from ..util import get_const_int, get_const_tuple, const_matrix, traverse_inline
 from ..generic import schedule_conv2d_winograd_without_weight_transform
 
@@ -353,12 +353,12 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
     CO, _, KH, KW = get_const_tuple(kernel.shape)
 
     dispatch_ctx = autotvm.DispatchContext.current
+    target = tvm.target.current_target()
 
     if groups == 1:
         # query config of this workload
-        workload = ('conv2d',) + autotvm.task.args_to_workload(
-            [tinfos[0], tinfos[1], strides, padding, dilation, layout, out_dtype])
-        target = tvm.target.current_target()
+        workload = autotvm.task.args_to_workload(
+            [tinfos[0], tinfos[1], strides, padding, dilation, layout, out_dtype], conv2d)
         cfg = autotvm.DispatchContext.current.query(target, workload)
 
         if cfg.is_fallback:  # if is fallback, clear query cache and return None
@@ -411,6 +411,36 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
         )
         dispatch_ctx.update(target, new_workload, cfg)
         return sym.contrib.conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
+    elif groups != CI:
+        workload = autotvm.task.args_to_workload(
+            [tinfos[0], tinfos[1], strides, padding, dilation, groups, out_dtype],
+            group_conv2d_nchw)
+        cfg = autotvm.DispatchContext.current.query(target, workload)
+
+        if cfg.is_fallback:  # if is fallback, clear query cache and return None
+            autotvm.task.clear_fallback_cache(target, workload)
+            return None
+
+        if cfg.template_key == 'int8':
+            assert 'cuda' in target.keys
+            new_layout = 'NCHW4c'
+            new_attrs['layout'] = new_layout
+            new_attrs['out_layout'] = new_layout
+            new_attrs['kernel_layout'] = 'OIHW4o4i'
+            ic_block_factor = oc_block_factor = 4
+
+            # Store the same config for the altered operator (workload)
+            new_data = tvm.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor),
+                                       dtype=data.dtype)
+            new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor // groups,\
+                                         KH, KW, oc_block_factor, ic_block_factor),
+                                         dtype=kernel.dtype)
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_kernel, strides, padding, dilation, groups, out_dtype],
+                group_conv2d_nchw
+            )
+            dispatch_ctx.update(target, new_workload, cfg)
+            return sym.conv2d(*copy_inputs, **new_attrs)
 
     # do nothing for depthwise convolution
     return None

From f3ae3f20966e03ae8302b822aa9fda6ff0f04aa9 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 21 Nov 2018 14:12:30 -0800
Subject: [PATCH 26/30] [TOPI] Fix atlest1d for reduce and squeeze (#2147)

---
 nnvm/include/nnvm/compiler/util.h            |  11 ++
 nnvm/src/top/tensor/reduce.cc                |  34 +++---
 nnvm/src/top/tensor/transform.cc             |   4 +-
 topi/include/topi/detail/fuse.h              |  14 +--
 topi/include/topi/nn/l2_normalize.h          |   2 +-
 topi/include/topi/nn/softmax.h               |   2 +-
 topi/include/topi/reduction.h                | 109 +++++++++++--------
 topi/include/topi/transform.h                |  22 ++--
 topi/python/topi/cuda/reduction.py           |   6 +-
 topi/src/topi.cc                             |   4 +-
 topi/tests/python/test_topi_reduce.py        |   4 +
 topi/tests/python/test_topi_transform.py     |   5 +-
 topi/tests/python_cpp/test_topi_transform.py |   5 +-
 13 files changed, 125 insertions(+), 97 deletions(-)

diff --git a/nnvm/include/nnvm/compiler/util.h b/nnvm/include/nnvm/compiler/util.h
index 5d5bc4478530a..0f7fb2a5c8752 100644
--- a/nnvm/include/nnvm/compiler/util.h
+++ b/nnvm/include/nnvm/compiler/util.h
@@ -28,6 +28,17 @@ inline tvm::Array<tvm::Expr> ShapeToArray(TShape shape) {
   return result;
 }
 
+/*
+ * \brief Helper function to convert TShape to TVM array. Useful for
+ * passing data from NNVM param structures to TOPI ops.
+ *
+ * \param shape The shape to convert
+ *
+ * \return An Array of Expr, where each element is a constant int32
+ */
+inline tvm::Array<tvm::Integer> ShapeToIntArray(TShape shape) {
+  return tvm::Array<tvm::Integer>(ShapeToArray(shape).node_);
+}
 }  // namespace compiler
 }  // namespace nnvm
 #endif  // NNVM_COMPILER_UTIL_H_
diff --git a/nnvm/src/top/tensor/reduce.cc b/nnvm/src/top/tensor/reduce.cc
index 7b768ac643042..007a3cc6e3fb6 100644
--- a/nnvm/src/top/tensor/reduce.cc
+++ b/nnvm/src/top/tensor/reduce.cc
@@ -3,9 +3,6 @@
  * \file reduce.cc
  * \brief reduce operator.
  */
-// Enforce TOPI to use old behavior that reduces to at least 1d
-#define TOPI_REDUCE_ATLEAST1D 1
-
 #include <nnvm/op.h>
 #include <nnvm/node.h>
 #include <nnvm/op_attr_types.h>
@@ -20,13 +17,12 @@
 #include "topi/reduction.h"
 #include "topi/transform.h"
 
-static_assert(TOPI_REDUCE_ATLEAST1D, "need to use legacy reduce behavior");
-
 namespace nnvm {
 namespace top {
 using namespace tvm;
 using namespace nnvm::compiler;
 
+
 // reduce
 DMLC_REGISTER_PARAMETER(ReduceParam);
 
@@ -168,9 +164,9 @@ Example::
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
     if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
-      topi::sum(inputs[0], axis, param.keepdims) };
+      topi::sum(inputs[0], axis, param.keepdims, true) };
 })
 .set_attr<FGradient>(
   "FGradient", [](const NodePtr& n,
@@ -202,9 +198,9 @@ NNVM_REGISTER_REDUCE_OP(max)
     const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
-      topi::max(inputs[0], axis, param.keepdims) };
+      topi::max(inputs[0], axis, param.keepdims, true) };
 })
 .set_attr<FGradient>(
   "FGradient", [](const NodePtr& n,
@@ -235,9 +231,9 @@ NNVM_REGISTER_REDUCE_OP(min)
     const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
-      topi::min(inputs[0], axis, param.keepdims) };
+      topi::min(inputs[0], axis, param.keepdims, true) };
 })
 .set_attr<FGradient>(
   "FGradient", [](const NodePtr& n,
@@ -299,8 +295,8 @@ values over a given axis.
     const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
-    auto axis = ShapeToArray(r_axes);
-    Tensor out = topi::argmax(inputs[0], axis, param.keepdims);
+    auto axis = ShapeToIntArray(r_axes);
+    Tensor out = topi::argmax(inputs[0], axis, param.keepdims, true);
     if (param.dtype == kFloat32) out = topi::cast(out, out_info[0]->dtype);
     return Array<Tensor>{out};
 });
@@ -322,8 +318,8 @@ values over a given axis.
     const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
-    auto axis = ShapeToArray(r_axes);
-    Tensor out = topi::argmin(inputs[0], axis, param.keepdims);
+    auto axis = ShapeToIntArray(r_axes);
+    Tensor out = topi::argmin(inputs[0], axis, param.keepdims, true);
     if (param.dtype == kFloat32) out = topi::cast(out, out_info[0]->dtype);
     return Array<Tensor>{out};
 });
@@ -352,7 +348,7 @@ Example::
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
     if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
 
     Expr count = make_const(inputs[0]->dtype, 1);
     for (auto& i : r_axes) {
@@ -360,7 +356,7 @@ Example::
     }
 
     return Array<Tensor>{
-      topi::divide(topi::sum(inputs[0], axis, param.keepdims), count) };
+      topi::divide(topi::sum(inputs[0], axis, param.keepdims, true), count) };
 });
 
 NNVM_REGISTER_REDUCE_OP(prod)
@@ -387,9 +383,9 @@ Example::
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
     if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
-      topi::prod(inputs[0], axis, param.keepdims) };
+      topi::prod(inputs[0], axis, param.keepdims, true) };
 });
 
 
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 2f42727d60837..492208ed7a7c5 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -756,8 +756,8 @@ Examples::
                     const Array<Tensor>& inputs,
                     const Array<Tensor>& out_info) {
     const SqueezeParam& param = nnvm::get<SqueezeParam>(attrs.parsed);
-    auto axis = ShapeToArray(param.axis);
-    return Array<Tensor>{ topi::squeeze(inputs[0], axis) };
+    auto axis = ShapeToIntArray(param.axis);
+    return Array<Tensor>{ topi::squeeze(inputs[0], axis, true) };
 })
 .set_attr<FGradient>(
   "FGradient", [](const NodePtr& n,
diff --git a/topi/include/topi/detail/fuse.h b/topi/include/topi/detail/fuse.h
index 9ee7fbd1cffd4..85ca0f9efacb0 100644
--- a/topi/include/topi/detail/fuse.h
+++ b/topi/include/topi/detail/fuse.h
@@ -14,22 +14,16 @@ using namespace tvm;
 
 /*!
  * \brief Fuse all of the given args
- * 
+ *
  * \param stage The stage in which to apply the fuse
  * \param args The iteration variables to be fused
  *
  * \return The fused iteration variable
  */
 inline IterVar Fuse(Stage stage, const Array<IterVar>& args) {
-  CHECK_GE(args.size(), 1) << "Fuse requires at least 1 arg";
-
-  auto fused = args[0];
-  for (size_t i = 1; i < args.size(); ++i) {
-    IterVar out;
-    stage.fuse(fused, args[i], &out);
-    fused = out;
-  }
-  return fused;
+  IterVar res;
+  stage.fuse(args, &res);
+  return res;
 }
 
 }  // namespace detail
diff --git a/topi/include/topi/nn/l2_normalize.h b/topi/include/topi/nn/l2_normalize.h
index cda1f3b5c8134..6d98a75ec1571 100644
--- a/topi/include/topi/nn/l2_normalize.h
+++ b/topi/include/topi/nn/l2_normalize.h
@@ -27,7 +27,7 @@ using namespace tvm;
 */
 inline Tensor l2_normalize(const Tensor& data,
                            float eps,
-                           const Array<Expr>& axis,
+                           const Array<Integer>& axis,
                            std::string name = "tensor",
                            std::string tag = "l2_normalize") {
   CHECK_EQ(data->shape.size(), 4) << "L2 normalization requires 4-D input";
diff --git a/topi/include/topi/nn/softmax.h b/topi/include/topi/nn/softmax.h
index d17f93046e72b..8ee747ccd07c0 100644
--- a/topi/include/topi/nn/softmax.h
+++ b/topi/include/topi/nn/softmax.h
@@ -40,7 +40,7 @@ inline Tensor softmax(const Tensor &x,
 
   auto k1 = tvm::reduce_axis(Range(0, input_shape[axis]), "k1");
   auto k2 = tvm::reduce_axis(Range(0, input_shape[axis]), "k2");
-  auto reduced_shape = MakeReduceTargetShape({axis}, x, false);
+  auto reduced_shape = MakeReduceTargetShape({axis}, x, false, false);
 
   auto insert_reduce_index = [axis, ndim](const Array<Var> &indices,
                                           const IterVar &reduce_index) {
diff --git a/topi/include/topi/reduction.h b/topi/include/topi/reduction.h
index 777c103ec950c..f26d14951fd49 100644
--- a/topi/include/topi/reduction.h
+++ b/topi/include/topi/reduction.h
@@ -8,7 +8,6 @@
 
 #include <algorithm>
 #include <string>
-#include <set>
 #include <vector>
 #include <iterator>
 
@@ -20,13 +19,6 @@
 #include "topi/detail/constant_utils.h"
 #include "tvm/tvm.h"
 
-/*!
- * \brief macro flag to enable some legacy behavior which requires
- * reduction result to be at least 1d.
- */
-#ifndef TOPI_REDUCE_ATLEAST1D
-#define TOPI_REDUCE_ATLEAST1D 0
-#endif
 
 namespace topi {
 using namespace tvm;
@@ -42,30 +34,34 @@ using FCommReduce = std::function<
 * \brief Convert a reduction axis which could be empty or have negative
 * elements into a real axis with valid dimension indices.
 *
+* \param ndim Number of dimensions in the target.
+* \param axis The axis parameter.
+*
 * \return A non-empty sorted array of valid dimension indices, with no duplicates.
 * If the input axis is empty, the result will be an axis including all dimensions.
 * If any input element is negative, it will be treated as an offset from the
 * last dimension (same as python indexing rules).
 */
-inline std::vector<int> GetRealAxis(int ndim, const std::vector<int>& axis) {
+inline std::vector<int> GetRealAxis(int ndim, const Array<Integer>& axis) {
   std::vector<int> real_axis;
-  if (axis.size() == 0) {
+  if (!axis.defined() || axis.size() == 0) {
     for (int i = 0; i < ndim; ++i) {
       real_axis.push_back(i);
     }
   } else {
     // Use a set so duplicates are removed and the dims are sorted
-    std::set<int> dims;
-    for (auto ele : axis) {
-      if (ele < 0) {
-        ele += ndim;
-      }
-      if (ele >= ndim) {
-        LOG(ERROR) << ele << " exceeds the maximum dimension " << ndim;
+    for (auto elem : axis) {
+      int64_t val = elem->value;
+      if (val < 0) {
+        val += ndim;
       }
-      dims.emplace(ele);
+      CHECK_LE(val, ndim) << " exceeds the maximum dimension " << ndim;
+      CHECK_GE(val, 0);
+      real_axis.push_back(static_cast<int>(val));
     }
-    std::copy(dims.begin(), dims.end(), std::back_inserter(real_axis));
+    std::sort(real_axis.begin(), real_axis.end());
+    real_axis.resize(
+        std::unique(real_axis.begin(), real_axis.end()) - real_axis.begin());
   }
   return real_axis;
 }
@@ -84,7 +80,8 @@ inline Array<IterVar> MakeReduceAxes(const std::vector<int>& real_axis, const Te
 /*! \brief Calculate the target shape for a reduce op */
 inline Array<Expr> MakeReduceTargetShape(const std::vector<int>& real_axis,
                                          const Tensor& data,
-                                         bool keepdims) {
+                                         bool keepdims,
+                                         bool atleast1d) {
   auto ndim = data->shape.size();
   Array<Expr> target_shape;
   if (keepdims) {
@@ -104,7 +101,7 @@ inline Array<Expr> MakeReduceTargetShape(const std::vector<int>& real_axis,
       }
     }
   }
-  if (target_shape.size() == 0 && TOPI_REDUCE_ATLEAST1D) {
+  if (target_shape.size() == 0 && atleast1d) {
     target_shape.push_back(1);
   }
   return target_shape;
@@ -163,18 +160,19 @@ inline Tensor DoCommReduce(const Tensor& data,
  * \param keepdims If this is set to true, the axes which are reduced are
  * left in the result as dimensions with size one. This enables the result
  * to broadcast correctly against the input array.
+ * \param atleast1d Whether the output need to be atleast1d.
  *
  * \return The result tensor.
  */
 inline Tensor CommReduce(const Tensor& data,
-                         const Array<Expr>& axis,
+                         const Array<Integer>& axis,
                          FReduce func,
-                         bool keepdims = false) {
+                         bool keepdims,
+                         bool atleast1d) {
   auto ndim = data->shape.size();
   CHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
-  auto axis_val = detail::GetConstIntValues(axis, "axis");
-  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis_val);
-  auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims);
+  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis);
+  auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims, atleast1d);
   return DoCommReduce(data, func, target_shape, real_axis,
       keepdims ? std::vector<int>() : real_axis);
 }
@@ -188,19 +186,20 @@ inline Tensor CommReduce(const Tensor& data,
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return The result tensor.
 */
 inline Tensor CommReduceIdx(const Tensor& data,
-                            const Array<Expr>& axis,
+                            const Array<Integer>& axis,
                             FCommReduce func,
-                            bool keepdims = false) {
+                            bool keepdims,
+                            bool atleast1d) {
   auto ndim = data->shape.size();
   CHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
-  auto axis_val = detail::GetConstIntValues(axis, "axis");
-  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis_val);
+  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis);
   auto reduce_axes = MakeReduceAxes(real_axis, data);
-  auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims);
+  auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims, atleast1d);
 
   auto compute = [ndim, keepdims, &real_axis, &reduce_axes, &func, &data]
   (const Array<Var>& indices) {
@@ -311,11 +310,15 @@ inline Expr ProdOp(Expr source, Array<IterVar> axis) {
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the sum operation
 */
-inline Tensor sum(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
-  return CommReduce(data, axis, tvm::sum, keepdims);
+inline Tensor sum(const Tensor& data,
+                  const Array<Integer>& axis,
+                  bool keepdims = false,
+                  bool atleast1d = false) {
+  return CommReduce(data, axis, tvm::sum, keepdims, atleast1d);
 }
 
 inline Tensor collapse_sum(const Tensor& data, Array<Expr> target_shape) {
@@ -356,11 +359,15 @@ inline Tensor collapse_sum(const Tensor& data, Array<Expr> target_shape) {
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the min operation
 */
-inline Tensor min(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
-  return CommReduce(data, axis, MinOp, keepdims);
+inline Tensor min(const Tensor& data,
+                  const Array<Integer>& axis,
+                  bool keepdims = false,
+                  bool atleast1d = false) {
+  return CommReduce(data, axis, MinOp, keepdims, atleast1d);
 }
 
 /*!
@@ -373,11 +380,15 @@ inline Tensor min(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the max operation
 */
-inline Tensor max(const Tensor& data, Array<Expr> axis, bool keepdims = false) {  // NOLINT(*)
-  return CommReduce(data, axis, MaxOp, keepdims);
+inline Tensor max(const Tensor& data,
+                  const Array<Integer>& axis,
+                  bool keepdims = false,
+                  bool atleast1d = false) {
+  return CommReduce(data, axis, MaxOp, keepdims, atleast1d);
 }
 
 /*!
@@ -390,10 +401,14 @@ inline Tensor max(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the argmin operation
 */
-inline Tensor argmin(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
+inline Tensor argmin(const Tensor& data,
+                     const Array<Integer>& axis,
+                     bool keepdims = false,
+                     bool atleast1d = false) {
   auto fcombine = [](Array<Var> lhs, Array<Var> rhs) {
     Array<Expr> result;
     result.push_back(tvm::select(lhs[1] <= rhs[1], lhs[0], rhs[0]));  // idx
@@ -407,7 +422,7 @@ inline Tensor argmin(const Tensor& data, Array<Expr> axis, bool keepdims = false
     return result;
   };
   auto func = MakeCommReducer(fcombine, fidentity, "argmin");
-  return CommReduceIdx(data, axis, func, keepdims);
+  return CommReduceIdx(data, axis, func, keepdims, atleast1d);
 }
 
 /*!
@@ -420,10 +435,14 @@ inline Tensor argmin(const Tensor& data, Array<Expr> axis, bool keepdims = false
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the argmax operation
 */
-inline Tensor argmax(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
+inline Tensor argmax(const Tensor& data,
+                     const Array<Integer>& axis,
+                     bool keepdims = false,
+                     bool atleast1d = false) {
   auto fcombine = [](Array<Var> lhs, Array<Var> rhs) {
     Array<Expr> result;
     result.push_back(tvm::select(lhs[1] >= rhs[1], lhs[0], rhs[0]));  // idx
@@ -437,7 +456,7 @@ inline Tensor argmax(const Tensor& data, Array<Expr> axis, bool keepdims = false
     return result;
   };
   auto func = MakeCommReducer(fcombine, fidentity, "argmax");
-  return CommReduceIdx(data, axis, func, keepdims);
+  return CommReduceIdx(data, axis, func, keepdims, atleast1d);
 }
 
 /*!
@@ -449,11 +468,15 @@ inline Tensor argmax(const Tensor& data, Array<Expr> axis, bool keepdims = false
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the prod operation
 */
-inline Tensor prod(const Tensor& data, Array<Expr> axis, bool keepdims = false) {  // NOLINT(*)
-  return CommReduce(data, axis, ProdOp, keepdims);
+inline Tensor prod(const Tensor& data,
+                   const Array<Integer>& axis,
+                   bool keepdims = false,
+                   bool atleast1d = false) {
+  return CommReduce(data, axis, ProdOp, keepdims, atleast1d);
 }
 
 }  // namespace topi
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index cb09f1cb419e4..9bc62b2c02493 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -196,30 +196,34 @@ inline Tensor reshape(const Tensor& x,
 * \param x The input tensor
 * \param axis Indices of the dimensions to remove. If this is empty,
 * all entries with a constant size of 1 will be removed.
+ * \param atleast1d Whether the output need to be atleast1d.
 * \param name The name of the operation
 * \param tag The tag to mark the operation
 *
 * \return A Tensor whose op member is the squeeze operation
 */
 inline Tensor squeeze(const Tensor& x,
-                      Array<Expr> axis,
+                      Array<Integer> axis,
+                      bool atleast1d = false,
                       std::string name = "tensor",
                       std::string tag = kInjective) {
-  auto axis_val = GetConstIntValues(axis, "axis");
   auto ndim = x->shape.size();
-  if (axis_val.size() == 0) {
+  std::vector<int> axis_val;
+  if (!axis.defined() || axis.size() == 0) {
     for (size_t i = 0; i < ndim; ++i) {
       if (IsConstInt(x->shape[i]) && GetConstInt(x->shape[i]) == 1) {
         axis_val.push_back(static_cast<int>(i));
       }
     }
   } else {
-    for (size_t i = 0; i < axis_val.size(); ++i) {
-      if (axis_val[i] < 0) {
-        axis_val[i] += static_cast<int>(x->shape.size());
+    for (size_t i = 0; i < axis.size(); ++i) {
+      int64_t val = axis[i]->value;
+      if (val < 0) {
+        val += static_cast<int>(x->shape.size());
       }
-      CHECK_EQ(GetConstInt(x->shape[axis_val[i]]), 1) <<
-        "Dimension " << axis[i] << " must have size 1";
+      CHECK_EQ(GetConstInt(x->shape[val]), 1) <<
+          "Dimension " << val << " must have size 1";
+      axis_val.push_back(val);
     }
   }
 
@@ -231,7 +235,7 @@ inline Tensor squeeze(const Tensor& x,
       out_shape.push_back(x->shape[i]);
     }
   }
-  if (out_shape.size() == 0) {
+  if (out_shape.size() == 0 && atleast1d) {
     out_shape.push_back(1);
   }
 
diff --git a/topi/python/topi/cuda/reduction.py b/topi/python/topi/cuda/reduction.py
index 79fa02156b196..4c5d1a507660e 100644
--- a/topi/python/topi/cuda/reduction.py
+++ b/topi/python/topi/cuda/reduction.py
@@ -63,10 +63,12 @@ def _schedule_reduce(op, sch, is_idx_reduce=False):
             sch[temp_val_input].compute_at(sch[real_output], outer_in)
     else:
         if is_idx_reduce:
+            spatial_axis = sch[real_output].fuse(*(sch[real_output].op.axis))
+            sch[real_output].bind(spatial_axis, tvm.thread_axis("blockIdx.x"))
             sch[temp_idx_input].compute_at(sch[real_output],
-                                           sch[real_output].op.axis[0])
+                                           spatial_axis)
             sch[temp_val_input].compute_at(sch[real_output],
-                                           sch[real_output].op.axis[0])
+                                           spatial_axis)
     sch[real_output].set_store_predicate(thread_x.equal(0))
     return sch
 
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index b47ba1165eb9d..13a5ccad654c0 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -59,9 +59,9 @@ using namespace tvm;
 using namespace tvm::runtime;
 
 /*! \brief Canonicalize an argument that may be Array<Expr> or int to Array<Expr> */
-Array<Expr> ArrayOrInt(TVMArgValue arg) {
+Array<Integer> ArrayOrInt(TVMArgValue arg) {
   if (arg.type_code() == kDLInt || arg.type_code() == kDLUInt) {
-    Array<Expr> result;
+    Array<Integer> result;
     result.push_back(arg.operator int());
     return result;
   } else {
diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py
index 3b3472f538b7e..77a33d86ed3e1 100644
--- a/topi/tests/python/test_topi_reduce.py
+++ b/topi/tests/python/test_topi_reduce.py
@@ -97,6 +97,10 @@ def check_device(device):
 
 
 def test_reduce_map():
+    verify_reduce_map_ele(in_shape=(32,),
+                          axis=0,
+                          keepdims=False,
+                          type="argmax")
     verify_reduce_map_ele(in_shape=(128, 24, 128, 24),
                         axis=(1, 2, 3),
                         keepdims=True,
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index dc3c3fb70b241..84d4aa6dc9520 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -91,10 +91,7 @@ def check_device(device):
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
         out_npy = np.squeeze(data_npy, axis=axis)
         data_nd = tvm.nd.array(data_npy, ctx)
-        if out_npy.shape == ():
-            out_nd_shape = (1,)
-        else:
-            out_nd_shape = out_npy.shape
+        out_nd_shape = out_npy.shape
         out_nd = tvm.nd.empty(out_nd_shape, ctx=ctx, dtype=B.dtype)
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
diff --git a/topi/tests/python_cpp/test_topi_transform.py b/topi/tests/python_cpp/test_topi_transform.py
index 492f1d94c341a..b411375b333ec 100644
--- a/topi/tests/python_cpp/test_topi_transform.py
+++ b/topi/tests/python_cpp/test_topi_transform.py
@@ -100,10 +100,7 @@ def check_device(device):
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
         out_npy = np.squeeze(data_npy, axis=axis)
         data_nd = tvm.nd.array(data_npy, ctx)
-        if out_npy.shape == ():
-            out_nd_shape = (1,)
-        else:
-            out_nd_shape = out_npy.shape
+        out_nd_shape = out_npy.shape
         out_nd = tvm.nd.empty(out_nd_shape, ctx=ctx, dtype=B.dtype)
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)

From 58fa05312e34117df4f10fb4136d8bfed1ea4f7f Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Wed, 21 Nov 2018 19:17:33 -0500
Subject: [PATCH 27/30] Reverse shape dims of weight type (#2155)

---
 python/tvm/relay/testing/lstm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/testing/lstm.py b/python/tvm/relay/testing/lstm.py
index 47e68a988dab7..b0915e033ccbf 100644
--- a/python/tvm/relay/testing/lstm.py
+++ b/python/tvm/relay/testing/lstm.py
@@ -49,7 +49,7 @@ def lstm_cell(num_hidden, batch_size=1, dtype="float32", name=""):
     builder = relay.ScopeBuilder()
 
     input_type = relay.TensorType((batch_size, num_hidden), dtype)
-    weight_type = relay.TensorType((num_hidden, 4*num_hidden), dtype)
+    weight_type = relay.TensorType((4*num_hidden, num_hidden), dtype)
     bias_type = relay.TensorType((4*num_hidden,), dtype)
 
     dense_type = relay.TensorType((batch_size, 4*num_hidden), dtype)
@@ -116,7 +116,7 @@ def lstm_cell(num_hidden, batch_size=1, dtype="float32", name=""):
 def get_net(iterations, num_hidden, batch_size=1, dtype="float32"):
     '''Constructs an unrolled RNN with LSTM cells'''
     input_type = relay.TensorType((batch_size, num_hidden), dtype)
-    weight_type = relay.TensorType((num_hidden, 4*num_hidden), dtype)
+    weight_type = relay.TensorType((4*num_hidden, num_hidden), dtype)
     bias_type = relay.TensorType((4*num_hidden,), dtype)
 
     state_type = relay.TupleType([input_type, input_type])

From 2bd02e4e33950edf13b0aadc262462587d4fd0fe Mon Sep 17 00:00:00 2001
From: MORINAGA <34588258+imorinaga@users.noreply.github.com>
Date: Thu, 22 Nov 2018 16:35:53 +0900
Subject: [PATCH 28/30] [DOCS] fix link (#2157)

* fix fname in comment

* fix
---
 apps/howto_deploy/cpp_deploy.cc | 2 +-
 docs/faq.md                     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/howto_deploy/cpp_deploy.cc b/apps/howto_deploy/cpp_deploy.cc
index 1fd22e5f2b5f5..9a6c5ebca703a 100644
--- a/apps/howto_deploy/cpp_deploy.cc
+++ b/apps/howto_deploy/cpp_deploy.cc
@@ -1,7 +1,7 @@
 /*!
  *  Copyright (c) 2017 by Contributors
  * \brief Example code on load and run TVM module.s
- * \file cpp_deploy_example.cc
+ * \file cpp_deploy.cc
  */
 #include <cstdio>
 #include <dlpack/dlpack.h>
diff --git a/docs/faq.md b/docs/faq.md
index 54df0ced8fa88..9b735e54d5ddf 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -4,7 +4,7 @@ This document contains frequently asked questions.
 
 How to Install
 --------------
-See [Installation](http://tvm.ai/install/)
+See [Installation](http://docs.tvm.ai/install/)
 
 TVM's relation to Other IR/DSL Projects
 ---------------------------------------

From ff5dffa440a8787e94efe3a69972d7f094a32166 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 22 Nov 2018 10:32:00 -0800
Subject: [PATCH 29/30] [APPS] add an external dll call example (#2156)

---
 apps/extension/python/tvm_ext/__init__.py |  4 +++-
 apps/extension/src/tvm_ext.cc             |  5 +++++
 apps/extension/tests/test_ext.py          | 20 ++++++++++++++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/apps/extension/python/tvm_ext/__init__.py b/apps/extension/python/tvm_ext/__init__.py
index 5045a9ec02e00..25286f67b4f57 100644
--- a/apps/extension/python/tvm_ext/__init__.py
+++ b/apps/extension/python/tvm_ext/__init__.py
@@ -8,7 +8,9 @@
 def load_lib():
     """Load library, the functions will be registered into TVM"""
     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    lib = ctypes.CDLL(os.path.join(curr_path, "../../lib/libtvm_ext.so"))
+    # load in as global so the global extern symbol is visible to other dll.
+    lib = ctypes.CDLL(
+        os.path.join(curr_path, "../../lib/libtvm_ext.so"), ctypes.RTLD_GLOBAL)
     return lib
 
 _LIB = load_lib()
diff --git a/apps/extension/src/tvm_ext.cc b/apps/extension/src/tvm_ext.cc
index bb8b4b6941873..362ac62dea3da 100644
--- a/apps/extension/src/tvm_ext.cc
+++ b/apps/extension/src/tvm_ext.cc
@@ -66,6 +66,11 @@ TVM_REGISTER_GLOBAL("device_api.ext_dev")
   });
 }  // namespace tvm_ext
 
+// External function exposed to runtime.
+extern "C" float TVMTestAddOne(float y) {
+  return y + 1;
+}
+
 // This callback approach allows extension allows tvm to extract
 // This way can be helpful when we want to use a header only
 // minimum version of TVM Runtime.
diff --git a/apps/extension/tests/test_ext.py b/apps/extension/tests/test_ext.py
index b7b97897a0fa0..def30803135ef 100644
--- a/apps/extension/tests/test_ext.py
+++ b/apps/extension/tests/test_ext.py
@@ -49,7 +49,27 @@ def test_extract_ext():
     assert fdict["mul"](3, 4) == 12
 
 
+def test_extern_call():
+    n = 10
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.compute((n,), lambda *i: tvm.call_extern("float32", "TVMTestAddOne", A(*i)), name='B')
+    s = tvm.create_schedule(B.op)
+
+    def check_llvm():
+        if not tvm.module.enabled("llvm"):
+            return
+        f = tvm.build(s, [A, B], "llvm")
+        ctx = tvm.cpu(0)
+        # launch the kernel.
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
+        f(a, b)
+        tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1)
+    check_llvm()
+
+
 if __name__ == "__main__":
+    test_extern_call()
     test_ext_dev()
     test_ext_vec()
     test_bind_add()

From 53ac89ede7cddd1649b01b2ff10cc67a963757ce Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Fri, 23 Nov 2018 02:41:40 +0800
Subject: [PATCH 30/30] [RELAY][PASS] CombineParallelConv2D (#2089)

---
 python/tvm/relay/build_module.py              |   5 +
 python/tvm/relay/ir_pass.py                   |  16 +
 src/relay/pass/combine_parallel_conv2d.cc     | 328 ++++++++++++++++++
 src/relay/pass/expr_subst.cc                  |  35 ++
 src/relay/pass/expr_subst.h                   |  18 +
 src/relay/pass/pattern_util.h                 |  18 +
 .../test_pass_combine_parallel_conv2d.py      | 138 ++++++++
 7 files changed, 558 insertions(+)
 create mode 100644 src/relay/pass/combine_parallel_conv2d.cc
 create mode 100644 src/relay/pass/expr_subst.cc
 create mode 100644 src/relay/pass/expr_subst.h
 create mode 100644 tests/python/relay/test_pass_combine_parallel_conv2d.py

diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 557e4edac681c..5a45ac276de9d 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -13,6 +13,7 @@
 # List of optimization pass and level when switch on
 OPT_PASS_LEVEL = {
     "SimplifyInference": 0,
+    "CombineParallelConv2D": 1,
     "OpFusion": 1,
     "FoldConstant": 2,
     "FoldScaleAxis": 3,
@@ -144,6 +145,10 @@ def optimize(func, params=None):
         func = ir_pass.infer_type(func)
         func = ir_pass.simplify_inference(func)
 
+    if cfg.pass_enabled("CombineParallelConv2D"):
+        func = ir_pass.infer_type(func)
+        func = ir_pass.combine_parallel_conv2d(func)
+
     if cfg.pass_enabled("FoldScaleAxis"):
         func = ir_pass.infer_type(func)
         func = ir_pass.backward_fold_scale_axis(func)
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 9d59980f61274..ef0a59cd3f6d5 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -292,3 +292,19 @@ def fuse_ops(expr, opt_level=1):
         Transformed expression, containing fused result.
     """
     return _ir_pass.FuseOps(expr, opt_level)
+
+
+def combine_parallel_conv2d(expr):
+    """Fold multiple conv2d into one.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+
+    Returns
+    -------
+    transformed_expr : tvm.relay.Expr
+        Transformed expression
+    """
+    return _ir_pass.CombineParallelConv2D(expr)
diff --git a/src/relay/pass/combine_parallel_conv2d.cc b/src/relay/pass/combine_parallel_conv2d.cc
new file mode 100644
index 0000000000000..48d5d77990d67
--- /dev/null
+++ b/src/relay/pass/combine_parallel_conv2d.cc
@@ -0,0 +1,328 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file combine_parallel_conv2d.cc
+ * \brief Combine parallel 2d convolutions into a single convolution.
+ *
+ * This pass replaces convolutions that share the same input node and the same
+ * arguments (except that the number of output channels can be different) with a
+ * single convolution. The weight of the new 2d convolution is the concatenation
+ * of the original weights. Elemwise and broadcast ops following conv2d are also
+ * combined if possible.
+ *
+ * This prevents launching multiple kernels in networks with multiple
+ * convolution branches, such as Inception block.
+ */
+
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/attrs/transform.h>
+#include <tvm/relay/op_attr_types.h>
+#include <unordered_map>
+#include <unordered_set>
+#include "./expr_subst.h"
+#include "./pattern_util.h"
+
+
+namespace tvm {
+namespace relay {
+
+using Branch = std::vector<const CallNode*>;
+using Group = std::vector<Branch>;
+
+/*
+  Find parallel branches starting with conv2d as shown below and then group branches by kernel
+  shape and attributes of conv2d. Conv2d can be followed by zero or more elemwise or broadcast ops.
+  Intermediate nodes have exactly one successor. It is possible that branches meet at a point,
+  which should be handled in ParallelConv2DCombiner.
+
+          data
+        /    \
+    conv2d   conv2d
+      |        |
+      op       op
+      |        |
+*/
+class BranchGroupFinder : private ExprVisitor {
+ public:
+  std::vector<Group> Find(const Expr& expr) {
+    this->VisitExpr(expr);
+
+    std::vector<Group> groups;
+    for (const auto& root : conv_roots_) {
+      const auto& convs = children_map_.at(root);
+      for (const CallNode* conv : convs) {
+        auto&& branch = CreateBranch(conv);
+        // add the branch to a group, or create a new group
+        auto it = std::find_if(groups.begin(), groups.end(), [&](const Group& group) {
+          CHECK(!group.empty() && !group[0].empty());
+          return IsCompatibleConv2D(conv, group[0][0]);
+        });
+        if (it != groups.end()) {
+          it->push_back(branch);
+        } else {
+          groups.emplace_back();
+          // each group has at least one branch
+          groups.back().push_back(branch);
+        }
+      }
+    }
+    return groups;
+  }
+
+ private:
+  std::unordered_set<Expr, NodeHash, NodeEqual> conv_roots_;
+  std::unordered_map<Expr, std::vector<const CallNode*>, NodeHash, NodeEqual> children_map_;
+
+  // Two 2d convolutions can be combined if they have the same attributes or
+  // only have different output channels.
+  bool IsCompatibleConv2D(const CallNode* a, const CallNode* b) {
+    AttrsEqual eq;
+    static const Layout kOIHW("OIHW");
+    const auto* attrs_a = a->attrs.as<Conv2DAttrs>();
+    const auto* attrs_b = b->attrs.as<Conv2DAttrs>();
+    CHECK(attrs_a);
+    CHECK(attrs_b);
+    const auto* tweight_a = a->args[1]->type_as<TensorTypeNode>();
+    const auto* tweight_b = b->args[1]->type_as<TensorTypeNode>();
+    const auto shape_a = ConvertLayout(tweight_a->shape, attrs_a->weight_layout, kOIHW);
+    const auto shape_b = ConvertLayout(tweight_b->shape, attrs_b->weight_layout, kOIHW);
+
+    return eq(attrs_a->strides, attrs_b->strides) && eq(attrs_a->padding, attrs_b->padding) &&
+           eq(attrs_a->dilation, attrs_b->dilation) && eq(attrs_a->groups, attrs_b->groups) &&
+           eq(attrs_a->data_layout, attrs_b->data_layout) &&
+           eq(attrs_a->weight_layout, attrs_b->weight_layout) &&
+           eq(attrs_a->out_dtype, attrs_b->out_dtype) &&
+           eq(attrs_a->out_layout, attrs_b->out_layout) && eq(shape_a[2], shape_b[2]) &&
+           eq(shape_a[3], shape_b[3]);
+  }
+
+  // Create a branch starting from conv2d.
+  Branch CreateBranch(const CallNode* conv) {
+    static auto fpattern = Op::GetAttr<TOpPattern>("TOpPattern");
+    // each branch has at least one element, the first element is always conv2d
+    Branch branch{conv};
+    auto it = children_map_.find(GetRef<Expr>(branch.back()));
+    while (it != children_map_.end() && it->second.size() == 1) {
+      const CallNode* call = it->second[0];
+      auto pattern = fpattern[Downcast<Op>(call->op)];
+      if (pattern <= kBroadcast) {
+        branch.push_back(it->second[0]);
+        it = children_map_.find(GetRef<Expr>(branch.back()));
+      } else {
+        break;
+      }
+    }
+    return branch;
+  }
+
+  void VisitExpr_(const CallNode* n) final {
+    static const Op& conv2d = Op::Get("nn.conv2d");
+    ExprVisitor::VisitExpr_(n);
+    if (n->op.same_as(conv2d) && n->attrs.as<Conv2DAttrs>()->groups == 1) {
+      conv_roots_.insert(n->args[0]);
+      children_map_[n->args[0]].push_back(n);
+    } else {
+      for (size_t i = 0; i < n->args.size(); i++) {
+        children_map_[n->args[i]].push_back(n);
+      }
+    }
+  }
+};
+
+class ParallelConv2DCombiner {
+ public:
+  Expr Combine(const Expr& expr) {
+    auto groups = BranchGroupFinder().Find(expr);
+    for (const Group& group : groups) {
+      if (group.size() < 2) continue;
+      CombineBranches(group);
+    }
+    return ExprSubst(expr, std::move(subst_map_));
+  }
+
+ private:
+  std::unordered_map<Expr, Expr, NodeHash, NodeEqual> subst_map_;
+
+  std::tuple<Expr, IndexExpr> TransformWeight(const Group& branches) {
+    int64_t num_filters = 0;  // number of filters of the transformed weight
+    Array<Expr> weights;
+    for (const auto& branch : branches) {
+      auto conv2d = branch[0];
+      weights.push_back(conv2d->args[1]);
+      auto channels = GetConv2DSuperChannelsDim(conv2d);
+      num_filters += channels;
+    }
+    auto index = branches[0][0]->attrs.as<Conv2DAttrs>()->weight_layout.find('O');
+    CHECK_NE(index, std::string::npos);
+    return std::make_tuple(MakeConcatenate(TupleNode::make(weights), index),
+                           MakeConstScalar(Int(32), num_filters));
+  }
+
+  Call MakeCombinedConv2D(const Group& branches) {
+    static const Op& conv2d = Op::Get("nn.conv2d");
+    Expr data = branches[0][0]->args[0];
+    Expr new_weight;
+    IndexExpr new_channels;
+    std::tie(new_weight, new_channels) = TransformWeight(branches);
+
+    const CallNode* group_root = branches[0][0];
+    const auto* attrs = group_root->attrs.as<Conv2DAttrs>();
+    CHECK(attrs);
+    const auto new_attrs = make_node<Conv2DAttrs>();
+    new_attrs->strides = attrs->strides;
+    new_attrs->padding = attrs->padding;
+    new_attrs->dilation = attrs->dilation;
+    new_attrs->groups = attrs->groups;
+    new_attrs->kernel_size = attrs->kernel_size;
+    new_attrs->data_layout = attrs->data_layout;
+    new_attrs->weight_layout = attrs->weight_layout;
+    new_attrs->out_layout = attrs->out_layout;
+    new_attrs->out_dtype = attrs->out_dtype;
+    new_attrs->channels = new_channels;
+
+    return CallNode::make(conv2d, {data, new_weight}, Attrs{new_attrs}, {});
+  }
+
+  bool IsArgCompatible(const CallNode* a, const CallNode* b, size_t index, size_t channel_pos) {
+    AttrsEqual eq;
+    auto ta = a->args[index]->type_as<TensorTypeNode>();
+    auto tb = b->args[index]->type_as<TensorTypeNode>();
+    auto toutput_a = a->type_as<TensorTypeNode>();
+    auto toutput_b = b->type_as<TensorTypeNode>();
+
+    if (!eq(ta->dtype, tb->dtype) || ta->shape.size() != tb->shape.size())
+      return false;
+
+    // Position of the 'C' dimension in the argument
+    size_t arg_channel_pos = channel_pos - toutput_a->shape.size() + ta->shape.size();
+
+    // Channel super-dimension shoule be present and not broadcasted
+    if ((arg_channel_pos > channel_pos) ||  // size_t overflow
+        !eq(ta->shape[arg_channel_pos], toutput_a->shape[channel_pos]) ||
+        !eq(tb->shape[arg_channel_pos], toutput_b->shape[channel_pos]))
+      return false;
+
+    for (size_t i = 0; i < ta->shape.size(); i++) {
+      if (i == arg_channel_pos) continue;
+      if (!eq(ta->shape[i], tb->shape[i]))
+        return false;
+    }
+    return true;
+  }
+
+  // Check if ops in depth-th level can be combined
+  bool CheckLevel(const Group& branches, size_t depth, size_t channel_pos, size_t parent_index) {
+    const CallNode* call = branches[0][depth];
+    AttrsEqual attrs_equal;
+    // check if all branches in current depth can be combined
+    for (auto it = branches.begin() + 1; it != branches.end(); it++) {
+      const Branch& branch = *it;
+      if (!branch[depth]->op.same_as(call->op) ||
+          !attrs_equal(branch[depth]->attrs, call->attrs) ||
+          branch[depth]->args.size() != call->args.size()) {
+        return false;
+      }
+
+      if (branch[depth]->args[parent_index].get() != branch[depth - 1])
+        return false;
+
+      // Check args
+      for (size_t i = 0; i < call->args.size(); i++) {
+        if (i == parent_index) continue;
+
+        if (!IsArgCompatible(call, branch[depth], i, channel_pos) ||
+            !attrs_equal(call->attrs, branch[depth]->attrs)) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  // Combine args and make the combined CallNode
+  Call MakeCombinedCall(const Expr& data, const Group& branches, size_t depth, size_t channel_pos,
+                        size_t parent_index) {
+    Array<Expr> new_args;
+    const CallNode* call = branches[0][depth];
+    size_t ndim = call->type_as<TensorTypeNode>()->shape.size();
+
+    for (size_t i = 0; i < call->args.size(); i++) {
+      if (i == parent_index) {
+        new_args.push_back(data);
+        continue;
+      }
+      size_t arg_ndim = call->args[i]->type_as<TensorTypeNode>()->shape.size();
+      size_t arg_channel_pos = channel_pos - ndim + arg_ndim;
+      Array<Expr> tuple;
+      for (const auto& branch : branches) {
+        tuple.push_back(branch[depth]->args[i]);
+      }
+      auto concat = MakeConcatenate(TupleNode::make(tuple), arg_channel_pos);
+      new_args.push_back(std::move(concat));
+    }
+    return CallNode::make(call->op, new_args, call->attrs, {});
+  }
+
+  // Replace output of each branch with slices of the combined output
+  void UpdateGroupOutput(const Expr& data, const Group& branches, size_t depth,
+                         size_t channel_pos) {
+    int64_t index = 0;
+    for (const auto& branch : branches) {
+      const CallNode* conv2d = branch[0];
+      int64_t channels = GetConv2DSuperChannelsDim(conv2d);
+      Array<Integer> begin;
+      Array<Integer> end;
+      for (size_t i = 0; i < channel_pos; i++) {
+        begin.push_back(0);
+        end.push_back(NullValue<Integer>());
+      }
+      begin.push_back(index);
+      index += channels;
+      end.push_back(index);
+      auto slice = MakeStridedSlice(data, std::move(begin), std::move(end), Array<Integer>{});
+      subst_map_[GetRef<Expr>(branch[depth])] = slice;
+    }
+  }
+
+  // Combine branches in a group. Conv2d in different branches in the same group are safe to
+  // combine. Subsequent ops may or may not be combined. We start from conv2d and try to
+  // combine ops from all branches in the same depth.
+  void CombineBranches(const Group& branches) {
+    Call combined = MakeCombinedConv2D(branches);
+    auto conv_param = combined->attrs.as<Conv2DAttrs>();
+    const std::string& layout =
+        conv_param->out_layout == "" ? conv_param->data_layout : conv_param->out_layout;
+    size_t channel_pos = layout.find('C');
+    CHECK_NE(channel_pos, std::string::npos);
+    auto it = std::min_element(branches.begin(), branches.end(),
+                               [](const Branch& branch_a,
+                                  const Branch& branch_b) {
+                                    return branch_a.size() < branch_b.size();
+                                  });
+    size_t depth = it->size();
+    size_t i;
+    // starting from 1 to skip the conv2d
+    for (i = 1; i < depth; i++) {
+      size_t parent_index;
+      for (parent_index = 0; parent_index < branches[0][i]->args.size(); parent_index++) {
+        if (branches[0][i]->args[parent_index].get() == branches[0][i - 1]) break;
+      }
+      CHECK_NE(parent_index, branches[0][i]->args.size());
+      if (!CheckLevel(branches, i, channel_pos, parent_index)) break;
+      combined = MakeCombinedCall(combined, branches, i, channel_pos, parent_index);
+    }
+    UpdateGroupOutput(combined, branches, i - 1, channel_pos);
+  }
+};
+
+Expr CombineParallelConv2D(const Expr& expr) { return ParallelConv2DCombiner().Combine(expr); }
+
+TVM_REGISTER_API("relay._ir_pass.CombineParallelConv2D")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = CombineParallelConv2D(args[0]);
+});
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/expr_subst.cc b/src/relay/pass/expr_subst.cc
new file mode 100644
index 0000000000000..586f748abef57
--- /dev/null
+++ b/src/relay/pass/expr_subst.cc
@@ -0,0 +1,35 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file expr_subst.h
+ * \brief Utility functions for substituting expressions.
+ */
+
+#include <tvm/relay/expr_functor.h>
+#include "./expr_subst.h"
+
+namespace tvm {
+namespace relay {
+
+class ExprSubstituter : public ExprMutator {
+ public:
+  explicit ExprSubstituter(std::unordered_map<Expr, Expr, NodeHash, NodeEqual> subst_map)
+      : subst_map_(subst_map) {}
+
+  Expr VisitExpr(const Expr& expr) final {
+    auto it = subst_map_.find(expr);
+    if (it != subst_map_.end()) {
+      return (*it).second;
+    }
+    return ExprMutator::VisitExpr(expr);
+  }
+
+ private:
+  tvm::Map<Expr, Expr> subst_map_;
+};
+
+Expr ExprSubst(const Expr& expr, std::unordered_map<Expr, Expr, NodeHash, NodeEqual> subst_map) {
+  return ExprSubstituter(std::move(subst_map)).Mutate(expr);
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/expr_subst.h b/src/relay/pass/expr_subst.h
new file mode 100644
index 0000000000000..67892b3a0af7d
--- /dev/null
+++ b/src/relay/pass/expr_subst.h
@@ -0,0 +1,18 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file expr_subst.h
+ * \brief Utility functions for substituting expressions.
+ */
+#ifndef TVM_RELAY_PASS_EXPR_SUBST_H_
+#define TVM_RELAY_PASS_EXPR_SUBST_H_
+#include <tvm/relay/expr.h>
+#include <unordered_map>
+
+namespace tvm {
+namespace relay {
+
+Expr ExprSubst(const Expr& expr, std::unordered_map<Expr, Expr, NodeHash, NodeEqual> subst_map);
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_PASS_EXPR_SUBST_H_
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 1c855d9a53cb4..38ae923c52744 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -11,6 +11,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/attrs/transform.h>
+#include <string>
 #include "../op/layout.h"
 
 
@@ -120,6 +121,19 @@ inline bool IsDepthwiseConv2D(const Call& call,
       is_const_int(wshape[1], 1);
 }
 
+/*!
+ * \brief Get super-dimension of output channels of conv2d
+ * \param call The conv2d call.
+ * \return Super-dimension size of output channels of conv2d.
+ */
+inline int64_t GetConv2DSuperChannelsDim(const CallNode* call) {
+    auto param = call->attrs.as<Conv2DAttrs>();
+    auto tweight = call->args[1]->type_as<TensorTypeNode>();
+    auto index = param->weight_layout.find('O');
+    CHECK_NE(index, std::string::npos);
+    auto channels = as_const_int(tweight->shape[index]);
+    return *channels;
+}
 
 /*!
  * \brief Create a Constant with a scalar
@@ -172,6 +186,10 @@ inline Expr ReshapeLike(Expr lhs, Expr rhs) {
   return CallNode::make(op, {lhs, rhs}, Attrs(), {});
 }
 
+Expr MakeConcatenate(Expr data, int axis);
+
+Expr MakeStridedSlice(Expr data, Array<Integer> begin, Array<Integer> end, Array<Integer> strides);
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_PASS_PATTERN_UTIL_H_
diff --git a/tests/python/relay/test_pass_combine_parallel_conv2d.py b/tests/python/relay/test_pass_combine_parallel_conv2d.py
new file mode 100644
index 0000000000000..31dfe095f682e
--- /dev/null
+++ b/tests/python/relay/test_pass_combine_parallel_conv2d.py
@@ -0,0 +1,138 @@
+from tvm import relay
+import numpy as np
+
+
+def test_combine_parallel_conv2d():
+    """Simple testcase."""
+    def before(x, w1, w2, w3, w4):
+        args = [x, w1, w2, w3, w4]
+        y1 = relay.nn.conv2d(x, w1)
+        y2 = relay.nn.conv2d(x, w2)
+        # y3 cannot be combined
+        y3 = relay.nn.conv2d(x, w3)
+        y4 = relay.nn.conv2d(x, w4)
+        y = relay.Tuple((y1, y2, y3, y4))
+        return relay.Function(args, y)
+
+    def expected(x, w1, w2, w3, w4, channels1, channels2, channels3, channels4):
+        # use a fixed order of args so alpha equal check can pass
+        args = [x, w1, w2, w3, w4]
+        w = relay.concatenate((w1, w2, w4), axis=0)
+        y = relay.nn.conv2d(x, w, channels=channels1 + channels2 + channels4)
+        y1 = relay.strided_slice(y, [0, 0], [None, channels1])
+        y2 = relay.strided_slice(y, [0, channels1], [None, channels1 + channels2])
+        y3 = relay.nn.conv2d(x, w3)
+        y4 = relay.strided_slice(y, [0, channels1 + channels2],
+                                 [None, channels1 + channels2 + channels4])
+        y = relay.Tuple((y1, y2, y3, y4))
+        return relay.Function(args, y)
+
+    def check(x_shape, channels1, channels2, channels3, channels4):
+        x =  relay.var("x", shape=x_shape)
+        in_c = x_shape[1]
+        w1 = relay.var("w1", shape=(channels1, in_c, 1, 1))
+        w2 = relay.var("w2", shape=(channels2, in_c, 1, 1))
+        w3 = relay.var("w3", shape=(channels3, in_c, 3, 3))
+        w4 = relay.var("w4", shape=(channels4, in_c, 1, 1))
+
+        y_before = before(x, w1, w2, w3, w4)
+        y = relay.ir_pass.infer_type(y_before)
+        y = relay.ir_pass.combine_parallel_conv2d(y)
+        y = relay.ir_pass.infer_type(y)
+        y_expected = expected(x, w1, w2, w3, w4, channels1, channels2, channels3, channels4)
+        y_expected = relay.ir_pass.infer_type(y_expected)
+        assert relay.ir_pass.alpha_equal(y, y_expected)
+
+    check((1, 4, 16, 16), 4, 4, 4, 4)
+    check((1, 4, 16, 16), 4, 8, 4, 7)
+
+
+def test_combine_parallel_conv2d_scale_relu():
+    """Testcase of combining conv2d + scale + relu"""
+    def before(x, w1, w2, scale1, scale2, bias):
+        args = [x, w1, w2, scale1, scale2, bias]
+        y1 = relay.nn.conv2d(x, w1)
+        y1 = relay.multiply(y1, scale1)
+        y1 = relay.nn.relu(y1)
+        y2 = relay.nn.conv2d(x, w2)
+        y2 = relay.multiply(y2, scale2)
+        y2 = relay.nn.relu(y2)
+        y2 = relay.add(y2, bias)
+        y = relay.Tuple((y1, y2))
+        return relay.Function(args, y)
+
+    def expected(x, w1, w2, scale1, scale2, bias, channels1, channels2):
+        args = [x, w1, w2, scale1, scale2, bias]
+        w = relay.concatenate((w1, w2), axis=0)
+        scale = relay.concatenate((scale1, scale2), axis=0)
+        y = relay.nn.conv2d(x, w, channels=channels1 + channels2)
+        y = relay.multiply(y, scale)
+        y = relay.nn.relu(y)
+        y1 = relay.strided_slice(y, [0, 0], [None, channels1])
+        y2 = relay.strided_slice(y, [0, channels1], [None, channels1 + channels2])
+        y2 = relay.add(y2, bias)
+        y = relay.Tuple((y1, y2))
+        return relay.Function(args, y)
+
+    def check(x_shape, channels1, channels2):
+        x = relay.var("x", shape=x_shape)
+        in_c = x_shape[1]
+        w1 = relay.var("w1", shape=(channels1, in_c, 1, 1))
+        w2 = relay.var("w2", shape=(channels2, in_c, 1, 1))
+        scale1 = relay.var("scale1", shape=(channels1, 1, 1))
+        scale2 = relay.var("scale2", shape=(channels2, 1, 1))
+        bias = relay.var("bias", shape=(channels2, 1, 1))
+        y_before = before(x, w1, w2, scale1, scale2, bias)
+        y = relay.ir_pass.infer_type(y_before)
+        y = relay.ir_pass.combine_parallel_conv2d(y)
+        y = relay.ir_pass.infer_type(y)
+        y_expected = expected(x, w1, w2, scale1, scale2, bias, channels1, channels2)
+        y_expected = relay.ir_pass.infer_type(y_expected)
+        assert relay.ir_pass.alpha_equal(y, y_expected)
+
+    check((1, 4, 16, 16), 4, 8)
+
+
+def test_combine_parallel_conv2d_scale():
+    """Testcase of un-combinable scale"""
+    def before(x, w1, w2, scale1, scale2):
+        args = [x, w1, w2, scale1, scale2]
+        y1 = relay.nn.conv2d(x, w1)
+        y1 = relay.multiply(y1, scale1)
+        y2 = relay.nn.conv2d(x, w2)
+        y2 = relay.multiply(y2, scale2)
+        y = relay.Tuple((y1, y2))
+        return relay.Function(args, y)
+
+    def expected(x, w1, w2, scale1, scale2, channels1, channels2):
+        args = [x, w1, w2, scale1, scale2]
+        w = relay.concatenate((w1, w2), axis=0)
+        y = relay.nn.conv2d(x, w, channels=channels1 + channels2)
+        y1 = relay.strided_slice(y, [0, 0], [None, channels1])
+        y2 = relay.strided_slice(y, [0, channels1], [None, channels1 + channels2])
+        y1 = relay.multiply(y1, scale1)
+        y2 = relay.multiply(y2, scale2)
+        y = relay.Tuple((y1, y2))
+        return relay.Function(args, y)
+
+    def check(x_shape, channels1, channels2):
+        x = relay.var("x", shape=x_shape)
+        in_c = x_shape[1]
+        w1 = relay.var("w1", shape=(channels1, in_c, 1, 1))
+        w2 = relay.var("w2", shape=(channels2, in_c, 1, 1))
+        scale1 = relay.var("scale1", shape=(1,))
+        scale2 = relay.var("scale2", shape=(1,))
+        y_before = before(x, w1, w2, scale1, scale2)
+        y = relay.ir_pass.infer_type(y_before)
+        y = relay.ir_pass.combine_parallel_conv2d(y)
+        y = relay.ir_pass.infer_type(y)
+        y_expected = expected(x, w1, w2, scale1, scale2, channels1, channels2)
+        y_expected = relay.ir_pass.infer_type(y_expected)
+        assert relay.ir_pass.alpha_equal(y, y_expected)
+
+    check((1, 4, 16, 16), 4, 8)
+
+if __name__ == "__main__":
+    test_combine_parallel_conv2d()
+    test_combine_parallel_conv2d_scale_relu()
+    test_combine_parallel_conv2d_scale()