From 53e4c603e64a0df4468e0745d29e81d19c586132 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 9 Jun 2021 09:34:43 -0700
Subject: [PATCH] [DOC] Improve "Getting Started with TVM" tutorials and fix
 warnings (#8221)

* improve src/README.md

* fix intro

* fix more warnings

* improve docs

* update

* update

* update

* update overview image
---
 docs/api/python/graph_executor.rst            |  2 +-
 docs/conf.py                                  |  8 +-
 docs/deploy/bnns.rst                          |  3 +-
 docs/dev/device_target_interactions.rst       |  2 +-
 docs/dev/index.rst                            | 12 ++-
 docs/index.rst                                |  4 +-
 python/tvm/relay/op/nn/nn.py                  |  1 +
 python/tvm/relay/op/transform.py              |  7 +-
 python/tvm/topi/cuda/sparse_reshape.py        |  1 +
 python/tvm/topi/cuda/unique.py                |  1 +
 python/tvm/topi/sparse_reshape.py             |  1 +
 python/tvm/topi/unique.py                     |  1 +
 src/README.md                                 | 19 ++--
 tutorials/auto_scheduler/tune_network_arm.py  |  2 +-
 tutorials/autotvm/tune_conv2d_cuda.py         |  2 +-
 ...ul_x86.py => auto_scheduler_matmul_x86.py} |  2 +-
 ...utotvm_matmul.py => autotvm_matmul_x86.py} | 17 ++--
 ...ng_with_python.py => autotvm_relay_x86.py} |  5 +-
 tutorials/get_started/install.py              |  4 +-
 tutorials/get_started/introduction.py         | 94 ++++++++++---------
 .../get_started/tensor_expr_get_started.py    | 33 +++----
 .../get_started/tvmc_command_line_driver.py   |  2 +-
 .../frontend/deploy_classification.py         |  2 +-
 23 files changed, 120 insertions(+), 105 deletions(-)
 rename tutorials/get_started/{tune_matmul_x86.py => auto_scheduler_matmul_x86.py} (99%)
 rename tutorials/get_started/{autotvm_matmul.py => autotvm_matmul_x86.py} (96%)
 rename tutorials/get_started/{auto_tuning_with_python.py => autotvm_relay_x86.py} (99%)

diff --git a/docs/api/python/graph_executor.rst b/docs/api/python/graph_executor.rst
index 3f8811553ba4..1af93e88458d 100644
--- a/docs/api/python/graph_executor.rst
+++ b/docs/api/python/graph_executor.rst
@@ -16,6 +16,6 @@
     under the License.
 
 tvm.contrib.graph_executor
--------------------------
+--------------------------
 .. automodule:: tvm.contrib.graph_executor
     :members:
diff --git a/docs/conf.py b/docs/conf.py
index 45f5da670608..1f645645f25d 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -226,10 +226,10 @@ def git_describe_version(original_version):
         "introduction.py",
         "install.py",
         "tvmc_command_line_driver.py",
-        "auto_tuning_with_python.py",
+        "autotvm_relay_x86.py",
         "tensor_expr_get_started.py",
-        "autotvm_matmul.py",
-        "autoschedule_matmul.py",
+        "autotvm_matmul_x86.py",
+        "auto_scheduler_matmul_x86.py",
         "cross_compilation_and_rpc.py",
         "relay_quick_start.py",
     ],
@@ -246,7 +246,7 @@ def git_describe_version(original_version):
     ],
     "language": [
         "schedule_primitives.py",
-        "reduciton.py",
+        "reduction.py",
         "intrin_math.py",
         "scan.py",
         "extern_op.py",
diff --git a/docs/deploy/bnns.rst b/docs/deploy/bnns.rst
index 7b62fb15a617..43c7e7bb264f 100644
--- a/docs/deploy/bnns.rst
+++ b/docs/deploy/bnns.rst
@@ -175,7 +175,8 @@ Operator support
 | nn.bias_add            | Supported by BNNS integration only as a bias part of nn.conv2d or nn.dense   |
 |                        | fusion                                                                       |
 +------------------------+------------------------------------------------------------------------------+
-| add                    | Supported by BNNS integration only as a bias part of nn.conv2d or nn.dense fusion |
+| add                    | Supported by BNNS integration only as a bias part of nn.conv2d or nn.dense   |
+|                        | fusion                                                                       |
 +------------------------+------------------------------------------------------------------------------+
 | nn.relu                | Supported by BNNS integration only as a part of nn.conv2d or nn.dense fusion |
 +------------------------+------------------------------------------------------------------------------+
diff --git a/docs/dev/device_target_interactions.rst b/docs/dev/device_target_interactions.rst
index 373b8fee753e..e5fa708434fb 100644
--- a/docs/dev/device_target_interactions.rst
+++ b/docs/dev/device_target_interactions.rst
@@ -18,7 +18,7 @@
 .. _tvm-target-specific-overview:
 
 Device/Target Interactions
---------------------------
+==========================
 
 This documented is intended for developers interested in understanding
 how the TVM framework interacts with specific device APIs, or who
diff --git a/docs/dev/index.rst b/docs/dev/index.rst
index d03f0fb03496..873af9c6a3b7 100644
--- a/docs/dev/index.rst
+++ b/docs/dev/index.rst
@@ -29,10 +29,6 @@ This page is organized as follows:
   The sections after are specific guides focused on each logical component, organized
   by the component's name.
 
-- The `Device/Target Interactions`_ section describes how TVM
-  interacts with each supported physical device and code-generation
-  target.
-
 - Feel free to also check out the :ref:`dev-how-to` for useful development tips.
 
 This guide provides a few complementary views of the architecture.
@@ -245,12 +241,13 @@ for learning-based optimizations.
 
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
 
    runtime
    debugger
    virtual_machine
    introduction_to_module_serialization
+   device_target_interactions
 
 tvm/node
 --------
@@ -318,6 +315,11 @@ It also provides a common `Target` class that describes the target.
 The compilation pipeline can be customized according to the target by querying the attribute information
 in the target and builtin information registered to each target id(cuda, opencl).
 
+.. toctree::
+   :maxdepth: 1
+
+   device_target_interactions
+
 tvm/tir
 -------
 
diff --git a/docs/index.rst b/docs/index.rst
index 2a1078e645ab..e3cf466d3cf1 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -44,8 +44,8 @@ For Developers
    contribute/index
    deploy/index
    dev/how_to
-   microtvm/index
    errors
+   faq
 
 .. toctree::
    :maxdepth: 1
@@ -76,8 +76,8 @@ For Developers
    :hidden:
    :caption: MISC
 
+   microtvm/index
    vta/index
-   faq
 
 
 Index
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 91c148b5df2e..caf1f187fad3 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -2236,6 +2236,7 @@ def sparse_add(dense_mat, sparse_mat):
     Examples
     -------
     .. code-block:: python
+
         dense_data = [[ 3.,   4.,   4. ]
                       [ 4.,  2.,  5. ]]
         sparse_data = [4., 8.]
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 80913e5f0cbd..049ddc9622ba 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -1404,6 +1404,7 @@ def sparse_fill_empty_rows(sparse_indices, sparse_values, dense_shape, default_v
     Examples
     -------
     .. code-block:: python
+
         sparse_indices = [[0, 1],
                          [0, 3],
                          [2, 0],
@@ -1425,7 +1426,6 @@ def sparse_fill_empty_rows(sparse_indices, sparse_values, dense_shape, default_v
                              [4, 0]]
         empty_row_indicator = [False, True, False, False, True]
         new_sparse_values = [1, 2, 10, 3, 4, 10]
-
     """
     new_sparse_indices, new_sparse_values, empty_row_indicator = TupleWrapper(
         _make.sparse_fill_empty_rows(sparse_indices, sparse_values, dense_shape, default_value), 3
@@ -1457,6 +1457,7 @@ def sparse_reshape(sparse_indices, prev_shape, new_shape):
     Examples
     --------
     .. code-block:: python
+
         sparse_indices = [[0, 0, 0],
                             [0, 0, 1],
                             [0, 1, 0],
@@ -1508,6 +1509,7 @@ def segment_sum(data, segment_ids, num_segments=None):
     Examples
     --------
     .. code-block:: python
+
         data = [[1, 2, 3, 4],
                 [4, -3, 2, -1],
                 [5, 6, 7, 8]]
@@ -1578,6 +1580,7 @@ def cumsum(data, axis=None, dtype=None, exclusive=None):
     Examples
     --------
     .. code-block:: python
+
         a = [[1,2,3], [4,5,6]]
 
         cumsum(a)  # if axis is not provided, cumsum is done over the flattened input.
@@ -1633,6 +1636,7 @@ def cumprod(data, axis=None, dtype=None, exclusive=None):
     Examples
     --------
     .. code-block:: python
+
         a = [[1,2,3], [4,5,6]]
 
         cumprod(a)  # if axis is not provided, cumprod is done over the flattened input.
@@ -1693,6 +1697,7 @@ def unique(data, is_sorted=True, return_counts=False):
     Examples
     --------
     .. code-block:: python
+
         [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, False)
         output         =  [4, 5, 1, 2, 3, ?, ?, ?]
         indices        =  [0, 1, 2, 3, 4, 4, 0, 1]
diff --git a/python/tvm/topi/cuda/sparse_reshape.py b/python/tvm/topi/cuda/sparse_reshape.py
index 4476648e0aa4..7a796fa42696 100644
--- a/python/tvm/topi/cuda/sparse_reshape.py
+++ b/python/tvm/topi/cuda/sparse_reshape.py
@@ -48,6 +48,7 @@ def sparse_reshape(
     Examples
     --------
     .. code-block:: python
+
         sparse_indices = [[0, 0, 0],
                             [0, 0, 1],
                             [0, 1, 0],
diff --git a/python/tvm/topi/cuda/unique.py b/python/tvm/topi/cuda/unique.py
index 911ee71a0057..8f78cc5fb924 100644
--- a/python/tvm/topi/cuda/unique.py
+++ b/python/tvm/topi/cuda/unique.py
@@ -317,6 +317,7 @@ def unique(data, is_sorted=True, return_counts=False):
     Examples
     --------
     .. code-block:: python
+
         [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, False)
         output          =  [4, 5, 1, 2, 3, ?, ?, ?]
         indices         =  [0, 1, 2, 3, 4, ?, ?, ?]
diff --git a/python/tvm/topi/sparse_reshape.py b/python/tvm/topi/sparse_reshape.py
index 5535477e17c8..f2c0a2928b93 100644
--- a/python/tvm/topi/sparse_reshape.py
+++ b/python/tvm/topi/sparse_reshape.py
@@ -45,6 +45,7 @@ def sparse_reshape(
     Examples
     --------
     .. code-block:: python
+
         sparse_indices = [[0, 0, 0],
                             [0, 0, 1],
                             [0, 1, 0],
diff --git a/python/tvm/topi/unique.py b/python/tvm/topi/unique.py
index 49869c2ecda4..5aeadc541e29 100644
--- a/python/tvm/topi/unique.py
+++ b/python/tvm/topi/unique.py
@@ -243,6 +243,7 @@ def unique(data, is_sorted=True, return_counts=False):
     Examples
     --------
     .. code-block:: python
+
         [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, False)
         output          =  [4, 5, 1, 2, 3, ?, ?, ?]
         indices         =  [0, 1, 2, 3, 4, ?, ?, ?]
diff --git a/src/README.md b/src/README.md
index 2653efa56c83..bb9aeb2a8578 100644
--- a/src/README.md
+++ b/src/README.md
@@ -21,14 +21,17 @@ Header files in include are public APIs that share across modules.
 There can be internal header files within each module that sit in src.
 
 ## Modules
-- support: Internal support utilities.
-- runtime: Minimum runtime related codes.
-- node: base infra for IR/AST nodes that is dialect independent.
-- ir: Common IR infrastructure.
-- tir: Tensor-level IR.
-- te: tensor expression DSL
 - arith: Arithmetic expression and set simplification.
-- relay: Relay IR, high-level optimization.
-- autotvm: The auto-tuning module.
+- auto\_scheduler: The template-free auto-tuning module.
+- autotvm: The template-based auto-tuning module.
 - contrib: Contrib extension libraries.
 - driver: Compilation driver APIs.
+- ir: Common IR infrastructure.
+- node: The base infra for IR/AST nodes that is dialect independent.
+- relay: Relay IR, high-level optimizations.
+- runtime: Minimum runtime related codes.
+- support: Internal support utilities.
+- target: Hardwaer target.
+- tir: Tensor IR, low-level optimizations.
+- te: Tensor expression DSL.
+- topi: Tensor Operator Inventory.
diff --git a/tutorials/auto_scheduler/tune_network_arm.py b/tutorials/auto_scheduler/tune_network_arm.py
index a7a844b7a86e..5b0931405212 100644
--- a/tutorials/auto_scheduler/tune_network_arm.py
+++ b/tutorials/auto_scheduler/tune_network_arm.py
@@ -437,7 +437,7 @@ def tune_and_evaluate():
 #    in function :code:`run_tuning`. Say,
 #    :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)`
 # 4. If you have multiple target CPUs, you can use all of them for measurements to
-#    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-rpc-tracker>`
+#    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-scale-up-rpc-tracker>`
 #    to learn how to use the RPC Tracker and RPC Server.
 #    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
 #    with :any:`auto_scheduler.RPCRunner`.
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index 4e80a74413aa..ef921563e466 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -77,7 +77,7 @@
 # to tune other operators such as depthwise convolution and gemm.
 # In order to fully understand this template, you should be familiar with
 # the schedule primitives and auto tuning API. You can refer to the above
-# tutorials and :doc:`autotvm tutorial <tune_simple_template>`
+# tutorials and :ref:`autotvm tutorial <tutorial-autotvm-matmul-x86>`
 #
 # It is worth noting that the search space for a conv2d operator
 # can be very large (at the level of 10^9 for some input shapes)
diff --git a/tutorials/get_started/tune_matmul_x86.py b/tutorials/get_started/auto_scheduler_matmul_x86.py
similarity index 99%
rename from tutorials/get_started/tune_matmul_x86.py
rename to tutorials/get_started/auto_scheduler_matmul_x86.py
index 8156d0e106ff..f9fb3615aedc 100644
--- a/tutorials/get_started/tune_matmul_x86.py
+++ b/tutorials/get_started/auto_scheduler_matmul_x86.py
@@ -23,7 +23,7 @@
 In this tutorial, we will show how TVM's Auto Scheduling feature can find
 optimal schedules without the need for writing a custom template.
 
-Different from the template-based :ref:`<autotvm_matmul>` which relies on
+Different from the template-based :doc:`AutoTVM <autotvm_matmul_x86>` which relies on
 manual templates to define the search space, the auto-scheduler does not
 require any templates.  Users only need to write the computation declaration
 without any schedule commands or templates.  The auto-scheduler can
diff --git a/tutorials/get_started/autotvm_matmul.py b/tutorials/get_started/autotvm_matmul_x86.py
similarity index 96%
rename from tutorials/get_started/autotvm_matmul.py
rename to tutorials/get_started/autotvm_matmul_x86.py
index 234315b53ff9..97e1b0b8b55f 100644
--- a/tutorials/get_started/autotvm_matmul.py
+++ b/tutorials/get_started/autotvm_matmul_x86.py
@@ -15,17 +15,18 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Optimizing Operators with Templates and AutoTVM
-===============================================
+.. _tutorial-autotvm-matmul-x86:
+
+Optimizing Operators with Schedule Templates and AutoTVM
+========================================================
 **Authors**:
 `Lianmin Zheng <https://github.com/merrymercy>`_,
 `Chris Hoge <https://github.com/hogepodge>`_
 
-In this tutorial, we will now show how the TVM Template Extension (TE) language
-can be used to write scheduling templates that can be searched by AutoTVM to
-find optimal configurations of scheduling variables. This process is called
-Auto-Tuning, and builds on TE to help automate the process of optimizing
-operations.
+In this tutorial, we show how the TVM Tensor Expression (TE) language
+can be used to write schedule templates that can be searched by AutoTVM to
+find the optimal schedule. This process is called Auto-Tuning, which helps
+automate the process of optimizing tensor computation.
 
 This tutorial builds on the previous `tutorial on how to write a matrix
 multiplication using TE <tensor_expr_get_started>`.
@@ -371,6 +372,6 @@ def matmul(N, L, M, dtype):
 # To gain a deeper understanding of how this works, we recommend expanding on
 # this example by adding new search parameters to the schedule based on
 # schedule operations demonstated in the `Getting Started With Tensor
-# Expressions <tensor_expr_get_started>_` tutorial In the upcoming sections, we
+# Expressions <tensor_expr_get_started>_` tutorial. In the upcoming sections, we
 # will demonstate the AutoScheduler, a method for TVM to optimize common
 # operators without the need for the user to provide a user-defined template.
diff --git a/tutorials/get_started/auto_tuning_with_python.py b/tutorials/get_started/autotvm_relay_x86.py
similarity index 99%
rename from tutorials/get_started/auto_tuning_with_python.py
rename to tutorials/get_started/autotvm_relay_x86.py
index 848511cb9994..67faec4505a6 100644
--- a/tutorials/get_started/auto_tuning_with_python.py
+++ b/tutorials/get_started/autotvm_relay_x86.py
@@ -15,8 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Compiling and Optimizing a Model with the Python AutoScheduler
-==============================================================
+Compiling and Optimizing a Model with the Python Interface (AutoTVM)
+====================================================================
 **Author**:
 `Chris Hoge <https://github.com/hogepodge>`_
 
@@ -302,6 +302,7 @@
     repeat=repeat,
     timeout=timeout,
     min_repeat_ms=min_repeat_ms,
+    enable_cpu_cache_flush=True,
 )
 
 # Create a simple structure for holding tuning options. We use an XGBoost
diff --git a/tutorials/get_started/install.py b/tutorials/get_started/install.py
index 6d1db4ddb127..efc951a52709 100644
--- a/tutorials/get_started/install.py
+++ b/tutorials/get_started/install.py
@@ -23,8 +23,8 @@
 
 Depending on your needs and your working environment, there are a few different
 methods for installing TVM. These include:
-    * Installing from source 
-    * Installing from third-party binary package.
+* Installing from source
+* Installing from third-party binary package.
 """
 
 ################################################################################
diff --git a/tutorials/get_started/introduction.py b/tutorials/get_started/introduction.py
index 0ee79d334c03..0746c3983b61 100644
--- a/tutorials/get_started/introduction.py
+++ b/tutorials/get_started/introduction.py
@@ -19,7 +19,8 @@
 ============
 **Authors**:
 `Jocelyn Shiue <https://github.com/>`_,
-`Chris Hoge <https://github.com/hogepodge>`_
+`Chris Hoge <https://github.com/hogepodge>`_,
+`Lianmin Zheng <https://github.com/merrymercy>`_
 
 Apache TVM is an open source machine learning compiler framework for CPUs,
 GPUs, and machine learning accelerators. It aims to enable machine learning
@@ -35,11 +36,11 @@
 
 #. :doc:`Introduction <introduction>`
 #. :doc:`Installing TVM <install>`
-#. :doc:`Compiling and Optimizing a Model with TVMC <tvmc_command_line_driver>`
-#. :doc:`Compiling and Optimizing a Model with the Python AutoScheduler <auto_tuning_with_python>`
-#. :doc:`Working with Operators Using Tensor Expressions <tensor_expr_get_started>`
-#. :doc:`Optimizing Operators with Templates and AutoTVM <autotvm_matmul>`
-#. :doc:`Optimizing Operators with AutoScheduling <tune_matmul_x86>`
+#. :doc:`Compiling and Optimizing a Model with the Command Line Interface <tvmc_command_line_driver>`
+#. :doc:`Compiling and Optimizing a Model with the Python Interface <autotvm_relay_x86>`
+#. :doc:`Working with Operators Using Tensor Expression <tensor_expr_get_started>`
+#. :doc:`Optimizing Operators with Templates and AutoTVM <autotvm_matmul_x86>`
+#. :doc:`Optimizing Operators with Template-free AutoScheduler <auto_scheduler_matmul_x86>`
 #. :doc:`Cross Compilation and Remote Procedure Calls (RPC) <cross_compilation_and_rpc>`
 #. :doc:`Compiling Deep Learning Models for GPUs <relay_quick_start>`
 """
@@ -51,18 +52,18 @@
 # The diagram below illustrates the steps a machine model takes as it is
 # transformed with the TVM optimizing compiler framework.
 #
-# .. image:: https://raw.githubusercontent.com/hogepodge/web-data/c339ebbbae41f3762873147c1e920a53a08963dd/images/getting_started/overview.png
+# .. image:: https://raw.githubusercontent.com/apache/tvm-site/main/images/tutorial/overview.png
 #   :width: 100%
 #   :alt: A High Level View of TVM
 #
 # 1. Import the model from a framework like *Tensorflow*, *Pytorch*, or *Onnx*.
 #    The importer layer is where TVM can ingest models from other frameworks, like
-#    ONNX, Tensorflow, or PyTorch. The level of support that TVM offers for each
+#    Tensorflow, PyTorch, or ONNX. The level of support that TVM offers for each
 #    frontend varies as we are constantly improving the open source project. If
 #    you're having issues importing your model into TVM, you may want to try
 #    converting it to ONNX.
 #
-# 2. Translate to *Relay*, TVM's high level model language.
+# 2. Translate to *Relay*, TVM's high-level model language.
 #    A model that has been imported into TVM is represented in Relay. Relay is a
 #    functional language and intermediate representation (IR) for neural networks.
 #    It has support for:
@@ -72,46 +73,47 @@
 #      differentiable language
 #    - Ability to allow the user to mix the two programming styles
 #
-#    Relay applies several high-level optimization to the model, after which
-#    is runs the Relay Fusion Pass. To aid in the process of converting to
-#    Relay, TVM includes a Tensor Operator Inventory (TOPI) that has pre-defined
-#    templates of common computations.
+#    Relay applies graph-level optimization passes to optimize the model.
 #
 # 3. Lower to *Tensor Expression* (TE) representation. Lowering is when a
 #    higher-level representation is transformed into a lower-level
-#    representation. In Relay Fusion Pass, the model is lowered from the
-#    higher-level Relay representation into a smaller set of subgraphs, where
-#    each node is a task. A task is a collection of computation templates,
-#    expressed in TE, where there parameters of the template can control how
-#    the computation is carried out on hardware. The specific ordering of compuation,
-#    defined by parameters to the TE template, is called a schedule.
-#
-# 4. Search for optimized schedule using *AutoTVM* or *AutoScheduler* for each
-#    task through tuning. Tuning is the process of searching the TE parameter
-#    space for a schedule that is optimized for target hardware. There are
-#    couple of optimization options available, each requiring varying levels of
-#    user interaction. The optimization options include:
-#
-#    - **AutoTVM**: The user specifies a search template for the schedule of a TE task,
-#      or TE subraph. AutoTVM directs the search of the parameter space defined by the
-#      template to produce an optimized configuration. AutoTVM requires users to
-#      define manually templates for each operator as part of the TOPI.
-#    - **Ansor/AutoSchedule**: Using a TVM Operator Inventory (TOPI) of operations,
-#      Ansor can automatically search an optimization space with much less
-#      intervention and guidance from the end user. Ansor depends on TE templates to
-#      guide the search.
-#
-# 5. Choose the optimal configuration for the model. After tuning, an optimal schedule
-#    for each task is chosen. Regardless if it is AutoTVM or AutoSchedule,
-#    schedule records in JSON format are produced that are referred to by this step
-#    to build an optimized model.
-#
-# 6. Lower to a hardware specific compiler. After selecting an optimized configuration
-#    based on the tuning step, the model is then lowered to a representation
-#    expected by the target compiler for the hardware platform. This is the
-#    final code generation phase with the intention of producing an optimized
-#    model that can be deployed into production. TVM supports a number of
-#    different compiler backends including:
+#    representation. After applying the high-level optimizations, Relay
+#    runs FuseOps pass to partition the model into many small subgraphs and lowers
+#    the subgraphs to TE representation. Tensor Expression (TE) is a
+#    domain-specific language for describing tensor computations.
+#    TE also provides several *schedule* primitives to specify low-level loop
+#    optimizations, such as tiling, vectorization, parallelization,
+#    unrolling, and fusion.
+#    To aid in the process of converting Relay representation into TE representation,
+#    TVM includes a Tensor Operator Inventory (TOPI) that has pre-defined
+#    templates of common tensor operators (e.g., conv2d, transpose).
+#
+# 4. Search for the best schedule using the auto-tuning module *AutoTVM* or *AutoScheduler*.
+#    A schedule specifies the low-level loop optimizations for an operator or
+#    subgraph defined in TE. Auto-tuning modules search for the best schedule
+#    and compare them with cost models and on-device measurements.
+#    There are two auto-tuning modules in TVM.
+#
+#    - **AutoTVM**: A template-based auto-tuning module. It runs search algorithms
+#      to find the best values for the tunable knobs in a user-defined template.
+#      For common operators, their templates are already provided in TOPI.
+#    - **AutoScheduler (a.k.a. Ansor)**: A template-free auto-tuning module.
+#      It does not require pre-defined schedule templates. Instead, it generates
+#      the search space automatically by analyzing the computation definition.
+#      It then searches for the best schedule in the generated search space.
+#
+# 5. Choose the optimal configurations for model compilation. After tuning, the
+#    auto-tuning module generates tuning records in JSON format. This step
+#    picks the best schedule for each subgraph.
+#
+# 6. Lower to Tensor Intermediate Representation (TIR), TVM's low-level
+#    intermediate representation. After selecting the optimal configurations
+#    based on the tuning step, each TE subgraph is lowered to TIR and be
+#    optimized by low-level optimization passes. Next, the optimized TIR is
+#    lowered to the target compiler of the hardware platform.
+#    This is the final code generation phase to produce an optimized model
+#    that can be deployed into production. TVM supports several different
+#    compiler backends including:
 #
 #    - LLVM, which can target arbitrary microprocessor architecture including
 #      standard x86 and ARM processors, AMDGPU and NVPTX code generation, and any
diff --git a/tutorials/get_started/tensor_expr_get_started.py b/tutorials/get_started/tensor_expr_get_started.py
index ee13d9e475f6..8fbdb751c9f8 100644
--- a/tutorials/get_started/tensor_expr_get_started.py
+++ b/tutorials/get_started/tensor_expr_get_started.py
@@ -17,22 +17,19 @@
 """
 .. _tutorial-tensor-expr-get-started:
 
-Working with Operators Using Tensor Expressions
-===============================================
+Working with Operators Using Tensor Expression
+==============================================
 **Author**: `Tianqi Chen <https://tqchen.github.io>`_
 
 In this tutorial we will turn our attention to how TVM works with Tensor
-Expressions (TE) to create a space to search for performant configurations. TE
+Expression (TE) to define tensor computations and apply loop optimizations. TE
 describes tensor computations in a pure functional language (that is each
 expression has no side effects). When viewed in context of the TVM as a whole,
 Relay describes a computation as a set of operators, and each of these
 operators can be represented as a TE expression where each TE expression takes
-input tensors and produces an output tensor. It's important to note that the
-tensor isn't necessarily a fully materialized array, rather it is a
-representation of a computation. If you want to produce a computation from a
-TE, you will need to use the scheduling features of TVM.
+input tensors and produces an output tensor.
 
-This is an introductory tutorial to the Tensor expression language in TVM. TVM
+This is an introductory tutorial to the Tensor Expression language in TVM. TVM
 uses a domain specific tensor expression for efficient kernel construction. We
 will demonstrate the basic workflow with two examples of using the tensor expression
 language. The first example introduces TE and scheduling with vector
@@ -47,8 +44,8 @@
 # ---------------------------------------------------------------
 #
 # Let's look at an example in Python in which we will implement a TE for
-# vector addition, followed by a schedule targeted towards a CPU. We begin by initializing a TVM
-# environment.
+# vector addition, followed by a schedule targeted towards a CPU.
+# We begin by initializing a TVM environment.
 
 import tvm
 import tvm.testing
@@ -59,7 +56,8 @@
 # and specify it. If you're using llvm, you can get this information from the
 # command ``llc --version`` to get the CPU type, and you can check
 # ``/proc/cpuinfo`` for additional extensions that your processor might
-# support. For example, ``tgt = "llvm -mcpu=`skylake`
+# support. For example, you can use "llvm -mcpu=skylake-avx512" for CPUs with
+# AVX-512 instructions.
 
 tgt = tvm.target.Target(target="llvm", host="llvm")
 
@@ -69,7 +67,7 @@
 # We describe a vector addition computation. TVM adopts tensor semantics, with
 # each intermediate result represented as a multi-dimensional array. The user
 # needs to describe the computation rule that generates the tensors. We first
-# define a symbolic variable n to represent the shape. We then define two
+# define a symbolic variable ``n`` to represent the shape. We then define two
 # placeholder Tensors, ``A`` and ``B``, with given shape ``(n,)``. We then
 # describe the result tensor ``C``, with a ``compute`` operation. The
 # ``compute`` defines a computation, with the output conforming to the
@@ -79,7 +77,6 @@
 # tensors. Remember, no actual computation happens during this phase, as we
 # are only declaring how the computation should be done.
 
-
 n = te.var("n")
 A = te.placeholder((n,), name="A")
 B = te.placeholder((n,), name="B")
@@ -88,10 +85,10 @@
 ################################################################################
 # .. note:: Lambda Functions
 #
-# The second argument to the ``te.compute`` method is the function that
-# performs the computation. In this example, we're using an anonymous function,
-# also known as a ``lambda`` function, to define the computation, in this case
-# addition on the ``i``th element of ``A`` and ``B``.
+#   The second argument to the ``te.compute`` method is the function that
+#   performs the computation. In this example, we're using an anonymous function,
+#   also known as a ``lambda`` function, to define the computation, in this case
+#   addition on the ``i``th element of ``A`` and ``B``.
 
 ################################################################################
 # Create a Default Schedule for the Computation
@@ -322,8 +319,6 @@ def evaluate_addition(func, target, optimization, log):
 
     bx, tx = s[C].split(C.op.axis[0], factor=64)
 
-    xXXXXXXXx
-
     ################################################################################
     # Finally we must bind the iteration axis bx and tx to threads in the GPU
     # compute grid. The naive schedule is not valid for GPUs, and these are
diff --git a/tutorials/get_started/tvmc_command_line_driver.py b/tutorials/get_started/tvmc_command_line_driver.py
index fffbfbf0356f..d9174da2ec58 100644
--- a/tutorials/get_started/tvmc_command_line_driver.py
+++ b/tutorials/get_started/tvmc_command_line_driver.py
@@ -494,5 +494,5 @@
 # --help``.
 #
 # In the next tutorial, `Compiling and Optimizing a Model with the Python
-# AutoScheduler <auto_tuning_with_pyton>`_, we will cover the same compilation
+# Interface <auto_tuning_with_pyton>`_, we will cover the same compilation
 # and optimization steps using the Python interface.
diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py
index 493db87d46d5..b2f909b9710a 100644
--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -141,7 +141,7 @@
 
 ######################################################################
 # Build the inference graph executor
-# ---------------------------------
+# ----------------------------------
 # Grab vision model from Gluon model zoo and compile with Relay.
 # The compilation steps are:
 #