diff --git a/nnvm/README.md b/nnvm/README.md
index 66aa7530f709a..d9bd4aa0d2a48 100644
--- a/nnvm/README.md
+++ b/nnvm/README.md
@@ -9,10 +9,10 @@ NNVM is a reusable computational graph compilation stack for deep learning syste
 - Optimize computation graphs to improve performance.
 - Compile into executable modules and deploy to different hardware backends with minimum dependency.
 
-NNVM is designed to add new frontend, operators and graph optimizations in a decentralized fashion without changing the core interface. NNVM is part of [TVM stack](https://github.com/dmlc/tvm). NNVM compiler toolchain can target hardware backends supported by TVM.
+NNVM is designed to add new frontend, operators and graph optimizations in a decentralized fashion without changing the core interface. It is part of [TVM stack](https://github.com/dmlc/tvm). The compiler toolchain can target hardware backends supported by TVM.
 The compiled module can be deployed to server, mobile, embedded devices and browsers with minimum dependency, in languages including c++, python, javascript, java, objective-c.
 
-The following code snippet demonstrates the general workflow of nnvm compiler toolchain.
+The following code snippet demonstrates the general workflow of nnvm.
 
 ```python
 import tvm
diff --git a/nnvm/docs/conf.py b/nnvm/docs/conf.py
index 5175167185cd7..64e40eb748b5a 100644
--- a/nnvm/docs/conf.py
+++ b/nnvm/docs/conf.py
@@ -16,6 +16,7 @@
 import shlex
 import recommonmark
 import sphinx_gallery
+from tvm.contrib import rpc, graph_runtime
 from recommonmark.parser import CommonMarkParser
 from recommonmark.transform import AutoStructify
 
diff --git a/nnvm/python/nnvm/compiler/graph_attr.py b/nnvm/python/nnvm/compiler/graph_attr.py
index 3787eca68707a..43be85e644bba 100644
--- a/nnvm/python/nnvm/compiler/graph_attr.py
+++ b/nnvm/python/nnvm/compiler/graph_attr.py
@@ -54,7 +54,7 @@ def set_dtype_inputs(g, dtype):
     """
     if isinstance(dtype, dict):
         list_dtype = [
-            DTYPE_TO_TCODE[dtype.get(name, "default")]
+            DTYPE_TO_TCODE[str(dtype.get(name, "default"))]
             for name in g.index.input_names]
     else:
         list_dtype = [DTYPE_TO_TCODE[dtype]] * len(g.index.input_names)
diff --git a/nnvm/tutorials/get_started.py b/nnvm/tutorials/get_started.py
new file mode 100644
index 0000000000000..f505853375df0
--- /dev/null
+++ b/nnvm/tutorials/get_started.py
@@ -0,0 +1,174 @@
+"""
+Get Started with NNVM
+=====================
+**Author**: `Tianqi Chen <https://tqchen.github.io/>`_
+
+This article is an introductory tutorial to workflow in NNVM.
+"""
+import nnvm.compiler
+import nnvm.symbol as sym
+
+######################################################################
+# Declare Computation
+# -------------------
+# We start by describing our need using computational graph.
+# Most deep learning frameworks use computation graph to describe
+# their computation. In this example, we directly use
+# NNVM's API to construct the computational graph.
+#
+# .. note::
+#
+#   In a typical deep learning compilation workflow,
+#   we can get the models from :any:`nnvm.frontend`
+#
+# The following code snippet describes :math:`z = x + \sqrt{y}`
+# and creates a nnvm graph from the description.
+# We can print out the graph ir to check the graph content.
+
+x = sym.Variable("x")
+y = sym.Variable("y")
+z = sym.elemwise_add(x, sym.sqrt(y))
+compute_graph = nnvm.graph.create(z)
+print("-------compute graph-------")
+print(compute_graph.ir())
+
+######################################################################
+# Compile
+# -------
+# We can call :any:`nnvm.compiler.build` to compile the graph.
+# The build function takes a shape parameter which specifies the
+# input shape requirement. Here we only need to pass in shape of ``x``
+# and the other one will be inferred automatically by NNVM.
+#
+# The function returns three values. ``deploy_graph`` contains
+# the final compiled graph structure. ``lib`` is a :any:`tvm.module.Module`
+# that contains compiled CUDA functions. We do not need the ``params``
+# in this case.
+shape = (4,)
+deploy_graph, lib, params = nnvm.compiler.build(
+    compute_graph, target="cuda", shape={"x": shape}, dtype="float32")
+
+######################################################################
+# We can print out the IR of ``deploy_graph`` to understand what just
+# happened under the hood. We can find that ``deploy_graph`` only
+# contains a single operator ``tvm_op``. This is because NNVM
+# automatically fused the operator together into one operator.
+#
+print("-------deploy graph-------")
+print(deploy_graph.ir())
+
+######################################################################
+# Let us also peek into content of ``lib``.
+# Typically a compiled TVM CUDA module contains a host module(lib)
+# and a device module(``lib.imported_modules[0]``) that contains the CUDA code.
+# We print out the the generated device code here.
+# This is exactly a fused CUDA version of kernel that the graph points to.
+#
+print("-------deploy library-------")
+print(lib.imported_modules[0].get_source())
+
+######################################################################
+# Deploy and Run
+# --------------
+# Now that we have have compiled module, let us run it.
+# We can use :any:`graph_runtime <tvm.contrib.graph_runtime.create>`
+# in tvm to create a deployable :any:`GraphModule <tvm.contrib.graph_runtime.GraphModule>`.
+# We can use the :any:`set_input <tvm.contrib.graph_runtime.GraphModule.set_input>`,
+# :any:`run <tvm.contrib.graph_runtime.GraphModule.run>` and
+# :any:`get_output <tvm.contrib.graph_runtime.GraphModule.get_output>` function
+# to set the input, execute the graph and get the output we need.
+#
+import tvm
+import numpy as np
+from tvm.contrib import graph_runtime, util
+
+module = graph_runtime.create(deploy_graph, lib, tvm.gpu(0))
+x_np = np.array([1, 2, 3, 4]).astype("float32")
+y_np = np.array([4, 4, 4, 4]).astype("float32")
+# set input to the graph module
+module.set_input(x=x_np, y=y_np)
+# run forward computation
+module.run()
+# get the first output
+out = module.get_output(0, out=tvm.nd.empty(shape))
+print(out.asnumpy())
+
+######################################################################
+# Provide Model Parameters
+# ------------------------
+# Most deep learning models contains two types of inputs: parameters
+# that remains fixed during inference and data input that need to
+# change for each inference task. It is helpful to provide these
+# information to NNVM. Let us assume that ``y`` is the parameter
+# in our example. We can provide the model parameter information
+# by the params argument to :any:`nnvm.compiler.build`.
+#
+deploy_graph, lib, params = nnvm.compiler.build(
+    compute_graph, target="cuda", shape={"x": shape}, params={"y": y_np})
+
+######################################################################
+# This time we will need params value returned by :any:`nnvm.compiler.build`.
+# NNVM applys  optimization  to pre-compute the intermediate values in
+# the graph that can be determined by parameters. In this case
+# :math:`\sqrt{y}` can be pre-computed. The pre-computed values
+# are returned as new params. We can print out the new compiled library
+# to confirm that the fused kernel only now contains add.
+#
+print("-----optimized params-----")
+print(params)
+print("-------deploy library-------")
+print(lib.imported_modules[0].get_source())
+
+######################################################################
+# Save the Deployed Module
+# ------------------------
+# We can save the ``deploy_graph``, ``lib`` and ``params`` separately
+# and load them back later. We can use :any:`tvm.module.Module` to export
+# the compiled library. ``deploy_graph`` is saved in json format and ``params``
+# is serialized into a bytearray.
+#
+temp = util.tempdir()
+path_lib = temp.relpath("deploy.so")
+lib.export_library(path_lib)
+with open(temp.relpath("deploy.json"), "w") as fo:
+    fo.write(deploy_graph.json())
+with open(temp.relpath("deploy.params"), "wb") as fo:
+    fo.write(nnvm.compiler.save_param_dict(params))
+print(temp.listdir())
+
+######################################################################
+# We can load the module back.
+loaded_lib = tvm.module.load(path_lib)
+loaded_json = open(temp.relpath("deploy.json")).read()
+loaded_params = bytearray(open(temp.relpath("deploy.params"), "rb").read())
+module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0))
+params = nnvm.compiler.load_param_dict(loaded_params)
+# directly load from byte array
+module.load_params(loaded_params)
+module.run(x=x_np)
+# get the first output
+out = module.get_output(0, out=tvm.nd.empty(shape))
+print(out.asnumpy())
+
+######################################################################
+# Deploy using Another Language
+# -----------------------------
+# We use python in this example for demonstration.
+# We can also deploy the compiled modules with other languages
+# supported by TVM such as  c++, java, javascript.
+# The graph module itself is fully embedded in TVM runtime.
+#
+# The following block demonstrates how we can directly use TVM's
+# runtime API to execute the compiled module.
+# You can find similar runtime API in TVMRuntime of other languages.
+#
+fcreate = tvm.get_global_func("tvm.graph_runtime.create")
+ctx = tvm.gpu(0)
+gmodule = fcreate(loaded_json, loaded_lib, ctx.device_type, ctx.device_id)
+set_input, get_output, run = gmodule["set_input"], gmodule["get_output"], gmodule["run"]
+set_input("x", tvm.nd.array(x_np))
+gmodule["load_params"](loaded_params)
+run()
+out = tvm.nd.empty(shape)
+get_output(0, out)
+print(out.asnumpy())
diff --git a/nnvm/tutorials/mobilenet_inference_gpu.py b/nnvm/tutorials/mobilenet_inference_gpu.py
index 28e97244f88ab..e477e758cf912 100644
--- a/nnvm/tutorials/mobilenet_inference_gpu.py
+++ b/nnvm/tutorials/mobilenet_inference_gpu.py
@@ -17,19 +17,24 @@
 # ---------------------------------
 # NNVM optimizes the graph and relies on TVM to generate fast GPU code.
 # To get the maximum performance, we need to enable nvcc's compiler hook.
-# This gives better performance than nvrtc mode.
+# This usually gives better performance than nvrtc mode.
 
 @tvm.register_func
 def tvm_callback_cuda_compile(code):
-    ptx = nvcc.compile_cuda(code, target="ptx", options=["-arch=sm_52"])
+    ptx = nvcc.compile_cuda(code, target="ptx")
     return ptx
 
 ######################################################################
 # Prepare the Benchmark
 # ---------------------
 # We construct a standard imagenet inference benchmark.
-# We use nnvm's testing utility to produce the model description and random parameters so that the example does not
-# depend on a specific front-end framework.
+# NNVM needs two things to compile a deep learning model:
+#
+# - net: the graph representation of the computation
+# - params: a dictionary of str to parameters
+#
+# We use nnvm's testing utility to produce the model description and random parameters
+# so that the example does not depend on a specific front-end framework.
 #
 # .. note::
 #
@@ -48,11 +53,6 @@ def tvm_callback_cuda_compile(code):
 ######################################################################
 # Compile the Graph
 # -----------------
-# NNVM needs two things to compile a deep learning model:
-#
-# - net: the graph representation of the computation
-# - params: a dictionary of str to parameters
-#
 # To compile the graph, we call the build function with the graph
 # configuration and parameters.
 # When parameters are provided, NNVM will pre-compute certain part of the graph if possible (e.g. simplify batch normalization to scale shift),