From aa7b80693854254d5a53cf682d0c5ac468b01b8b Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Mon, 25 Sep 2017 19:58:02 -0700 Subject: [PATCH] [DOCS] Add intro tutorial (#45) --- nnvm/README.md | 4 +- nnvm/docs/conf.py | 1 + nnvm/python/nnvm/compiler/graph_attr.py | 2 +- nnvm/tutorials/get_started.py | 174 ++++++++++++++++++++++ nnvm/tutorials/mobilenet_inference_gpu.py | 18 +-- 5 files changed, 187 insertions(+), 12 deletions(-) create mode 100644 nnvm/tutorials/get_started.py diff --git a/nnvm/README.md b/nnvm/README.md index 66aa7530f709a..d9bd4aa0d2a48 100644 --- a/nnvm/README.md +++ b/nnvm/README.md @@ -9,10 +9,10 @@ NNVM is a reusable computational graph compilation stack for deep learning syste - Optimize computation graphs to improve performance. - Compile into executable modules and deploy to different hardware backends with minimum dependency. -NNVM is designed to add new frontend, operators and graph optimizations in a decentralized fashion without changing the core interface. NNVM is part of [TVM stack](https://github.com/dmlc/tvm). NNVM compiler toolchain can target hardware backends supported by TVM. +NNVM is designed to add new frontend, operators and graph optimizations in a decentralized fashion without changing the core interface. It is part of [TVM stack](https://github.com/dmlc/tvm). The compiler toolchain can target hardware backends supported by TVM. The compiled module can be deployed to server, mobile, embedded devices and browsers with minimum dependency, in languages including c++, python, javascript, java, objective-c. -The following code snippet demonstrates the general workflow of nnvm compiler toolchain. +The following code snippet demonstrates the general workflow of nnvm. ```python import tvm diff --git a/nnvm/docs/conf.py b/nnvm/docs/conf.py index 5175167185cd7..64e40eb748b5a 100644 --- a/nnvm/docs/conf.py +++ b/nnvm/docs/conf.py @@ -16,6 +16,7 @@ import shlex import recommonmark import sphinx_gallery +from tvm.contrib import rpc, graph_runtime from recommonmark.parser import CommonMarkParser from recommonmark.transform import AutoStructify diff --git a/nnvm/python/nnvm/compiler/graph_attr.py b/nnvm/python/nnvm/compiler/graph_attr.py index 3787eca68707a..43be85e644bba 100644 --- a/nnvm/python/nnvm/compiler/graph_attr.py +++ b/nnvm/python/nnvm/compiler/graph_attr.py @@ -54,7 +54,7 @@ def set_dtype_inputs(g, dtype): """ if isinstance(dtype, dict): list_dtype = [ - DTYPE_TO_TCODE[dtype.get(name, "default")] + DTYPE_TO_TCODE[str(dtype.get(name, "default"))] for name in g.index.input_names] else: list_dtype = [DTYPE_TO_TCODE[dtype]] * len(g.index.input_names) diff --git a/nnvm/tutorials/get_started.py b/nnvm/tutorials/get_started.py new file mode 100644 index 0000000000000..f505853375df0 --- /dev/null +++ b/nnvm/tutorials/get_started.py @@ -0,0 +1,174 @@ +""" +Get Started with NNVM +===================== +**Author**: `Tianqi Chen `_ + +This article is an introductory tutorial to workflow in NNVM. +""" +import nnvm.compiler +import nnvm.symbol as sym + +###################################################################### +# Declare Computation +# ------------------- +# We start by describing our need using computational graph. +# Most deep learning frameworks use computation graph to describe +# their computation. In this example, we directly use +# NNVM's API to construct the computational graph. +# +# .. note:: +# +# In a typical deep learning compilation workflow, +# we can get the models from :any:`nnvm.frontend` +# +# The following code snippet describes :math:`z = x + \sqrt{y}` +# and creates a nnvm graph from the description. +# We can print out the graph ir to check the graph content. + +x = sym.Variable("x") +y = sym.Variable("y") +z = sym.elemwise_add(x, sym.sqrt(y)) +compute_graph = nnvm.graph.create(z) +print("-------compute graph-------") +print(compute_graph.ir()) + +###################################################################### +# Compile +# ------- +# We can call :any:`nnvm.compiler.build` to compile the graph. +# The build function takes a shape parameter which specifies the +# input shape requirement. Here we only need to pass in shape of ``x`` +# and the other one will be inferred automatically by NNVM. +# +# The function returns three values. ``deploy_graph`` contains +# the final compiled graph structure. ``lib`` is a :any:`tvm.module.Module` +# that contains compiled CUDA functions. We do not need the ``params`` +# in this case. +shape = (4,) +deploy_graph, lib, params = nnvm.compiler.build( + compute_graph, target="cuda", shape={"x": shape}, dtype="float32") + +###################################################################### +# We can print out the IR of ``deploy_graph`` to understand what just +# happened under the hood. We can find that ``deploy_graph`` only +# contains a single operator ``tvm_op``. This is because NNVM +# automatically fused the operator together into one operator. +# +print("-------deploy graph-------") +print(deploy_graph.ir()) + +###################################################################### +# Let us also peek into content of ``lib``. +# Typically a compiled TVM CUDA module contains a host module(lib) +# and a device module(``lib.imported_modules[0]``) that contains the CUDA code. +# We print out the the generated device code here. +# This is exactly a fused CUDA version of kernel that the graph points to. +# +print("-------deploy library-------") +print(lib.imported_modules[0].get_source()) + +###################################################################### +# Deploy and Run +# -------------- +# Now that we have have compiled module, let us run it. +# We can use :any:`graph_runtime ` +# in tvm to create a deployable :any:`GraphModule `. +# We can use the :any:`set_input `, +# :any:`run ` and +# :any:`get_output ` function +# to set the input, execute the graph and get the output we need. +# +import tvm +import numpy as np +from tvm.contrib import graph_runtime, util + +module = graph_runtime.create(deploy_graph, lib, tvm.gpu(0)) +x_np = np.array([1, 2, 3, 4]).astype("float32") +y_np = np.array([4, 4, 4, 4]).astype("float32") +# set input to the graph module +module.set_input(x=x_np, y=y_np) +# run forward computation +module.run() +# get the first output +out = module.get_output(0, out=tvm.nd.empty(shape)) +print(out.asnumpy()) + +###################################################################### +# Provide Model Parameters +# ------------------------ +# Most deep learning models contains two types of inputs: parameters +# that remains fixed during inference and data input that need to +# change for each inference task. It is helpful to provide these +# information to NNVM. Let us assume that ``y`` is the parameter +# in our example. We can provide the model parameter information +# by the params argument to :any:`nnvm.compiler.build`. +# +deploy_graph, lib, params = nnvm.compiler.build( + compute_graph, target="cuda", shape={"x": shape}, params={"y": y_np}) + +###################################################################### +# This time we will need params value returned by :any:`nnvm.compiler.build`. +# NNVM applys optimization to pre-compute the intermediate values in +# the graph that can be determined by parameters. In this case +# :math:`\sqrt{y}` can be pre-computed. The pre-computed values +# are returned as new params. We can print out the new compiled library +# to confirm that the fused kernel only now contains add. +# +print("-----optimized params-----") +print(params) +print("-------deploy library-------") +print(lib.imported_modules[0].get_source()) + +###################################################################### +# Save the Deployed Module +# ------------------------ +# We can save the ``deploy_graph``, ``lib`` and ``params`` separately +# and load them back later. We can use :any:`tvm.module.Module` to export +# the compiled library. ``deploy_graph`` is saved in json format and ``params`` +# is serialized into a bytearray. +# +temp = util.tempdir() +path_lib = temp.relpath("deploy.so") +lib.export_library(path_lib) +with open(temp.relpath("deploy.json"), "w") as fo: + fo.write(deploy_graph.json()) +with open(temp.relpath("deploy.params"), "wb") as fo: + fo.write(nnvm.compiler.save_param_dict(params)) +print(temp.listdir()) + +###################################################################### +# We can load the module back. +loaded_lib = tvm.module.load(path_lib) +loaded_json = open(temp.relpath("deploy.json")).read() +loaded_params = bytearray(open(temp.relpath("deploy.params"), "rb").read()) +module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0)) +params = nnvm.compiler.load_param_dict(loaded_params) +# directly load from byte array +module.load_params(loaded_params) +module.run(x=x_np) +# get the first output +out = module.get_output(0, out=tvm.nd.empty(shape)) +print(out.asnumpy()) + +###################################################################### +# Deploy using Another Language +# ----------------------------- +# We use python in this example for demonstration. +# We can also deploy the compiled modules with other languages +# supported by TVM such as c++, java, javascript. +# The graph module itself is fully embedded in TVM runtime. +# +# The following block demonstrates how we can directly use TVM's +# runtime API to execute the compiled module. +# You can find similar runtime API in TVMRuntime of other languages. +# +fcreate = tvm.get_global_func("tvm.graph_runtime.create") +ctx = tvm.gpu(0) +gmodule = fcreate(loaded_json, loaded_lib, ctx.device_type, ctx.device_id) +set_input, get_output, run = gmodule["set_input"], gmodule["get_output"], gmodule["run"] +set_input("x", tvm.nd.array(x_np)) +gmodule["load_params"](loaded_params) +run() +out = tvm.nd.empty(shape) +get_output(0, out) +print(out.asnumpy()) diff --git a/nnvm/tutorials/mobilenet_inference_gpu.py b/nnvm/tutorials/mobilenet_inference_gpu.py index 28e97244f88ab..e477e758cf912 100644 --- a/nnvm/tutorials/mobilenet_inference_gpu.py +++ b/nnvm/tutorials/mobilenet_inference_gpu.py @@ -17,19 +17,24 @@ # --------------------------------- # NNVM optimizes the graph and relies on TVM to generate fast GPU code. # To get the maximum performance, we need to enable nvcc's compiler hook. -# This gives better performance than nvrtc mode. +# This usually gives better performance than nvrtc mode. @tvm.register_func def tvm_callback_cuda_compile(code): - ptx = nvcc.compile_cuda(code, target="ptx", options=["-arch=sm_52"]) + ptx = nvcc.compile_cuda(code, target="ptx") return ptx ###################################################################### # Prepare the Benchmark # --------------------- # We construct a standard imagenet inference benchmark. -# We use nnvm's testing utility to produce the model description and random parameters so that the example does not -# depend on a specific front-end framework. +# NNVM needs two things to compile a deep learning model: +# +# - net: the graph representation of the computation +# - params: a dictionary of str to parameters +# +# We use nnvm's testing utility to produce the model description and random parameters +# so that the example does not depend on a specific front-end framework. # # .. note:: # @@ -48,11 +53,6 @@ def tvm_callback_cuda_compile(code): ###################################################################### # Compile the Graph # ----------------- -# NNVM needs two things to compile a deep learning model: -# -# - net: the graph representation of the computation -# - params: a dictionary of str to parameters -# # To compile the graph, we call the build function with the graph # configuration and parameters. # When parameters are provided, NNVM will pre-compute certain part of the graph if possible (e.g. simplify batch normalization to scale shift),