diff --git a/apps/bundle_deploy/Makefile b/apps/bundle_deploy/Makefile
new file mode 100644
index 0000000000000..0bf1613c8d66e
--- /dev/null
+++ b/apps/bundle_deploy/Makefile
@@ -0,0 +1,39 @@
+# Makefile Example to bundle TVM modules.
+TVM_ROOT=$(shell cd ../..; pwd)
+NNVM_PATH=nnvm
+DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core
+PKG_CFLAGS = -std=c++14 -Oz -fPIC\
+	-I${TVM_ROOT}/include\
+	-I${DMLC_CORE}/include\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
+
+PKG_LDFLAGS = -L${TVM_ROOT}/build
+
+build_dir := build
+
+test: $(build_dir)/demo $(build_dir)/bundle.so
+	$(build_dir)/demo $(build_dir)/bundle.so
+
+$(build_dir)/demo: demo.cc
+	@mkdir -p $(@D)
+	$(CXX) $(PKG_CFLAGS) -o $@  $^
+
+# Serialize our graph.json file.
+$(build_dir)/graph.json.cc: $(build_dir)/graph.json
+	xxd -i $^  > $@
+
+# Serialize our params.bin file.
+$(build_dir)/params.bin.cc: $(build_dir)/params.bin
+	xxd -i $^  > $@
+
+$(build_dir)/model.o $(build_dir)/graph.json $(build_dir)/params.bin: build_model.py
+	python $< -o $(build_dir)
+
+# Build our bundle against the serialized bundle.cc API, the runtime.cc API, and
+# the serialized graph.json and params.bin
+$(build_dir)/bundle.so: bundle.cc runtime.cc $(build_dir)/model.o $(build_dir)/graph.json.cc $(build_dir)/params.bin.cc
+	@mkdir -p $(@D)
+	$(CXX) $(PKG_CFLAGS) -fvisibility=hidden -o $@  $^ $(PKG_LDFLAGS) -shared
+
+clean:
+	rm -r $(build_dir)
diff --git a/apps/bundle_deploy/README.md b/apps/bundle_deploy/README.md
new file mode 100644
index 0000000000000..9019c44628c58
--- /dev/null
+++ b/apps/bundle_deploy/README.md
@@ -0,0 +1,30 @@
+How to Bundle TVM Modules
+=========================
+
+This folder contains an example on how to bundle a TVM module (with the required
+interpreter runtime modules such as `runtime::GraphRuntime`, the graph JSON, and
+the params) into a single, self-contained shared object (`bundle.so`) which
+exposes a C API wrapping the appropriate `runtime::GraphRuntime` instance.
+
+This is useful for cases where we'd like to avoid deploying the TVM runtime
+components to the target host in advance - instead, we simply deploy the bundled
+shared-object to the host, which embeds both the model and the runtime
+components. The bundle should only depend on libc/libc++.
+
+It also contains an example code to load this shared object and invoke the
+packaged TVM model instance.
+
+Type the following command to run the sample code under the current folder,
+after building TVM first.
+
+```bash
+make demo
+```
+
+This will:
+
+- Download the mobilenet0.25 model from the MXNet Gluon Model Zoo
+- Compile the model with NNVM
+- Build a `bundle.so` shared object containing the model specification and parameters
+- Build a `demo` executable that `dlopen`'s `bundle.so`, instantiates the
+  contained graph runtime, and invokes the run function on a random input.
diff --git a/apps/bundle_deploy/build_model.py b/apps/bundle_deploy/build_model.py
new file mode 100644
index 0000000000000..901996b8774e9
--- /dev/null
+++ b/apps/bundle_deploy/build_model.py
@@ -0,0 +1,40 @@
+"""Creates a simple TVM modules."""
+
+import argparse
+import os
+import nnvm.compiler
+import nnvm.testing
+import tvm
+import logging
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-o', '--out-dir', default='.')
+    opts = parser.parse_args()
+
+    dshape = (1, 3, 224, 224)
+    from mxnet.gluon.model_zoo.vision import get_model
+    block = get_model('mobilenet0.25', pretrained=True)
+    net, params = nnvm.frontend.from_mxnet(block)
+    net = nnvm.sym.softmax(net)
+
+    with nnvm.compiler.build_config(opt_level=3):
+        graph, lib, params = nnvm.compiler.build(
+            net, 'llvm --system-lib', shape={'data': dshape}, params=params)
+    print(graph.symbol().debug_str())
+    build_dir = os.path.abspath(opts.out_dir)
+    if not os.path.isdir(build_dir):
+        os.makedirs(build_dir)
+
+    lib.save(os.path.join(build_dir, 'model.o'))
+    with open(os.path.join(build_dir, 'graph.json'), 'w') as f_graph_json:
+        f_graph_json.write(graph.json())
+    with open(os.path.join(build_dir, 'params.bin'), 'wb') as f_params:
+        f_params.write(nnvm.compiler.save_param_dict(params))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/apps/bundle_deploy/bundle.cc b/apps/bundle_deploy/bundle.cc
new file mode 100644
index 0000000000000..b9557092737c8
--- /dev/null
+++ b/apps/bundle_deploy/bundle.cc
@@ -0,0 +1,47 @@
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/registry.h>
+#include <memory>
+
+extern unsigned char build_graph_json[];
+extern unsigned int build_graph_json_len;
+extern unsigned char build_params_bin[];
+extern unsigned int build_params_bin_len;
+
+#define TVM_BUNDLE_FUNCTION __attribute__((visibility("default"))) extern "C"
+
+TVM_BUNDLE_FUNCTION void *tvm_runtime_create() {
+  const std::string json_data(&build_graph_json[0],
+                              &build_graph_json[0] + build_graph_json_len);
+  tvm::runtime::Module mod_syslib =
+      (*tvm::runtime::Registry::Get("module._GetSystemLib"))();
+  int device_type = kDLCPU;
+  int device_id = 0;
+  tvm::runtime::Module mod =
+      (*tvm::runtime::Registry::Get("tvm.graph_runtime.create"))(
+          json_data, mod_syslib, device_type, device_id);
+  TVMByteArray params;
+  params.data = reinterpret_cast<const char *>(&build_params_bin[0]);
+  params.size = build_params_bin_len;
+  mod.GetFunction("load_params")(params);
+  return new tvm::runtime::Module(mod);
+}
+
+TVM_BUNDLE_FUNCTION void tvm_runtime_destroy(void *handle) {
+  delete reinterpret_cast<tvm::runtime::Module *>(handle);
+}
+
+TVM_BUNDLE_FUNCTION void tvm_runtime_set_input(void *handle, const char *name,
+                                               void *tensor) {
+  reinterpret_cast<tvm::runtime::Module *>(handle)->GetFunction("set_input")(
+      name, reinterpret_cast<DLTensor *>(tensor));
+}
+
+TVM_BUNDLE_FUNCTION void tvm_runtime_run(void *handle) {
+  reinterpret_cast<tvm::runtime::Module *>(handle)->GetFunction("run")();
+}
+
+TVM_BUNDLE_FUNCTION void tvm_runtime_get_output(void *handle, int index,
+                                                void *tensor) {
+  reinterpret_cast<tvm::runtime::Module *>(handle)->GetFunction("get_output")(
+      index, reinterpret_cast<DLTensor *>(tensor));
+}
diff --git a/apps/bundle_deploy/demo.cc b/apps/bundle_deploy/demo.cc
new file mode 100644
index 0000000000000..3abd24a1d691a
--- /dev/null
+++ b/apps/bundle_deploy/demo.cc
@@ -0,0 +1,69 @@
+#include "tvm/runtime/c_runtime_api.h"
+#include <assert.h>
+#include <dlfcn.h> //dlopen
+#include <dlpack/dlpack.h>
+#include <iostream>
+#include <random>
+#include <vector>
+
+int main(int argc, char **argv) {
+  auto *bundle = dlopen(argv[1], RTLD_LAZY | RTLD_LOCAL);
+  assert(bundle);
+  dlerror();
+  auto *ftvm_runtime_create =
+      (auto (*)()->void *)dlsym(bundle, "tvm_runtime_create");
+  assert(!dlerror());
+  void *handle = ftvm_runtime_create();
+
+  dlerror();
+  auto *ftvm_runtime_set_input =
+      (auto (*)(void *, const char *, void *)->void)dlsym(
+          bundle, "tvm_runtime_set_input");
+  assert(!dlerror());
+  std::vector<float> input_storage(1 * 3 * 224 * 224);
+  std::mt19937 gen(0);
+  for (auto &e : input_storage) {
+    e = std::uniform_real_distribution<float>(0.0, 1.0)(gen);
+  }
+
+  std::vector<int64_t> input_shape = {1, 3, 224, 224};
+  DLTensor input;
+  input.data = input_storage.data();
+  input.ctx = DLContext{kDLCPU, 0};
+  input.ndim = 4;
+  input.dtype = DLDataType{kDLFloat, 32, 1};
+  input.shape = input_shape.data();
+  input.strides = nullptr;
+  input.byte_offset = 0;
+  ftvm_runtime_set_input(handle, "data", &input);
+
+  auto *ftvm_runtime_run =
+      (auto (*)(void *)->void)dlsym(bundle, "tvm_runtime_run");
+  assert(!dlerror());
+  ftvm_runtime_run(handle);
+
+  std::vector<float> output_storage(1000);
+  std::vector<int64_t> output_shape = {1, 1000};
+  DLTensor output;
+  output.data = output_storage.data();
+  output.ctx = DLContext{kDLCPU, 0};
+  output.ndim = 2;
+  output.dtype = DLDataType{kDLFloat, 32, 1};
+  output.shape = output_shape.data();
+  output.strides = nullptr;
+  output.byte_offset = 0;
+
+  dlerror();
+  auto *ftvm_runtime_get_output = (auto (*)(void *, int, void *)->void)dlsym(
+      bundle, "tvm_runtime_get_output");
+  assert(!dlerror());
+  ftvm_runtime_get_output(handle, 0, &output);
+  for (auto i = 0; i < output_storage.size(); ++i) {
+    std::cerr << "output[" << i << "]: " << output_storage[i] << std::endl;
+  }
+  auto *ftvm_runtime_destroy =
+      (auto (*)(void *)->void)dlsym(bundle, "tvm_runtime_destroy");
+  assert(!dlerror());
+  ftvm_runtime_destroy(handle);
+  return 0;
+}
diff --git a/apps/bundle_deploy/runtime.cc b/apps/bundle_deploy/runtime.cc
new file mode 100644
index 0000000000000..2284953b8c16c
--- /dev/null
+++ b/apps/bundle_deploy/runtime.cc
@@ -0,0 +1,17 @@
+#include <dlpack/dlpack.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/packed_func.h>
+
+#include "../../src/runtime/c_runtime_api.cc"
+#include "../../src/runtime/cpu_device_api.cc"
+#include "../../src/runtime/workspace_pool.cc"
+#include "../../src/runtime/module_util.cc"
+#include "../../src/runtime/module.cc"
+#include "../../src/runtime/registry.cc"
+#include "../../src/runtime/file_util.cc"
+#include "../../src/runtime/threading_backend.cc"
+#include "../../src/runtime/thread_pool.cc"
+#include "../../src/runtime/ndarray.cc"
+#include "../../src/runtime/system_lib_module.cc"
+#include "../../src/runtime/graph/graph_runtime.cc"