From c0c4a0e66cdca0b0c312c00fab8340f4390736dd Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 6 Apr 2021 13:30:41 -0700
Subject: [PATCH 01/47] MXNet2.0: Add cpp-package

---
 CMakeLists.txt                                |  10 +
 cpp-package/CMakeLists.txt                    |  52 ++
 cpp-package/README.md                         |  62 ++
 cpp-package/cpp-package.mk                    |  45 ++
 cpp-package/example/CMakeLists.txt            |  56 ++
 cpp-package/example/Makefile                  |  56 ++
 cpp-package/example/README.md                 | 131 +++
 cpp-package/example/alexnet.cpp               | 358 +++++++++
 cpp-package/example/charRNN.cpp               | 758 ++++++++++++++++++
 cpp-package/example/example.mk                |  39 +
 cpp-package/example/feature_extract/Makefile  |  41 +
 cpp-package/example/feature_extract/README.md |  29 +
 .../feature_extract/feature_extract.cpp       | 139 ++++
 .../prepare_data_with_opencv.cpp              |  55 ++
 cpp-package/example/feature_extract/run.sh    |  38 +
 cpp-package/example/get_data.sh               |  64 ++
 cpp-package/example/googlenet.cpp             | 198 +++++
 cpp-package/example/inception_bn.cpp          | 261 ++++++
 cpp-package/example/inference/CMakeLists.txt  |  22 +
 cpp-package/example/inference/Makefile        |  40 +
 cpp-package/example/inference/README.md       | 213 +++++
 .../example/inference/imagenet_inference.cpp  | 662 +++++++++++++++
 cpp-package/example/inference/inference.mk    |  39 +
 .../inference/sentiment_analysis_rnn.cpp      | 488 +++++++++++
 .../inference/unit_test_imagenet_inference.sh |  63 ++
 .../unit_test_sentiment_analysis_rnn.sh       |  41 +
 cpp-package/example/lenet.cpp                 | 267 ++++++
 cpp-package/example/lenet_with_mxdataiter.cpp | 203 +++++
 cpp-package/example/mlp.cpp                   | 182 +++++
 cpp-package/example/mlp_cpu.cpp               | 147 ++++
 cpp-package/example/mlp_csv.cpp               | 276 +++++++
 cpp-package/example/mlp_gpu.cpp               | 163 ++++
 cpp-package/example/mnist_to_csv.py           |  59 ++
 cpp-package/example/resnet.cpp                | 283 +++++++
 .../example/run_lenet_with_mxdataiter.sh      |  23 +
 cpp-package/example/test_kvstore.cpp          | 201 +++++
 cpp-package/example/test_ndarray_copy.cpp     |  62 ++
 cpp-package/example/test_optimizer.cpp        |  36 +
 cpp-package/example/test_score.cpp            | 164 ++++
 .../example/unittests/unit_test_mlp_csv.sh    |  63 ++
 cpp-package/example/utils.h                   |  76 ++
 cpp-package/include/mxnet-cpp/.gitignore      |   2 +
 cpp-package/include/mxnet-cpp/CPPLINT.cfg     |  19 +
 cpp-package/include/mxnet-cpp/MxNetCpp.h      |  43 +
 cpp-package/include/mxnet-cpp/base.h          |  57 ++
 cpp-package/include/mxnet-cpp/contrib.h       | 115 +++
 cpp-package/include/mxnet-cpp/executor.h      | 188 +++++
 cpp-package/include/mxnet-cpp/executor.hpp    | 103 +++
 cpp-package/include/mxnet-cpp/initializer.h   | 257 ++++++
 cpp-package/include/mxnet-cpp/io.h            | 149 ++++
 cpp-package/include/mxnet-cpp/io.hpp          | 108 +++
 cpp-package/include/mxnet-cpp/kvstore.h       |  77 ++
 cpp-package/include/mxnet-cpp/kvstore.hpp     | 268 +++++++
 cpp-package/include/mxnet-cpp/lr_scheduler.h  |  97 +++
 cpp-package/include/mxnet-cpp/metric.h        | 210 +++++
 cpp-package/include/mxnet-cpp/model.h         |  77 ++
 cpp-package/include/mxnet-cpp/ndarray.h       | 485 +++++++++++
 cpp-package/include/mxnet-cpp/ndarray.hpp     | 466 +++++++++++
 cpp-package/include/mxnet-cpp/op_map.h        | 111 +++
 cpp-package/include/mxnet-cpp/op_suppl.h      | 180 +++++
 cpp-package/include/mxnet-cpp/op_util.h       |  65 ++
 cpp-package/include/mxnet-cpp/operator.h      | 210 +++++
 cpp-package/include/mxnet-cpp/operator.hpp    | 180 +++++
 cpp-package/include/mxnet-cpp/optimizer.h     | 215 +++++
 cpp-package/include/mxnet-cpp/optimizer.hpp   | 495 ++++++++++++
 cpp-package/include/mxnet-cpp/shape.h         | 408 ++++++++++
 cpp-package/include/mxnet-cpp/symbol.h        | 300 +++++++
 cpp-package/include/mxnet-cpp/symbol.hpp      | 424 ++++++++++
 cpp-package/scripts/OpWrapperGenerator.py     | 440 ++++++++++
 cpp-package/scripts/lint.py                   | 193 +++++
 cpp-package/tests/ci_test.sh                  |  73 ++
 cpp-package/tests/travis/run_test.sh          |  42 +
 cpp-package/tests/travis/setup.sh             |  23 +
 include/mxnet/c_api.h                         |   2 +-
 src/c_api/c_api_ndarray.cc                    |  31 +-
 src/operator/softmax_output-inl.h             | 476 +++++++++++
 src/operator/softmax_output.cc                | 285 +++++++
 77 files changed, 13024 insertions(+), 15 deletions(-)
 create mode 100644 cpp-package/CMakeLists.txt
 create mode 100644 cpp-package/README.md
 create mode 100644 cpp-package/cpp-package.mk
 create mode 100644 cpp-package/example/CMakeLists.txt
 create mode 100644 cpp-package/example/Makefile
 create mode 100644 cpp-package/example/README.md
 create mode 100644 cpp-package/example/alexnet.cpp
 create mode 100644 cpp-package/example/charRNN.cpp
 create mode 100644 cpp-package/example/example.mk
 create mode 100644 cpp-package/example/feature_extract/Makefile
 create mode 100644 cpp-package/example/feature_extract/README.md
 create mode 100644 cpp-package/example/feature_extract/feature_extract.cpp
 create mode 100644 cpp-package/example/feature_extract/prepare_data_with_opencv.cpp
 create mode 100755 cpp-package/example/feature_extract/run.sh
 create mode 100755 cpp-package/example/get_data.sh
 create mode 100644 cpp-package/example/googlenet.cpp
 create mode 100644 cpp-package/example/inception_bn.cpp
 create mode 100644 cpp-package/example/inference/CMakeLists.txt
 create mode 100644 cpp-package/example/inference/Makefile
 create mode 100644 cpp-package/example/inference/README.md
 create mode 100644 cpp-package/example/inference/imagenet_inference.cpp
 create mode 100644 cpp-package/example/inference/inference.mk
 create mode 100755 cpp-package/example/inference/sentiment_analysis_rnn.cpp
 create mode 100755 cpp-package/example/inference/unit_test_imagenet_inference.sh
 create mode 100755 cpp-package/example/inference/unit_test_sentiment_analysis_rnn.sh
 create mode 100644 cpp-package/example/lenet.cpp
 create mode 100644 cpp-package/example/lenet_with_mxdataiter.cpp
 create mode 100644 cpp-package/example/mlp.cpp
 create mode 100644 cpp-package/example/mlp_cpu.cpp
 create mode 100644 cpp-package/example/mlp_csv.cpp
 create mode 100644 cpp-package/example/mlp_gpu.cpp
 create mode 100644 cpp-package/example/mnist_to_csv.py
 create mode 100644 cpp-package/example/resnet.cpp
 create mode 100755 cpp-package/example/run_lenet_with_mxdataiter.sh
 create mode 100644 cpp-package/example/test_kvstore.cpp
 create mode 100644 cpp-package/example/test_ndarray_copy.cpp
 create mode 100644 cpp-package/example/test_optimizer.cpp
 create mode 100644 cpp-package/example/test_score.cpp
 create mode 100755 cpp-package/example/unittests/unit_test_mlp_csv.sh
 create mode 100644 cpp-package/example/utils.h
 create mode 100644 cpp-package/include/mxnet-cpp/.gitignore
 create mode 100644 cpp-package/include/mxnet-cpp/CPPLINT.cfg
 create mode 100644 cpp-package/include/mxnet-cpp/MxNetCpp.h
 create mode 100644 cpp-package/include/mxnet-cpp/base.h
 create mode 100644 cpp-package/include/mxnet-cpp/contrib.h
 create mode 100644 cpp-package/include/mxnet-cpp/executor.h
 create mode 100644 cpp-package/include/mxnet-cpp/executor.hpp
 create mode 100644 cpp-package/include/mxnet-cpp/initializer.h
 create mode 100644 cpp-package/include/mxnet-cpp/io.h
 create mode 100644 cpp-package/include/mxnet-cpp/io.hpp
 create mode 100644 cpp-package/include/mxnet-cpp/kvstore.h
 create mode 100644 cpp-package/include/mxnet-cpp/kvstore.hpp
 create mode 100644 cpp-package/include/mxnet-cpp/lr_scheduler.h
 create mode 100644 cpp-package/include/mxnet-cpp/metric.h
 create mode 100644 cpp-package/include/mxnet-cpp/model.h
 create mode 100644 cpp-package/include/mxnet-cpp/ndarray.h
 create mode 100644 cpp-package/include/mxnet-cpp/ndarray.hpp
 create mode 100644 cpp-package/include/mxnet-cpp/op_map.h
 create mode 100644 cpp-package/include/mxnet-cpp/op_suppl.h
 create mode 100644 cpp-package/include/mxnet-cpp/op_util.h
 create mode 100644 cpp-package/include/mxnet-cpp/operator.h
 create mode 100644 cpp-package/include/mxnet-cpp/operator.hpp
 create mode 100644 cpp-package/include/mxnet-cpp/optimizer.h
 create mode 100644 cpp-package/include/mxnet-cpp/optimizer.hpp
 create mode 100644 cpp-package/include/mxnet-cpp/shape.h
 create mode 100644 cpp-package/include/mxnet-cpp/symbol.h
 create mode 100644 cpp-package/include/mxnet-cpp/symbol.hpp
 create mode 100644 cpp-package/scripts/OpWrapperGenerator.py
 create mode 100644 cpp-package/scripts/lint.py
 create mode 100755 cpp-package/tests/ci_test.sh
 create mode 100755 cpp-package/tests/travis/run_test.sh
 create mode 100755 cpp-package/tests/travis/setup.sh
 create mode 100644 src/operator/softmax_output-inl.h
 create mode 100644 src/operator/softmax_output.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5c8865aa890a..4c35ee20f631 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,6 +77,7 @@ option(USE_JEMALLOC "Build with Jemalloc support" OFF)
 option(USE_LIBJPEG_TURBO "Use libjpeg-turbo" OFF)
 option(USE_DIST_KVSTORE "Build with DIST_KVSTORE support" OFF)
 option(USE_PLUGINS_WARPCTC "Use WARPCTC Plugins" OFF)
+option(USE_CPP_PACKAGE "Build C++ Package" OFF)
 option(USE_MXNET_LIB_NAMING "Use MXNet library naming conventions." ON)
 option(USE_GPROF "Compile with gprof (profiling) flag" OFF)
 option(USE_VTUNE "Enable use of Intel Amplifier XE (VTune)" OFF) # one could set VTUNE_ROOT for search path
@@ -296,6 +297,10 @@ if(USE_ONEDNN)
   set_target_properties(dnnl PROPERTIES CXX_CLANG_TIDY "")  # don't lint 3rdparty dependency
 endif()
 
+if(USE_CPP_PACKAGE)
+    add_definitions(-DMXNET_USE_CPP_PACKAGE=1)
+endif()
+
 if(USE_INTGEMM)
   message(STATUS "Using intgemm")
   add_subdirectory(3rdparty/intgemm EXCLUDE_FROM_ALL)
@@ -960,6 +965,11 @@ if(INSTALL_PYTHON_VERSIONS)
   endforeach()
 endif()
 
+if(USE_CPP_PACKAGE)
+  add_subdirectory(cpp-package)
+  target_compile_definitions(mxnet PUBLIC MXNET_USE_CPP_PACKAGE=1)
+endif()
+
 if(NOT CMAKE_BUILD_TYPE STREQUAL "Distribution")
   # Staticbuild applies linker version script to hide private symbols, breaking unit tests
   add_subdirectory(tests)
diff --git a/cpp-package/CMakeLists.txt b/cpp-package/CMakeLists.txt
new file mode 100644
index 000000000000..db64fa99bddf
--- /dev/null
+++ b/cpp-package/CMakeLists.txt
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+cmake_minimum_required(VERSION 3.13)
+project(mxnet_cpp C CXX)
+
+add_library(mxnet_cpp INTERFACE)
+
+set(CPP_PACKAGE_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/include/)
+target_include_directories(mxnet_cpp INTERFACE "${CPP_PACKAGE_INCLUDE_DIR}")
+file(GLOB_RECURSE CPP_PACKAGE_HEADERS
+  "${CPP_PACKAGE_INCLUDE_DIR}/*.h"
+  "${CPP_PACKAGE_INCLUDE_DIR}/*.hpp")
+set(CPP_PACKAGE_OP_H_HEADER ${CMAKE_CURRENT_LIST_DIR}/include/mxnet-cpp/op.h)
+target_sources(mxnet_cpp INTERFACE ${CPP_PACKAGE_HEADERS} ${CPP_PACKAGE_OP_H_HEADER})
+target_link_libraries(mxnet_cpp INTERFACE mxnet ${mxnet_LINKER_LIBS})
+
+add_custom_target(
+  cpp_package_op_h ALL
+  BYPRODUCTS ${CPP_PACKAGE_OP_H_HEADER}
+  MAIN_DEPENDENCY mxnet
+  DEPENDS mxnet ${CMAKE_CURRENT_SOURCE_DIR}/scripts/OpWrapperGenerator.py
+  COMMAND echo "Running: OpWrapperGenerator.py"
+  COMMAND python OpWrapperGenerator.py $<TARGET_FILE:mxnet>
+  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts
+)
+add_dependencies(mxnet_cpp cpp_package_op_h)
+
+if(MSVC)
+  target_compile_options(mxnet_cpp INTERFACE "/utf-8")
+endif(MSVC)
+
+if(BUILD_CPP_EXAMPLES)
+  add_subdirectory(example)
+  add_subdirectory(example/inference)
+endif()
+
+install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
diff --git a/cpp-package/README.md b/cpp-package/README.md
new file mode 100644
index 000000000000..77ff0ee36e80
--- /dev/null
+++ b/cpp-package/README.md
@@ -0,0 +1,62 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# MXNet C++ Package
+
+The MXNet C++ Package provides C++ API bindings to the users of MXNet.  Currently, these bindings are not available as standalone package.
+The users of these bindings are required to build this package as mentioned below.
+
+## Building C++ Package
+
+The cpp-package directory contains the implementation of C++ API. As mentioned above, users are required to build this directory or package before using it.
+**The cpp-package is built while building the MXNet shared library, *libmxnet.so*.**
+
+### Steps to build the C++ package:
+1.  Building the MXNet C++ package requires building MXNet from source.
+2.  Clone the MXNet GitHub repository **recursively** to ensure the code in submodules is available for building MXNet.
+	```
+	git clone --recursive https://github.com/apache/incubator-mxnet mxnet
+	```
+
+3.  Install the [prerequisites](<https://mxnet.apache.org/install/build_from_source#prerequisites>), desired [BLAS libraries](<https://mxnet.apache.org/install/build_from_source#blas-library>) and optional [OpenCV, CUDA, and cuDNN](<https://mxnet.apache.org/install/build_from_source#optional>) for building MXNet from source.
+4.  There is a configuration file for make, [make/config.mk](<https://github.com/apache/incubator-mxnet/blob/master/make/config.mk>) that contains all the compilation options. You can edit this file and set the appropriate options prior to running the **make** command.
+5.  Please refer to  [platform specific build instructions](<https://mxnet.apache.org/install/build_from_source#build-instructions-by-operating-system>) and available [build configurations](https://mxnet.apache.org/install/build_from_source#build-configurations) for more details.
+5.  For enabling the build of C++ Package, set the **USE\_CPP\_PACKAGE = 1** in [make/config.mk](<https://github.com/apache/incubator-mxnet/blob/master/make/config.mk>). Optionally, the compilation flag can also be specified on **make** command line as follows.
+	```
+	make -j USE_CPP_PACKAGE=1
+	```
+
+## Usage
+
+In order to consume the C++ API please follow the steps below.
+
+1. Ensure that the MXNet shared library is built from source with the **USE\_CPP\_PACKAGE = 1**.
+2. Include the [MxNetCpp.h](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/include/mxnet-cpp/MxNetCpp.h>) in the program that is going to consume MXNet C++ API.
+	```
+	#include <mxnet-cpp/MxNetCpp.h>
+	```
+3. While building the program, ensure that the correct paths to the directories containing header files and MXNet shared library.
+4. The program links the MXNet shared library dynamically. Hence the library needs to be accessible to the program during runtime. This can be achieved by including the path to the shared library in the environment variable  **LD\_LIBRARY\_PATH** for Linux, Mac. and Ubuntu OS and **PATH** for Windows OS.
+
+
+## Tutorial
+
+A basic tutorial can be found at <https://mxnet.apache.org/api/cpp/docs/tutorials/basics>.
+
+## Examples
+
+The example directory contains examples for you to get started. Please build the MXNet C++ Package before building the examples.
diff --git a/cpp-package/cpp-package.mk b/cpp-package/cpp-package.mk
new file mode 100644
index 000000000000..b9e7c33311a1
--- /dev/null
+++ b/cpp-package/cpp-package.mk
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+ifndef LINT_LANG
+	LINT_LANG="all"
+endif
+
+ifdef CAFFE_PATH
+export LD_LIBRARY_PATH=$(CAFFE_PATH)/lib
+endif
+
+CPP_PACKAGE_OP_H_FILE = cpp-package/include/mxnet-cpp/op.h
+
+EXTRA_PACKAGES += cpp-package-all
+EXTRA_PACKAGES_CLEAN += cpp-package-clean
+
+.PHONY: cpp-package-all cpp-package-lint cpp-package-clean
+
+cpp-package-all: $(CPP_PACKAGE_OP_H_FILE)
+
+cpp-package-clean:
+	rm -f $(CPP_PACKAGE_OP_H_FILE)
+
+$(CPP_PACKAGE_OP_H_FILE): lib/libmxnet.so cpp-package/scripts/OpWrapperGenerator.py
+	(cd cpp-package/scripts; python OpWrapperGenerator.py $(ROOTDIR)/lib/libmxnet.so)
+
+cpp-package-lint:
+	(cd cpp-package; python scripts/lint.py dmlc ${LINT_LANG} include example)
+
+include cpp-package/example/example.mk
+include cpp-package/example/inference/inference.mk
diff --git a/cpp-package/example/CMakeLists.txt b/cpp-package/example/CMakeLists.txt
new file mode 100644
index 000000000000..d682a88c7760
--- /dev/null
+++ b/cpp-package/example/CMakeLists.txt
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Explicitly set GENERATED property https://gitlab.kitware.com/cmake/cmake/issues/18399
+set_property(SOURCE ${CMAKE_CURRENT_LIST_DIR}/../include/mxnet-cpp/op.h PROPERTY GENERATED 1)
+
+add_executable(lenet lenet.cpp)
+target_link_libraries(lenet mxnet_cpp)
+
+add_executable(lenet_with_mxdataiter lenet_with_mxdataiter.cpp)
+target_link_libraries(lenet_with_mxdataiter mxnet_cpp)
+
+add_executable(alexnet alexnet.cpp)
+target_link_libraries(alexnet mxnet_cpp)
+
+add_executable(charRNN charRNN.cpp)
+target_link_libraries(charRNN mxnet_cpp)
+
+add_executable(googlenet googlenet.cpp)
+target_link_libraries(googlenet mxnet_cpp)
+
+add_executable(inception_bn inception_bn.cpp)
+target_link_libraries(inception_bn mxnet_cpp)
+
+add_executable(mlp mlp.cpp)
+target_link_libraries(mlp mxnet_cpp)
+
+add_executable(mlp_cpu mlp_cpu.cpp)
+target_link_libraries(mlp_cpu mxnet_cpp)
+
+add_executable(mlp_gpu mlp_gpu.cpp)
+target_link_libraries(mlp_gpu mxnet_cpp)
+
+add_executable(resnet resnet.cpp)
+target_link_libraries(resnet mxnet_cpp)
+
+
+if(MSVC)
+  add_custom_target(cpp_package_deploy_library ALL
+    DEPENDS mxnet
+    COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:mxnet> $<TARGET_FILE_DIR:mlp>)
+endif()
diff --git a/cpp-package/example/Makefile b/cpp-package/example/Makefile
new file mode 100644
index 000000000000..89af219d3103
--- /dev/null
+++ b/cpp-package/example/Makefile
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+ifeq ($(OS),Windows_NT)
+	UNAME_S := Windows
+else
+	UNAME_S := $(shell uname -s)
+endif
+
+prebuild :
+	@mkdir -p build
+	$(shell ./get_data.sh)
+	$(shell cp -r ../../lib ./)
+CPPEX_SRC = $(wildcard *.cpp)
+CPPEX_EXE = $(patsubst %.cpp, %, $(CPPEX_SRC))
+
+CFLAGS += -I../../include -I../../3rdparty/tvm/nnvm/include -I../../3rdparty/dmlc-core/include  -I../include
+
+ifeq ($(MXNET_USE_CPU),1)
+	CFLAGS += -D MXNET_USE_CPU
+endif
+
+# CPPEX_CFLAGS += -I../include
+CPPEX_EXTRA_LDFLAGS := -L../../build -lmxnet
+MXNET_LIB_PATH := $(shell cd ../../build; pwd)
+
+.PHONY: all clean
+
+all: prebuild  $(CPPEX_EXE)
+
+debug: CPPEX_CFLAGS += -DDEBUG -g
+debug: prebuild all
+
+$(CPPEX_EXE):% : %.cpp
+	$(CXX) -std=c++11 $(CFLAGS)  $(CPPEX_CFLAGS) -o build/$@ $(filter %.cpp %.a, $^) $(CPPEX_EXTRA_LDFLAGS)
+ifeq ($(UNAME_S), Darwin)
+	install_name_tool -add_rpath @loader_path build/$@
+	install_name_tool -add_rpath $(MXNET_LIB_PATH) build/$@
+endif
+
+clean:
+	@rm -rf build
diff --git a/cpp-package/example/README.md b/cpp-package/example/README.md
new file mode 100644
index 000000000000..555316dd1ac3
--- /dev/null
+++ b/cpp-package/example/README.md
@@ -0,0 +1,131 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# MXNet C++ Package Examples
+
+## Building C++ examples
+
+The examples in this folder demonstrate the **training** workflow. The **inference workflow** related examples can be found in [inference](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference>) folder.
+Please build the MXNet C++ Package as explained in the [README](<https://github.com/apache/incubator-mxnet/tree/master/cpp-package#building-c-package>) File before building these examples manually.
+The examples in this folder are built while building the MXNet library and cpp-package from source. However, they can be built manually as follows
+
+From cpp-package/examples directory
+
+-  Build all examples in release mode: **make all**
+-  Build all examples in debug mode: **make debug**
+
+By default, the examples are built to be run on GPU. To build examples to run on CPU:
+
+-  Release: **make all MXNET\_USE\_CPU=1**
+-  Debug: **make debug MXNET\_USE\_CPU=1**
+
+The examples that are built to be run on GPU may not work on the non-GPU machines.
+The makefile will also download the necessary data files and store in a data folder. (The download will take couple of minutes, but will be done only once on a fresh installation.)
+
+
+## Examples demonstrating training workflow
+
+This directory contains following examples. In order to run the examples, ensure that the path to the MXNet shared library is added to the OS specific environment variable viz. **LD\_LIBRARY\_PATH** for Linux, Mac and Ubuntu OS and **PATH** for Windows OS. For example `export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/home/ubuntu/incubator-mxnet/lib` on ubuntu using gpu.
+
+### [alexnet.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/alexnet.cpp>)
+
+The example implements the C++ version of AlexNet. The networks trains on MNIST data. The number of epochs can be specified as a command line argument. For example to train with 10 epochs use the following:
+
+```
+build/alexnet 10
+```
+
+### [googlenet.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/googlenet.cpp>)
+
+The code implements a GoogLeNet/Inception network using the C++ API. The example uses MNIST data to train the network. By default, the example trains the model for 100 epochs. The number of epochs can also be specified in the command line. For example, to train the model for 10 epochs use the following:
+
+```
+build/googlenet 10
+```
+
+### [mlp.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/mlp.cpp>)
+
+The code implements a multilayer perceptron from scratch. The example creates its own dummy data to train the model. The example does not require command line parameters. It trains the model for 20,000 epochs.
+To run the example use the following command:
+
+```
+build/mlp
+```
+
+### [mlp_cpu.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/mlp_cpu.cpp>)
+
+The code implements a multilayer perceptron to train the MNIST data. The code demonstrates the use of "SimpleBind"  C++ API and MNISTIter. The example is designed to work on CPU. The example does not require command line parameters.
+To run the example use the following command:
+
+```
+build/mlp_cpu
+```
+
+### [mlp_gpu.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/mlp_gpu.cpp>)
+
+The code implements a multilayer perceptron to train the MNIST data. The code demonstrates the use of the "SimpleBind"  C++ API and MNISTIter. The example is designed to work on GPU. The example does not require command line arguments. To run the example execute following command:
+
+```
+build/mlp_gpu
+```
+
+### [mlp_csv.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/mlp_csv.cpp>)
+
+The code implements a multilayer perceptron to train the MNIST data. The code demonstrates the use of the "SimpleBind"  C++ API and CSVIter. The CSVIter can iterate data that is in CSV format. The example can be run on CPU or GPU. The example usage is as follows:
+
+```
+build/mlp_csv --train data/mnist_data/mnist_train.csv --test data/mnist_data/mnist_test.csv --epochs 10 --batch_size 100 --hidden_units "128 64 64" --gpu
+```
+* To get the `mnist_training_set.csv` and `mnist_test_set.csv` please run the following command:
+```python
+# in incubator-mxnet/cpp-package/example directory
+python mnist_to_csv.py ./data/mnist_data/train-images-idx3-ubyte ./data/mnist_data/train-labels-idx1-ubyte ./data/mnist_data/mnist_train.csv 60000
+python mnist_to_csv.py ./data/mnist_data/t10k-images-idx3-ubyte ./data/mnist_data/t10k-labels-idx1-ubyte ./data/mnist_data/mnist_test.csv 10000
+```
+
+### [resnet.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/resnet.cpp>)
+
+The code implements a resnet model using the C++ API. The model is used to train MNIST data. The number of epochs for training the model can be specified on the command line. By default, model is trained for 100 epochs. For example, to train with 10 epochs use the following command:
+
+```
+build/resnet 10
+```
+
+### [lenet.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/lenet.cpp>)
+
+The code implements a lenet model using the C++ API. It uses MNIST training data in CSV format to train the network. The example does not use built-in CSVIter to read the data from CSV file. The number of epochs can be specified on the command line. By default, the mode is trained for 100,000 epochs. For example, to train with 10 epochs use the following command:
+
+```
+build/lenet 10
+```
+### [lenet\_with\_mxdataiter.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/mlp_cpu.cpp>)
+
+The code implements a lenet model using the C++ API. It uses MNIST training data to train the network. The example uses built-in MNISTIter to read the data. The number of epochs can be specified on the command line. By default, the mode is trained for 100 epochs. For example, to train with 10 epochs use the following command:
+
+```
+build/lenet_with_mxdataiter 10
+```
+
+In addition, there is `run_lenet_with_mxdataiter.sh` that downloads the mnist data and run `lenet_with_mxdataiter` example.
+
+### [inception_bn.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inception_bn.cpp>)
+
+The code implements an Inception network using the C++ API with batch normalization. The example uses MNIST data to train the network. The model trains for 100 epochs. The example can be run by executing the following command:
+
+```
+build/inception_bn
+```
diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp
new file mode 100644
index 000000000000..1c182182c1a5
--- /dev/null
+++ b/cpp-package/example/alexnet.cpp
@@ -0,0 +1,358 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ */
+#include <iostream>
+#include <map>
+#include <string>
+#include <fstream>
+#include <cstdlib>
+#include "utils.h"
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+Symbol AlexnetSymbol(int num_classes) {
+  auto input_data = Symbol::Variable("data");
+  auto target_label = Symbol::Variable("label");
+  /*stage 1*/
+  auto conv1 = Operator("Convolution")
+                   .SetParam("kernel", Shape(11, 11))
+                   .SetParam("num_filter", 96)
+                   .SetParam("stride", Shape(4, 4))
+                   .SetParam("dilate", Shape(1, 1))
+                   .SetParam("pad", Shape(0, 0))
+                   .SetParam("num_group", 1)
+                   .SetParam("workspace", 512)
+                   .SetParam("no_bias", false)
+                   .SetInput("data", input_data)
+                   .CreateSymbol("conv1");
+  auto relu1 = Operator("Activation")
+                   .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */
+                   .SetInput("data", conv1)
+                   .CreateSymbol("relu1");
+  auto pool1 = Operator("Pooling")
+                   .SetParam("kernel", Shape(3, 3))
+                   .SetParam("pool_type", "max") /*avg,max,sum */
+                   .SetParam("global_pool", false)
+                   .SetParam("stride", Shape(2, 2))
+                   .SetParam("pad", Shape(0, 0))
+                   .SetInput("data", relu1)
+                   .CreateSymbol("pool1");
+  auto lrn1 = Operator("LRN")
+                  .SetParam("nsize", 5)
+                  .SetParam("alpha", 0.0001)
+                  .SetParam("beta", 0.75)
+                  .SetParam("knorm", 1)
+                  .SetInput("data", pool1)
+                  .CreateSymbol("lrn1");
+  /*stage 2*/
+  auto conv2 = Operator("Convolution")
+                   .SetParam("kernel", Shape(5, 5))
+                   .SetParam("num_filter", 256)
+                   .SetParam("stride", Shape(1, 1))
+                   .SetParam("dilate", Shape(1, 1))
+                   .SetParam("pad", Shape(2, 2))
+                   .SetParam("num_group", 1)
+                   .SetParam("workspace", 512)
+                   .SetParam("no_bias", false)
+                   .SetInput("data", lrn1)
+                   .CreateSymbol("conv2");
+  auto relu2 = Operator("Activation")
+                   .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */
+                   .SetInput("data", conv2)
+                   .CreateSymbol("relu2");
+  auto pool2 = Operator("Pooling")
+                   .SetParam("kernel", Shape(3, 3))
+                   .SetParam("pool_type", "max") /*avg,max,sum */
+                   .SetParam("global_pool", false)
+                   .SetParam("stride", Shape(2, 2))
+                   .SetParam("pad", Shape(0, 0))
+                   .SetInput("data", relu2)
+                   .CreateSymbol("pool2");
+  auto lrn2 = Operator("LRN")
+                  .SetParam("nsize", 5)
+                  .SetParam("alpha", 0.0001)
+                  .SetParam("beta", 0.75)
+                  .SetParam("knorm", 1)
+                  .SetInput("data", pool2)
+                  .CreateSymbol("lrn2");
+  /*stage 3*/
+  auto conv3 = Operator("Convolution")
+                   .SetParam("kernel", Shape(3, 3))
+                   .SetParam("num_filter", 384)
+                   .SetParam("stride", Shape(1, 1))
+                   .SetParam("dilate", Shape(1, 1))
+                   .SetParam("pad", Shape(1, 1))
+                   .SetParam("num_group", 1)
+                   .SetParam("workspace", 512)
+                   .SetParam("no_bias", false)
+                   .SetInput("data", lrn2)
+                   .CreateSymbol("conv3");
+  auto relu3 = Operator("Activation")
+                   .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */
+                   .SetInput("data", conv3)
+                   .CreateSymbol("relu3");
+  auto conv4 = Operator("Convolution")
+                   .SetParam("kernel", Shape(3, 3))
+                   .SetParam("num_filter", 384)
+                   .SetParam("stride", Shape(1, 1))
+                   .SetParam("dilate", Shape(1, 1))
+                   .SetParam("pad", Shape(1, 1))
+                   .SetParam("num_group", 1)
+                   .SetParam("workspace", 512)
+                   .SetParam("no_bias", false)
+                   .SetInput("data", relu3)
+                   .CreateSymbol("conv4");
+  auto relu4 = Operator("Activation")
+                   .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */
+                   .SetInput("data", conv4)
+                   .CreateSymbol("relu4");
+  auto conv5 = Operator("Convolution")
+                   .SetParam("kernel", Shape(3, 3))
+                   .SetParam("num_filter", 256)
+                   .SetParam("stride", Shape(1, 1))
+                   .SetParam("dilate", Shape(1, 1))
+                   .SetParam("pad", Shape(1, 1))
+                   .SetParam("num_group", 1)
+                   .SetParam("workspace", 512)
+                   .SetParam("no_bias", false)
+                   .SetInput("data", relu4)
+                   .CreateSymbol("conv5");
+  auto relu5 = Operator("Activation")
+                   .SetParam("act_type", "relu")
+                   .SetInput("data", conv5)
+                   .CreateSymbol("relu5");
+  auto pool3 = Operator("Pooling")
+                   .SetParam("kernel", Shape(3, 3))
+                   .SetParam("pool_type", "max")
+                   .SetParam("global_pool", false)
+                   .SetParam("stride", Shape(2, 2))
+                   .SetParam("pad", Shape(0, 0))
+                   .SetInput("data", relu5)
+                   .CreateSymbol("pool3");
+  /*stage4*/
+  auto flatten =
+      Operator("Flatten").SetInput("data", pool3).CreateSymbol("flatten");
+  auto fc1 = Operator("FullyConnected")
+                 .SetParam("num_hidden", 4096)
+                 .SetParam("no_bias", false)
+                 .SetInput("data", flatten)
+                 .CreateSymbol("fc1");
+  auto relu6 = Operator("Activation")
+                   .SetParam("act_type", "relu")
+                   .SetInput("data", fc1)
+                   .CreateSymbol("relu6");
+  auto dropout1 = Operator("Dropout")
+                      .SetParam("p", 0.5)
+                      .SetInput("data", relu6)
+                      .CreateSymbol("dropout1");
+  /*stage5*/
+  auto fc2 = Operator("FullyConnected")
+                 .SetParam("num_hidden", 4096)
+                 .SetParam("no_bias", false)
+                 .SetInput("data", dropout1)
+                 .CreateSymbol("fc2");
+  auto relu7 = Operator("Activation")
+                   .SetParam("act_type", "relu")
+                   .SetInput("data", fc2)
+                   .CreateSymbol("relu7");
+  auto dropout2 = Operator("Dropout")
+                      .SetParam("p", 0.5)
+                      .SetInput("data", relu7)
+                      .CreateSymbol("dropout2");
+  /*stage6*/
+  auto fc3 = Operator("FullyConnected")
+                 .SetParam("num_hidden", num_classes)
+                 .SetParam("no_bias", false)
+                 .SetInput("data", dropout2)
+                 .CreateSymbol("fc3");
+  auto softmax = Operator("SoftmaxOutput")
+                     .SetParam("grad_scale", 1)
+                     .SetParam("ignore_label", -1)
+                     .SetParam("multi_output", false)
+                     .SetParam("use_ignore", false)
+                     .SetParam("normalization", "null") /*batch,null,valid */
+                     .SetInput("data", fc3)
+                     .SetInput("label", target_label)
+                     .CreateSymbol("softmax");
+  return softmax;
+}
+
+NDArray ResizeInput(NDArray data, const Shape new_shape) {
+  NDArray pic = data.Reshape(Shape(0, 1, 28, 28));
+  NDArray pic_1channel;
+  Operator("_contrib_BilinearResize2D")
+    .SetParam("height", new_shape[2])
+    .SetParam("width", new_shape[3])
+    (pic).Invoke(pic_1channel);
+  NDArray output;
+  Operator("tile")
+    .SetParam("reps", Shape(1, 3, 1, 1))
+    (pic_1channel).Invoke(output);
+  return output;
+}
+
+int main(int argc, char const *argv[]) {
+  /*basic config*/
+  int max_epo = argc > 1 ? strtol(argv[1], nullptr, 10) : 100;
+  float learning_rate = 1e-4;
+  float weight_decay = 1e-4;
+
+  /*context*/
+  auto ctx = Context::cpu();
+  int num_gpu;
+  MXGetGPUCount(&num_gpu);
+  int batch_size = 32;
+#if !MXNET_USE_CPU
+  if (num_gpu > 0) {
+    ctx = Context::gpu();
+    batch_size = 256;
+  }
+#endif
+
+  TRY
+  /*net symbol*/
+  auto Net = AlexnetSymbol(10);
+
+  /*args_map and aux_map is used for parameters' saving*/
+  std::map<std::string, NDArray> args_map;
+  std::map<std::string, NDArray> aux_map;
+
+  /*we should tell mxnet the shape of data and label*/
+  const Shape data_shape = Shape(batch_size, 3, 256, 256),
+              label_shape = Shape(batch_size);
+  args_map["data"] = NDArray(data_shape, ctx);
+  args_map["label"] = NDArray(label_shape, ctx);
+
+  /*with data and label, executor can be generated automatically*/
+  auto *exec = Net.SimpleBind(ctx, args_map);
+  auto arg_names = Net.ListArguments();
+  aux_map = exec->aux_dict();
+  args_map = exec->arg_dict();
+
+  /*if fine tune from some pre-trained model, we should load the parameters*/
+  // NDArray::Load("./model/alex_params_3", nullptr, &args_map);
+  /*else, we should use initializer Xavier to init the params*/
+  Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2.34);
+  for (auto &arg : args_map) {
+    /*be careful here, the arg's name must has some specific ends or starts for
+     * initializer to call*/
+    xavier(arg.first, &arg.second);
+  }
+
+  /*these binary files should be generated using im2rc tools, which can be found
+   * in mxnet/bin*/
+  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
+                                "./data/mnist_data/train-labels-idx1-ubyte",
+                                "./data/mnist_data/t10k-images-idx3-ubyte",
+                                "./data/mnist_data/t10k-labels-idx1-ubyte"
+                              };
+
+  auto train_iter =  MXDataIter("MNISTIter");
+  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
+    return 1;
+  }
+
+  auto val_iter = MXDataIter("MNISTIter");
+  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
+    return 1;
+  }
+
+  Optimizer* opt = OptimizerRegistry::Find("sgd");
+  opt->SetParam("momentum", 0.9)
+     ->SetParam("rescale_grad", 1.0 / batch_size)
+     ->SetParam("clip_gradient", 10)
+     ->SetParam("lr", learning_rate)
+     ->SetParam("wd", weight_decay);
+
+  Accuracy acu_train, acu_val;
+  LogLoss logloss_train, logloss_val;
+  for (int epoch = 0; epoch < max_epo; ++epoch) {
+    LG << "Train Epoch: " << epoch;
+    /*reset the metric every epoch*/
+    acu_train.Reset();
+    /*reset the data iter every epoch*/
+    train_iter.Reset();
+    int iter = 0;
+    while (train_iter.Next()) {
+      auto batch = train_iter.GetDataBatch();
+      /*use copyto to feed new data and label to the executor*/
+      ResizeInput(batch.data, data_shape).CopyTo(&args_map["data"]);
+      batch.label.CopyTo(&args_map["label"]);
+      exec->Forward(true);
+      exec->Backward();
+      for (size_t i = 0; i < arg_names.size(); ++i) {
+        if (arg_names[i] == "data" || arg_names[i] == "label") continue;
+        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+      }
+
+      NDArray::WaitAll();
+      acu_train.Update(batch.label, exec->outputs[0]);
+      logloss_train.Reset();
+      logloss_train.Update(batch.label, exec->outputs[0]);
+      ++iter;
+      LG << "EPOCH: " << epoch << " ITER: " << iter
+         << " Train Accuracy: " << acu_train.Get()
+         << " Train Loss: " << logloss_train.Get();
+    }
+    LG << "EPOCH: " << epoch << " Train Accuracy: " << acu_train.Get();
+
+    LG << "Val Epoch: " << epoch;
+    acu_val.Reset();
+    val_iter.Reset();
+    logloss_val.Reset();
+    iter = 0;
+    while (val_iter.Next()) {
+      auto batch = val_iter.GetDataBatch();
+      ResizeInput(batch.data, data_shape).CopyTo(&args_map["data"]);
+      batch.label.CopyTo(&args_map["label"]);
+      exec->Forward(false);
+      NDArray::WaitAll();
+      acu_val.Update(batch.label, exec->outputs[0]);
+      logloss_val.Update(batch.label, exec->outputs[0]);
+      LG << "EPOCH: " << epoch << " ITER: " << iter << " Val Accuracy: " << acu_val.Get();
+      ++iter;
+    }
+    LG << "EPOCH: " << epoch << " Val Accuracy: " << acu_val.Get();
+    LG << "EPOCH: " << epoch << " Val LogLoss: " << logloss_val.Get();
+
+    /*save the parameters*/
+    std::stringstream ss;
+    ss << epoch;
+    std::string epoch_str;
+    ss >> epoch_str;
+    std::string save_path_param = "alex_param_" + epoch_str;
+    auto save_args = args_map;
+    /*we do not want to save the data and label*/
+    save_args.erase(save_args.find("data"));
+    save_args.erase(save_args.find("label"));
+    /*the alexnet does not get any aux array, so we do not need to save
+     * aux_map*/
+    LG << "EPOCH: " << epoch << " Saving to..." << save_path_param;
+    NDArray::Save(save_path_param, save_args);
+  }
+  /*don't foget to release the executor*/
+  delete exec;
+  delete opt;
+  MXNotifyShutdown();
+  CATCH
+  return 0;
+}
diff --git a/cpp-package/example/charRNN.cpp b/cpp-package/example/charRNN.cpp
new file mode 100644
index 000000000000..524509c375af
--- /dev/null
+++ b/cpp-package/example/charRNN.cpp
@@ -0,0 +1,758 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Hua Zhang mz24cn@hotmail.com
+ * The code implements C++ version charRNN for mxnet\example\rnn\char-rnn.ipynb with MXNet.cpp API.
+ * The generated params file is compatiable with python version.
+ * train() and predict() has been verified with original data samples.
+ * 2017/1/23:
+ * Add faster version charRNN based on built-in cuDNN RNN operator, 10 times faster.
+ * Add time major computation graph, although no substantial performance difference.
+ * Support continuing training from last params file.
+ * Rename params file epoch number starts from zero.
+ */
+
+#if _MSC_VER
+#pragma warning(disable: 4996)  // VS2015 complains on 'std::copy' ...
+#endif
+#include <cstring>
+#include <iostream>
+#include <fstream>
+#include <unordered_map>
+#include <vector>
+#include <string>
+#include <tuple>
+#include <algorithm>
+#include <functional>
+#include <thread>
+#include <chrono>
+#include "mxnet-cpp/MxNetCpp.h"
+#include "utils.h"
+
+using namespace mxnet::cpp;
+
+struct LSTMState {
+  Symbol C;
+  Symbol h;
+};
+
+struct LSTMParam {
+  Symbol i2h_weight;
+  Symbol i2h_bias;
+  Symbol h2h_weight;
+  Symbol h2h_bias;
+};
+
+bool TIME_MAJOR = true;
+
+// LSTM Cell symbol
+LSTMState LSTM(int num_hidden, const Symbol& indata, const LSTMState& prev_state,
+    const LSTMParam& param, int seqidx, int layeridx, mx_float dropout = 0) {
+  auto input = dropout > 0? Dropout(indata, dropout) : indata;
+  auto prefix = std::string("t") + std::to_string(seqidx) + "_l" + std::to_string(layeridx);
+  auto i2h = FullyConnected(prefix + "_i2h", input, param.i2h_weight, param.i2h_bias,
+      num_hidden * 4);
+  auto h2h = FullyConnected(prefix + "_h2h", prev_state.h, param.h2h_weight, param.h2h_bias,
+      num_hidden * 4);
+  auto gates = i2h + h2h;
+  auto slice_gates = SliceChannel(prefix + "_slice", gates, 4);
+  auto in_gate = Activation(slice_gates[0], ActivationActType::kSigmoid);
+  auto in_transform = Activation(slice_gates[1], ActivationActType::kTanh);
+  auto forget_gate = Activation(slice_gates[2], ActivationActType::kSigmoid);
+  auto out_gate = Activation(slice_gates[3], ActivationActType::kSigmoid);
+
+  LSTMState state;
+  state.C = (forget_gate * prev_state.C) + (in_gate * in_transform);
+  state.h = out_gate * Activation(state.C, ActivationActType::kTanh);
+  return state;
+}
+
+Symbol LSTMUnroll(int num_lstm_layer, int sequence_length, int input_dim,
+        int num_hidden, int num_embed, mx_float dropout = 0) {
+  auto isTrain = sequence_length > 1;
+  auto data = Symbol::Variable("data");
+  if (TIME_MAJOR && isTrain)
+    data = transpose(data);
+  auto embed_weight = Symbol::Variable("embed_weight");
+  auto embed = Embedding("embed", data, embed_weight, input_dim, num_embed);
+  auto wordvec = isTrain? SliceChannel(embed, sequence_length, TIME_MAJOR? 0 : 1, true) : embed;
+
+  std::vector<LSTMState> last_states;
+  std::vector<LSTMParam> param_cells;
+  for (int l = 0; l < num_lstm_layer; l++) {
+    std::string layer = "l" + std::to_string(l);
+    LSTMParam param;
+    param.i2h_weight = Symbol::Variable(layer + "_i2h_weight");
+    param.i2h_bias = Symbol::Variable(layer + "_i2h_bias");
+    param.h2h_weight = Symbol::Variable(layer + "_h2h_weight");
+    param.h2h_bias = Symbol::Variable(layer + "_h2h_bias");
+    param_cells.push_back(param);
+    LSTMState state;
+    state.C = Symbol::Variable(layer + "_init_c");
+    state.h = Symbol::Variable(layer + "_init_h");
+    last_states.push_back(state);
+  }
+
+  std::vector<Symbol> hidden_all;
+  for (int i = 0; i < sequence_length; i++) {
+    auto hidden = wordvec[i];
+    for (int layer = 0; layer < num_lstm_layer; layer++) {
+      double dp_ratio = layer == 0? 0 : dropout;
+      auto next_state = LSTM(num_hidden, hidden, last_states[layer], param_cells[layer],
+          i, layer, dp_ratio);
+      hidden = next_state.h;
+      last_states[layer] = next_state;
+    }
+    if (dropout > 0)
+      hidden = Dropout(hidden, dropout);
+    hidden_all.push_back(hidden);
+  }
+
+  auto hidden_concat = isTrain? Concat(hidden_all, hidden_all.size(), 0) : hidden_all[0];
+  auto cls_weight = Symbol::Variable("cls_weight");
+  auto cls_bias = Symbol::Variable("cls_bias");
+  auto pred = FullyConnected("pred", hidden_concat, cls_weight, cls_bias, input_dim);
+
+  auto label = Symbol::Variable("softmax_label");
+  label = transpose(label);
+  label = Reshape(label, Shape(), false, Shape(0), false);  // -1: infer from graph
+  auto sm = SoftmaxOutput("softmax", pred, label);
+  if (isTrain)
+    return sm;
+
+  std::vector<Symbol> outputs = { sm };
+  for (auto& state : last_states) {
+    outputs.push_back(state.C);
+    outputs.push_back(state.h);
+  }
+  return Symbol::Group(outputs);
+}
+
+// Currently mxnet GPU version RNN operator is implemented via *fast* NVIDIA cuDNN.
+Symbol LSTMWithBuiltInRNNOp(int num_lstm_layer, int sequence_length, int input_dim,
+ int num_hidden, int num_embed, mx_float dropout = 0) {
+  auto isTrain = sequence_length > 1;
+  auto data = Symbol::Variable("data");
+  if (TIME_MAJOR && isTrain)
+    data = transpose(data);
+
+  auto embed_weight = Symbol::Variable("embed_weight");
+  auto embed = Embedding("embed", data, embed_weight, input_dim, num_embed);
+  auto label = Symbol::Variable("softmax_label");
+  label = transpose(label);
+  label = Reshape(label, Shape(), false,
+                  Shape(0), false);  // FullyConnected requires one dimension
+  if (!TIME_MAJOR && isTrain)
+    embed = SwapAxis(embed, 0, 1);  // Change to time-major as cuDNN requires
+
+  // We need not do the SwapAxis op as python version does. Direct and better performance in C++!
+  auto rnn_h_init = Symbol::Variable("LSTM_init_h");
+  auto rnn_c_init = Symbol::Variable("LSTM_init_c");
+  auto rnn_params = Symbol::Variable("LSTM_parameters");  // See explanations near RNNXavier class
+  auto variable_sequence_length = Symbol::Variable("sequence_length");
+  auto rnn = RNN(embed, rnn_params, rnn_h_init, rnn_c_init, variable_sequence_length, num_hidden,
+                 num_lstm_layer, RNNMode::kLstm, false, dropout, !isTrain);
+  auto hidden = Reshape(rnn[0], Shape(), false, Shape(0, num_hidden), false);
+
+  auto cls_weight = Symbol::Variable("cls_weight");
+  auto cls_bias = Symbol::Variable("cls_bias");
+  auto pred = FullyConnected("pred", hidden, cls_weight, cls_bias, input_dim);
+  /*In rnn-time-major/rnn_cell_demo.py, the author claimed time-major version speeds up
+   * 1.5~2 times versus batch version. I doubts on the conclusion. In my test, the performance
+   * of both codes are almost same. In fact, there are no substantially differences between
+   * two codes. They are both based on time major cuDNN, the computation graph only differs
+   * slightly on the choices of where to put Reshape/SwapAxis/transpose operation. Here I don't
+   * use Reshape on pred and keep label shape on SoftmaxOutput like time major version code,
+   * but Reshape on label for simplification. It doesn't make influence on performacne. */
+
+  auto sm = SoftmaxOutput("softmax", pred, label);
+  if (isTrain)
+    return sm;
+  else
+    return Symbol::Group({ sm, rnn[1/*RNNOpOutputs::kStateOut=1*/],
+    rnn[2/*RNNOpOutputs::kStateCellOut=2*/] });
+}
+
+class Shuffler {
+  std::vector<int> sequence;
+ public:
+  explicit Shuffler(int size) : sequence(size) {
+    int* p = sequence.data();
+    for (int i = 0; i < size; i++)
+      *p++ = i;
+  }
+  void shuffle(std::function<void(int, int)> lambda = nullptr) {
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(sequence.begin(), sequence.end(), g);
+    int n = 0;
+    if (lambda != nullptr)
+      for (int i : sequence)
+        lambda(n++, i);
+  }
+  const int* data() {
+    return sequence.data();
+  }
+};
+
+class BucketSentenceIter : public DataIter {
+  Shuffler* random;
+  int batch, current, end;
+  unsigned int sequence_length;
+  Context device;
+  std::vector<std::vector<mx_float>> sequences;
+  std::vector<wchar_t> index2chars;
+  std::unordered_map<wchar_t, mx_float> charIndices;
+
+ public:
+  BucketSentenceIter(std::string filename, int minibatch, Context context) : batch(minibatch),
+  current(-1), device(context) {
+    auto content = readContent(filename);
+    buildCharIndex(content);
+    sequences = convertTextToSequences(content, '\n');
+
+    int N = sequences.size() / batch * batch;  // total used samples
+    sequences.resize(N);
+    sort(sequences.begin(), sequences.end(), [](const std::vector<mx_float>& a,
+        const std::vector<mx_float>& b) { return a.size() < b.size(); });
+
+    sequence_length = sequences.back().size();
+    random = new Shuffler(N);
+    // We still can get random results if call Reset() firstly
+//    std::vector<vector<mx_float>>* target = &sequences;
+//    random->shuffle([target](int n, int i) { (*target)[n].swap((*target)[i]); });
+    end = N / batch;
+  }
+  virtual ~BucketSentenceIter() {
+    delete random;
+  }
+
+  unsigned int maxSequenceLength() {
+    return sequence_length;
+  }
+
+  size_t characterSize() {
+    return charIndices.size();
+  }
+
+  virtual bool Next(void) {
+    return ++current < end;
+  }
+  virtual NDArray GetData(void) {
+    const int* indices = random->data();
+    mx_float *data = new mx_float[sequence_length * batch], *pdata = data;
+
+    for (int i = current * batch, end = i + batch; i < end; i++) {
+      memcpy(pdata, sequences[indices[i]].data(), sequences[indices[i]].size() * sizeof(mx_float));
+      if (sequences[indices[i]].size() < sequence_length)
+        memset(pdata + sequences[indices[i]].size(), 0,
+            (sequence_length - sequences[indices[i]].size()) * sizeof(mx_float));
+      pdata += sequence_length;
+    }
+    NDArray array(Shape(batch, sequence_length), device, false);
+    array.SyncCopyFromCPU(data, batch * sequence_length);
+    return array;
+  }
+  virtual NDArray GetLabel(void) {
+    const int* indices = random->data();
+    mx_float *label = new mx_float[sequence_length * batch], *plabel = label;
+
+    for (int i = current * batch, end = i + batch; i < end; i++) {
+      memcpy(plabel, sequences[indices[i]].data() + 1,
+          (sequences[indices[i]].size() - 1) * sizeof(mx_float));
+      memset(plabel + sequences[indices[i]].size() - 1, 0,
+          (sequence_length - sequences[indices[i]].size() + 1) * sizeof(mx_float));
+      plabel += sequence_length;
+    }
+    NDArray array(Shape(batch, sequence_length), device, false);
+    array.SyncCopyFromCPU(label, batch * sequence_length);
+    return array;
+  }
+  virtual int GetPadNum(void) {
+    return sequence_length - sequences[random->data()[current * batch]].size();
+  }
+  virtual std::vector<int> GetIndex(void) {
+    const int* indices = random->data();
+    std::vector<int> list(indices + current * batch, indices + current * batch + batch);
+    return list;
+  }
+  virtual void BeforeFirst(void) {
+    current = -1;
+    random->shuffle(nullptr);
+  }
+
+  std::wstring readContent(const std::string file) {
+    std::wifstream ifs(file, std::ios::binary);
+    if (ifs) {
+      std::wostringstream os;
+      os << ifs.rdbuf();
+      return os.str();
+    }
+    return L"";
+  }
+
+  void buildCharIndex(const std::wstring& content) {
+  // This version buildCharIndex() Compatiable with python version char_rnn dictionary
+    int n = 1;
+    charIndices['\0'] = 0;  // padding character
+    index2chars.push_back(0);  // padding character index
+    for (auto c : content)
+      if (charIndices.find(c) == charIndices.end()) {
+        charIndices[c] = n++;
+        index2chars.push_back(c);
+      }
+  }
+//  void buildCharIndex(wstring& content) {
+//    for (auto c : content)
+//      charIndices[c]++; // char-frequency map; then char-index map
+//    std::vector<tuple<wchar_t, mx_float>> characters;
+//    for (auto& iter : charIndices)
+//      characters.push_back(make_tuple(iter.first, iter.second));
+//    sort(characters.begin(), characters.end(), [](const tuple<wchar_t, mx_float>& a,
+//      const tuple<wchar_t, mx_float>& b) { return get<1>(a) > get<1>(b); });
+//    mx_float index = 1; //0 is left for zero-padding
+//    index2chars.clear();
+//    index2chars.push_back(0); //zero-padding
+//    for (auto& t : characters) {
+//      charIndices[get<0>(t)] = index++;
+//      index2chars.push_back(get<0>(t));
+//    }s
+//  }
+
+  inline wchar_t character(int i) {
+    return index2chars[i];
+  }
+
+  inline mx_float index(wchar_t c) {
+    return charIndices[c];
+  }
+
+  void saveCharIndices(const std::string file) {
+    std::wofstream ofs(file, std::ios::binary);
+    if (ofs) {
+      ofs.write(index2chars.data() + 1, index2chars.size() - 1);
+      ofs.close();
+    }
+  }
+
+  static std::tuple<std::unordered_map<wchar_t, mx_float>, std::vector<wchar_t>> loadCharIndices(
+      const std::string file) {
+    std::wifstream ifs(file, std::ios::binary);
+    std::unordered_map<wchar_t, mx_float> map;
+    std::vector<wchar_t> chars;
+    if (ifs) {
+      std::wostringstream os;
+      os << ifs.rdbuf();
+      int n = 1;
+      map[L'\0'] = 0;
+      chars.push_back(L'\0');
+      for (auto c : os.str()) {
+        map[c] = (mx_float) n++;
+        chars.push_back(c);
+      }
+    }
+    // Note: Can't use {} because this would hit the explicit constructor
+    return std::tuple<std::unordered_map<wchar_t, mx_float>, std::vector<wchar_t>>(map, chars);
+  }
+
+  std::vector<std::vector<mx_float>>
+  convertTextToSequences(const std::wstring& content, wchar_t spliter) {
+    std::vector<std::vector<mx_float>> sequences;
+    sequences.push_back(std::vector<mx_float>());
+    for (auto c : content)
+      if (c == spliter && !sequences.back().empty())
+        sequences.push_back(std::vector<mx_float>());
+      else
+        sequences.back().push_back(charIndices[c]);
+    return sequences;
+  }
+};
+
+void OutputPerplexity(NDArray* labels, NDArray* output) {
+  std::vector<mx_float> charIndices, a;
+  labels->SyncCopyToCPU(&charIndices, 0L);  // 0L indicates all
+  output->SyncCopyToCPU(&a, 0L)/*4128*84*/;
+  mx_float loss = 0;
+  int batchSize = labels->GetShape()[0]/*32*/, sequenceLength = labels->GetShape()[1]/*129*/,
+      nSamples = output->GetShape()[0]/*4128*/, vocabSize = output->GetShape()[1]/*84*/;
+  for (int n = 0; n < nSamples; n++) {
+    int row = n % batchSize, column = n / batchSize, labelOffset = column +
+        row * sequenceLength;  // Search based on column storage: labels.T
+    mx_float safe_value = std::max(1e-10f, a[vocabSize * n +
+                                    static_cast<int>(charIndices[labelOffset])]);
+    loss += -log(safe_value);  // Calculate negative log-likelihood
+  }
+  loss = exp(loss / nSamples);
+  std::cout << "Train-Perplexity=" << loss << std::endl;
+}
+
+void SaveCheckpoint(const std::string filepath, Symbol net, Executor* exe) {
+  std::map<std::string, NDArray> params;
+  for (auto iter : exe->arg_dict())
+    if (iter.first.find("_init_") == std::string::npos
+        && iter.first.rfind("data") != iter.first.length() - 4
+        && iter.first.rfind("label") != iter.first.length() - 5)
+      params.insert({"arg:" + iter.first, iter.second});
+  for (auto iter : exe->aux_dict())
+      params.insert({"aux:" + iter.first, iter.second});
+  NDArray::Save(filepath, params);
+}
+
+void LoadCheckpoint(const std::string filepath, Executor* exe) {
+  std::map<std::string, NDArray> params = NDArray::LoadToMap(filepath);
+  for (auto iter : params) {
+    std::string type = iter.first.substr(0, 4);
+    std::string name = iter.first.substr(4);
+    NDArray target;
+    if (type == "arg:")
+      target = exe->arg_dict()[name];
+    else if (type == "aux:")
+      target = exe->aux_dict()[name];
+    else
+      continue;
+    iter.second.CopyTo(&target);
+  }
+}
+
+int input_dim = 0;/*84*/
+int sequence_length_max = 0;/*129*/
+int num_embed = 256;
+int num_lstm_layer = 3;
+int num_hidden = 512;
+mx_float dropout = 0.2;
+void train(const std::string file, int batch_size, int max_epoch, int start_epoch) {
+  Context device(DeviceType::kGPU, 0);
+  BucketSentenceIter dataIter(file, batch_size, device);
+  std::string prefix = file.substr(0, file.rfind("."));
+  dataIter.saveCharIndices(prefix + ".dictionary");
+
+  input_dim = static_cast<int>(dataIter.characterSize());
+  sequence_length_max = dataIter.maxSequenceLength();
+
+  auto RNN = LSTMUnroll(num_lstm_layer, sequence_length_max, input_dim, num_hidden,
+      num_embed, dropout);
+  std::map<std::string, NDArray> args_map;
+  args_map["data"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
+  args_map["softmax_label"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
+  for (int i = 0; i < num_lstm_layer; i++) {
+    std::string key = "l" + std::to_string(i) + "_init_";
+    args_map[key + "c"] = NDArray(Shape(batch_size, num_hidden), device, false);
+    args_map[key + "h"] = NDArray(Shape(batch_size, num_hidden), device, false);
+  }
+  std::vector<mx_float> zeros(batch_size * num_hidden, 0);
+  // RNN.SimpleBind(device, args_map, {}, {{"data", kNullOp}});
+  Executor* exe = RNN.SimpleBind(device, args_map);
+
+  if (start_epoch == -1) {
+    Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2.34);
+    for (auto &arg : exe->arg_dict())
+      xavier(arg.first, &arg.second);
+  } else {
+    LoadCheckpoint(prefix + "-" + std::to_string(start_epoch) + ".params", exe);
+  }
+  start_epoch++;
+
+  mx_float learning_rate = 0.0002;
+  mx_float weight_decay = 0.000002;
+  Optimizer* opt = OptimizerRegistry::Find("sgd");
+  opt->SetParam("lr", learning_rate)
+     ->SetParam("wd", weight_decay);
+//  opt->SetParam("momentum", 0.9)->SetParam("rescale_grad", 1.0 / batch_size)
+//  ->SetParam("clip_gradient", 10);
+
+  for (int epoch = start_epoch; epoch < max_epoch; ++epoch) {
+    dataIter.Reset();
+    auto tic =  std::chrono::system_clock::now();
+    while (dataIter.Next()) {
+      auto data_batch = dataIter.GetDataBatch();
+      data_batch.data.CopyTo(&exe->arg_dict()["data"]);
+      data_batch.label.CopyTo(&exe->arg_dict()["softmax_label"]);
+      for (int l = 0; l < num_lstm_layer; l++) {
+        std::string key = "l" + std::to_string(l) + "_init_";
+        exe->arg_dict()[key + "c"].SyncCopyFromCPU(zeros);
+        exe->arg_dict()[key + "h"].SyncCopyFromCPU(zeros);
+      }
+      NDArray::WaitAll();
+
+      exe->Forward(true);
+      exe->Backward();
+      for (size_t i = 0; i < exe->arg_arrays.size(); ++i) {
+        opt->Update(i, exe->arg_arrays[i], exe->grad_arrays[i]);
+      }
+
+      NDArray::WaitAll();
+    }
+    auto toc =  std::chrono::system_clock::now();
+    std::cout << "Epoch[" << epoch << "] Time Cost:" <<
+         std::chrono::duration_cast< std::chrono::seconds>(toc - tic).count() << " seconds ";
+    OutputPerplexity(&exe->arg_dict()["softmax_label"], &exe->outputs[0]);
+    std::string filepath = prefix + "-" + std::to_string(epoch) + ".params";
+    SaveCheckpoint(filepath, RNN, exe);
+  }
+
+  delete exe;
+  delete opt;
+}
+
+/*The original example, rnn_cell_demo.py, uses default Xavier as initalizer, which relies on
+ * variable name, cannot initialize LSTM_parameters. Thus it was renamed to LSTM_bias,
+ * which can be initialized as zero. But it cannot converge after 100 epochs in this corpus
+ * example. Using RNNXavier, after 15 oscillating epochs,  it rapidly converges like old
+ * LSTMUnroll version. */
+class RNNXavier : public Xavier {
+ public:
+  RNNXavier(RandType rand_type = gaussian, FactorType factor_type = avg,
+    float magnitude = 3) : Xavier(rand_type, factor_type, magnitude) {
+  }
+  virtual ~RNNXavier() {}
+ protected:
+  virtual void InitDefault(NDArray* arr) {
+    Xavier::InitWeight(arr);
+  }
+};
+
+void trainWithBuiltInRNNOp(const std::string file, int batch_size, int max_epoch, int start_epoch) {
+  Context device(DeviceType::kGPU, 0);
+  BucketSentenceIter dataIter(file, batch_size, device);
+  std::string prefix = file.substr(0, file.rfind("."));
+  dataIter.saveCharIndices(prefix + ".dictionary");
+
+  input_dim = static_cast<int>(dataIter.characterSize());
+  sequence_length_max = dataIter.maxSequenceLength();
+
+  auto RNN = LSTMWithBuiltInRNNOp(num_lstm_layer, sequence_length_max, input_dim, num_hidden,
+      num_embed, dropout);
+  std::map<std::string, NDArray> args_map;
+  args_map["data"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
+  // Avoiding SwapAxis, batch_size is of second dimension.
+  args_map["LSTM_init_c"] = NDArray(Shape(num_lstm_layer, batch_size, num_hidden), device, false);
+  args_map["LSTM_init_h"] = NDArray(Shape(num_lstm_layer, batch_size, num_hidden), device, false);
+  args_map["softmax_label"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
+  std::vector<mx_float> zeros(batch_size * num_lstm_layer * num_hidden, 0);
+  Executor* exe = RNN.SimpleBind(device, args_map);
+
+  if (start_epoch == -1) {
+    RNNXavier xavier = RNNXavier(Xavier::gaussian, Xavier::in, 2.34);
+    for (auto &arg : exe->arg_dict())
+      xavier(arg.first, &arg.second);
+  } else {
+    LoadCheckpoint(prefix + "-" + std::to_string(start_epoch) + ".params", exe);
+  }
+  start_epoch++;
+
+  Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+//  opt->SetParam("momentum", 0.9)->SetParam("rescale_grad", 1.0 / batch_size)
+//  ->SetParam("clip_gradient", 10);
+
+  for (int epoch = start_epoch; epoch < max_epoch; ++epoch) {
+    dataIter.Reset();
+    auto tic =  std::chrono::system_clock::now();
+    while (dataIter.Next()) {
+      auto data_batch = dataIter.GetDataBatch();
+      data_batch.data.CopyTo(&exe->arg_dict()["data"]);
+      data_batch.label.CopyTo(&exe->arg_dict()["softmax_label"]);
+      exe->arg_dict()["LSTM_init_c"].SyncCopyFromCPU(zeros);
+      exe->arg_dict()["LSTM_init_h"].SyncCopyFromCPU(zeros);
+      NDArray::WaitAll();
+
+      exe->Forward(true);
+      exe->Backward();
+      for (size_t i = 0; i < exe->arg_arrays.size(); ++i) {
+        opt->Update(i, exe->arg_arrays[i], exe->grad_arrays[i]);
+      }
+      NDArray::WaitAll();
+    }
+    auto toc =  std::chrono::system_clock::now();
+    std::cout << "Epoch[" << epoch << "] Time Cost:" <<
+         std::chrono::duration_cast< std::chrono::seconds>(toc - tic).count() << " seconds ";
+    OutputPerplexity(&exe->arg_dict()["softmax_label"], &exe->outputs[0]);
+    std::string filepath = prefix + "-" + std::to_string(epoch) + ".params";
+    SaveCheckpoint(filepath, RNN, exe);
+  }
+
+  delete exe;
+  delete opt;
+}
+
+void predict(std::wstring* ptext, int sequence_length, const std::string param_file,
+    const std::string dictionary_file) {
+  Context device(DeviceType::kGPU, 0);
+  auto results = BucketSentenceIter::loadCharIndices(dictionary_file);
+  auto dictionary = std::get<0>(results);
+  auto charIndices = std::get<1>(results);
+  input_dim = static_cast<int>(charIndices.size());
+  auto RNN = LSTMUnroll(num_lstm_layer, 1, input_dim, num_hidden, num_embed, 0);
+
+  std::map<std::string, NDArray> args_map;
+  args_map["data"] = NDArray(Shape(1, 1), device, false);
+  args_map["softmax_label"] = NDArray(Shape(1, 1), device, false);
+  std::vector<mx_float> zeros(1 * num_hidden, 0);
+  for (int l = 0; l < num_lstm_layer; l++) {
+    std::string key = "l" + std::to_string(l) + "_init_";
+    args_map[key + "c"] = NDArray(Shape(1, num_hidden), device, false);
+    args_map[key + "h"] = NDArray(Shape(1, num_hidden), device, false);
+    args_map[key + "c"].SyncCopyFromCPU(zeros);
+    args_map[key + "h"].SyncCopyFromCPU(zeros);
+  }
+  Executor* exe = RNN.SimpleBind(device, args_map);
+  LoadCheckpoint(param_file, exe);
+
+  mx_float index;
+  wchar_t next = 0;
+  std::vector<mx_float> softmax;
+  softmax.resize(input_dim);
+  for (auto c : *ptext) {
+    exe->arg_dict()["data"].SyncCopyFromCPU(&dictionary[c], 1);
+    exe->Forward(false);
+
+    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
+    for (int l = 0; l < num_lstm_layer; l++) {
+      std::string key = "l" + std::to_string(l) + "_init_";
+      exe->outputs[l * 2 + 1].CopyTo(&args_map[key + "c"]);
+      exe->outputs[l * 2 + 2].CopyTo(&args_map[key + "h"]);
+    }
+
+    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
+    index = (mx_float) n;
+    next = charIndices[n];
+  }
+  ptext->push_back(next);
+
+  for (int i = 0; i < sequence_length; i++) {
+    exe->arg_dict()["data"].SyncCopyFromCPU(&index, 1);
+    exe->Forward(false);
+
+    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
+    for (int l = 0; l < num_lstm_layer; l++) {
+      std::string key = "l" + std::to_string(l) + "_init_";
+      exe->outputs[l * 2 + 1].CopyTo(&args_map[key + "c"]);
+      exe->outputs[l * 2 + 2].CopyTo(&args_map[key + "h"]);
+    }
+
+    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
+    index = (mx_float) n;
+    next = charIndices[n];
+    ptext->push_back(next);
+  }
+
+  delete exe;
+}
+
+void predictWithBuiltInRNNOp(std::wstring* ptext, int sequence_length, const std::string param_file,
+  const std::string dictionary_file) {
+  Context device(DeviceType::kGPU, 0);
+  auto results = BucketSentenceIter::loadCharIndices(dictionary_file);
+  auto dictionary = std::get<0>(results);
+  auto charIndices = std::get<1>(results);
+  input_dim = static_cast<int>(charIndices.size());
+  auto RNN = LSTMWithBuiltInRNNOp(num_lstm_layer, 1, input_dim, num_hidden, num_embed, 0);
+
+  std::map<std::string, NDArray> args_map;
+  args_map["data"] = NDArray(Shape(1, 1), device, false);
+  args_map["softmax_label"] = NDArray(Shape(1, 1), device, false);
+  std::vector<mx_float> zeros(1 * num_lstm_layer * num_hidden, 0);
+  // Avoiding SwapAxis, batch_size=1 is of second dimension.
+  args_map["LSTM_init_c"] = NDArray(Shape(num_lstm_layer, 1, num_hidden), device, false);
+  args_map["LSTM_init_h"] = NDArray(Shape(num_lstm_layer, 1, num_hidden), device, false);
+  args_map["LSTM_init_c"].SyncCopyFromCPU(zeros);
+  args_map["LSTM_init_h"].SyncCopyFromCPU(zeros);
+  Executor* exe = RNN.SimpleBind(device, args_map);
+  LoadCheckpoint(param_file, exe);
+
+  mx_float index;
+  wchar_t next = 0;
+  std::vector<mx_float> softmax;
+  softmax.resize(input_dim);
+  for (auto c : *ptext) {
+    exe->arg_dict()["data"].SyncCopyFromCPU(&dictionary[c], 1);
+    exe->Forward(false);
+
+    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
+    exe->outputs[1].CopyTo(&args_map["LSTM_init_h"]);
+    exe->outputs[2].CopyTo(&args_map["LSTM_init_c"]);
+
+    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
+    index = (mx_float) n;
+    next = charIndices[n];
+  }
+  ptext->push_back(next);
+
+  for (int i = 0; i < sequence_length; i++) {
+    exe->arg_dict()["data"].SyncCopyFromCPU(&index, 1);
+    exe->Forward(false);
+
+    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
+    exe->outputs[1].CopyTo(&args_map["LSTM_init_h"]);
+    exe->outputs[2].CopyTo(&args_map["LSTM_init_c"]);
+
+    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
+    index = (mx_float) n;
+    next = charIndices[n];
+    ptext->push_back(next);
+  }
+
+  delete exe;
+}
+
+int main(int argc, char** argv) {
+  if (argc < 5) {
+    std::cout << "Usage for training: charRNN train[BuiltIn][TimeMajor] {corpus file}"
+            " {batch size} {max epoch} [{starting epoch}]" << std::endl;
+    std::cout <<"Usage for prediction: charRNN predict[BuiltIn][TimeMajor] {params file}"
+            " {dictionary file} {beginning of text}" << std::endl;
+    std::cout <<"Note: The {params file} of train/trainBuiltIn/trainTimeMajor/trainBuiltInTimeMajor"
+            " are not compatible with each other." << std::endl;
+    return 0;
+  }
+
+  std::string task = argv[1];
+  bool builtIn = task.find("BuiltIn") != std::string::npos;
+  TIME_MAJOR = task.find("TimeMajor") != std::string::npos;
+  std::cout << "use BuiltIn cuDNN RNN: " << builtIn << std::endl
+         << "use data as TimeMajor: " << TIME_MAJOR << std::endl;
+  TRY
+  if (task.find("train") == 0) {
+    std::cout << "train batch size:      " << argv[3] << std::endl
+           << "train max epoch:       " << argv[4] << std::endl;
+    int start_epoch = argc > 5? atoi(argv[5]) : -1;
+    // this function will generate dictionary file and params file.
+    if (builtIn)
+      trainWithBuiltInRNNOp(argv[2], atoi(argv[3]), atoi(argv[4]), start_epoch);
+    else
+      train(argv[2], atoi(argv[3]), atoi(argv[4]), start_epoch);  // ditto
+  } else if (task.find("predict") == 0) {
+    std::wstring text;  // = L"If there is anyone out there who still doubts ";
+    // Considering of extending to Chinese samples in future, use wchar_t instead of char
+    for (char c : std::string(argv[4]))
+      text.push_back((wchar_t) c);
+    /*Python version predicts text default to random selecltions. Here I didn't write the random
+    code, always choose the 'best' character. So the text length reduced to 600. Longer size often
+    leads to repeated sentances, since training sequence length is only 129 for obama corpus.*/
+    if (builtIn)
+      predictWithBuiltInRNNOp(&text, 600, argv[2], argv[3]);
+    else
+      predict(&text, 600, argv[2], argv[3]);
+    std::wcout << text << std::endl;
+  }
+
+  MXNotifyShutdown();
+  CATCH
+  return 0;
+}
diff --git a/cpp-package/example/example.mk b/cpp-package/example/example.mk
new file mode 100644
index 000000000000..ef99d7426414
--- /dev/null
+++ b/cpp-package/example/example.mk
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+CPPEX_SRC = $(wildcard cpp-package/example/*.cpp)
+CPPEX_EXE = $(patsubst cpp-package/example/%.cpp, build/cpp-package/example/%, $(CPPEX_SRC))
+
+CPPEX_CFLAGS += -Icpp-package/include
+CPPEX_EXTRA_LDFLAGS := -L$(ROOTDIR)/lib -lmxnet
+
+EXTRA_PACKAGES += cpp-package-example-all
+EXTRA_PACKAGES_CLEAN += cpp-package-example-clean
+
+.PHONY: cpp-package-example-all cpp-package-example-clean
+
+cpp-package-example-all: cpp-package-all $(CPPEX_EXE)
+
+build/cpp-package/example/% : cpp-package/example/%.cpp lib/libmxnet.so $(CPP_PACKAGE_OP_H_FILE)
+	@mkdir -p $(@D)
+	$(CXX) -std=c++11 $(CFLAGS) $(CPPEX_CFLAGS) -MM -MT cpp-package/example/$* $< >build/cpp-package/example//$*.d
+	$(CXX) -std=c++11 $(CFLAGS) $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(LDFLAGS) $(CPPEX_EXTRA_LDFLAGS)
+
+cpp-package-example-clean:
+	rm -rf build/cpp-package/example/*
+
+-include build/cpp-package/example/*.d
diff --git a/cpp-package/example/feature_extract/Makefile b/cpp-package/example/feature_extract/Makefile
new file mode 100644
index 000000000000..193eaa7e850b
--- /dev/null
+++ b/cpp-package/example/feature_extract/Makefile
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+CXX=g++
+BLAS=-L /opt/openblas/lib -lopenblas -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
+CUDA=-DMSHADOW_USE_CUDA=1
+OPENCV_CFLAGS=`pkg-config --cflags opencv`
+OPENCV_LDFLAGS=`pkg-config --libs opencv`
+
+CFLAGS=$(COMMFLAGS) -I../../../3rdparty/nnvm/include -I../../../3rdparty/dmlc-core/include -I ../../include -I ../../../include -Wall -O3 -msse3 -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas -fopenmp
+LDFLAGS=$(COMMFLAGS) -L ../../../lib -lmxnet $(BLAS) $(CUDA) -lgomp -pthread
+
+all: feature_extract prepare_data_with_opencv
+
+feature_extract: ./feature_extract.cpp
+	$(CXX) -c -std=c++11 $(CFLAGS) $^
+	$(CXX) $(basename $@).o -o $@ $(LDFLAGS)
+	-rm -f $(basename $@).o
+
+prepare_data_with_opencv: ./prepare_data_with_opencv.cpp
+	$(CXX) -c -std=c++11 $(OPENCV_CFLAGS) $^
+	$(CXX) $(basename $@).o -o $@ $(OPENCV_LDFLAGS)
+	-rm -f $(basename $@).o
+
+clean:
+	-rm -f feature_extract
+	-rm -f prepare_data_with_opencv
diff --git a/cpp-package/example/feature_extract/README.md b/cpp-package/example/feature_extract/README.md
new file mode 100644
index 000000000000..0b94bef7705f
--- /dev/null
+++ b/cpp-package/example/feature_extract/README.md
@@ -0,0 +1,29 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+This example shows how to extract features with a pretrained model.
+
+Execute `run.sh` to:
+- Download a pretrained model
+- Download sample pictures (`dog.jpg` and `cat.jpg`)
+- Compile the files
+- Execute the featurization on `dog.jpg` and `cat.jpg`
+
+
+Note:
+1. The filename of network parameters may vary, line 67 in `feature_extract.cpp` should be updated accordingly.
+2. You need to build MXNet from source to get access to the `lib/libmxnet.so` or point `LD_LIBRARY_PATH` to where it is installed in your system
diff --git a/cpp-package/example/feature_extract/feature_extract.cpp b/cpp-package/example/feature_extract/feature_extract.cpp
new file mode 100644
index 000000000000..d614fd576238
--- /dev/null
+++ b/cpp-package/example/feature_extract/feature_extract.cpp
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ */
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/MxNetCpp.h"
+using namespace std;
+using namespace mxnet::cpp;
+
+/*
+ * This example shows how to extract features with a pretrained model.
+ * Get the model here:
+ *   https://github.com/dmlc/mxnet-model-gallery
+ * */
+
+/*The global context, change them if necessary*/
+Context global_ctx(kGPU, 0);
+// Context global_ctx(kCPU,0);
+
+class FeatureExtractor {
+ private:
+  /*the mean image, get from the pretrained model*/
+  NDArray mean_img;
+  /*the following two maps store all the paramters need by the model*/
+  map<string, NDArray> args_map;
+  map<string, NDArray> aux_map;
+  Symbol net;
+  Executor *executor;
+  /*Get the feature layer we want to extract*/
+  void GetFeatureSymbol() {
+    /*
+     * use the following to check all the layers' names:
+     * */
+    /*
+    net=Symbol::Load("./model/Inception_BN-symbol.json").GetInternals();
+    for(const auto & layer_name:net.ListOutputs()){
+      LG<<layer_name;
+    }
+    */
+    net = Symbol::Load("./model/Inception-BN-symbol.json")
+              .GetInternals()["global_pool_output"];
+  }
+  /*Fill the trained paramters into the model, a.k.a. net, executor*/
+  void LoadParameters() {
+    map<string, NDArray> paramters;
+    NDArray::Load("./model/Inception-BN-0126.params", 0, &paramters);
+    for (const auto &k : paramters) {
+      if (k.first.substr(0, 4) == "aux:") {
+        auto name = k.first.substr(4, k.first.size() - 4);
+        aux_map[name] = k.second.Copy(global_ctx);
+      }
+      if (k.first.substr(0, 4) == "arg:") {
+        auto name = k.first.substr(4, k.first.size() - 4);
+        args_map[name] = k.second.Copy(global_ctx);
+      }
+    }
+    /*WaitAll is need when we copy data between GPU and the main memory*/
+    NDArray::WaitAll();
+  }
+  void GetMeanImg() {
+    mean_img = NDArray(Shape(1, 3, 224, 224), global_ctx, false);
+    mean_img.SyncCopyFromCPU(
+        NDArray::LoadToMap("./model/mean_224.nd")["mean_img"].GetData(),
+        1 * 3 * 224 * 224);
+    NDArray::WaitAll();
+  }
+
+ public:
+  FeatureExtractor() {
+    /*prepare the model, fill the pretrained parameters, get the mean image*/
+    GetFeatureSymbol();
+    LoadParameters();
+    GetMeanImg();
+  }
+
+  void Extract(NDArray data) {
+    /*Normalize the pictures*/
+    data.Slice(0, 1) -= mean_img;
+    data.Slice(1, 2) -= mean_img;
+    args_map["data"] = data;
+    /*bind the executor*/
+    executor = net.SimpleBind(global_ctx, args_map, map<string, NDArray>(),
+                              map<string, OpReqType>(), aux_map);
+    executor->Forward(false);
+    /*print out the features*/
+    auto array = executor->outputs[0].Copy(Context(kCPU, 0));
+    NDArray::WaitAll();
+    array = array.Reshape({2, 1024});
+    for (int i = 0; i < 1024; ++i) {
+      cout << array.At(0, i) << ",";
+    }
+    cout << endl;
+  }
+};
+
+NDArray Data2NDArray() {
+  NDArray ret(Shape(2, 3, 224, 224), global_ctx, false);
+  ifstream inf("./img.dat", ios::binary);
+  vector<float> data(2 * 3 * 224 * 224);
+  inf.read(reinterpret_cast<char *>(data.data()), 2 * 3 * 224 * 224 * sizeof(float));
+  inf.close();
+  ret.SyncCopyFromCPU(data.data(), 2 * 3 * 224 * 224);
+  NDArray::WaitAll();
+  return ret;
+}
+
+int main() {
+  /*
+   * get the data from a binary file ./img.data
+   * this file is generated by ./prepare_data_with_opencv
+   * it stores 2 pictures in NDArray format
+   *
+   */
+  auto data = Data2NDArray();
+  FeatureExtractor fe;
+  fe.Extract(data);
+  return 0;
+}
diff --git a/cpp-package/example/feature_extract/prepare_data_with_opencv.cpp b/cpp-package/example/feature_extract/prepare_data_with_opencv.cpp
new file mode 100644
index 000000000000..fe32e896adb1
--- /dev/null
+++ b/cpp-package/example/feature_extract/prepare_data_with_opencv.cpp
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ */
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <opencv2/opencv.hpp>
+
+using namespace std;
+
+/*read images and store them the NDArray format that MXNet.cpp can handle*/
+void Mat2Array() {
+  string file_name_list[] = {"./dog.jpg", "./cat.jpg"};
+
+  std::vector<float> array;
+  for (auto &t : file_name_list) {
+    cv::Mat mat = cv::imread(t);
+    /*resize pictures to (224, 224) according to the pretrained model*/
+    cv::resize(mat, mat, cv::Size(224, 224));
+    for (int c = 0; c < 3; ++c) {
+      for (int i = 0; i < 224; ++i) {
+        for (int j = 0; j < 224; ++j) {
+          array.push_back(static_cast<float>(mat.data[(i * 224 + j) * 3 + c]));
+        }
+      }
+    }
+  }
+  ofstream outf("./img.dat", ios::binary);
+  outf.write(reinterpret_cast<char *>(array.data()), array.size() * sizeof(float));
+  outf.close();
+}
+
+int main(int argc, char *argv[]) {
+  Mat2Array();
+  return 0;
+}
diff --git a/cpp-package/example/feature_extract/run.sh b/cpp-package/example/feature_extract/run.sh
new file mode 100755
index 000000000000..b98ddb9eb81e
--- /dev/null
+++ b/cpp-package/example/feature_extract/run.sh
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Downloading the data and model
+mkdir -p model
+wget -nc -O model/Inception-BN-symbol.json \
+    http://data.mxnet.io/mxnet/models/imagenet/inception-bn/Inception-BN-symbol.json
+wget -nc -O model/synset.txt \
+    http://data.mxnet.io/mxnet/models/imagenet/synset.txt
+wget -nc -O model/Inception-BN-0126.params \
+    http://data.mxnet.io/mxnet/models/imagenet/inception-bn/Inception-BN-0126.params?raw=true 
+wget -nc -O cat.jpg https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/python/predict_image/cat.jpg?raw=true
+wget -nc -O dog.jpg https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/python/predict_image/dog.jpg?raw=true
+wget -nc -O model/mean_224.nd https://github.com/dmlc/web-data/raw/master/mxnet/example/feature_extract/mean_224.nd
+tar -xvzf inception-bn.tar.gz -C model --skip-old-files
+
+# Building
+make
+
+# Preparing the data
+./prepare_data_with_opencv
+
+# Running the featurization
+LD_LIBRARY_PATH=../../../lib ./feature_extract
diff --git a/cpp-package/example/get_data.sh b/cpp-package/example/get_data.sh
new file mode 100755
index 000000000000..fda69ce2f087
--- /dev/null
+++ b/cpp-package/example/get_data.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+mkdir -p data/mnist_data
+cd data/mnist_data
+
+download () {
+    local URL=$1
+    local GZ_FILE_NAME="${URL##*/}"
+
+    local FILE_NAME="${GZ_FILE_NAME%.*}"
+    if [[ -f "${FILE_NAME}" ]]; then
+        echo "File ${FILE_NAME} already downloaded."
+        return 0
+    fi
+
+    echo "Downloading ${URL} ..."
+    local CURL_OPTIONS="--connect-timeout 10 \
+              --max-time 300 \
+              --retry-delay 10 \
+              --retry 3 \
+              --retry-delay 0 \
+              --location \
+              --silent"
+    curl ${CURL_OPTIONS} ${URL} -o ${GZ_FILE_NAME}
+
+    if [[ ! -f "${GZ_FILE_NAME}" ]]; then
+        echo "File ${URL} couldn't be downloaded!"
+        exit 1
+    fi
+
+    gzip -d ${GZ_FILE_NAME}
+    (($? != 0)) && exit 1 || return 0
+}
+
+# MNIST dataset from: http://yann.lecun.com/exdb/mnist/
+FILES=(
+    "https://web.archive.org/web/20160828233817/http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"
+    "https://web.archive.org/web/20160828233817/http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"
+    "https://web.archive.org/web/20160828233817/http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz"
+    "https://web.archive.org/web/20160828233817/http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"
+    "http://data.mxnet.io/data/mnist_train.csv.gz")
+
+for FILE in ${FILES[@]}; do
+    download ${FILE}
+done
diff --git a/cpp-package/example/googlenet.cpp b/cpp-package/example/googlenet.cpp
new file mode 100644
index 000000000000..7b51f4fde3a7
--- /dev/null
+++ b/cpp-package/example/googlenet.cpp
@@ -0,0 +1,198 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ */
+#include <string>
+#include <vector>
+#include <map>
+#include <fstream>
+#include "utils.h"
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+Symbol ConvFactory(Symbol data, int num_filter,
+                   Shape kernel,
+                   Shape stride = Shape(1, 1),
+                   Shape pad = Shape(0, 0),
+                   const std::string & name = "",
+                   const std::string & suffix = "") {
+  Symbol conv_w("conv_" + name + suffix + "_w"), conv_b("conv_" + name + suffix + "_b");
+
+  Symbol conv = Convolution("conv_" + name + suffix, data,
+                            conv_w, conv_b, kernel,
+                            num_filter, stride, Shape(1, 1), pad);
+  return Activation("relu_" + name + suffix, conv, "relu");
+}
+
+Symbol InceptionFactory(Symbol data, int num_1x1, int num_3x3red,
+                        int num_3x3, int num_d5x5red, int num_d5x5,
+                        PoolingPoolType pool, int proj, const std::string & name) {
+  Symbol c1x1 = ConvFactory(data, num_1x1, Shape(1, 1),
+                            Shape(1, 1), Shape(0, 0), name + "_1x1");
+
+  Symbol c3x3r = ConvFactory(data, num_3x3red, Shape(1, 1),
+                             Shape(1, 1), Shape(0, 0), name + "_3x3", "_reduce");
+
+  Symbol c3x3 = ConvFactory(c3x3r, num_3x3, Shape(3, 3),
+                            Shape(1, 1), Shape(1, 1), name + "_3x3");
+
+  Symbol cd5x5r = ConvFactory(data, num_d5x5red, Shape(1, 1),
+                              Shape(1, 1), Shape(0, 0), name + "_5x5", "_reduce");
+
+  Symbol cd5x5 = ConvFactory(cd5x5r, num_d5x5, Shape(5, 5),
+                             Shape(1, 1), Shape(2, 2), name + "_5x5");
+
+  Symbol pooling = Pooling(name + "_pool", data, Shape(3, 3), pool,
+                           false, false, PoolingPoolingConvention::kValid,
+                           Shape(1, 1), Shape(1, 1));
+
+  Symbol cproj = ConvFactory(pooling, proj, Shape(1, 1),
+                             Shape(1, 1), Shape(0, 0), name + "_proj");
+
+  std::vector<Symbol> lst;
+  lst.push_back(c1x1);
+  lst.push_back(c3x3);
+  lst.push_back(cd5x5);
+  lst.push_back(cproj);
+  return Concat("ch_concat_" + name + "_chconcat", lst, lst.size());
+}
+
+Symbol GoogleNetSymbol(int num_classes) {
+  // data and label
+  Symbol data = Symbol::Variable("data");
+  Symbol data_label = Symbol::Variable("data_label");
+
+  Symbol conv1 = ConvFactory(data, 64, Shape(7, 7), Shape(2, 2), Shape(3, 3), "conv1");
+  Symbol pool1 = Pooling("pool1", conv1, Shape(3, 3), PoolingPoolType::kMax,
+                         false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
+  Symbol conv2 = ConvFactory(pool1, 64, Shape(1, 1), Shape(1, 1),
+                             Shape(0, 0), "conv2");
+  Symbol conv3 = ConvFactory(conv2, 192, Shape(3, 3), Shape(1, 1), Shape(1, 1), "conv3");
+  Symbol pool3 = Pooling("pool3", conv3, Shape(3, 3), PoolingPoolType::kMax,
+                         false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
+
+  Symbol in3a = InceptionFactory(pool3, 64, 96, 128, 16, 32, PoolingPoolType::kMax, 32, "in3a");
+  Symbol in3b = InceptionFactory(in3a, 128, 128, 192, 32, 96, PoolingPoolType::kMax, 64, "in3b");
+  Symbol pool4 = Pooling("pool4", in3b, Shape(3, 3), PoolingPoolType::kMax,
+                         false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
+  Symbol in4a = InceptionFactory(pool4, 192, 96, 208, 16, 48, PoolingPoolType::kMax, 64, "in4a");
+  Symbol in4b = InceptionFactory(in4a, 160, 112, 224, 24, 64, PoolingPoolType::kMax, 64, "in4b");
+  Symbol in4c = InceptionFactory(in4b, 128, 128, 256, 24, 64, PoolingPoolType::kMax, 64, "in4c");
+  Symbol in4d = InceptionFactory(in4c, 112, 144, 288, 32, 64, PoolingPoolType::kMax, 64, "in4d");
+  Symbol in4e = InceptionFactory(in4d, 256, 160, 320, 32, 128, PoolingPoolType::kMax, 128, "in4e");
+  Symbol pool5 = Pooling("pool5", in4e, Shape(3, 3), PoolingPoolType::kMax,
+                         false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
+  Symbol in5a = InceptionFactory(pool5, 256, 160, 320, 32, 128, PoolingPoolType::kMax, 128, "in5a");
+  Symbol in5b = InceptionFactory(in5a, 384, 192, 384, 48, 128, PoolingPoolType::kMax, 128, "in5b");
+  Symbol pool6 = Pooling("pool6", in5b, Shape(7, 7), PoolingPoolType::kAvg,
+                         false, false, PoolingPoolingConvention::kValid, Shape(1, 1));
+
+  Symbol flatten = Flatten("flatten", pool6);
+
+  Symbol fc1_w("fc1_w"), fc1_b("fc1_b");
+  Symbol fc1 = FullyConnected("fc1", flatten, fc1_w, fc1_b, num_classes);
+
+  return SoftmaxOutput("softmax", fc1, data_label);
+}
+
+int main(int argc, char const *argv[]) {
+  int batch_size = 50;
+  int max_epoch = argc > 1 ? strtol(argv[1], nullptr, 10) : 100;
+  float learning_rate = 1e-4;
+  float weight_decay = 1e-4;
+
+  auto ctx = Context::gpu();
+#if MXNET_USE_CPU
+  ctx = Context::cpu();;
+#endif
+
+  TRY
+  auto googlenet = GoogleNetSymbol(10);
+  std::map<std::string, NDArray> args_map;
+  std::map<std::string, NDArray> aux_map;
+
+  args_map["data"] = NDArray(Shape(batch_size, 3, 256, 256), ctx);
+  args_map["data_label"] = NDArray(Shape(batch_size), ctx);
+  googlenet.InferArgsMap(ctx, &args_map, args_map);
+
+  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
+                                          "./data/mnist_data/train-labels-idx1-ubyte",
+                                          "./data/mnist_data/t10k-images-idx3-ubyte",
+                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
+                                        };
+
+  auto train_iter =  MXDataIter("MNISTIter");
+  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
+    return 1;
+  }
+
+  auto val_iter = MXDataIter("MNISTIter");
+  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
+    return 1;
+  }
+
+  Optimizer* opt = OptimizerRegistry::Find("sgd");
+  opt->SetParam("momentum", 0.9)
+     ->SetParam("rescale_grad", 1.0 / batch_size)
+     ->SetParam("clip_gradient", 10)
+     ->SetParam("lr", learning_rate)
+     ->SetParam("wd", weight_decay);
+
+
+  auto *exec = googlenet.SimpleBind(ctx, args_map);
+  auto arg_names = googlenet.ListArguments();
+
+  for (int iter = 0; iter < max_epoch; ++iter) {
+    LG << "Epoch: " << iter;
+    train_iter.Reset();
+    while (train_iter.Next()) {
+      auto data_batch = train_iter.GetDataBatch();
+      data_batch.data.CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
+      NDArray::WaitAll();
+      exec->Forward(true);
+      exec->Backward();
+      for (size_t i = 0; i < arg_names.size(); ++i) {
+        if (arg_names[i] == "data" || arg_names[i] == "data_label") continue;
+        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+      }
+    }
+
+    Accuracy acu;
+    val_iter.Reset();
+    while (val_iter.Next()) {
+      auto data_batch = val_iter.GetDataBatch();
+      data_batch.data.CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
+      NDArray::WaitAll();
+      exec->Forward(false);
+      NDArray::WaitAll();
+      acu.Update(data_batch.label, exec->outputs[0]);
+    }
+    LG << "Accuracy: " << acu.Get();
+  }
+
+  delete exec;
+  delete opt;
+  MXNotifyShutdown();
+  CATCH
+  return 0;
+}
diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp
new file mode 100644
index 000000000000..8fe6b070497c
--- /dev/null
+++ b/cpp-package/example/inception_bn.cpp
@@ -0,0 +1,261 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ */
+#include <map>
+#include <string>
+#include <fstream>
+#include <vector>
+#include "utils.h"
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+Symbol ConvFactoryBN(Symbol data, int num_filter,
+                     Shape kernel, Shape stride, Shape pad,
+                     const std::string & name,
+                     const std::string & suffix = "") {
+  Symbol conv_w("conv_" + name + suffix + "_w"), conv_b("conv_" + name + suffix + "_b");
+
+  Symbol conv = Convolution("conv_" + name + suffix, data,
+                            conv_w, conv_b, kernel,
+                            num_filter, stride, Shape(1, 1), pad);
+  std::string name_suffix = name + suffix;
+  Symbol gamma(name_suffix + "_gamma");
+  Symbol beta(name_suffix + "_beta");
+  Symbol mmean(name_suffix + "_mmean");
+  Symbol mvar(name_suffix + "_mvar");
+  Symbol bn = BatchNorm("bn_" + name + suffix, conv, gamma, beta, mmean, mvar);
+  return Activation("relu_" + name + suffix, bn, "relu");
+}
+
+Symbol InceptionFactoryA(Symbol data, int num_1x1, int num_3x3red,
+                         int num_3x3, int num_d3x3red, int num_d3x3,
+                         PoolingPoolType pool, int proj,
+                         const std::string & name) {
+  Symbol c1x1 = ConvFactoryBN(data, num_1x1, Shape(1, 1), Shape(1, 1),
+                              Shape(0, 0), name + "1x1");
+  Symbol c3x3r = ConvFactoryBN(data, num_3x3red, Shape(1, 1), Shape(1, 1),
+                               Shape(0, 0), name + "_3x3r");
+  Symbol c3x3 = ConvFactoryBN(c3x3r, num_3x3, Shape(3, 3), Shape(1, 1),
+                              Shape(1, 1), name + "_3x3");
+  Symbol cd3x3r = ConvFactoryBN(data, num_d3x3red, Shape(1, 1), Shape(1, 1),
+                                Shape(0, 0), name + "_double_3x3", "_reduce");
+  Symbol cd3x3 = ConvFactoryBN(cd3x3r, num_d3x3, Shape(3, 3), Shape(1, 1),
+                               Shape(1, 1), name + "_double_3x3_0");
+  cd3x3 = ConvFactoryBN(data = cd3x3, num_d3x3, Shape(3, 3), Shape(1, 1),
+                        Shape(1, 1), name + "_double_3x3_1");
+  Symbol pooling = Pooling(name + "_pool", data,
+                           Shape(3, 3), pool, false, false,
+                           PoolingPoolingConvention::kValid,
+                           Shape(1, 1), Shape(1, 1));
+  Symbol cproj = ConvFactoryBN(pooling, proj, Shape(1, 1), Shape(1, 1),
+                               Shape(0, 0), name + "_proj");
+  std::vector<Symbol> lst;
+  lst.push_back(c1x1);
+  lst.push_back(c3x3);
+  lst.push_back(cd3x3);
+  lst.push_back(cproj);
+  return Concat("ch_concat_" + name + "_chconcat", lst, lst.size());
+}
+
+Symbol InceptionFactoryB(Symbol data, int num_3x3red, int num_3x3,
+                         int num_d3x3red, int num_d3x3, const std::string & name) {
+  Symbol c3x3r = ConvFactoryBN(data, num_3x3red, Shape(1, 1),
+                               Shape(1, 1), Shape(0, 0),
+                               name + "_3x3", "_reduce");
+  Symbol c3x3 = ConvFactoryBN(c3x3r, num_3x3, Shape(3, 3), Shape(2, 2),
+                              Shape(1, 1), name + "_3x3");
+  Symbol cd3x3r = ConvFactoryBN(data, num_d3x3red, Shape(1, 1), Shape(1, 1),
+                                Shape(0, 0), name + "_double_3x3", "_reduce");
+  Symbol cd3x3 = ConvFactoryBN(cd3x3r, num_d3x3, Shape(3, 3), Shape(1, 1),
+                               Shape(1, 1), name + "_double_3x3_0");
+  cd3x3 = ConvFactoryBN(cd3x3, num_d3x3, Shape(3, 3), Shape(2, 2),
+                        Shape(1, 1), name + "_double_3x3_1");
+  Symbol pooling = Pooling("max_pool_" + name + "_pool", data,
+                           Shape(3, 3), PoolingPoolType::kMax,
+                           false, false, PoolingPoolingConvention::kValid,
+                           Shape(2, 2), Shape(1, 1));
+  std::vector<Symbol> lst;
+  lst.push_back(c3x3);
+  lst.push_back(cd3x3);
+  lst.push_back(pooling);
+  return Concat("ch_concat_" + name + "_chconcat", lst, lst.size());
+}
+
+Symbol InceptionSymbol(int num_classes) {
+  // data and label
+  Symbol data = Symbol::Variable("data");
+  Symbol data_label = Symbol::Variable("data_label");
+
+  // stage 1
+  Symbol conv1 = ConvFactoryBN(data, 64, Shape(7, 7), Shape(2, 2), Shape(3, 3), "conv1");
+  Symbol pool1 = Pooling("pool1", conv1, Shape(3, 3), PoolingPoolType::kMax,
+      false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
+
+  // stage 2
+  Symbol conv2red = ConvFactoryBN(pool1, 64, Shape(1, 1), Shape(1, 1),  Shape(0, 0), "conv2red");
+  Symbol conv2 = ConvFactoryBN(conv2red, 192, Shape(3, 3), Shape(1, 1), Shape(1, 1), "conv2");
+  Symbol pool2 = Pooling("pool2", conv2, Shape(3, 3), PoolingPoolType::kMax,
+      false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
+
+  // stage 3
+  Symbol in3a = InceptionFactoryA(pool2, 64, 64, 64, 64, 96, PoolingPoolType::kAvg, 32, "3a");
+  Symbol in3b = InceptionFactoryA(in3a, 64, 64, 96, 64, 96, PoolingPoolType::kAvg, 64, "3b");
+  Symbol in3c = InceptionFactoryB(in3b, 128, 160, 64, 96, "3c");
+
+  // stage 4
+  Symbol in4a = InceptionFactoryA(in3c, 224, 64, 96, 96, 128, PoolingPoolType::kAvg, 128, "4a");
+  Symbol in4b = InceptionFactoryA(in4a, 192, 96, 128, 96, 128,  PoolingPoolType::kAvg, 128, "4b");
+  Symbol in4c = InceptionFactoryA(in4b, 160, 128, 160, 128, 160, PoolingPoolType::kAvg, 128, "4c");
+  Symbol in4d = InceptionFactoryA(in4c, 96, 128, 192, 160, 192,  PoolingPoolType::kAvg, 128, "4d");
+  Symbol in4e = InceptionFactoryB(in4d, 128, 192, 192, 256, "4e");
+
+  // stage 5
+  Symbol in5a = InceptionFactoryA(in4e, 352, 192, 320, 160, 224, PoolingPoolType::kAvg, 128, "5a");
+  Symbol in5b = InceptionFactoryA(in5a, 352, 192, 320, 192, 224, PoolingPoolType::kMax, 128, "5b");
+
+  // average pooling
+  Symbol avg = Pooling("global_pool", in5b, Shape(7, 7), PoolingPoolType::kAvg);
+
+  // classifier
+  Symbol flatten = Flatten("flatten", avg);
+  Symbol conv1_w("conv1_w"), conv1_b("conv1_b");
+  Symbol fc1 = FullyConnected("fc1", flatten, conv1_w, conv1_b, num_classes);
+  return SoftmaxOutput("softmax", fc1, data_label);
+}
+
+NDArray ResizeInput(NDArray data, const Shape new_shape) {
+  NDArray pic = data.Reshape(Shape(0, 1, 28, 28));
+  NDArray pic_1channel;
+  Operator("_contrib_BilinearResize2D")
+    .SetParam("height", new_shape[2])
+    .SetParam("width", new_shape[3])
+    (pic).Invoke(pic_1channel);
+  NDArray output;
+  Operator("tile")
+    .SetParam("reps", Shape(1, 3, 1, 1))
+    (pic_1channel).Invoke(output);
+  return output;
+}
+
+int main(int argc, char const *argv[]) {
+  int batch_size = 40;
+  int max_epoch = argc > 1 ? strtol(argv[1], nullptr, 10) : 100;
+  float learning_rate = 1e-2;
+  float weight_decay = 1e-4;
+
+  /*context*/
+  auto ctx = Context::cpu();
+  int num_gpu;
+  MXGetGPUCount(&num_gpu);
+#if !MXNET_USE_CPU
+  if (num_gpu > 0) {
+    ctx = Context::gpu();
+  }
+#endif
+
+  TRY
+  auto inception_bn_net = InceptionSymbol(10);
+  std::map<std::string, NDArray> args_map;
+  std::map<std::string, NDArray> aux_map;
+
+  const Shape data_shape = Shape(batch_size, 3, 224, 224),
+              label_shape = Shape(batch_size);
+  args_map["data"] = NDArray(data_shape, ctx);
+  args_map["data_label"] = NDArray(label_shape, ctx);
+  inception_bn_net.InferArgsMap(ctx, &args_map, args_map);
+
+  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
+                                          "./data/mnist_data/train-labels-idx1-ubyte",
+                                          "./data/mnist_data/t10k-images-idx3-ubyte",
+                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
+                                        };
+
+  auto train_iter =  MXDataIter("MNISTIter");
+  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
+    return 1;
+  }
+
+  auto val_iter = MXDataIter("MNISTIter");
+  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
+    return 1;
+  }
+
+  // initialize parameters
+  Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2);
+  for (auto &arg : args_map) {
+    xavier(arg.first, &arg.second);
+  }
+
+  Optimizer* opt = OptimizerRegistry::Find("sgd");
+  opt->SetParam("momentum", 0.9)
+     ->SetParam("rescale_grad", 1.0 / batch_size)
+     ->SetParam("clip_gradient", 10)
+     ->SetParam("lr", learning_rate)
+     ->SetParam("wd", weight_decay);
+
+  auto *exec = inception_bn_net.SimpleBind(ctx, args_map);
+  auto arg_names = inception_bn_net.ListArguments();
+
+  // Create metrics
+  Accuracy train_acc, val_acc;
+  for (int iter = 0; iter < max_epoch; ++iter) {
+    LG << "Epoch: " << iter;
+    train_iter.Reset();
+    train_acc.Reset();
+    while (train_iter.Next()) {
+      auto data_batch = train_iter.GetDataBatch();
+      ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
+      NDArray::WaitAll();
+
+      exec->Forward(true);
+      exec->Backward();
+      // Update parameters
+      for (size_t i = 0; i < arg_names.size(); ++i) {
+        if (arg_names[i] == "data" || arg_names[i] == "data_label") continue;
+        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+      }
+
+      NDArray::WaitAll();
+      train_acc.Update(data_batch.label, exec->outputs[0]);
+    }
+
+    val_iter.Reset();
+    val_acc.Reset();
+    while (val_iter.Next()) {
+      auto data_batch = val_iter.GetDataBatch();
+      ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
+      NDArray::WaitAll();
+      exec->Forward(false);
+      NDArray::WaitAll();
+      val_acc.Update(data_batch.label, exec->outputs[0]);
+    }
+    LG << "Train Accuracy: " << train_acc.Get();
+    LG << "Validation Accuracy: " << val_acc.Get();
+  }
+  delete exec;
+  delete opt;
+  MXNotifyShutdown();
+  CATCH
+  return 0;
+}
diff --git a/cpp-package/example/inference/CMakeLists.txt b/cpp-package/example/inference/CMakeLists.txt
new file mode 100644
index 000000000000..0566d28a57df
--- /dev/null
+++ b/cpp-package/example/inference/CMakeLists.txt
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Explicitly set GENERATED property https://gitlab.kitware.com/cmake/cmake/issues/18399
+set_property(SOURCE ${CMAKE_CURRENT_LIST_DIR}/../../include/mxnet-cpp/op.h PROPERTY GENERATED 1)
+
+add_executable(imagenet_inference "imagenet_inference.cpp")
+target_link_libraries(imagenet_inference mxnet_cpp)
diff --git a/cpp-package/example/inference/Makefile b/cpp-package/example/inference/Makefile
new file mode 100644
index 000000000000..5efe6cfb68e5
--- /dev/null
+++ b/cpp-package/example/inference/Makefile
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+CPPEX_SRC = $(wildcard *.cpp)
+CPPEX_EXE = $(patsubst %.cpp, %, $(CPPEX_SRC))
+OPENCV_CFLAGS=`pkg-config --cflags opencv`
+OPENCV_LDFLAGS=`pkg-config --libs opencv`
+
+CXX=g++
+
+
+CFLAGS=$(COMMFLAGS) -I../../../3rdparty/tvm/nnvm/include -I../../../3rdparty/dmlc-core/include -I ../../include -I ../../../include -Wall -O3 -msse3 -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas
+CPPEX_EXTRA_LDFLAGS := -L../../../lib -lmxnet $(OPENCV_LDFLAGS)
+
+all: $(CPPEX_EXE)
+
+debug: CPPEX_CFLAGS += -DDEBUG -g
+debug: all
+
+
+$(CPPEX_EXE):% : %.cpp
+	$(CXX) -std=c++0x $(CFLAGS)  $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(CPPEX_EXTRA_LDFLAGS)
+
+clean:
+	rm -f $(CPPEX_EXE)
diff --git a/cpp-package/example/inference/README.md b/cpp-package/example/inference/README.md
new file mode 100644
index 000000000000..90047e5fe14f
--- /dev/null
+++ b/cpp-package/example/inference/README.md
@@ -0,0 +1,213 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# MXNet C++ Package Inference Workflow Examples
+
+## Building C++ Inference examples
+
+The examples in this folder demonstrate the **inference** workflow. Please build the MXNet C++ Package as explained in the [README](<https://github.com/apache/incubator-mxnet/tree/master/cpp-package#building-c-package>) File before building these examples.
+To build examples use following commands:
+
+-  Release: **make all**
+-  Debug: **make debug all**
+
+
+## Examples demonstrating inference workflow
+
+This directory contains following examples. In order to run the examples, ensure that the path to the MXNet shared library is added to the OS specific environment variable viz. **LD\_LIBRARY\_PATH** for Linux, Mac and Ubuntu OS and **PATH** for Windows OS.
+
+## [imagenet_inference.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/imagenet_inference.cpp>)
+
+This example demonstrates image classification workflow with pre-trained models using MXNet C++ API. Now this script also supports inference with quantized CNN models generated by Intel® MKL-DNN (see this [quantization flow](https://github.com/apache/incubator-mxnet/blob/master/example/quantization/README.md)). By using C++ API, the latency of most models will be reduced to some extent compared with current Python implementation.
+
+Most of CNN models have been tested on Linux systems. And 50000 images are used to collect accuracy numbers. Please refer to this [README](https://github.com/apache/incubator-mxnet/blob/master/example/quantization/README.md) for  more details about accuracy.
+
+The following performance numbers are collected via using C++ inference API on AWS EC2 C5.12xlarge. The environment variables are set like below:
+
+```
+export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
+export OMP_NUM_THREADS=$(vCPUs/2)
+export MXNET_ENGINE_TYPE=NaiveEngine
+```
+Also users are recommended to use ```numactl``` or ```taskset``` to bind a running process to the specified cores.
+
+| Model | Dataset |BS=1 (imgs/sec) |BS=64 (imgs/sec) |
+|:---|:---|:---:|:---:|
+| |  |FP32 / INT8 | FP32 / INT8 |
+| ResNet18-V1  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  |369.00 / 778.82|799.7 / 2598.04|
+| ResNet50-V1  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  |160.72 / 405.84|349.73 / 1297.65 |
+| ResNet101-V1 | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  | 89.56 / 197.55| 193.25 / 740.47|
+|Squeezenet 1.0|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)	  | 294.46 /  899.28| 857.70 / 3065.13|
+|MobileNet 1.0|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)    |554.94 / 676.59|1279.44 / 3393.43|
+|MobileNetV2 1.0|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  |303.40 / 776.40|994.25 / 4227.77|
+|Inception V3|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)     |108.20 /  219.20 | 232.22 / 870.09 |
+|ResNet152-V2|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)     |52.28 / 64.62|107.03 / 134.04 |
+|Inception-BN|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)     | 211.86 / 306.37| 632.79 / 2115.28|
+
+The command line to launch inference by this script can accept are as shown below:
+```
+./imagenet_inference --help
+Usage:
+imagenet_inference  --symbol_file <model symbol file in json format>
+                    --params_file <model params file>
+		    --dataset <dataset used to benchmark>
+		    --data_nthreads <number of threads for data decoding, default: 60>
+		    --input_shape <shape of input image e.g "3 224 224">]
+		    --rgb_mean <mean value to be subtracted on R/G/B channel e.g "0 0 0">
+		    --rgb_std <standard deviation on R/G/B channel. e.g "1 1 1">
+		    --batch_size <number of images per batch>
+		    --num_skipped_batches <skip the number of batches for inference>
+		    --num_inference_batches <number of batches used for inference>
+		    --data_layer_type <default: "float32", choices: ["float32", "int8", "uint8"]>
+		    --gpu <whether to run inference on GPU, default: false>
+		    --enableTRT  <whether to run inference with TensorRT, default: false>"
+		    --benchmark <whether to use dummy data to run inference, default: false>
+```
+
+Follow the below steps to do inference with more models.
+
+- Download the pre-trained FP32 models into ```./model``` directory.
+- Refer this [README](https://github.com/apache/incubator-mxnet/blob/master/example/quantization/README.md) to generate the corresponding quantized models and also put them into ```./model``` directory.
+- Prepare [validation dataset](http://data.mxnet.io/data/val_256_q90.rec) and put it into ```./data``` directory.
+
+The below command lines show how to run inference with FP32/INT8 resnet50_v1 model. Because the C++ inference script provides the almost same command line as this [Python script](https://github.com/apache/incubator-mxnet/blob/master/example/quantization/imagenet_inference.py) and then users can easily go from Python to C++.
+```
+
+# FP32 inference
+./imagenet_inference --symbol_file "./model/resnet50_v1-symbol.json" --params_file "./model/resnet50_v1-0000.params" --dataset "./data/val_256_q90.rec" --rgb_mean "123.68 116.779 103.939" --rgb_std "58.393 57.12 57.375" --batch_size 64 --num_skipped_batches 50 --num_inference_batches 500
+
+# INT8 inference
+./imagenet_inference --symbol_file "./model/resnet50_v1-quantized-5batches-naive-symbol.json" --params_file "./model/resnet50_v1-quantized-0000.params" --dataset "./data/val_256_q90.rec" --rgb_mean "123.68 116.779 103.939" --rgb_std "58.393 57.12 57.375" --batch_size 64 --num_skipped_batches 50 --num_inference_batches 500
+
+# FP32 dummy data
+./imagenet_inference --symbol_file "./model/resnet50_v1-symbol.json" --batch_size 64 --num_inference_batches 500 --benchmark
+
+# INT8 dummy data
+./imagenet_inference --symbol_file "./model/resnet50_v1-quantized-5batches-naive-symbol.json" --batch_size 64 --num_inference_batches 500 --benchmark
+
+```
+For a quick inference test, users can directly run [unit_test_imagenet_inference.sh](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/unit_test_imagenet_inference.sh>) by using the below command. This script will automatically download the pre-trained **Inception-Bn** and **resnet50_v1_int8** model and **validation dataset** which are required for inference.
+
+```
+./unit_test_imagenet_inference.sh
+```
+And you may get the similiar outputs like below:
+```
+>>> INFO: FP32 real data
+imagenet_inference.cpp:282: Loading the model from ./model/Inception-BN-symbol.json
+imagenet_inference.cpp:295: Loading the model parameters from ./model/Inception-BN-0126.params
+imagenet_inference.cpp:443: INFO:Dataset for inference: ./data/val_256_q90.rec
+imagenet_inference.cpp:444: INFO:label_name = softmax_label
+imagenet_inference.cpp:445: INFO:rgb_mean: (123.68, 116.779, 103.939)
+imagenet_inference.cpp:447: INFO:rgb_std: (1, 1, 1)
+imagenet_inference.cpp:449: INFO:Image shape: (3, 224, 224)
+imagenet_inference.cpp:451: INFO:Finished inference with: 500 images
+imagenet_inference.cpp:453: INFO:Batch size = 1 for inference
+imagenet_inference.cpp:454: INFO:Accuracy: 0.744
+imagenet_inference.cpp:455: INFO:Throughput: xxxx images per second
+
+>>> INFO: FP32 dummy data
+imagenet_inference.cpp:282: Loading the model from ./model/Inception-BN-symbol.json
+imagenet_inference.cpp:372: Running the forward pass on model to evaluate the performance..
+imagenet_inference.cpp:387: benchmark completed!
+imagenet_inference.cpp:388: batch size: 1 num batch: 500 throughput: xxxx imgs/s latency:xxxx ms
+
+>>> INFO: INT8 dummy data
+imagenet_inference.cpp:282: Loading the model from ./model/resnet50_v1_int8-symbol.json
+imagenet_inference.cpp:372: Running the forward pass on model to evaluate the performance..
+imagenet_inference.cpp:387: benchmark completed!
+imagenet_inference.cpp:388: batch size: 1 num batch: 500 throughput: xxxx imgs/s latency:xxxx ms
+```
+For running this example with TensorRT, you can quickly try the following example to run a benchmark test for testing Inception BN:
+```
+./imagenet_inference --symbol_file "./model/Inception-BN-symbol.json" --params_file "./model/Inception-BN-0126.params" --batch_size 16 --num_inference_batches 500 --benchmark --enableTRT
+```
+Sample output will looks like this (the example is running on a AWS P3.2xl machine):
+```
+imagenet_inference.cpp:302: Loading the model from ./model/Inception-BN-symbol.json
+build_subgraph.cc:686: start to execute partition graph.
+imagenet_inference.cpp:317: Loading the model parameters from ./model/Inception-BN-0126.params
+imagenet_inference.cpp:424: Running the forward pass on model to evaluate the performance..
+imagenet_inference.cpp:439:  benchmark completed!
+imagenet_inference.cpp:440:  batch size: 16 num batch: 500 throughput: 6284.78 imgs/s latency:0.159115 ms
+```
+
+## [sentiment_analysis_rnn.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/sentiment_analysis_rnn.cpp>)
+This example demonstrates how you can load a pre-trained RNN model and use it to predict the sentiment expressed in the given movie review with the MXNet C++ API. The example is capable of processing variable legnth inputs. It performs the following tasks
+- Loads the pre-trained RNN model.
+- Loads the dictionary file containing the word to index mapping.
+- Splits the review in multiple lines separated by "."
+- The example predicts the sentiment score for individual lines and outputs the average score.
+
+The example is capable of processing variable length input by implementing following technique:
+- The example creates executors for pre-determined input lenghts such as 5, 10, 15, 20, 25, etc called **buckets**.
+- Each bucket is identified by **bucket-key** representing the length on input required by corresponding executor.
+- For each line in the review, the example finds the number of words in the line and tries to find a closest bucket or executor.
+- If the bucket key does not match the number of words in the line, the example pads or trims the input line to match the required length.
+
+The example uses a pre-trained RNN model trained with a IMDB dataset. The RNN model was built by exercising the [GluonNLP Sentiment Analysis Tutorial](<http://gluon-nlp.mxnet.io/examples/sentiment_analysis/sentiment_analysis.html#>). The tutorial uses 'standard_lstm_lm_200' available in Gluon Model Zoo and fine tunes it for the IMDB dataset
+The model consists of :
+- Embedding Layer
+- 2 LSTM Layers with hidden dimension size of 200
+- Average pooling layer
+- Sigmoid output layer
+The model was trained for 10 epochs to achieve 85% test accuracy.
+The visual representation of the model is [here](<http://gluon-nlp.mxnet.io/examples/sentiment_analysis/sentiment_analysis.html#Sentiment-analysis-model-with-pre-trained-language-model-encoder>).
+
+The model files can be found here.
+- [sentiment_analysis-symbol.json](< https://s3.amazonaws.com/mxnet-cpp/RNN_model/sentiment_analysis-symbol.json>)
+- [sentiment_analysis-0010.params](< https://s3.amazonaws.com/mxnet-cpp/RNN_model/sentiment_analysis-0010.params>)
+- [sentiment_token_to_idx.txt](<https://s3.amazonaws.com/mxnet-cpp/RNN_model/sentiment_token_to_idx.txt>) Each line of the dictionary file contains a word and a unique index for that word, separated by a space, with a total of 32787 words generated from the training dataset.
+The example downloads the above files while running.
+
+The example's command line parameters are as shown below:
+
+```
+./sentiment_analysis_rnn --help
+Usage:
+sentiment_analysis_rnn
+--input Input movie review. The review can be single line or multiline.e.g. "This movie is the best." OR  "This movie is the best. The direction is awesome."
+[--gpu]  Specify this option if workflow needs to be run in gpu context
+If the review is multiline, the example predicts sentiment score for each line and the final score is the average of scores obtained for each line.
+
+```
+
+The following command line shows running the example with the movie review containing only one line.
+
+```
+./sentiment_analysis_rnn --input "This movie has the great story"
+```
+
+The above command will output the sentiment score as follows:
+```
+sentiment_analysis_rnn.cpp:346: Input Line : [This movie has the great story] Score : 0.999898
+sentiment_analysis_rnn.cpp:449: The sentiment score between 0 and 1, (1 being positive)=0.999898
+```
+
+The following command line shows invoking the example with the multi-line review.
+
+```
+./sentiment_analysis_rnn --input "This movie is the best. The direction is awesome."
+```
+The above command will output the sentiment score for each line in the review and average score as follows:
+```
+Input Line : [This movie is the best] Score : 0.964498
+Input Line : [ The direction is awesome] Score : 0.968855
+The sentiment score between 0 and 1, (1 being positive)=0.966677
+```
+
+Alternatively, you can run the [unit_test_sentiment_analysis_rnn.sh](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/unit_test_sentiment_analysis_rnn.sh>) script.
diff --git a/cpp-package/example/inference/imagenet_inference.cpp b/cpp-package/example/inference/imagenet_inference.cpp
new file mode 100644
index 000000000000..845a227fe93d
--- /dev/null
+++ b/cpp-package/example/inference/imagenet_inference.cpp
@@ -0,0 +1,662 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * This example demonstrates image classification workflow with pre-trained models using MXNet C++ API.
+ * The example performs following tasks.
+ * 1. Load the pre-trained model.
+ * 2. Load the parameters of pre-trained model.
+ * 3. Load the inference dataset and create a new ImageRecordIter.
+ * 4. Run the forward pass and obtain throughput & accuracy.
+ */
+#ifndef _WIN32
+#include <sys/time.h>
+#endif
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <chrono>
+#include <string>
+#include <vector>
+#include <random>
+#include <type_traits>
+#include <opencv2/opencv.hpp>
+#include "mxnet/c_api.h"
+#include "mxnet/tuple.h"
+#include "mxnet-cpp/MxNetCpp.h"
+#include "mxnet-cpp/initializer.h"
+
+using namespace mxnet::cpp;
+
+double ms_now() {
+  double ret;
+#ifdef _WIN32
+  auto timePoint = std::chrono::high_resolution_clock::now().time_since_epoch();
+  ret = std::chrono::duration<double, std::milli>(timePoint).count();
+#else
+  struct timeval time;
+  gettimeofday(&time, nullptr);
+  ret = 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
+#endif
+  return ret;
+}
+
+
+// define the data type for NDArray, aliged with the definition in mshadow/base.h
+enum TypeFlag {
+  kFloat32 = 0,
+  kFloat64 = 1,
+  kFloat16 = 2,
+  kUint8 = 3,
+  kInt32 = 4,
+  kInt8  = 5,
+  kInt64 = 6,
+};
+
+/*
+ * class Predictor
+ *
+ * This class encapsulates the functionality to load the model, prepare dataset and run the forward pass.
+ */
+
+class Predictor {
+ public:
+    Predictor() {}
+    Predictor(const std::string& model_json_file,
+              const std::string& model_params_file,
+              const Shape& input_shape,
+              bool use_gpu,
+              bool enable_tensorrt,
+              const std::string& dataset,
+              const int data_nthreads,
+              const std::string& data_layer_type,
+              const std::vector<float>& rgb_mean,
+              const std::vector<float>& rgb_std,
+              int shuffle_chunk_seed,
+              int seed, bool benchmark);
+    void BenchmarkScore(int num_inference_batches);
+    void Score(int num_skipped_batches, int num_inference_batches);
+    ~Predictor();
+
+ private:
+    bool CreateImageRecordIter();
+    bool AdvanceDataIter(int skipped_batches);
+    void LoadModel(const std::string& model_json_file);
+    void LoadParameters(const std::string& model_parameters_file);
+    void SplitParamMap(const std::map<std::string, NDArray> &paramMap,
+        std::map<std::string, NDArray> *argParamInTargetContext,
+        std::map<std::string, NDArray> *auxParamInTargetContext,
+        Context targetContext);
+    void ConvertParamMapToTargetContext(const std::map<std::string, NDArray> &paramMap,
+        std::map<std::string, NDArray> *paramMapInTargetContext,
+        Context targetContext);
+    void InitParameters();
+
+    inline bool FileExists(const std::string &name) {
+      std::ifstream fhandle(name.c_str());
+      return fhandle.good();
+    }
+    int GetDataLayerType();
+
+    std::map<std::string, NDArray> args_map_;
+    std::map<std::string, NDArray> aux_map_;
+    Symbol net_;
+    Executor *executor_;
+    Shape input_shape_;
+    Context global_ctx_ = Context::cpu();
+
+    MXDataIter *val_iter_;
+    bool use_gpu_;
+    bool enable_tensorrt_;
+    std::string dataset_;
+    int data_nthreads_;
+    std::string data_layer_type_;
+    std::vector<float> rgb_mean_;
+    std::vector<float> rgb_std_;
+    int shuffle_chunk_seed_;
+    int seed_;
+    bool benchmark_;
+};
+
+
+/*
+ * The constructor takes following parameters as input:
+ * 1. model_json_file:  The model in json formatted file.
+ * 2. model_params_file: File containing model parameters
+ * 3. input_shape: Shape of input data to the model. Since this class will be running one inference at a time,
+ *                 the input shape is required to be in format Shape(1, number_of_channels, height, width)
+ *                 The input image will be resized to (height x width) size before running the inference.
+ * 4. use_gpu: determine if run inference on GPU
+ * 5. enable_tensorrt: determine if enable TensorRT
+ * 6. dataset: data file (.rec) to be used for inference
+ * 7. data_nthreads: number of threads for data loading
+ * 8. data_layer_type: data type for data layer
+ * 9. rgb_mean: mean value to be subtracted on R/G/B channel
+ * 10. rgb_std: standard deviation on R/G/B channel
+ * 11. shuffle_chunk_seed: shuffling chunk seed
+ * 12. seed: shuffling seed
+ * 13. benchmark: use dummy data for inference
+ *
+ * The constructor will:
+ *  1. Create ImageRecordIter based on the given dataset file.
+ *  2. Load the model and parameter files.
+ *  3. Infer and construct NDArrays according to the input argument and create an executor.
+ */
+Predictor::Predictor(const std::string& model_json_file,
+                     const std::string& model_params_file,
+                     const Shape& input_shape,
+                     bool use_gpu,
+                     bool enable_tensorrt,
+                     const std::string& dataset,
+                     const int data_nthreads,
+                     const std::string& data_layer_type,
+                     const std::vector<float>& rgb_mean,
+                     const std::vector<float>& rgb_std,
+                     int shuffle_chunk_seed,
+                     int seed, bool benchmark)
+    : input_shape_(input_shape),
+      use_gpu_(use_gpu),
+      enable_tensorrt_(enable_tensorrt),
+      dataset_(dataset),
+      data_nthreads_(data_nthreads),
+      data_layer_type_(data_layer_type),
+      rgb_mean_(rgb_mean),
+      rgb_std_(rgb_std),
+      shuffle_chunk_seed_(shuffle_chunk_seed),
+      seed_(seed),
+      benchmark_(benchmark) {
+  if (use_gpu) {
+    global_ctx_ = Context::gpu();
+  }
+
+  // initilize data iterator
+  if (!benchmark_ && !CreateImageRecordIter()) {
+    LG << "Error: failed to create ImageRecordIter";
+    throw std::runtime_error("ImageRecordIter cannot be created");
+  }
+
+  // Load the model
+  LoadModel(model_json_file);
+  // Initilize the parameters
+  // benchmark=true && model_params_file.empty(), randomly initialize parameters
+  // else, load parameters
+  if (benchmark_ && model_params_file.empty()) {
+    InitParameters();
+  } else {
+    LoadParameters(model_params_file);
+  }
+
+  int dtype = GetDataLayerType();
+  if (dtype == -1) {
+    throw std::runtime_error("Unsupported data layer type...");
+  }
+  args_map_["data"] = NDArray(input_shape_, global_ctx_, false, dtype);
+  Shape label_shape(input_shape_[0]);
+  args_map_["softmax_label"] = NDArray(label_shape, global_ctx_, false);
+  std::vector<NDArray> arg_arrays;
+  std::vector<NDArray> grad_arrays;
+  std::vector<OpReqType> grad_reqs;
+  std::vector<NDArray> aux_arrays;
+
+  // infer and create ndarrays according to the given input ndarrays.
+  net_.InferExecutorArrays(global_ctx_, &arg_arrays, &grad_arrays, &grad_reqs,
+                           &aux_arrays, args_map_, std::map<std::string, NDArray>(),
+                           std::map<std::string, OpReqType>(), aux_map_);
+  for (auto& i : grad_reqs) i = OpReqType::kNullOp;
+
+  // Create an executor after binding the model to input parameters.
+  executor_ = new Executor(net_, global_ctx_, arg_arrays, grad_arrays, grad_reqs, aux_arrays);
+}
+
+/*
+ * The following function is used to get the data layer type for input data
+ */
+int Predictor::GetDataLayerType() {
+  int ret_type = -1;
+  if (data_layer_type_ == "float32") {
+    ret_type = kFloat32;
+  } else if (data_layer_type_ == "int8") {
+    ret_type = kInt8;
+  } else if (data_layer_type_ == "uint8") {
+    ret_type = kUint8;
+  } else {
+    LG << "Unsupported data layer type " << data_layer_type_ << "..."
+       << "Please use one of {float32, int8, uint8}";
+  }
+  return ret_type;
+}
+
+/*
+ * create a new ImageRecordIter according to the given parameters
+ */
+bool Predictor::CreateImageRecordIter() {
+  val_iter_ = new MXDataIter("ImageRecordIter");
+  if (!FileExists(dataset_)) {
+    LG << "Error: " << dataset_ << " must be provided";
+    return false;
+  }
+
+  std::vector<index_t> shape_vec;
+  for (index_t i = 1; i < input_shape_.ndim(); i++)
+    shape_vec.push_back(input_shape_[i]);
+  mxnet::TShape data_shape(shape_vec.begin(), shape_vec.end());
+
+  // set image record parser parameters
+  val_iter_->SetParam("path_imgrec", dataset_);
+  val_iter_->SetParam("label_width", 1);
+  val_iter_->SetParam("data_shape", data_shape);
+  val_iter_->SetParam("preprocess_threads", data_nthreads_);
+  val_iter_->SetParam("shuffle_chunk_seed", shuffle_chunk_seed_);
+
+  // set Batch parameters
+  val_iter_->SetParam("batch_size", input_shape_[0]);
+
+  // image record parameters
+  val_iter_->SetParam("shuffle", true);
+  val_iter_->SetParam("seed", seed_);
+
+  // set normalize parameters
+  val_iter_->SetParam("mean_r", rgb_mean_[0]);
+  val_iter_->SetParam("mean_g", rgb_mean_[1]);
+  val_iter_->SetParam("mean_b", rgb_mean_[2]);
+  val_iter_->SetParam("std_r", rgb_std_[0]);
+  val_iter_->SetParam("std_g", rgb_std_[1]);
+  val_iter_->SetParam("std_b", rgb_std_[2]);
+
+  // set prefetcher parameters
+  if (use_gpu_) {
+    val_iter_->SetParam("ctx", "gpu");
+  } else {
+    val_iter_->SetParam("ctx", "cpu");
+  }
+  val_iter_->SetParam("dtype", data_layer_type_);
+
+  val_iter_->CreateDataIter();
+  return true;
+}
+
+/*
+ * The following function loads the model from json file.
+ */
+void Predictor::LoadModel(const std::string& model_json_file) {
+  if (!FileExists(model_json_file)) {
+    LG << "Model file " << model_json_file << " does not exist";
+    throw std::runtime_error("Model file does not exist");
+  }
+  LG << "Loading the model from " << model_json_file << std::endl;
+  net_ = Symbol::Load(model_json_file);
+  if (enable_tensorrt_) {
+    net_ = net_.GetBackendSymbol("TensorRT");
+  }
+}
+
+/*
+ * The following function loads the model parameters.
+ */
+void Predictor::LoadParameters(const std::string& model_parameters_file) {
+  if (!FileExists(model_parameters_file)) {
+    LG << "Parameter file " << model_parameters_file << " does not exist";
+    throw std::runtime_error("Model parameters does not exist");
+  }
+  LG << "Loading the model parameters from " << model_parameters_file << std::endl;
+  std::map<std::string, NDArray> parameters;
+  NDArray::Load(model_parameters_file, 0, &parameters);
+  if (enable_tensorrt_) {
+    std::map<std::string, NDArray> intermediate_args_map;
+    std::map<std::string, NDArray> intermediate_aux_map;
+    SplitParamMap(parameters, &intermediate_args_map, &intermediate_aux_map, Context::cpu());
+    contrib::InitTensorRTParams(net_, &intermediate_args_map, &intermediate_aux_map);
+    ConvertParamMapToTargetContext(intermediate_args_map, &args_map_, global_ctx_);
+    ConvertParamMapToTargetContext(intermediate_aux_map, &aux_map_, global_ctx_);
+  } else {
+    SplitParamMap(parameters, &args_map_, &aux_map_, global_ctx_);
+  }
+  /*WaitAll is need when we copy data between GPU and the main memory*/
+  NDArray::WaitAll();
+}
+
+/*
+ * The following function split loaded param map into arg parm
+ *   and aux param with target context
+ */
+void Predictor::SplitParamMap(const std::map<std::string, NDArray> &paramMap,
+    std::map<std::string, NDArray> *argParamInTargetContext,
+    std::map<std::string, NDArray> *auxParamInTargetContext,
+    Context targetContext) {
+  for (const auto& pair : paramMap) {
+    std::string type = pair.first.substr(0, 4);
+    std::string name = pair.first.substr(4);
+    if (type == "arg:") {
+      (*argParamInTargetContext)[name] = pair.second.Copy(targetContext);
+    } else if (type == "aux:") {
+      (*auxParamInTargetContext)[name] = pair.second.Copy(targetContext);
+    }
+  }
+}
+
+/*
+ * The following function copy the param map into the target context
+ */
+void Predictor::ConvertParamMapToTargetContext(const std::map<std::string, NDArray> &paramMap,
+    std::map<std::string, NDArray> *paramMapInTargetContext,
+    Context targetContext) {
+  for (const auto& pair : paramMap) {
+    (*paramMapInTargetContext)[pair.first] = pair.second.Copy(targetContext);
+  }
+}
+
+/*
+ * The following function randomly initializes the parameters when benchmark_ is true.
+ */
+void Predictor::InitParameters() {
+  std::vector<mx_uint> data_shape;
+  for (index_t i = 0; i < input_shape_.ndim(); i++) {
+    data_shape.push_back(input_shape_[i]);
+  }
+
+  std::map<std::string, std::vector<mx_uint> > arg_shapes;
+  std::vector<std::vector<mx_uint> > aux_shapes, in_shapes, out_shapes;
+  arg_shapes["data"] = data_shape;
+  net_.InferShape(arg_shapes, &in_shapes, &aux_shapes, &out_shapes);
+
+  // initializer to call
+  Xavier xavier(Xavier::uniform, Xavier::avg, 2.0f);
+
+  auto arg_name_list = net_.ListArguments();
+  for (index_t i = 0; i < in_shapes.size(); i++) {
+    const auto &shape = in_shapes[i];
+    const auto &arg_name = arg_name_list[i];
+    int paramType = kFloat32;
+    if (Initializer::StringEndWith(arg_name, "weight_quantize") ||
+        Initializer::StringEndWith(arg_name, "bias_quantize")) {
+      paramType = kInt8;
+    }
+    NDArray tmp_arr(shape, global_ctx_, false, paramType);
+    xavier(arg_name, &tmp_arr);
+    args_map_[arg_name] = tmp_arr.Copy(global_ctx_);
+  }
+
+  auto aux_name_list = net_.ListAuxiliaryStates();
+  for (index_t i = 0; i < aux_shapes.size(); i++) {
+    const auto &shape = aux_shapes[i];
+    const auto &aux_name = aux_name_list[i];
+    NDArray tmp_arr(shape, global_ctx_, false);
+    xavier(aux_name, &tmp_arr);
+    aux_map_[aux_name] = tmp_arr.Copy(global_ctx_);
+  }
+  /*WaitAll is need when we copy data between GPU and the main memory*/
+  NDArray::WaitAll();
+}
+
+/*
+ * The following function runs the forward pass on the model
+ * and use dummy data for benchmark.
+ */
+void Predictor::BenchmarkScore(int num_inference_batches) {
+  // Create dummy data
+  std::vector<float> dummy_data(input_shape_.Size());
+  std::default_random_engine generator;
+  std::uniform_real_distribution<float> val(0.0f, 1.0f);
+  for (size_t i = 0; i < static_cast<size_t>(input_shape_.Size()); ++i) {
+    dummy_data[i] = static_cast<float>(val(generator));
+  }
+  executor_->arg_dict()["data"].SyncCopyFromCPU(
+        dummy_data.data(),
+        input_shape_.Size());
+  NDArray::WaitAll();
+
+  LG << "Running the forward pass on model to evaluate the performance..";
+
+  // warm up.
+  for (int i = 0; i < 5; i++) {
+    executor_->Forward(false);
+    NDArray::WaitAll();
+  }
+
+  // Run the forward pass.
+  double ms = ms_now();
+  for (int i = 0; i < num_inference_batches; i++) {
+    executor_->Forward(false);
+    NDArray::WaitAll();
+  }
+  ms = ms_now() - ms;
+  LG << " benchmark completed!";
+  LG << " batch size: " << input_shape_[0] << " num batch: " << num_inference_batches
+     << " throughput: " << 1000.0 * input_shape_[0] * num_inference_batches / ms
+     << " imgs/s latency:" << ms / input_shape_[0] / num_inference_batches << " ms";
+}
+
+/*
+ * \param skipped_batches skip the first number of batches
+ *
+ */
+bool Predictor::AdvanceDataIter(int skipped_batches) {
+  assert(skipped_batches >= 0);
+  if (skipped_batches == 0) return true;
+  int skipped_count = 0;
+  while (val_iter_->Next()) {
+    if (++skipped_count >= skipped_batches) break;
+  }
+  if (skipped_count != skipped_batches) return false;
+  return true;
+}
+
+/*
+ * The following function runs the forward pass on the model
+ * and use real data for testing accuracy and performance.
+ */
+void Predictor::Score(int num_skipped_batches, int num_inference_batches) {
+  // Create metrics
+  Accuracy val_acc;
+
+  val_iter_->Reset();
+  val_acc.Reset();
+  int nBatch = 0;
+
+  if (!AdvanceDataIter(num_skipped_batches)) {
+    LG << "skipped batches should less than total batches!";
+    return;
+  }
+
+  double ms = ms_now();
+  while (val_iter_->Next()) {
+    auto data_batch = val_iter_->GetDataBatch();
+    data_batch.data.CopyTo(&args_map_["data"]);
+    data_batch.label.CopyTo(&args_map_["softmax_label"]);
+    NDArray::WaitAll();
+
+    // running on forward pass
+    executor_->Forward(false);
+    NDArray::WaitAll();
+    val_acc.Update(data_batch.label, executor_->outputs[0]);
+
+    if (++nBatch >= num_inference_batches) {
+      break;
+    }
+  }
+  ms = ms_now() - ms;
+  auto args_name = net_.ListArguments();
+  LG << "INFO:" << "Dataset for inference: " << dataset_;
+  LG << "INFO:" << "label_name = " << args_name[args_name.size()-1];
+  LG << "INFO:" << "rgb_mean: " << "(" << rgb_mean_[0] << ", " << rgb_mean_[1]
+     << ", " << rgb_mean_[2] << ")";
+  LG << "INFO:" << "rgb_std: " << "(" << rgb_std_[0] << ", " << rgb_std_[1]
+     << ", " << rgb_std_[2] << ")";
+  LG << "INFO:" << "Image shape: " << "(" << input_shape_[1] << ", "
+     << input_shape_[2] << ", " << input_shape_[3] << ")";
+  LG << "INFO:" << "Finished inference with: " << nBatch * input_shape_[0]
+     << " images ";
+  LG << "INFO:" << "Batch size = " << input_shape_[0] << " for inference";
+  LG << "INFO:" << "Accuracy: " << val_acc.Get();
+  LG << "INFO:" << "Throughput: " << (1000.0 * nBatch * input_shape_[0] / ms)
+     << " images per second";
+}
+
+Predictor::~Predictor() {
+  if (executor_) {
+    delete executor_;
+  }
+  if (!benchmark_ && val_iter_) {
+    delete val_iter_;
+  }
+  MXNotifyShutdown();
+}
+
+/*
+ * Convert the input string of number into the vector.
+ */
+template<typename T>
+std::vector<T> createVectorFromString(const std::string& input_string) {
+  std::vector<T> dst_vec;
+  char *p_next;
+  T elem;
+  bool bFloat = std::is_same<T, float>::value;
+  if (!bFloat) {
+    elem = strtol(input_string.c_str(), &p_next, 10);
+  } else {
+    elem = strtof(input_string.c_str(), &p_next);
+  }
+
+  dst_vec.push_back(elem);
+  while (*p_next) {
+    if (!bFloat) {
+      elem = strtol(p_next, &p_next, 10);
+    } else {
+      elem = strtof(p_next, &p_next);
+    }
+    dst_vec.push_back(elem);
+  }
+  return dst_vec;
+}
+
+void printUsage() {
+    std::cout << "Usage:" << std::endl;
+    std::cout << "imagenet_inference --symbol_file <model symbol file in json format>" << std::endl
+              << "--params_file <model params file> " << std::endl
+              << "--dataset <dataset used to run inference> " << std::endl
+              << "--data_nthreads <default: 60> " << std::endl
+              << "--input_shape <shape of input image e.g \"3 224 224\">] " << std::endl
+              << "--rgb_mean <mean value to be subtracted on RGB channel e.g \"0 0 0\">"
+              << std::endl
+              << "--rgb_std <standard deviation on R/G/B channel. e.g \"1 1 1\"> " << std::endl
+              << "--batch_size <number of images per batch> " << std::endl
+              << "--num_skipped_batches <skip the number of batches for inference> " << std::endl
+              << "--num_inference_batches <number of batches used for inference> " << std::endl
+              << "--data_layer_type <default: \"float32\" "
+              << "choices: [\"float32\",\"int8\",\"uint8\"]>" << std::endl
+              << "--gpu  <whether to run inference on GPU, default: false>" << std::endl
+              << "--enableTRT  <whether to run inference with TensorRT, "
+              << "default: false>" << std::endl
+              << "--benchmark <whether to use dummy data to run inference, default: false>"
+              << std::endl;
+}
+
+int main(int argc, char** argv) {
+  std::string model_file_json;
+  std::string model_file_params;
+  std::string dataset("");
+  std::string input_rgb_mean("0 0 0");
+  std::string input_rgb_std("1 1 1");
+  bool use_gpu = false;
+  bool enable_tensorrt = false;
+  bool benchmark = false;
+  int batch_size = 64;
+  int num_skipped_batches = 0;
+  int num_inference_batches = 100;
+  std::string data_layer_type("float32");
+  std::string input_shape("3 224 224");
+  int seed = 48564309;
+  int shuffle_chunk_seed = 3982304;
+  int data_nthreads = 60;
+
+  int index = 1;
+  while (index < argc) {
+    if (strcmp("--symbol_file", argv[index]) == 0) {
+      index++;
+      model_file_json = (index < argc ? argv[index]:"");
+    } else if (strcmp("--params_file", argv[index]) == 0) {
+      index++;
+      model_file_params = (index < argc ? argv[index]:"");
+    } else if (strcmp("--dataset", argv[index]) == 0) {
+      index++;
+      dataset = (index < argc ? argv[index]:dataset);
+    } else if (strcmp("--data_nthreads", argv[index]) == 0) {
+      index++;
+      data_nthreads = strtol(argv[index], nullptr, 10);
+    } else if (strcmp("--input_shape", argv[index]) == 0) {
+      index++;
+      input_shape = (index < argc ? argv[index]:input_shape);
+    } else if (strcmp("--rgb_mean", argv[index]) == 0) {
+      index++;
+      input_rgb_mean = (index < argc ? argv[index]:input_rgb_mean);
+    } else if (strcmp("--rgb_std", argv[index]) == 0) {
+      index++;
+      input_rgb_std = (index < argc ? argv[index]:input_rgb_std);
+    } else if (strcmp("--batch_size", argv[index]) == 0) {
+      index++;
+      batch_size = strtol(argv[index], nullptr, 10);
+    }  else if (strcmp("--num_skipped_batches", argv[index]) == 0) {
+      index++;
+      num_skipped_batches = strtol(argv[index], nullptr, 10);
+    }  else if (strcmp("--num_inference_batches", argv[index]) == 0) {
+      index++;
+      num_inference_batches = strtol(argv[index], nullptr, 10);
+    } else if (strcmp("--data_layer_type", argv[index]) == 0) {
+      index++;
+      data_layer_type = (index < argc ? argv[index]:data_layer_type);
+    } else if (strcmp("--gpu", argv[index]) == 0) {
+      use_gpu = true;
+    } else if (strcmp("--enableTRT", argv[index]) == 0) {
+      use_gpu = true;
+      enable_tensorrt = true;
+    } else if (strcmp("--benchmark", argv[index]) == 0) {
+      benchmark = true;
+    } else if (strcmp("--help", argv[index]) == 0) {
+      printUsage();
+      return 0;
+    }
+    index++;
+  }
+
+  if (model_file_json.empty()
+      || (!benchmark && model_file_params.empty())
+      || (enable_tensorrt && model_file_params.empty())) {
+    LG << "ERROR: Model details such as symbol, param files are not specified";
+    printUsage();
+    return 1;
+  }
+  std::vector<index_t> input_dimensions = createVectorFromString<index_t>(input_shape);
+  input_dimensions.insert(input_dimensions.begin(), batch_size);
+  Shape input_data_shape(input_dimensions);
+
+  std::vector<float> rgb_mean = createVectorFromString<float>(input_rgb_mean);
+  std::vector<float> rgb_std = createVectorFromString<float>(input_rgb_std);
+
+  // Initialize the predictor object
+  Predictor predict(model_file_json, model_file_params, input_data_shape, use_gpu, enable_tensorrt,
+                    dataset, data_nthreads, data_layer_type, rgb_mean, rgb_std, shuffle_chunk_seed,
+                    seed, benchmark);
+
+  if (benchmark) {
+    predict.BenchmarkScore(num_inference_batches);
+  } else {
+    predict.Score(num_skipped_batches, num_inference_batches);
+  }
+  return 0;
+}
diff --git a/cpp-package/example/inference/inference.mk b/cpp-package/example/inference/inference.mk
new file mode 100644
index 000000000000..b03055395f21
--- /dev/null
+++ b/cpp-package/example/inference/inference.mk
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+CPPEX_SRC = $(wildcard cpp-package/example/inference/*.cpp)
+CPPEX_EXE = $(patsubst cpp-package/example/inference/%.cpp, build/cpp-package/example/%, $(CPPEX_SRC))
+
+CPPEX_CFLAGS += -Icpp-package/include
+CPPEX_EXTRA_LDFLAGS := -L$(ROOTDIR)/lib -lmxnet
+
+EXTRA_PACKAGES += cpp-package-inference-example-all
+EXTRA_PACKAGES_CLEAN += cpp-package-inference-example-clean
+
+.PHONY: cpp-package-inference-example-all cpp-package-inference-example-clean
+
+cpp-package-inference-example-all: cpp-package-all $(CPPEX_EXE)
+
+build/cpp-package/example/% : cpp-package/example/inference/%.cpp lib/libmxnet.so $(CPP_PACKAGE_OP_H_FILE)
+	@mkdir -p $(@D)
+	$(CXX) -std=c++11 $(CFLAGS) $(CPPEX_CFLAGS) -MM -MT cpp-package/example/inference/$* $< >build/cpp-package/example/$*.d
+	$(CXX) -std=c++11 $(CFLAGS) $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(LDFLAGS) $(CPPEX_EXTRA_LDFLAGS)
+
+cpp-package-inference-example-clean:
+	rm -rf build/cpp-package/example/inference*
+
+-include build/cpp-package/example/inference/*.d
diff --git a/cpp-package/example/inference/sentiment_analysis_rnn.cpp b/cpp-package/example/inference/sentiment_analysis_rnn.cpp
new file mode 100755
index 000000000000..53b618ff116c
--- /dev/null
+++ b/cpp-package/example/inference/sentiment_analysis_rnn.cpp
@@ -0,0 +1,488 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * This example demonstrates sentiment prediction workflow with pre-trained RNN model using MXNet C++ API.
+ * The example performs following tasks.
+ * 1. Load the pre-trained RNN model,
+ * 2. Load the dictionary file that contains word to index mapping.
+ * 3. Create executors for pre-determined input lengths.
+ * 4. Convert each line in the input to the vector of indices.
+ * 5. Predictor finds the right executor for each line.
+ * 4. Run the forward pass for each line and predicts the sentiment scores.
+ * The example uses a pre-trained RNN model that is trained with the IMDB dataset.
+ */
+
+#include <sys/stat.h>
+#include <iostream>
+#include <fstream>
+#include <cstdlib>
+#include <map>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <sstream>
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+static const int DEFAULT_BUCKET_KEYS[] = {30, 25, 20, 15, 10, 5};
+static const char DEFAULT_S3_URL[] = "https://s3.amazonaws.com/mxnet-cpp/RNN_model/";
+
+
+/*
+ * class Predictor
+ *
+ * This class encapsulates the functionality to load the model, process input image and run the forward pass.
+ */
+
+class Predictor {
+ public:
+    Predictor() {}
+    Predictor(const std::string& model_json,
+              const std::string& model_params,
+              const std::string& input_dictionary,
+              const std::vector<int>& bucket_keys,
+              bool use_gpu = false);
+    float PredictSentiment(const std::string &input_review);
+    ~Predictor();
+
+ private:
+    void LoadModel(const std::string& model_json_file);
+    void LoadParameters(const std::string& model_parameters_file);
+    void LoadDictionary(const std::string &input_dictionary);
+    inline bool FileExists(const std::string& name) {
+        struct stat buffer;
+        return (stat(name.c_str(), &buffer) == 0);
+    }
+    float PredictSentimentForOneLine(const std::string &input_line);
+    int ConvertToIndexVector(const std::string& input,
+                      std::vector<float> *input_vector);
+    int GetIndexForOutputSymbolName(const std::string& output_symbol_name);
+    float GetIndexForWord(const std::string& word);
+    int GetClosestBucketKey(int num_words);
+
+    std::map<std::string, NDArray> args_map;
+    std::map<std::string, NDArray> aux_map;
+    std::map<std::string, int>  wordToIndex;
+    Symbol net;
+    std::map<int, Executor*> executor_buckets;
+    Context global_ctx = Context::cpu();
+    int highest_bucket_key;
+};
+
+
+/*
+ * The constructor takes the following parameters as input:
+ * 1. model_json:  The RNN model in json formatted file.
+ * 2. model_params: File containing model parameters
+ * 3. input_dictionary: File containing the word and associated index.
+ * 4. bucket_keys: A vector of bucket keys for creating executors.
+ *
+ * The constructor:
+ *  1. Loads the model and parameter files.
+ *  2. Loads the dictionary file to create index to word and word to index maps.
+ *  3. For each bucket key in the input vector of bucket keys, it creates an executor.
+ *     The executors share the memory. The bucket key determines the length of input data
+ *     required for that executor.
+ *  4. Creates a map of bucket key to corresponding executor.
+ *  5. The model is loaded only once. The executors share the memory for the parameters.
+ */
+Predictor::Predictor(const std::string& model_json,
+                     const std::string& model_params,
+                     const std::string& input_dictionary,
+                     const std::vector<int>& bucket_keys,
+                     bool use_gpu) {
+  if (use_gpu) {
+    global_ctx = Context::gpu();
+  }
+
+  /*
+   * Load the dictionary file that contains the word and its index.
+   * The function creates word to index and index to word map. The maps are used to create index
+   * vector for the input sentence.
+   */
+  LoadDictionary(input_dictionary);
+
+  // Load the model
+  LoadModel(model_json);
+
+  // Load the model parameters.
+  LoadParameters(model_params);
+
+  /*
+   * Create the executors for each bucket key. The bucket key represents the shape of input data.
+   * The executors will share the memory by using following technique:
+   * 1. Infer the executor arrays and bind the first executor with the first bucket key.
+   * 2. Then for creating the next bucket key, adjust the shape of input argument to match that key.
+   * 3. Create the executor for the next bucket key by passing the inferred executor arrays and
+   *    pointer to the executor created for the first key.
+   */
+  std::vector<NDArray> arg_arrays;
+  std::vector<NDArray> grad_arrays;
+  std::vector<OpReqType> grad_reqs;
+  std::vector<NDArray> aux_arrays;
+
+  /*
+   * Create master executor with highest bucket key for optimizing the shared memory between the
+   * executors for the remaining bucket keys.
+   */
+  highest_bucket_key = *(std::max_element(bucket_keys.begin(), bucket_keys.end()));
+  args_map["data0"] = NDArray(Shape(highest_bucket_key, 1), global_ctx, false);
+  args_map["data1"] = NDArray(Shape(1), global_ctx, false);
+
+  net.InferExecutorArrays(global_ctx, &arg_arrays, &grad_arrays, &grad_reqs,
+                          &aux_arrays, args_map, std::map<std::string, NDArray>(),
+                              std::map<std::string, OpReqType>(), aux_map);
+  Executor *master_executor = net.Bind(global_ctx, arg_arrays, grad_arrays, grad_reqs, aux_arrays,
+                                 std::map<std::string, Context>(), nullptr);
+  executor_buckets[highest_bucket_key] = master_executor;
+
+  for (int bucket : bucket_keys) {
+    if (executor_buckets.find(bucket) == executor_buckets.end()) {
+      arg_arrays[0]  = NDArray(Shape(bucket, 1), global_ctx, false);
+      Executor *executor = net.Bind(global_ctx, arg_arrays, grad_arrays, grad_reqs, aux_arrays,
+                                    std::map<std::string, Context>(), master_executor);
+      executor_buckets[bucket] = executor;
+    }
+  }
+}
+
+
+/*
+ * The following function loads the model from json file.
+ */
+void Predictor::LoadModel(const std::string& model_json_file) {
+  if (!FileExists(model_json_file)) {
+    LG << "Model file " << model_json_file << " does not exist";
+    throw std::runtime_error("Model file does not exist");
+  }
+  LG << "Loading the model from " << model_json_file << std::endl;
+  net = Symbol::Load(model_json_file);
+}
+
+
+/*
+ * The following function loads the model parameters.
+ */
+void Predictor::LoadParameters(const std::string& model_parameters_file) {
+  if (!FileExists(model_parameters_file)) {
+    LG << "Parameter file " << model_parameters_file << " does not exist";
+    throw std::runtime_error("Model parameters does not exist");
+  }
+  LG << "Loading the model parameters from " << model_parameters_file << std::endl;
+  std::map<std::string, NDArray> parameters;
+  NDArray::Load(model_parameters_file, 0, &parameters);
+  for (const auto &k : parameters) {
+    if (k.first.substr(0, 4) == "aux:") {
+      auto name = k.first.substr(4, k.first.size() - 4);
+      aux_map[name] = k.second.Copy(global_ctx);
+    }
+    if (k.first.substr(0, 4) == "arg:") {
+      auto name = k.first.substr(4, k.first.size() - 4);
+      args_map[name] = k.second.Copy(global_ctx);
+    }
+  }
+  /*WaitAll is need when we copy data between GPU and the main memory*/
+  NDArray::WaitAll();
+}
+
+
+/*
+ * The following function loads the dictionary file.
+ * The function constructs the word to index and index to word maps.
+ * These maps will be used to represent words in the input sentence to their indices.
+ * Ensure to use the same dictionary file that was used for training the network.
+ */
+void Predictor::LoadDictionary(const std::string& input_dictionary) {
+  if (!FileExists(input_dictionary)) {
+    LG << "Dictionary file " << input_dictionary << " does not exist";
+    throw std::runtime_error("Dictionary file does not exist");
+  }
+  LG << "Loading the dictionary file.";
+  std::ifstream fi(input_dictionary.c_str());
+  if (!fi.is_open()) {
+    std::cerr << "Error opening dictionary file " << input_dictionary << std::endl;
+    assert(false);
+  }
+
+  std::string line;
+  std::string word;
+  int index;
+  while (std::getline(fi, line)) {
+    std::istringstream stringline(line);
+    stringline >> word >> index;
+    wordToIndex[word] = index;
+  }
+  fi.close();
+}
+
+
+/*
+ * The function returns the index associated with the word in the dictionary.
+ * If the word is not present, the index representing "<unk>" is returned.
+ * If the "<unk>" is not present then 0 is returned.
+ */
+float Predictor::GetIndexForWord(const std::string& word) {
+  if (wordToIndex.find(word) == wordToIndex.end()) {
+    if (wordToIndex.find("<unk>") == wordToIndex.end())
+      return 0;
+    else
+      return static_cast<float>(wordToIndex["<unk>"]);
+  }
+  return static_cast<float>(wordToIndex[word]);
+}
+
+/*
+ * The function populates the input vector with indices from the dictionary that
+ * correspond to the words in the input string.
+ * The function returns the number of words in the input line.
+ */
+int Predictor::ConvertToIndexVector(const std::string& input, std::vector<float> *input_vector) {
+  std::istringstream input_string(input);
+  input_vector->clear();
+  const char delimiter = ' ';
+  std::string token;
+  size_t words = 0;
+  while (std::getline(input_string, token, delimiter) && (words <= input_vector->size())) {
+    input_vector->push_back(GetIndexForWord(token));
+    words++;
+  }
+  return words;
+}
+
+
+/*
+ * The function returns the index at which the given symbol name will appear
+ * in the output vector of NDArrays obtained after running the forward pass on the executor.
+ */
+int Predictor::GetIndexForOutputSymbolName(const std::string& output_symbol_name) {
+  int index = 0;
+  for (const std::string op : net.ListOutputs()) {
+    if (op == output_symbol_name) {
+      return index;
+    } else {
+      index++;
+    }
+  }
+  throw std::runtime_error("The output symbol name can not be found");
+}
+
+
+/*
+ * The function finds the closest bucket for the given num_words in the input line.
+ * If the exact bucket key exists, function returns that bucket key.
+ * If the matching bucket key does not exist, function looks for the next bucket key
+ * that is greater than given num_words.
+ * If the next larger bucket does not exist, function returns the largest bucket key.
+ */
+int Predictor::GetClosestBucketKey(int num_words) {
+  int closest_bucket_key = highest_bucket_key;
+
+  if (executor_buckets.lower_bound(num_words) != executor_buckets.end()) {
+    closest_bucket_key = executor_buckets.lower_bound(num_words)->first;
+  }
+  return closest_bucket_key;
+}
+
+
+/*
+ * The following function runs the forward pass on the model for the given line.
+ *
+ */
+float Predictor::PredictSentimentForOneLine(const std::string& input_line) {
+  /*
+   * Initialize a vector of length equal to 'num_words' with index corresponding to <eos>.
+   * Convert the input string to a vector of indices that represent
+   * the words in the input string.
+   */
+  std::vector<float> index_vector(GetIndexForWord("<eos>"));
+  int num_words = ConvertToIndexVector(input_line, &index_vector);
+  int bucket_key = GetClosestBucketKey(num_words);
+
+  /*
+   * The index_vector has size equal to num_words. The vector needs to be padded if
+   * the bucket_key is greater than num_words. The vector needs to be trimmed if
+   * the bucket_key is smaller than num_words.
+   */
+  index_vector.resize(bucket_key, GetIndexForWord("<eos>"));
+
+  Executor* executor = executor_buckets[bucket_key];
+  executor->arg_dict()["data0"].SyncCopyFromCPU(index_vector.data(), index_vector.size());
+  executor->arg_dict()["data1"] = num_words;
+
+  // Run the forward pass.
+  executor->Forward(false);
+
+  /*
+   * The output is available in executor->outputs. It is a vector of
+   * NDArray. We need to find the index in that vector that
+   * corresponds to the output symbol "sentimentnet0_hybridsequential0_dense0_fwd_output".
+   */
+  const std::string output_symbol_name = "sentimentnet0_hybridsequential0_dense0_fwd_output";
+  int output_index = GetIndexForOutputSymbolName(output_symbol_name);
+  std::vector<NDArray> outputs = executor->outputs;
+  auto arrayout = executor->outputs[output_index].Copy(global_ctx);
+  /*
+   * We will run sigmoid operator to find out the sentiment score between
+   * 0 and 1 where 1 represents positive.
+   */
+  NDArray ret;
+  Operator("sigmoid")(arrayout).Invoke(ret);
+  ret.WaitToRead();
+
+  return ret.At(0, 0);
+}
+
+
+/*
+ * The function predicts the sentiment score for the input review.
+ * The function splits the input review in lines (separated by '.').
+ * It finds sentiment score for each line and computes the average.
+ */
+float Predictor::PredictSentiment(const std::string& input_review) {
+  std::istringstream input_string(input_review);
+  int num_lines = 0;
+  float sentiment_score = 0.0f;
+
+  // Split the iput review in separate lines separated by '.'
+  const char delimiter = '.';
+  std::string line;
+  while (std::getline(input_string, line, delimiter)) {
+    // Predict the sentiment score for each line.
+    float score = PredictSentimentForOneLine(line);
+    LG << "Input Line : [" << line << "] Score : " << score;
+    sentiment_score += score;
+    num_lines++;
+  }
+
+  // Find the average sentiment score.
+  sentiment_score = sentiment_score / num_lines;
+  return sentiment_score;
+}
+
+
+/*
+ * The destructor frees the executor and notifies MXNetEngine to shutdown.
+ */
+Predictor::~Predictor() {
+  for (auto bucket : this->executor_buckets) {
+    Executor* executor = bucket.second;
+    delete executor;
+  }
+  MXNotifyShutdown();
+}
+
+
+/*
+ * The function prints the usage information.
+ */
+void printUsage() {
+    std::cout << "Usage:" << std::endl;
+    std::cout << "sentiment_analysis_rnn " << std::endl
+              << "--input Input movie review. The review can be single line or multiline."
+              << "e.g. \"This movie is the best.\" OR  "
+              << "\"This movie is the best. The direction is awesome.\" " << std::endl
+              << "[--gpu]  Specify this option if workflow needs to be run in gpu context "
+              << std::endl
+              << "If the review is multiline, the example predicts sentiment score for each line "
+              << "and the final score is the average of scores obtained for each line."
+              << std::endl;
+}
+
+
+/*
+ * The function downloads the model files from s3 bucket.
+ */
+void DownloadFiles(const std::vector<std::string> model_files) {
+  std::string wget_command("wget -nc ");
+  std::string s3_url(DEFAULT_S3_URL);
+  for (auto &file : model_files) {
+    std::ostringstream oss;
+    oss << wget_command << s3_url << file << " -O " << file;
+    int status = system(oss.str().c_str());
+    LG << "Downloading " << file << " with status " << status;
+  }
+  return;
+}
+
+
+int main(int argc, char** argv) {
+  std::string model_file_json = "./sentiment_analysis-symbol.json";
+  std::string model_file_params ="./sentiment_analysis-0010.params";
+  std::string input_dictionary = "./sentiment_token_to_idx.txt";
+  std::string input_review = "This movie is the best";
+  bool use_gpu = false;
+
+  int index = 1;
+  while (index < argc) {
+    if (strcmp("--input", argv[index]) == 0) {
+      index++;
+      input_review = (index < argc ? argv[index]:input_review);
+    } else if (strcmp("--gpu", argv[index]) == 0) {
+      use_gpu = true;
+    } else if (strcmp("--help", argv[index]) == 0) {
+      printUsage();
+      return 0;
+    }
+    index++;
+  }
+
+
+  /*
+   * Download the trained RNN model file, param file and dictionary file.
+   * The dictionary file contains word to index mapping.
+   * Each line of the dictionary file contains a word and the unique index for that word separated
+   * by a space. For example:
+   * snippets 11172
+   * This dictionary file is created when the RNN model was trained with a particular dataset.
+   * Hence the dictionary file is specific to the dataset with which model was trained.
+   */
+  std::vector<std::string> files;
+  files.push_back(model_file_json);
+  files.push_back(model_file_params);
+  files.push_back(input_dictionary);
+
+  DownloadFiles(files);
+
+  std::vector<int> buckets(DEFAULT_BUCKET_KEYS,
+                           DEFAULT_BUCKET_KEYS + sizeof(DEFAULT_BUCKET_KEYS) / sizeof(int));
+
+  try {
+    // Initialize the predictor object
+    Predictor predict(model_file_json, model_file_params, input_dictionary, buckets, use_gpu);
+
+    // Run the forward pass to predict the sentiment score for the given review.
+    float sentiment_score = predict.PredictSentiment(input_review);
+    LG << "The sentiment score between 0 and 1, (1 being positive)=" << sentiment_score;
+  } catch (std::runtime_error &error) {
+    LG << MXGetLastError();
+    LG << "Execution failed with ERROR: " << error.what();
+    return 1;
+  } catch (...) {
+    /*
+     * If underlying MXNet code has thrown an exception the error message is
+     * accessible through MXGetLastError() function.
+     */
+    LG << "Execution failed with following MXNet error";
+    LG << MXGetLastError();
+    return 1;
+  }
+  return 0;
+}
diff --git a/cpp-package/example/inference/unit_test_imagenet_inference.sh b/cpp-package/example/inference/unit_test_imagenet_inference.sh
new file mode 100755
index 000000000000..c645388cd419
--- /dev/null
+++ b/cpp-package/example/inference/unit_test_imagenet_inference.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+# create ./model directory if not existed
+if [ ! -d model ]; then
+    mkdir -p model
+fi
+# create ./data directory if not existed
+if [ ! -d data ]; then
+    mkdir -p data
+fi
+# Downloading the data and model if not existed
+model_file=./model/Inception-BN-symbol.json
+params_file=./model/Inception-BN-0126.params
+if [ ! -f ${model_file} ] || [ ! -f ${params_file} ]; then
+    wget -nc http://data.mxnet.io/models/imagenet/inception-bn.tar.gz
+    tar -xvzf inception-bn.tar.gz -C model
+fi
+cd model
+wget -nc https://raw.githubusercontent.com/dmlc/gluon-cv/master/gluoncv/model_zoo/quantized/resnet50_v1_int8-symbol.json
+cd ../data
+wget -nc http://data.mxnet.io/data/val_256_q90.rec
+cd ..
+
+# Running inference on imagenet.
+if [ "$(uname)" == "Darwin" ]; then
+    echo ">>> INFO: FP32 real data"
+    DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:../../../lib ./imagenet_inference --symbol_file "./model/Inception-BN-symbol.json" --params_file "./model/Inception-BN-0126.params" --dataset "./data/val_256_q90.rec" --rgb_mean "123.68 116.779 103.939" --batch_size 1 --num_skipped_batches 50 --num_inference_batches 500
+
+    echo ">>> INFO: FP32 dummy data"
+    DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:../../../lib ./imagenet_inference --symbol_file "./model/Inception-BN-symbol.json" --batch_size 1 --num_inference_batches 500 --benchmark
+else
+    echo ">>> INFO: FP32 real data"
+    LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../../../lib ./imagenet_inference --symbol_file "./model/Inception-BN-symbol.json" --params_file "./model/Inception-BN-0126.params" --dataset "./data/val_256_q90.rec" --rgb_mean "123.68 116.779 103.939" --batch_size 1 --num_skipped_batches 50 --num_inference_batches 500
+
+    echo ">>> INFO: FP32 dummy data"
+    LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../../../lib ./imagenet_inference --symbol_file "./model/Inception-BN-symbol.json" --batch_size 1 --num_inference_batches 500 --benchmark
+
+    lib_name=$(ls -a ../../../lib | grep -oE 'mkldnn' | tail -1)
+    if [[ -n ${lib_name} ]] && [[ 'mkldnn' =~ ${lib_name} ]]; then
+        echo ">>> INFO: INT8 dummy data"
+        LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../../../lib ./imagenet_inference --symbol_file "./model/resnet50_v1_int8-symbol.json" --batch_size 1 --num_inference_batches 500 --benchmark
+    else
+        echo "Skipped INT8 test because mkldnn was not found which is required for running inference with quantized models."
+    fi
+fi
diff --git a/cpp-package/example/inference/unit_test_sentiment_analysis_rnn.sh b/cpp-package/example/inference/unit_test_sentiment_analysis_rnn.sh
new file mode 100755
index 000000000000..6f42e449ce58
--- /dev/null
+++ b/cpp-package/example/inference/unit_test_sentiment_analysis_rnn.sh
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+function compare_range() {
+    perl -e "{if($1>$2 && $1<=$3){print 1} else {print 0}}"
+}
+
+set -e # exit on the first error
+export EXE_NAME="sentiment_analysis_rnn"
+
+# Running the example with a movie review.
+if [ "$(uname)" == "Darwin" ]; then
+    DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:../../../lib ./${EXE_NAME}  --input "This movie is the best." 2&> ${EXE_NAME}.log
+else
+    LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../../../lib ./${EXE_NAME}  --input "This movie is the best." 2&> ${EXE_NAME}.log
+fi
+result=`grep "The sentiment score between 0 and 1.*\=" ${EXE_NAME}.log | cut -d '=' -f2`
+lower_bound=0.8
+upper_bound=0.99
+if [ $(compare_range $result $lower_bound $upper_bound) == 1 ];
+then
+    echo "PASS: ${EXE_NAME} correctly predicted the sentiment with score = $result"
+    exit 0
+else
+    echo "FAIL: ${EXE_NAME} FAILED to predict the sentiment with score = $result"
+    exit 1
+fi
\ No newline at end of file
diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp
new file mode 100644
index 000000000000..54be0edccc14
--- /dev/null
+++ b/cpp-package/example/lenet.cpp
@@ -0,0 +1,267 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ */
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <cstdlib>
+#include "mxnet-cpp/MxNetCpp.h"
+#include "utils.h"
+
+using namespace mxnet::cpp;
+
+class Lenet {
+ public:
+  Lenet()
+      : ctx_cpu(Context(DeviceType::kCPU, 0)),
+#if MXNET_USE_CPU
+        ctx_dev(Context(DeviceType::kCPU, 0))
+#else
+        ctx_dev(Context(DeviceType::kGPU, 0))
+#endif
+        {}
+
+  void Run(int max_epoch) {
+    /*
+     * LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner.
+     * "Gradient-based learning applied to document recognition."
+     * Proceedings of the IEEE (1998)
+     * */
+
+    /*define the symbolic net*/
+    Symbol data = Symbol::Variable("data");
+    Symbol data_label = Symbol::Variable("data_label");
+    Symbol conv1_w("conv1_w"), conv1_b("conv1_b");
+    Symbol conv2_w("conv2_w"), conv2_b("conv2_b");
+    Symbol conv3_w("conv3_w"), conv3_b("conv3_b");
+    Symbol fc1_w("fc1_w"), fc1_b("fc1_b");
+    Symbol fc2_w("fc2_w"), fc2_b("fc2_b");
+
+    Symbol conv1 =
+        Convolution("conv1", data, conv1_w, conv1_b, Shape(5, 5), 20);
+    Symbol tanh1 = Activation("tanh1", conv1, ActivationActType::kTanh);
+    Symbol pool1 = Pooling("pool1", tanh1, Shape(2, 2), PoolingPoolType::kMax,
+      false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
+
+    Symbol conv2 = Convolution("conv2", pool1, conv2_w, conv2_b,
+      Shape(5, 5), 50);
+    Symbol tanh2 = Activation("tanh2", conv2, ActivationActType::kTanh);
+    Symbol pool2 = Pooling("pool2", tanh2, Shape(2, 2), PoolingPoolType::kMax,
+      false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
+
+    Symbol conv3 = Convolution("conv3", pool2, conv3_w, conv3_b,
+      Shape(2, 2), 500);
+    Symbol tanh3 = Activation("tanh3", conv3, ActivationActType::kTanh);
+    Symbol pool3 = Pooling("pool3", tanh3, Shape(2, 2), PoolingPoolType::kMax,
+      false, false, PoolingPoolingConvention::kValid, Shape(1, 1));
+
+    Symbol flatten = Flatten("flatten", pool3);
+    Symbol fc1 = FullyConnected("fc1", flatten, fc1_w, fc1_b, 500);
+    Symbol tanh4 = Activation("tanh4", fc1, ActivationActType::kTanh);
+    Symbol fc2 = FullyConnected("fc2", tanh4, fc2_w, fc2_b, 10);
+
+    Symbol lenet = SoftmaxOutput("softmax", fc2, data_label);
+
+    for (auto s : lenet.ListArguments()) {
+      LG << s;
+    }
+
+    /*setup basic configs*/
+    int val_fold = 1;
+    int W = 28;
+    int H = 28;
+    int batch_size = 42;
+    float learning_rate = 1e-4;
+    float weight_decay = 1e-4;
+
+    /*prepare the data*/
+    std::vector<float> data_vec, label_vec;
+    size_t data_count = GetData(&data_vec, &label_vec);
+    const float *dptr = data_vec.data();
+    const float *lptr = label_vec.data();
+    NDArray data_array = NDArray(Shape(data_count, 1, W, H), ctx_cpu,
+                                 false);  // store in main memory, and copy to
+    // device memory while training
+    NDArray label_array =
+      NDArray(Shape(data_count), ctx_cpu,
+                false);  // it's also ok if just store them all in device memory
+    data_array.SyncCopyFromCPU(dptr, data_count * W * H);
+    label_array.SyncCopyFromCPU(lptr, data_count);
+    data_array.WaitToRead();
+    label_array.WaitToRead();
+
+    size_t train_num = data_count * (1 - val_fold / 10.0);
+    train_data = data_array.Slice(0, train_num);
+    train_label = label_array.Slice(0, train_num);
+    val_data = data_array.Slice(train_num, data_count);
+    val_label = label_array.Slice(train_num, data_count);
+
+    LG << "here read fin";
+
+    /*init some of the args*/
+    // map<string, NDArray> args_map;
+    args_map["data"] = data_array.Slice(0, batch_size).Copy(ctx_dev);
+    args_map["data_label"] = label_array.Slice(0, batch_size).Copy(ctx_dev);
+    NDArray::WaitAll();
+
+    LG << "here slice fin";
+    /*
+     * we can also feed in some of the args other than the input all by
+     * ourselves,
+     * fc2-w , fc1-b for example:
+     * */
+    // args_map["fc2_w"] =
+    // NDArray(mshadow::Shape2(500, 4 * 4 * 50), ctx_dev, false);
+    // NDArray::SampleGaussian(0, 1, &args_map["fc2_w"]);
+    // args_map["fc1_b"] = NDArray(mshadow::Shape1(10), ctx_dev, false);
+    // args_map["fc1_b"] = 0;
+
+    lenet.InferArgsMap(ctx_dev, &args_map, args_map);
+    Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+    opt->SetParam("momentum", 0.9)
+       ->SetParam("rescale_grad", 1.0)
+       ->SetParam("clip_gradient", 10)
+       ->SetParam("lr", learning_rate)
+       ->SetParam("wd", weight_decay);
+
+    Executor *exe = lenet.SimpleBind(ctx_dev, args_map);
+    auto arg_names = lenet.ListArguments();
+
+    for (int ITER = 0; ITER < max_epoch; ++ITER) {
+      size_t start_index = 0;
+      while (start_index < train_num) {
+        if (start_index + batch_size > train_num) {
+          start_index = train_num - batch_size;
+        }
+        args_map["data"] =
+            train_data.Slice(start_index, start_index + batch_size)
+                .Copy(ctx_dev);
+        args_map["data_label"] =
+            train_label.Slice(start_index, start_index + batch_size)
+                .Copy(ctx_dev);
+        start_index += batch_size;
+        NDArray::WaitAll();
+
+        exe->Forward(true);
+        exe->Backward();
+        // Update parameters
+        for (size_t i = 0; i < arg_names.size(); ++i) {
+          if (arg_names[i] == "data" || arg_names[i] == "data_label") continue;
+          opt->Update(i, exe->arg_arrays[i], exe->grad_arrays[i]);
+        }
+      }
+
+      LG << "Iter " << ITER
+         << ", accuracy: " << ValAccuracy(batch_size * 10, lenet);
+    }
+    delete exe;
+    delete opt;
+  }
+
+ private:
+  Context ctx_cpu;
+  Context ctx_dev;
+  std::map<std::string, NDArray> args_map;
+  NDArray train_data;
+  NDArray train_label;
+  NDArray val_data;
+  NDArray val_label;
+
+  size_t GetData(std::vector<float> *data, std::vector<float> *label) {
+    const char *train_data_path = "./data/mnist_data/mnist_train.csv";
+    std::ifstream inf(train_data_path);
+    std::string line;
+    inf >> line;  // ignore the header
+    size_t _N = 0;
+    while (inf >> line) {
+      for (auto &c : line) c = (c == ',') ? ' ' : c;
+      std::stringstream ss;
+      ss << line;
+      float _data;
+      ss >> _data;
+      label->push_back(_data);
+      while (ss >> _data) data->push_back(_data / 256.0);
+      _N++;
+    }
+    inf.close();
+    return _N;
+  }
+
+  float ValAccuracy(int batch_size, Symbol lenet) {
+    size_t val_num = val_data.GetShape()[0];
+
+    size_t correct_count = 0;
+    size_t all_count = 0;
+
+    size_t start_index = 0;
+    while (start_index < val_num) {
+      if (start_index + batch_size > val_num) {
+        start_index = val_num - batch_size;
+      }
+      args_map["data"] =
+          val_data.Slice(start_index, start_index + batch_size).Copy(ctx_dev);
+      args_map["data_label"] =
+          val_label.Slice(start_index, start_index + batch_size).Copy(ctx_dev);
+      start_index += batch_size;
+      NDArray::WaitAll();
+
+      Executor *exe = lenet.SimpleBind(ctx_dev, args_map);
+      exe->Forward(false);
+
+      const auto &out = exe->outputs;
+      NDArray out_cpu = out[0].Copy(ctx_cpu);
+      NDArray label_cpu =
+          val_label.Slice(start_index - batch_size, start_index).Copy(ctx_cpu);
+
+      NDArray::WaitAll();
+
+      const mx_float *dptr_out = out_cpu.GetData();
+      const mx_float *dptr_label = label_cpu.GetData();
+      for (int i = 0; i < batch_size; ++i) {
+        float label = dptr_label[i];
+        int cat_num = out_cpu.GetShape()[1];
+        float p_label = 0, max_p = dptr_out[i * cat_num];
+        for (int j = 0; j < cat_num; ++j) {
+          float p = dptr_out[i * cat_num + j];
+          if (max_p < p) {
+            p_label = j;
+            max_p = p;
+          }
+        }
+        if (label == p_label) correct_count++;
+      }
+      all_count += batch_size;
+
+      delete exe;
+    }
+    return correct_count * 1.0 / all_count;
+  }
+};
+
+int main(int argc, char const *argv[]) {
+  TRY
+  Lenet lenet;
+  lenet.Run(argc > 1 ? strtol(argv[1], nullptr, 10) : 100000);
+  MXNotifyShutdown();
+  CATCH
+  return 0;
+}
diff --git a/cpp-package/example/lenet_with_mxdataiter.cpp b/cpp-package/example/lenet_with_mxdataiter.cpp
new file mode 100644
index 000000000000..6b37693cda59
--- /dev/null
+++ b/cpp-package/example/lenet_with_mxdataiter.cpp
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ */
+#include <map>
+#include <string>
+#include <vector>
+#include <fstream>
+#include <chrono>
+#include <cstdlib>
+#include "utils.h"
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+Symbol LenetSymbol() {
+  /*
+   * LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner.
+   * "Gradient-based learning applied to document recognition."
+   * Proceedings of the IEEE (1998)
+   * */
+
+  /*define the symbolic net*/
+  Symbol data = Symbol::Variable("data");
+  Symbol data_label = Symbol::Variable("data_label");
+  Symbol conv1_w("conv1_w"), conv1_b("conv1_b");
+  Symbol conv2_w("conv2_w"), conv2_b("conv2_b");
+  Symbol conv3_w("conv3_w"), conv3_b("conv3_b");
+  Symbol fc1_w("fc1_w"), fc1_b("fc1_b");
+  Symbol fc2_w("fc2_w"), fc2_b("fc2_b");
+
+  Symbol conv1 = Convolution("conv1", data, conv1_w, conv1_b, Shape(5, 5), 20);
+  Symbol tanh1 = Activation("tanh1", conv1, ActivationActType::kTanh);
+  Symbol pool1 = Pooling("pool1", tanh1, Shape(2, 2), PoolingPoolType::kMax,
+      false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
+
+  Symbol conv2 = Convolution("conv2", pool1, conv2_w, conv2_b, Shape(5, 5), 50);
+  Symbol tanh2 = Activation("tanh2", conv2, ActivationActType::kTanh);
+  Symbol pool2 = Pooling("pool2", tanh2, Shape(2, 2), PoolingPoolType::kMax,
+      false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
+
+  Symbol flatten = Flatten("flatten", pool2);
+  Symbol fc1 = FullyConnected("fc1", flatten, fc1_w, fc1_b, 500);
+  Symbol tanh3 = Activation("tanh3", fc1, ActivationActType::kTanh);
+  Symbol fc2 = FullyConnected("fc2", tanh3, fc2_w, fc2_b, 10);
+
+  Symbol lenet = SoftmaxOutput("softmax", fc2, data_label);
+
+  return lenet;
+}
+
+NDArray ResizeInput(NDArray data, const Shape new_shape) {
+  NDArray pic = data.Reshape(Shape(0, 1, 28, 28));
+  NDArray output;
+  Operator("_contrib_BilinearResize2D")
+    .SetParam("height", new_shape[2])
+    .SetParam("width", new_shape[3])
+    (pic).Invoke(output);
+  return output;
+}
+
+int main(int argc, char const *argv[]) {
+  /*setup basic configs*/
+  int W = 28;
+  int H = 28;
+  int batch_size = 128;
+  int max_epoch = argc > 1 ? strtol(argv[1], nullptr, 10) : 100;
+  float learning_rate = 1e-4;
+  float weight_decay = 1e-4;
+
+  auto dev_ctx = Context::cpu();
+  int num_gpu;
+  MXGetGPUCount(&num_gpu);
+#if !MXNET_USE_CPU
+  if (num_gpu > 0) {
+    dev_ctx = Context::gpu();
+  }
+#endif
+
+  TRY
+  auto lenet = LenetSymbol();
+  std::map<std::string, NDArray> args_map;
+
+  const Shape data_shape = Shape(batch_size, 1, H, W),
+              label_shape = Shape(batch_size);
+  args_map["data"] = NDArray(data_shape, dev_ctx);
+  args_map["data_label"] = NDArray(label_shape, dev_ctx);
+  lenet.InferArgsMap(dev_ctx, &args_map, args_map);
+
+  args_map["fc1_w"] = NDArray(Shape(500, 4 * 4 * 50), dev_ctx);
+  NDArray::SampleGaussian(0, 1, &args_map["fc1_w"]);
+  args_map["fc2_b"] = NDArray(Shape(10), dev_ctx);
+  args_map["fc2_b"] = 0;
+
+  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
+                                          "./data/mnist_data/train-labels-idx1-ubyte",
+                                          "./data/mnist_data/t10k-images-idx3-ubyte",
+                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
+                                        };
+
+  auto train_iter =  MXDataIter("MNISTIter");
+  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
+    return 1;
+  }
+
+  auto val_iter = MXDataIter("MNISTIter");
+  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
+    return 1;
+  }
+
+  Optimizer* opt = OptimizerRegistry::Find("sgd");
+  opt->SetParam("momentum", 0.9)
+     ->SetParam("rescale_grad", 1.0)
+     ->SetParam("clip_gradient", 10)
+     ->SetParam("lr", learning_rate)
+     ->SetParam("wd", weight_decay);
+
+
+  auto *exec = lenet.SimpleBind(dev_ctx, args_map);
+  auto arg_names = lenet.ListArguments();
+
+  // Create metrics
+  Accuracy train_acc, val_acc;
+
+  for (int iter = 0; iter < max_epoch; ++iter) {
+      int samples = 0;
+      train_iter.Reset();
+      train_acc.Reset();
+
+      auto tic = std::chrono::system_clock::now();
+
+     while (train_iter.Next()) {
+      samples += batch_size;
+      auto data_batch = train_iter.GetDataBatch();
+
+      ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
+      NDArray::WaitAll();
+
+      // Compute gradients
+      exec->Forward(true);
+      exec->Backward();
+
+      // Update parameters
+      for (size_t i = 0; i < arg_names.size(); ++i) {
+        if (arg_names[i] == "data" || arg_names[i] == "data_label") continue;
+        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+      }
+
+      // Update metric
+      train_acc.Update(data_batch.label, exec->outputs[0]);
+    }
+
+     // one epoch of training is finished
+     auto toc = std::chrono::system_clock::now();
+     float duration = std::chrono::duration_cast<std::chrono::milliseconds>
+                      (toc - tic).count() / 1000.0;
+     LG << "Epoch[" << iter << "] " << samples / duration \
+         << " samples/sec " << "Train-Accuracy=" << train_acc.Get();;
+
+      val_iter.Reset();
+      val_acc.Reset();
+
+    Accuracy acu;
+    val_iter.Reset();
+    while (val_iter.Next()) {
+      auto data_batch = val_iter.GetDataBatch();
+      ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
+      NDArray::WaitAll();
+
+      // Only forward pass is enough as no gradient is needed when evaluating
+      exec->Forward(false);
+      NDArray::WaitAll();
+      acu.Update(data_batch.label, exec->outputs[0]);
+      val_acc.Update(data_batch.label, exec->outputs[0]);
+    }
+    LG << "Epoch[" << iter << "] Val-Accuracy=" << val_acc.Get();
+  }
+
+  delete exec;
+  delete opt;
+  MXNotifyShutdown();
+  CATCH
+  return 0;
+}
diff --git a/cpp-package/example/mlp.cpp b/cpp-package/example/mlp.cpp
new file mode 100644
index 000000000000..970dad74e727
--- /dev/null
+++ b/cpp-package/example/mlp.cpp
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ */
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include "mxnet-cpp/MxNetCpp.h"
+#include "utils.h"
+
+using namespace mxnet::cpp;
+
+/*
+ * In this example,
+ * we make by hand some data in 10 classes with some pattern
+ * and try to use MLP to recognize the pattern.
+ */
+
+void OutputAccuracy(mx_float* pred, mx_float* target) {
+  int right = 0;
+  for (int i = 0; i < 128; ++i) {
+    float mx_p = pred[i * 10 + 0];
+    float p_y = 0;
+    for (int j = 0; j < 10; ++j) {
+      if (pred[i * 10 + j] > mx_p) {
+        mx_p = pred[i * 10 + j];
+        p_y = j;
+      }
+    }
+    if (p_y == target[i]) right++;
+  }
+  std::cout << "Accuracy: " << right / 128.0 << std::endl;
+}
+
+void MLP(int max_epoch) {
+  auto sym_x = Symbol::Variable("X");
+  auto sym_label = Symbol::Variable("label");
+
+  const int nLayers = 2;
+  std::vector<int> layerSizes({512, 10});
+  std::vector<Symbol> weights(nLayers);
+  std::vector<Symbol> biases(nLayers);
+  std::vector<Symbol> outputs(nLayers);
+
+  Symbol null_sym;
+  for (int i = 0; i < nLayers; i++) {
+    std::string istr = std::to_string(i);
+    weights[i] = Symbol::Variable(std::string("w") + istr);
+    biases[i] = Symbol::Variable(std::string("b") + istr);
+    Symbol fc = FullyConnected(std::string("fc") + istr,
+      i == 0? sym_x : outputs[i-1],
+      weights[i], biases[i], layerSizes[i]);
+    outputs[i] = LeakyReLU(std::string("act") + istr, fc, null_sym, LeakyReLUActType::kLeaky);
+  }
+  auto sym_out = SoftmaxOutput("softmax", outputs[nLayers - 1], sym_label);
+
+  Context ctx_dev(DeviceType::kCPU, 0);
+
+  NDArray array_x(Shape(128, 28), ctx_dev, false);
+  NDArray array_y(Shape(128), ctx_dev, false);
+
+  mx_float* aptr_x = new mx_float[128 * 28];
+  mx_float* aptr_y = new mx_float[128];
+
+  // we make the data by hand, in 10 classes, with some pattern
+  for (int i = 0; i < 128; i++) {
+    for (int j = 0; j < 28; j++) {
+      aptr_x[i * 28 + j] = i % 10 * 1.0f;
+    }
+    aptr_y[i] = i % 10;
+  }
+  array_x.SyncCopyFromCPU(aptr_x, 128 * 28);
+  array_x.WaitToRead();
+  array_y.SyncCopyFromCPU(aptr_y, 128);
+  array_y.WaitToRead();
+
+  // init the parameters
+  NDArray array_w_1(Shape(512, 28), ctx_dev, false);
+  NDArray array_b_1(Shape(512), ctx_dev, false);
+  NDArray array_w_2(Shape(10, 512), ctx_dev, false);
+  NDArray array_b_2(Shape(10), ctx_dev, false);
+
+  // the parameters should be initialized in some kind of distribution,
+  // so it learns fast
+  // but here just give a const value by hand
+  array_w_1 = 0.5f;
+  array_b_1 = 0.0f;
+  array_w_2 = 0.5f;
+  array_b_2 = 0.0f;
+
+  // the grads
+  NDArray array_w_1_g(Shape(512, 28), ctx_dev, false);
+  NDArray array_b_1_g(Shape(512), ctx_dev, false);
+  NDArray array_w_2_g(Shape(10, 512), ctx_dev, false);
+  NDArray array_b_2_g(Shape(10), ctx_dev, false);
+
+  // Bind the symolic network with the ndarray
+  // all the input args
+  std::vector<NDArray> in_args;
+  in_args.push_back(array_x);
+  in_args.push_back(array_w_1);
+  in_args.push_back(array_b_1);
+  in_args.push_back(array_w_2);
+  in_args.push_back(array_b_2);
+  in_args.push_back(array_y);
+  // all the grads
+  std::vector<NDArray> arg_grad_store;
+  arg_grad_store.push_back(NDArray());  // we don't need the grad of the input
+  arg_grad_store.push_back(array_w_1_g);
+  arg_grad_store.push_back(array_b_1_g);
+  arg_grad_store.push_back(array_w_2_g);
+  arg_grad_store.push_back(array_b_2_g);
+  arg_grad_store.push_back(
+      NDArray());  // neither do we need the grad of the loss
+  // how to handle the grad
+  std::vector<OpReqType> grad_req_type;
+  grad_req_type.push_back(kNullOp);
+  grad_req_type.push_back(kWriteTo);
+  grad_req_type.push_back(kWriteTo);
+  grad_req_type.push_back(kWriteTo);
+  grad_req_type.push_back(kWriteTo);
+  grad_req_type.push_back(kNullOp);
+  std::vector<NDArray> aux_states;
+
+  std::cout << "make the Executor" << std::endl;
+  Executor* exe = new Executor(sym_out, ctx_dev, in_args, arg_grad_store,
+                               grad_req_type, aux_states);
+
+  std::cout << "Training" << std::endl;
+  mx_float learning_rate = 0.0001;
+  for (int epoch_num = 0; epoch_num < max_epoch; ++epoch_num) {
+    exe->Forward(true);
+    // print accuracy every 100 epoch
+    if (epoch_num % 100 == 0) {
+      std::cout << "epoch " << epoch_num << std::endl;
+      std::vector<NDArray>& out = exe->outputs;
+      float* cptr = new float[128 * 10];
+      out[0].SyncCopyToCPU(cptr, 128 * 10);
+      NDArray::WaitAll();
+      OutputAccuracy(cptr, aptr_y);
+      delete[] cptr;
+    }
+
+    // update the parameters
+    exe->Backward();
+    for (int i = 1; i < 5; ++i) {
+      in_args[i] -= arg_grad_store[i] * learning_rate;
+    }
+    NDArray::WaitAll();
+  }
+
+  delete exe;
+  delete[] aptr_x;
+  delete[] aptr_y;
+}
+
+int main(int argc, char** argv) {
+  int max_epoch = argc > 1 ? strtol(argv[1], nullptr, 10) : 15000;
+  TRY
+  MLP(max_epoch);
+  MXNotifyShutdown();
+  CATCH
+  return 0;
+}
diff --git a/cpp-package/example/mlp_cpu.cpp b/cpp-package/example/mlp_cpu.cpp
new file mode 100644
index 000000000000..7ea6946dd8c2
--- /dev/null
+++ b/cpp-package/example/mlp_cpu.cpp
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Xin Li yakumolx@gmail.com
+ */
+#include <chrono>
+#include "utils.h"
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+Symbol mlp(const std::vector<int> &layers) {
+  auto x = Symbol::Variable("X");
+  auto label = Symbol::Variable("label");
+
+  std::vector<Symbol> weights(layers.size());
+  std::vector<Symbol> biases(layers.size());
+  std::vector<Symbol> outputs(layers.size());
+
+  for (size_t i = 0; i < layers.size(); ++i) {
+    weights[i] = Symbol::Variable("w" + std::to_string(i));
+    biases[i] = Symbol::Variable("b" + std::to_string(i));
+    Symbol fc = FullyConnected(
+      i == 0? x : outputs[i-1],  // data
+      weights[i],
+      biases[i],
+      layers[i]);
+    outputs[i] = i == layers.size()-1 ? fc : Activation(fc, ActivationActType::kRelu);
+  }
+
+  return SoftmaxOutput(outputs.back(), label);
+}
+
+int main(int argc, char** argv) {
+  const int image_size = 28;
+  const std::vector<int> layers{128, 64, 10};
+  const int batch_size = 100;
+  const int max_epoch = 10;
+  const float learning_rate = 0.1;
+  const float weight_decay = 1e-2;
+
+  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
+                                          "./data/mnist_data/train-labels-idx1-ubyte",
+                                          "./data/mnist_data/t10k-images-idx3-ubyte",
+                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
+                                        };
+
+  auto train_iter =  MXDataIter("MNISTIter");
+  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
+    return 1;
+  }
+
+  auto val_iter = MXDataIter("MNISTIter");
+  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
+    return 1;
+  }
+
+  TRY
+  auto net = mlp(layers);
+
+  Context ctx = Context::cpu();  // Use CPU for training
+
+  std::map<std::string, NDArray> args;
+  args["X"] = NDArray(Shape(batch_size, image_size*image_size), ctx);
+  args["label"] = NDArray(Shape(batch_size), ctx);
+  // Let MXNet infer shapes other parameters such as weights
+  net.InferArgsMap(ctx, &args, args);
+
+  // Initialize all parameters with uniform distribution U(-0.01, 0.01)
+  auto initializer = Uniform(0.01);
+  for (auto& arg : args) {
+    // arg.first is parameter name, and arg.second is the value
+    initializer(arg.first, &arg.second);
+  }
+
+  // Create sgd optimizer
+  Optimizer* opt = OptimizerRegistry::Find("sgd");
+  opt->SetParam("rescale_grad", 1.0/batch_size)
+     ->SetParam("lr", learning_rate)
+     ->SetParam("wd", weight_decay);
+
+  // Create executor by binding parameters to the model
+  auto *exec = net.SimpleBind(ctx, args);
+  auto arg_names = net.ListArguments();
+
+  // Start training
+  for (int iter = 0; iter < max_epoch; ++iter) {
+    int samples = 0;
+    train_iter.Reset();
+
+    auto tic = std::chrono::system_clock::now();
+    while (train_iter.Next()) {
+      samples += batch_size;
+      auto data_batch = train_iter.GetDataBatch();
+      // Set data and label
+      data_batch.data.CopyTo(&args["X"]);
+      data_batch.label.CopyTo(&args["label"]);
+
+      // Compute gradients
+      exec->Forward(true);
+      exec->Backward();
+      // Update parameters
+      for (size_t i = 0; i < arg_names.size(); ++i) {
+        if (arg_names[i] == "X" || arg_names[i] == "label") continue;
+        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+      }
+    }
+    auto toc = std::chrono::system_clock::now();
+
+    Accuracy acc;
+    val_iter.Reset();
+    while (val_iter.Next()) {
+      auto data_batch = val_iter.GetDataBatch();
+      data_batch.data.CopyTo(&args["X"]);
+      data_batch.label.CopyTo(&args["label"]);
+      // Forward pass is enough as no gradient is needed when evaluating
+      exec->Forward(false);
+      acc.Update(data_batch.label, exec->outputs[0]);
+    }
+    float duration = std::chrono::duration_cast<std::chrono::milliseconds>
+                     (toc - tic).count() / 1000.0;
+    LG << "Epoch: " << iter << " " << samples/duration << " samples/sec Accuracy: " << acc.Get();
+  }
+
+  delete exec;
+  delete opt;
+  MXNotifyShutdown();
+  CATCH
+  return 0;
+}
diff --git a/cpp-package/example/mlp_csv.cpp b/cpp-package/example/mlp_csv.cpp
new file mode 100644
index 000000000000..8db6638a90d3
--- /dev/null
+++ b/cpp-package/example/mlp_csv.cpp
@@ -0,0 +1,276 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Example: mlp_csv
+ * Description:
+ * The following example demonstrates how to use CSVIter. This example creates
+ * mlp (multi-layer perceptron) model and trains the MNIST data which is in
+ * CSV format.
+ */
+#include <chrono>
+#include <string>
+#include "utils.h"
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+/*
+ * Implementing the mlp symbol with given hidden units configuration.
+ */
+Symbol mlp(const std::vector<int> &hidden_units) {
+    auto data = Symbol::Variable("data");
+    auto label = Symbol::Variable("label");
+
+    std::vector<Symbol> weights(hidden_units.size());
+    std::vector<Symbol> biases(hidden_units.size());
+    std::vector<Symbol> outputs(hidden_units.size());
+
+    for (size_t i = 0; i < hidden_units.size(); ++i) {
+        weights[i] = Symbol::Variable("w" + std::to_string(i));
+        biases[i] = Symbol::Variable("b" + std::to_string(i));
+        Symbol fc = FullyConnected(
+                                   i == 0? data : outputs[i-1],  // data
+                                   weights[i],
+                                   biases[i],
+                                   hidden_units[i]);
+        outputs[i] = i == hidden_units.size()-1 ? fc : Activation(fc, ActivationActType::kRelu);
+    }
+    return SoftmaxOutput(outputs.back(), label);
+}
+
+/*
+ * Convert the input string of number of hidden units into the vector of integers.
+ */
+std::vector<int> getLayers(const std::string& hidden_units_string) {
+    std::vector<int> hidden_units;
+    char *pNext;
+    int num_unit = strtol(hidden_units_string.c_str(), &pNext, 10);
+    hidden_units.push_back(num_unit);
+    while (*pNext) {
+        num_unit = strtol(pNext, &pNext, 10);
+        hidden_units.push_back(num_unit);
+    }
+    return hidden_units;
+}
+
+void printUsage() {
+    std::cout << "Usage:" << std::endl;
+    std::cout << "mlp_csv --train mnist_training_set.csv --test mnist_test_set.csv --epochs 10 "
+    << "--batch_size 100 --hidden_units \"128 64 64\" --gpu" << std::endl;
+    std::cout << "The example uses mnist data in CSV format. The MNIST data in CSV format assumes "
+    << "the column 0 to be label and the rest 784 column to be data." << std::endl;
+    std::cout << "By default, the example uses 'cpu' context. If '--gpu' is specified, "
+    << "program uses 'gpu' context." <<std::endl;
+}
+
+int main(int argc, char** argv) {
+    const int image_size = 28;
+    const int num_mnist_features = image_size * image_size;
+    int batch_size = 100;
+    int max_epoch = 10;
+    const float learning_rate = 0.1;
+    const float weight_decay = 1e-2;
+    bool isGpu = false;
+
+    std::string training_set;
+    std::string test_set;
+    std::string hidden_units_string;
+    int index = 1;
+    while (index < argc) {
+        if (strcmp("--train", argv[index]) == 0) {
+            index++;
+            training_set = argv[index];
+        } else if (strcmp("--test", argv[index]) == 0) {
+            index++;
+            test_set = argv[index];
+        } else if (strcmp("--epochs", argv[index]) == 0) {
+            index++;
+            max_epoch = strtol(argv[index], nullptr, 10);
+        } else if (strcmp("--batch_size", argv[index]) == 0) {
+            index++;
+            batch_size = strtol(argv[index], nullptr, 10);
+        } else if (strcmp("--hidden_units", argv[index]) == 0) {
+            index++;
+            hidden_units_string = argv[index];
+        } else if (strcmp("--gpu", argv[index]) == 0) {
+            isGpu = true;
+            index++;
+        } else if (strcmp("--help", argv[index]) == 0) {
+            printUsage();
+            return 0;
+        }
+        index++;
+    }
+
+    if (training_set.empty() || test_set.empty() || hidden_units_string.empty()) {
+        std::cout << "ERROR: The mandatory arguments such as path to training and test data or "
+        << "number of hidden units for mlp are not specified." << std::endl << std::endl;
+        printUsage();
+        return 1;
+    }
+
+    std::vector<int> hidden_units = getLayers(hidden_units_string);
+
+    if (hidden_units.empty()) {
+        std::cout << "ERROR: Number of hidden units are not provided in correct format."
+        << "The numbers need to be separated by ' '." << std::endl << std::endl;
+        printUsage();
+        return 1;
+    }
+
+    /*
+     * The MNIST data in CSV format has 785 columns.
+     * The first column is "Label" and rest of the columns contain data.
+     * The mnist_train.csv has 60000 records and mnist_test.csv has
+     * 10000 records.
+     */
+    auto train_iter = MXDataIter("CSVIter")
+    .SetParam("data_csv", training_set)
+    .SetParam("data_shape", Shape(num_mnist_features + 1, 1))
+    .SetParam("batch_size", batch_size)
+    .SetParam("flat", 1)
+    .SetParam("shuffle", 0)
+    .CreateDataIter();
+
+    auto val_iter = MXDataIter("CSVIter")
+    .SetParam("data_csv", test_set)
+    .SetParam("data_shape", Shape(num_mnist_features + 1, 1))
+    .SetParam("batch_size", batch_size)
+    .SetParam("flat", 1)
+    .SetParam("shuffle", 0)
+    .CreateDataIter();
+
+    TRY
+    auto net = mlp(hidden_units);
+
+    Context ctx = Context::cpu();
+    if (isGpu) {
+        ctx = Context::gpu();
+    }
+
+    std::map<std::string, NDArray> args;
+    args["data"] = NDArray(Shape(batch_size, num_mnist_features), ctx);
+    args["label"] = NDArray(Shape(batch_size), ctx);
+    // Let MXNet infer shapes other parameters such as weights
+    net.InferArgsMap(ctx, &args, args);
+
+    // Initialize all parameters with uniform distribution U(-0.01, 0.01)
+    auto initializer = Uniform(0.01);
+    for (auto& arg : args) {
+        // arg.first is parameter name, and arg.second is the value
+        initializer(arg.first, &arg.second);
+    }
+
+    // Create sgd optimiz er
+    Optimizer* opt = OptimizerRegistry::Find("sgd");
+    opt->SetParam("rescale_grad", 1.0/batch_size)
+    ->SetParam("lr", learning_rate)
+    ->SetParam("wd", weight_decay);
+
+    // Create executor by binding parameters to the model
+    auto *exec = net.SimpleBind(ctx, args);
+    auto arg_names = net.ListArguments();
+
+    // Start training
+    for (int iter = 0; iter < max_epoch; ++iter) {
+        int samples = 0;
+        train_iter.Reset();
+
+        auto tic = std::chrono::system_clock::now();
+        while (train_iter.Next()) {
+            samples += batch_size;
+            auto data_batch = train_iter.GetDataBatch();
+
+            /*
+             * The shape of data_batch.data is (batch_size, (num_mnist_features + 1))
+             * Need to reshape this data so that label column can be extracted from this data.
+             */
+            NDArray reshapedData = data_batch.data.Reshape(Shape((num_mnist_features + 1),
+                                                                 batch_size));
+
+            /*
+             * Extract the label data by slicing the first column of the data and
+             * copy it to "label" arg.
+             */
+            reshapedData.Slice(0, 1).Reshape(Shape(batch_size)).CopyTo(&args["label"]);
+
+            /*
+             * Extract the feature data by slicing the columns 1 to 785 of the data and
+             * copy it to "data" arg.
+             */
+            reshapedData.Slice(1, (num_mnist_features + 1)).Reshape(Shape(batch_size,
+                                                                         num_mnist_features))
+                                                           .CopyTo(&args["data"]);
+
+            exec->Forward(true);
+
+            // Compute gradients
+            exec->Backward();
+            // Update parameters
+            for (size_t i = 0; i < arg_names.size(); ++i) {
+                if (arg_names[i] == "data" || arg_names[i] == "label") continue;
+                opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+            }
+        }
+        auto toc = std::chrono::system_clock::now();
+
+        Accuracy acc;
+        val_iter.Reset();
+        while (val_iter.Next()) {
+            auto data_batch = val_iter.GetDataBatch();
+
+            /*
+             * The shape of data_batch.data is (batch_size, (num_mnist_features + 1))
+             * Need to reshape this data so that label column can be extracted from this data.
+             */
+            NDArray reshapedData = data_batch.data.Reshape(Shape((num_mnist_features + 1),
+                                                                 batch_size));
+
+            /*
+             * Extract the label data by slicing the first column of the data and
+             * copy it to "label" arg.
+             */
+            NDArray labelData = reshapedData.Slice(0, 1).Reshape(Shape(batch_size));
+            labelData.CopyTo(&args["label"]);
+
+            /*
+             * Extract the feature data by slicing the columns 1 to 785 of the data and
+             * copy it to "data" arg.
+             */
+            reshapedData.Slice(1, (num_mnist_features + 1)).Reshape(Shape(batch_size,
+                                                                         num_mnist_features))
+                                                                   .CopyTo(&args["data"]);
+
+            // Forward pass is enough as no gradient is needed when evaluating
+            exec->Forward(false);
+            acc.Update(labelData, exec->outputs[0]);
+        }
+        float duration = std::chrono::duration_cast<std::chrono::milliseconds>
+        (toc - tic).count() / 1000.0;
+        LG << "Epoch[" << iter << "]  " << samples/duration << " samples/sec Accuracy: "
+        << acc.Get();
+    }
+
+    delete exec;
+    delete opt;
+    MXNotifyShutdown();
+    CATCH
+    return 0;
+}
diff --git a/cpp-package/example/mlp_gpu.cpp b/cpp-package/example/mlp_gpu.cpp
new file mode 100644
index 000000000000..5265de79d976
--- /dev/null
+++ b/cpp-package/example/mlp_gpu.cpp
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Xin Li yakumolx@gmail.com
+ */
+#include <chrono>
+#include "utils.h"
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+Symbol mlp(const std::vector<int> &layers) {
+  auto x = Symbol::Variable("X");
+  auto label = Symbol::Variable("label");
+
+  std::vector<Symbol> weights(layers.size());
+  std::vector<Symbol> biases(layers.size());
+  std::vector<Symbol> outputs(layers.size());
+
+  for (size_t i = 0; i < layers.size(); ++i) {
+    weights[i] = Symbol::Variable("w" + std::to_string(i));
+    biases[i] = Symbol::Variable("b" + std::to_string(i));
+    Symbol fc = FullyConnected(
+      i == 0? x : outputs[i-1],  // data
+      weights[i],
+      biases[i],
+      layers[i]);
+    outputs[i] = i == layers.size()-1 ? fc : Activation(fc, ActivationActType::kRelu);
+  }
+
+  return SoftmaxOutput(outputs.back(), label);
+}
+
+int main(int argc, char** argv) {
+  const int image_size = 28;
+  const std::vector<int> layers{128, 64, 10};
+  const int batch_size = 100;
+  const int max_epoch = 10;
+  const float learning_rate = 0.1;
+  const float weight_decay = 1e-2;
+
+  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
+                                          "./data/mnist_data/train-labels-idx1-ubyte",
+                                          "./data/mnist_data/t10k-images-idx3-ubyte",
+                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
+                                        };
+
+  auto train_iter =  MXDataIter("MNISTIter");
+  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
+    return 1;
+  }
+
+  auto val_iter = MXDataIter("MNISTIter");
+  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
+    return 1;
+  }
+
+  TRY
+  auto net = mlp(layers);
+
+  Context ctx = Context::gpu();  // Use GPU for training
+
+  std::map<std::string, NDArray> args;
+  args["X"] = NDArray(Shape(batch_size, image_size*image_size), ctx);
+  args["label"] = NDArray(Shape(batch_size), ctx);
+  // Let MXNet infer shapes of other parameters such as weights
+  net.InferArgsMap(ctx, &args, args);
+
+  // Initialize all parameters with uniform distribution U(-0.01, 0.01)
+  auto initializer = Uniform(0.01);
+  for (auto& arg : args) {
+    // arg.first is parameter name, and arg.second is the value
+    initializer(arg.first, &arg.second);
+  }
+
+  // Create sgd optimizer
+  Optimizer* opt = OptimizerRegistry::Find("sgd");
+  opt->SetParam("rescale_grad", 1.0/batch_size)
+     ->SetParam("lr", learning_rate)
+     ->SetParam("wd", weight_decay);
+  std::unique_ptr<LRScheduler> lr_sch(new FactorScheduler(5000, 0.1));
+  opt->SetLRScheduler(std::move(lr_sch));
+
+  // Create executor by binding parameters to the model
+  auto *exec = net.SimpleBind(ctx, args);
+  auto arg_names = net.ListArguments();
+
+  // Create metrics
+  Accuracy train_acc, val_acc;
+
+  // Start training
+  for (int iter = 0; iter < max_epoch; ++iter) {
+    int samples = 0;
+    train_iter.Reset();
+    train_acc.Reset();
+
+    auto tic = std::chrono::system_clock::now();
+    while (train_iter.Next()) {
+      samples += batch_size;
+      auto data_batch = train_iter.GetDataBatch();
+      // Data provided by DataIter are stored in memory, should be copied to GPU first.
+      data_batch.data.CopyTo(&args["X"]);
+      data_batch.label.CopyTo(&args["label"]);
+      // CopyTo is imperative, need to wait for it to complete.
+      NDArray::WaitAll();
+
+      // Compute gradients
+      exec->Forward(true);
+      exec->Backward();
+
+      // Update parameters
+      for (size_t i = 0; i < arg_names.size(); ++i) {
+        if (arg_names[i] == "X" || arg_names[i] == "label") continue;
+        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+      }
+      // Update metric
+      train_acc.Update(data_batch.label, exec->outputs[0]);
+    }
+    // one epoch of training is finished
+    auto toc = std::chrono::system_clock::now();
+    float duration = std::chrono::duration_cast<std::chrono::milliseconds>
+                     (toc - tic).count() / 1000.0;
+    LG << "Epoch[" << iter << "] " << samples/duration \
+       << " samples/sec " << "Train-Accuracy=" << train_acc.Get();;
+
+    val_iter.Reset();
+    val_acc.Reset();
+    while (val_iter.Next()) {
+      auto data_batch = val_iter.GetDataBatch();
+      data_batch.data.CopyTo(&args["X"]);
+      data_batch.label.CopyTo(&args["label"]);
+      NDArray::WaitAll();
+
+      // Only forward pass is enough as no gradient is needed when evaluating
+      exec->Forward(false);
+      val_acc.Update(data_batch.label, exec->outputs[0]);
+    }
+    LG << "Epoch[" << iter << "] Val-Accuracy=" << val_acc.Get();
+  }
+
+  delete exec;
+  delete opt;
+  MXNotifyShutdown();
+  CATCH
+  return 0;
+}
diff --git a/cpp-package/example/mnist_to_csv.py b/cpp-package/example/mnist_to_csv.py
new file mode 100644
index 000000000000..dad9ed5f9c72
--- /dev/null
+++ b/cpp-package/example/mnist_to_csv.py
@@ -0,0 +1,59 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Following file converts the mnist data to CSV format.
+# Usage:
+# mnist_to_csv.py train-images-idx3-ubyte train-labels-idx1-ubyte mnist_train.csv 60000
+# mnist_to_csv.py t10k-images-idx3-ubyte t10k-labels-idx1-ubyte mnist_test.csv 10000
+#
+
+import argparse
+
+def convert_to_csv(args):
+    imageFile = open(args.imageFile, "rb")
+    labelFile = open(args.labelFile, "rb")
+    outputFile = open(args.outputFile, "w")
+
+    imageFile.read(16)
+    labelFile.read(8)
+    images = []
+
+    for i in range(args.num_records):
+        image = [ord(labelFile.read(1))]
+        for j in range(28 * 28):
+            image.append(ord(imageFile.read(1)))
+        images.append(image)
+
+    for image in images:
+        outputFile.write(",".join(str(pix) for pix in image) + "\n")
+
+    imageFile.close()
+    outputFile.close()
+    labelFile.close()
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("imageFile", type=str, help="image file in mnist format e.g. train-images-idx3-ubyte")
+    parser.add_argument("labelFile", type=str, help="label file in mnist format e.g train-labels-idx1-ubyte")
+    parser.add_argument("outputFile", type=str, help="Output file in CSV format e.g mnist_train_trial.csv")
+    parser.add_argument("num_records", type=int, help="Number of images in the input files.e.g 60000")
+    args = parser.parse_args()
+
+    try:
+        convert_to_csv(args)
+    except Exception as e:
+        print("Error : Exception {}".format(str(e)))
diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp
new file mode 100644
index 000000000000..51dbf420ef99
--- /dev/null
+++ b/cpp-package/example/resnet.cpp
@@ -0,0 +1,283 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ */
+#include <map>
+#include <string>
+#include <fstream>
+#include <vector>
+#include <cstdlib>
+#include "utils.h"
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+Symbol ConvolutionNoBias(const std::string& symbol_name,
+                         Symbol data,
+                         Symbol weight,
+                         Shape kernel,
+                         int num_filter,
+                         Shape stride = Shape(1, 1),
+                         Shape dilate = Shape(1, 1),
+                         Shape pad = Shape(0, 0),
+                         int num_group = 1,
+                         int64_t workspace = 512) {
+  return Operator("Convolution")
+      .SetParam("kernel", kernel)
+      .SetParam("num_filter", num_filter)
+      .SetParam("stride", stride)
+      .SetParam("dilate", dilate)
+      .SetParam("pad", pad)
+      .SetParam("num_group", num_group)
+      .SetParam("workspace", workspace)
+      .SetParam("no_bias", true)
+      .SetInput("data", data)
+      .SetInput("weight", weight)
+      .CreateSymbol(symbol_name);
+}
+
+Symbol getConv(const std::string & name, Symbol data,
+               int  num_filter,
+               Shape kernel, Shape stride, Shape pad,
+               bool with_relu,
+               mx_float bn_momentum) {
+  Symbol conv_w(name + "_w");
+  Symbol conv = ConvolutionNoBias(name, data, conv_w,
+                                  kernel, num_filter, stride, Shape(1, 1),
+                                  pad, 1, 512);
+
+  Symbol gamma(name + "_gamma");
+  Symbol beta(name + "_beta");
+  Symbol mmean(name + "_mmean");
+  Symbol mvar(name + "_mvar");
+
+  Symbol bn = BatchNorm(name + "_bn", conv, gamma,
+                        beta, mmean, mvar, 2e-5, bn_momentum, false);
+
+  if (with_relu) {
+    return Activation(name + "_relu", bn, "relu");
+  } else {
+    return bn;
+  }
+}
+
+Symbol makeBlock(const std::string & name, Symbol data, int num_filter,
+                 bool dim_match, mx_float bn_momentum) {
+  Shape stride;
+  if (dim_match) {
+    stride = Shape(1, 1);
+  } else {
+    stride = Shape(2, 2);
+  }
+
+  Symbol conv1 = getConv(name + "_conv1", data, num_filter,
+                         Shape(3, 3), stride, Shape(1, 1),
+                         true, bn_momentum);
+
+  Symbol conv2 = getConv(name + "_conv2", conv1, num_filter,
+                         Shape(3, 3), Shape(1, 1), Shape(1, 1),
+                         false, bn_momentum);
+
+  Symbol shortcut;
+
+  if (dim_match) {
+    shortcut = data;
+  } else {
+    Symbol shortcut_w(name + "_proj_w");
+    shortcut = ConvolutionNoBias(name + "_proj", data, shortcut_w,
+                                 Shape(2, 2), num_filter,
+                                 Shape(2, 2), Shape(1, 1), Shape(0, 0),
+                                 1, 512);
+  }
+
+  Symbol fused = shortcut + conv2;
+  return Activation(name + "_relu", fused, "relu");
+}
+
+Symbol getBody(Symbol data, int num_level, int num_block, int num_filter, mx_float bn_momentum) {
+  for (int level = 0; level < num_level; level++) {
+    for (int block = 0; block < num_block; block++) {
+      data = makeBlock("level" + std::to_string(level + 1) + "_block" + std::to_string(block + 1),
+                       data, num_filter * (std::pow(2, level)),
+                       (level == 0 || block > 0), bn_momentum);
+    }
+  }
+  return data;
+}
+
+Symbol ResNetSymbol(int num_class, int num_level = 3, int num_block = 9,
+                    int num_filter = 16, mx_float bn_momentum = 0.9,
+                    mxnet::cpp::Shape pool_kernel = mxnet::cpp::Shape(8, 8)) {
+  // data and label
+  Symbol data = Symbol::Variable("data");
+  Symbol data_label = Symbol::Variable("data_label");
+
+  Symbol gamma("gamma");
+  Symbol beta("beta");
+  Symbol mmean("mmean");
+  Symbol mvar("mvar");
+
+  Symbol zscore = BatchNorm("zscore", data, gamma,
+                            beta, mmean, mvar, 0.001, bn_momentum);
+
+  Symbol conv = getConv("conv0", zscore, num_filter,
+                        Shape(3, 3), Shape(1, 1), Shape(1, 1),
+                        true, bn_momentum);
+
+  Symbol body = getBody(conv, num_level, num_block, num_filter, bn_momentum);
+
+  Symbol pool = Pooling("pool", body, pool_kernel, PoolingPoolType::kAvg);
+
+  Symbol flat = Flatten("flatten", pool);
+
+  Symbol fc_w("fc_w"), fc_b("fc_b");
+  Symbol fc = FullyConnected("fc", flat, fc_w, fc_b, num_class);
+
+  return SoftmaxOutput("softmax", fc, data_label);
+}
+
+NDArray ResizeInput(NDArray data, const Shape new_shape) {
+  NDArray pic = data.Reshape(Shape(0, 1, 28, 28));
+  NDArray pic_1channel;
+  Operator("_contrib_BilinearResize2D")
+    .SetParam("height", new_shape[2])
+    .SetParam("width", new_shape[3])
+    (pic).Invoke(pic_1channel);
+  NDArray output;
+  Operator("tile")
+    .SetParam("reps", Shape(1, 3, 1, 1))
+    (pic_1channel).Invoke(output);
+  return output;
+}
+
+int main(int argc, char const *argv[]) {
+  int max_epoch = argc > 1 ? strtol(argv[1], nullptr, 10) : 100;
+  float learning_rate = 1e-4;
+  float weight_decay = 1e-4;
+
+  TRY
+  auto resnet = ResNetSymbol(10);
+  std::map<std::string, NDArray> args_map;
+  std::map<std::string, NDArray> aux_map;
+
+  /*context*/
+  auto ctx = Context::cpu();
+  int num_gpu;
+  MXGetGPUCount(&num_gpu);
+  int batch_size = 8;
+#if !MXNET_USE_CPU
+  if (num_gpu > 0) {
+    ctx = Context::gpu();
+    batch_size = 32;
+  }
+#endif
+
+  const Shape data_shape = Shape(batch_size, 3, 224, 224),
+              label_shape = Shape(batch_size);
+  args_map["data"] = NDArray(data_shape, ctx);
+  args_map["data_label"] = NDArray(label_shape, ctx);
+  resnet.InferArgsMap(ctx, &args_map, args_map);
+
+  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
+                                          "./data/mnist_data/train-labels-idx1-ubyte",
+                                          "./data/mnist_data/t10k-images-idx3-ubyte",
+                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
+                                        };
+
+  auto train_iter =  MXDataIter("MNISTIter");
+  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
+    return 1;
+  }
+
+  auto val_iter = MXDataIter("MNISTIter");
+  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
+    return 1;
+  }
+
+  // initialize parameters
+  Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2);
+  for (auto &arg : args_map) {
+    xavier(arg.first, &arg.second);
+  }
+
+  Optimizer* opt = OptimizerRegistry::Find("sgd");
+  opt->SetParam("lr", learning_rate)
+     ->SetParam("wd", weight_decay)
+     ->SetParam("momentum", 0.9)
+     ->SetParam("rescale_grad", 1.0 / batch_size)
+     ->SetParam("clip_gradient", 10);
+
+  auto *exec = resnet.SimpleBind(ctx, args_map);
+  auto arg_names = resnet.ListArguments();
+
+  // Create metrics
+  Accuracy train_acc, val_acc;
+  LogLoss logloss_train, logloss_val;
+  for (int epoch = 0; epoch < max_epoch; ++epoch) {
+    LG << "Epoch: " << epoch;
+    train_iter.Reset();
+    train_acc.Reset();
+    int iter = 0;
+    while (train_iter.Next()) {
+      auto data_batch = train_iter.GetDataBatch();
+      ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
+      NDArray::WaitAll();
+
+      exec->Forward(true);
+      exec->Backward();
+
+      for (size_t i = 0; i < arg_names.size(); ++i) {
+        if (arg_names[i] == "data" || arg_names[i] == "data_label") continue;
+        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+      }
+      NDArray::WaitAll();
+      train_acc.Update(data_batch.label, exec->outputs[0]);
+      logloss_train.Reset();
+      logloss_train.Update(data_batch.label, exec->outputs[0]);
+      ++iter;
+      LG << "EPOCH: " << epoch << " ITER: " << iter
+         << " Train Accuracy: " << train_acc.Get()
+         << " Train Loss: " << logloss_train.Get();
+    }
+    LG << "EPOCH: " << epoch << " Train Accuracy: " << train_acc.Get();
+
+    val_iter.Reset();
+    val_acc.Reset();
+    iter = 0;
+    while (val_iter.Next()) {
+      auto data_batch = val_iter.GetDataBatch();
+      ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
+      NDArray::WaitAll();
+      exec->Forward(false);
+      NDArray::WaitAll();
+      val_acc.Update(data_batch.label, exec->outputs[0]);
+      LG << "EPOCH: " << epoch << " ITER: " << iter << " Val Accuracy: " << val_acc.Get();
+      ++iter;
+    }
+    LG << "Validation Accuracy: " << val_acc.Get();
+  }
+  delete exec;
+  delete opt;
+  MXNotifyShutdown();
+  CATCH
+  return 0;
+}
diff --git a/cpp-package/example/run_lenet_with_mxdataiter.sh b/cpp-package/example/run_lenet_with_mxdataiter.sh
new file mode 100755
index 000000000000..57f3fcea8b5b
--- /dev/null
+++ b/cpp-package/example/run_lenet_with_mxdataiter.sh
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if [ ! -f "./mnist.zip" ]; then
+  wget https://repo.mxnet.io/gluon/dataset/mnist/
+  unzip -u mnist.zip
+fi
+make lenet_with_mxdataiter
+LD_LIBRARY_PATH=../lib/linux ./lenet_with_mxdataiter
diff --git a/cpp-package/example/test_kvstore.cpp b/cpp-package/example/test_kvstore.cpp
new file mode 100644
index 000000000000..d9e0400a5ac8
--- /dev/null
+++ b/cpp-package/example/test_kvstore.cpp
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "mxnet/c_api.h"  // MXGetGPUCount()
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+static bool test_single_key(const Context &context, const std::string &context_str) {
+  std::string key = "singlekeytest-" + context_str;
+
+  NDArray result(Shape(4), context);
+  NDArray result_cpu;
+
+  // initialize data
+  NDArray data_cpu({0.f, 233.f, -0.12f, 9.f}, Shape(4), Context::cpu());
+  NDArray data = data_cpu.Copy(context);
+  NDArray::WaitAll();
+
+  KVStore::Init(key, data);
+  NDArray::WaitAll();
+
+  // retrieve result
+  KVStore::Pull(key, &result);
+  NDArray::WaitAll();
+
+  result_cpu = result.Copy(Context::cpu());
+  NDArray::WaitAll();
+
+  // compare
+  for (size_t j=0; j < result_cpu.Size(); j++) {
+    if (result_cpu.GetData()[j] != data_cpu.GetData()[j]) {
+      LG << "Error: wrong initialized data in singlekeytest-" << context_str
+          << ", expect " << data_cpu.GetData()[j]
+          << " got " << result_cpu.GetData()[j];
+      return false;
+    }
+  }
+
+  // push gradient
+  NDArray grad_cpu({0.1f, -2.f, -4.4f, 0.f}, Shape(4), Context::cpu());
+  NDArray grad = grad_cpu.Copy(context);
+  NDArray::WaitAll();
+
+  KVStore::Push(key, grad);
+  NDArray::WaitAll();
+
+  // retrieve result
+  KVStore::Pull(key, &result);
+  NDArray::WaitAll();
+
+  result_cpu = result.Copy(Context::cpu());
+  NDArray::WaitAll();
+
+  // compare
+  for (size_t j=0; j < result_cpu.Size(); j++) {
+    if (result_cpu.GetData()[j] != grad_cpu.GetData()[j]) {
+      LG << "Error: wrong gradient data in singlekeytest-" << context_str
+          << ", expect " << grad_cpu.GetData()[j]
+          << " got " << result_cpu.GetData()[j];
+      return false;
+    }
+  }
+
+  return true;
+}
+
+static bool test_multiple_key(const Context &context, const std::string &context_str) {
+  std::vector<std::string> keys(2);
+  keys[0] = "multikeytest-0-" + context_str;
+  keys[1] = "multikeytest-1-" + context_str;
+
+  std::vector<NDArray> results(2);
+  results[0] = NDArray(Shape(4), context);
+  results[1] = NDArray(Shape(4), context);
+  std::vector<NDArray> results_cpu(2);
+
+  // initialize data
+  std::vector<NDArray> data_cpu(2);
+  data_cpu[0] = NDArray({0.f, 2.f, -3.12f, 4.f}, Shape(4), Context::cpu());
+  data_cpu[1] = NDArray({0.8f, -2.f, 6.6f, 77.f}, Shape(4), Context::cpu());
+  std::vector<NDArray> data(2);
+  data[0] = data_cpu[0].Copy(context);
+  data[1] = data_cpu[1].Copy(context);
+  NDArray::WaitAll();
+
+  KVStore::Init(keys, data);
+  NDArray::WaitAll();
+
+  // retrieve result
+  KVStore::Pull(keys, &results);
+  NDArray::WaitAll();
+
+  results_cpu[0] = results[0].Copy(Context::cpu());
+  results_cpu[1] = results[1].Copy(Context::cpu());
+  NDArray::WaitAll();
+
+  // compare
+  for (size_t i=0; i < results_cpu.size(); i++) {
+    for (size_t j=0; j < results_cpu[i].Size(); j++) {
+      if (results_cpu[i].GetData()[j] != data_cpu[i].GetData()[j]) {
+        LG << "Error: wrong initialized data in multikeytest-" << context_str
+            << ", expect " << data_cpu[i].GetData()[j]
+            << " got " << results_cpu[i].GetData()[j];
+        return false;
+      }
+    }
+  }
+
+  // push gradient, reduce for the second
+  std::vector<std::string> push_keys(3);
+  push_keys[0] = "multikeytest-0-" + context_str;
+  push_keys[1] = "multikeytest-1-" + context_str;
+  push_keys[2] = "multikeytest-1-" + context_str;
+
+  std::vector<NDArray> grads_cpu(3);
+  grads_cpu[0] = NDArray({0.2f, -0.3f, -1.1f, 0.0f}, Shape(4), Context::cpu());
+  grads_cpu[1] = NDArray({2.f, 4.f, -4.f, -5.f}, Shape(4), Context::cpu());
+  grads_cpu[2] = NDArray({-3.f, -0.2f, 12.f, -9.f}, Shape(4), Context::cpu());
+  std::vector<NDArray> grads(3);
+  grads[0] = grads_cpu[0].Copy(context);
+  grads[1] = grads_cpu[1].Copy(context);
+  grads[2] = grads_cpu[2].Copy(context);
+  NDArray::WaitAll();
+
+  KVStore::Push(push_keys, grads);
+  NDArray::WaitAll();
+
+  // retrieve result
+  KVStore::Pull(keys, &results);
+  NDArray::WaitAll();
+
+  results_cpu[0] = results[0].Copy(Context::cpu());
+  results_cpu[1] = results[1].Copy(Context::cpu());
+  NDArray::WaitAll();
+
+  // compare the first
+  for (size_t j=0; j < results_cpu[0].Size(); j++) {
+    if (results_cpu[0].GetData()[j] != grads_cpu[0].GetData()[j]) {
+      LG << "Error: wrong gradient data in multikeytest-" << context_str
+          << ", expect " << grads_cpu[0].GetData()[j]
+          << " got " << results_cpu[0].GetData()[j];
+      return false;
+    }
+  }
+
+  // compare the second
+  for (size_t j=0; j < results_cpu[1].Size(); j++) {
+    if (results_cpu[1].GetData()[j] != (grads_cpu[1].GetData()[j] + grads_cpu[2].GetData()[j])) {
+      LG << "Error: wrong reduced gradient data in multikeytest-" << context_str
+          << ", expect " << (grads_cpu[1].GetData()[j] + grads_cpu[2].GetData()[j])
+          << " got " << results_cpu[1].GetData()[j];
+      return false;
+    }
+  }
+
+  return true;
+}
+
+int main(int argc, char** argv) {
+  KVStore::SetType("local");
+
+  bool success1 = test_single_key(Context::cpu(), "cpu");
+  bool success2 = test_multiple_key(Context::cpu(), "cpu");
+
+  bool success3 = true;
+  bool success4 = true;
+
+  int gpu_count = 0;
+  if (MXGetGPUCount(&gpu_count) != 0) {
+    LG << "Error: MXGetGPUCount";
+
+    MXNotifyShutdown();
+    return 1;
+  }
+
+  if (gpu_count > 0) {
+    success3 = test_single_key(Context::gpu(), "gpu");
+    success4 = test_multiple_key(Context::gpu(), "gpu");
+  }
+
+  int ret = (success1 && success2 && success3 && success4) ? 0 : 1;
+
+  MXNotifyShutdown();
+  return ret;
+}
diff --git a/cpp-package/example/test_ndarray_copy.cpp b/cpp-package/example/test_ndarray_copy.cpp
new file mode 100644
index 000000000000..a3b3011993fa
--- /dev/null
+++ b/cpp-package/example/test_ndarray_copy.cpp
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+#include <vector>
+#include "mxnet/c_api.h"
+#include "dmlc/logging.h"
+#include "mxnet-cpp/MxNetCpp.h"
+using namespace mxnet::cpp;
+
+enum TypeFlag {
+  kFloat32 = 0,
+  kFloat64 = 1,
+  kFloat16 = 2,
+  kUint8 = 3,
+  kInt32 = 4,
+  kInt8  = 5,
+  kInt64 = 6,
+};
+
+/*
+ * The file is used for testing if there exist type inconsistency
+ * when using Copy API to create a new NDArray.
+ * By running: build/test_ndarray.
+ */
+int main(int argc, char** argv) {
+    std::vector<mx_uint> shape1{128, 2, 32};
+    Shape shape2(32, 8, 64);
+
+    int gpu_count = 0;
+    if (MXGetGPUCount(&gpu_count) != 0) {
+      LOG(ERROR) << "MXGetGPUCount failed";
+      return -1;
+    }
+
+    Context context = (gpu_count > 0) ? Context::gpu() : Context::cpu();
+
+    NDArray src1(shape1, context, true, kFloat16);
+    NDArray src2(shape2, context, false, kInt8);
+    NDArray dst1, dst2;
+    dst1 = src1.Copy(context);
+    dst2 = src2.Copy(context);
+    NDArray::WaitAll();
+    CHECK_EQ(src1.GetDType(), dst1.GetDType());
+    CHECK_EQ(src2.GetDType(), dst2.GetDType());
+    return 0;
+}
diff --git a/cpp-package/example/test_optimizer.cpp b/cpp-package/example/test_optimizer.cpp
new file mode 100644
index 000000000000..70190eff5dc6
--- /dev/null
+++ b/cpp-package/example/test_optimizer.cpp
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ * The file is used for testing if the optimizer could be created more than 1.
+ * By running: build/test_optimizer
+ * It return 0(means no error) if it succeed otherwise 1(error).
+ */
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+int main(int argc, char** argv) {
+  // Confirm >1 optimizers can be created w/o error
+  Optimizer* opt = OptimizerRegistry::Find("sgd");
+  opt = OptimizerRegistry::Find("adam");
+  int ret = (opt == 0) ? 1 : 0;
+
+  delete opt;
+  MXNotifyShutdown();
+  return ret;
+}
diff --git a/cpp-package/example/test_score.cpp b/cpp-package/example/test_score.cpp
new file mode 100644
index 000000000000..0ccdf65b3b19
--- /dev/null
+++ b/cpp-package/example/test_score.cpp
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Xin Li yakumolx@gmail.com
+ * The file is used for testing if the score(accurary) we get
+ * is better than the threshold we set using mlp model.
+ * By running: build/test_score 0.75
+ * 0.75 here means the threshold score
+ * It return 0 if we can achieve higher score than threshold, otherwise 1
+ */
+#include <chrono>
+#include "utils.h"
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+Symbol mlp(const std::vector<int> &layers) {
+  auto x = Symbol::Variable("X");
+  auto label = Symbol::Variable("label");
+
+  std::vector<Symbol> weights(layers.size());
+  std::vector<Symbol> biases(layers.size());
+  std::vector<Symbol> outputs(layers.size());
+
+  for (size_t i = 0; i < layers.size(); ++i) {
+    weights[i] = Symbol::Variable("w" + std::to_string(i));
+    biases[i] = Symbol::Variable("b" + std::to_string(i));
+    Symbol fc = FullyConnected(
+      i == 0? x : outputs[i-1],  // data
+      weights[i],
+      biases[i],
+      layers[i]);
+    outputs[i] = i == layers.size()-1? fc : Activation(fc, ActivationActType::kRelu);
+  }
+
+  return SoftmaxOutput(outputs.back(), label);
+}
+
+int main(int argc, char** argv) {
+  const float MIN_SCORE = std::stof(argv[1]);
+
+  const int image_size = 28;
+  const std::vector<int> layers{128, 64, 10};
+  const int batch_size = 100;
+  const int max_epoch = 10;
+  const float learning_rate = 0.1;
+  const float weight_decay = 1e-2;
+  float score = 0;
+
+  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
+                                          "./data/mnist_data/train-labels-idx1-ubyte",
+                                          "./data/mnist_data/t10k-images-idx3-ubyte",
+                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
+                                        };
+
+  auto train_iter =  MXDataIter("MNISTIter");
+  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
+    return 1;
+  }
+
+  auto val_iter = MXDataIter("MNISTIter");
+  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
+    return 1;
+  }
+
+  TRY
+  auto net = mlp(layers);
+
+  Context ctx = Context::gpu();  // Use GPU for training
+#if MXNET_USE_CPU
+  ctx = Context::cpu();
+#endif
+
+  std::map<std::string, NDArray> args;
+  args["X"] = NDArray(Shape(batch_size, image_size*image_size), ctx);
+  args["label"] = NDArray(Shape(batch_size), ctx);
+  // Let MXNet infer shapes of other parameters such as weights
+  net.InferArgsMap(ctx, &args, args);
+
+  // Initialize all parameters with uniform distribution U(-0.01, 0.01)
+  auto initializer = Uniform(0.01);
+  for (auto& arg : args) {
+    // arg.first is parameter name, and arg.second is the value
+    initializer(arg.first, &arg.second);
+  }
+
+  // Create sgd optimizer
+  Optimizer* opt = OptimizerRegistry::Find("sgd");
+  opt->SetParam("rescale_grad", 1.0/batch_size)
+     ->SetParam("lr", learning_rate)
+     ->SetParam("wd", weight_decay);
+  std::unique_ptr<LRScheduler> lr_sch(new FactorScheduler(5000, 0.1));
+  opt->SetLRScheduler(std::move(lr_sch));
+
+  // Create executor by binding parameters to the model
+  auto *exec = net.SimpleBind(ctx, args);
+  auto arg_names = net.ListArguments();
+
+  // Start training
+  for (int iter = 0; iter < max_epoch; ++iter) {
+    int samples = 0;
+    train_iter.Reset();
+
+    auto tic = std::chrono::system_clock::now();
+    while (train_iter.Next()) {
+      samples += batch_size;
+      auto data_batch = train_iter.GetDataBatch();
+      // Data provided by DataIter are stored in memory, should be copied to GPU first.
+      data_batch.data.CopyTo(&args["X"]);
+      data_batch.label.CopyTo(&args["label"]);
+      // CopyTo is imperative, need to wait for it to complete.
+      NDArray::WaitAll();
+
+      // Compute gradients
+      exec->Forward(true);
+      exec->Backward();
+      // Update parameters
+      for (size_t i = 0; i < arg_names.size(); ++i) {
+        if (arg_names[i] == "X" || arg_names[i] == "label") continue;
+        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+      }
+    }
+    auto toc = std::chrono::system_clock::now();
+
+    Accuracy acc;
+    val_iter.Reset();
+    while (val_iter.Next()) {
+      auto data_batch = val_iter.GetDataBatch();
+      data_batch.data.CopyTo(&args["X"]);
+      data_batch.label.CopyTo(&args["label"]);
+      NDArray::WaitAll();
+      // Only forward pass is enough as no gradient is needed when evaluating
+      exec->Forward(false);
+      acc.Update(data_batch.label, exec->outputs[0]);
+    }
+    float duration = std::chrono::duration_cast<std::chrono::milliseconds>
+                     (toc - tic).count() / 1000.0;
+    LG << "Epoch: " << iter << " " << samples/duration << " samples/sec Accuracy: " << acc.Get();
+    score = acc.Get();
+  }
+
+  delete exec;
+  delete opt;
+  MXNotifyShutdown();
+  CATCH
+  return score >= MIN_SCORE ? 0 : 1;
+}
diff --git a/cpp-package/example/unittests/unit_test_mlp_csv.sh b/cpp-package/example/unittests/unit_test_mlp_csv.sh
new file mode 100755
index 000000000000..55ddcdecaafd
--- /dev/null
+++ b/cpp-package/example/unittests/unit_test_mlp_csv.sh
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file is a unit test for mlp_csv.cpp example in 'example' directory.
+# The file
+#    1. Downloads the MNIST data,
+#    2. Converts it into CSV format.
+#    3. Runs the mlp_csv example and ensures that the accuracy is more than expected.
+#
+
+#!/bin/bash
+
+set -e # exit on the first error
+export EXE_NAME=mlp_csv
+
+cd $(dirname $(readlink -f $0))/../
+export LD_LIBRARY_PATH=$(readlink -f ../../lib):$LD_LIBRARY_PATH
+
+if [ ! -f ../../build/cpp-package/example/${EXE_NAME} ];
+then
+echo "FAIL: ${EXE_NAME} does not exist"
+exit
+fi
+
+cp ../../build/cpp-package/example/${EXE_NAME} .
+
+./get_data.sh
+python mnist_to_csv.py ./data/mnist_data/train-images-idx3-ubyte ./data/mnist_data/train-labels-idx1-ubyte ./data/mnist_data/mnist_train.csv 60000
+python mnist_to_csv.py ./data/mnist_data/t10k-images-idx3-ubyte ./data/mnist_data/t10k-labels-idx1-ubyte ./data/mnist_data/mnist_test.csv 10000
+
+./${EXE_NAME} --train ./data/mnist_data/mnist_train.csv --test ./data/mnist_data/mnist_test.csv --epochs 10 --batch_size 100 --hidden_units "128 64 10" 2&> ${EXE_NAME}.log
+
+if [ ! -f ${EXE_NAME}.log ];
+then
+echo "FAIL: Log file ${EXE_NAME}.log does not exist."
+exit
+fi
+
+# Obtain the accuracy achieved by mlp model after training with MNIST data in CSV format.
+export Acc_obtained=`grep -oP '.*\K(?<=Accuracy: ).*$' ${EXE_NAME}.log | tail -1 | tr -d '\n'`
+export Acc_expected=0.98
+
+# If the obtained accuracy does not meet the expected accuracy, report the test as FAIL.
+if [ $(echo "$Acc_obtained $Acc_expected" | awk '{printf($1 >= $2) ? 1 : 0}') -eq 1 ] ;
+then
+echo "PASS: ${EXE_NAME} obtained $Acc_obtained accuracy."
+else
+echo "FAIL: Accuracy = $Acc_obtained is less than expected accuracy $Acc_expected."
+fi
diff --git a/cpp-package/example/utils.h b/cpp-package/example/utils.h
new file mode 100644
index 000000000000..87847701ce6e
--- /dev/null
+++ b/cpp-package/example/utils.h
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef CPP_PACKAGE_EXAMPLE_UTILS_H_
+#define CPP_PACKAGE_EXAMPLE_UTILS_H_
+
+#include <string>
+#include <fstream>
+#include <vector>
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+#define TRY \
+  try {
+#define CATCH \
+  } catch(dmlc::Error &err) { \
+    LG << "Status: FAIL";\
+    LG << "With Error: " << MXGetLastError(); \
+    return 1; \
+  }
+
+bool isFileExists(const std::string &filename) {
+  std::ifstream fhandle(filename.c_str());
+  return fhandle.good();
+}
+
+bool check_datafiles(const std::vector<std::string> &data_files) {
+  for (size_t index=0; index < data_files.size(); index++) {
+    if (!(isFileExists(data_files[index]))) {
+      LG << "Error: File does not exist: "<< data_files[index];
+      return false;
+    }
+  }
+  return true;
+}
+
+bool setDataIter(MXDataIter *iter , const std::string &useType,
+              const std::vector<std::string> &data_files, int batch_size) {
+    if (!check_datafiles(data_files)) {
+        return false;
+    }
+
+    iter->SetParam("batch_size", batch_size);
+    iter->SetParam("shuffle", 1);
+    iter->SetParam("flat", 1);
+
+    if (useType ==  "Train") {
+      iter->SetParam("image", data_files[0]);
+      iter->SetParam("label", data_files[1]);
+    } else if (useType == "Label") {
+      iter->SetParam("image", data_files[2]);
+      iter->SetParam("label", data_files[3]);
+    }
+
+    iter->CreateDataIter();
+    return true;
+}
+
+#endif  // CPP_PACKAGE_EXAMPLE_UTILS_H_
diff --git a/cpp-package/include/mxnet-cpp/.gitignore b/cpp-package/include/mxnet-cpp/.gitignore
new file mode 100644
index 000000000000..995efdd6f07b
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/.gitignore
@@ -0,0 +1,2 @@
+# Rebuildable file(s)
+op.h
diff --git a/cpp-package/include/mxnet-cpp/CPPLINT.cfg b/cpp-package/include/mxnet-cpp/CPPLINT.cfg
new file mode 100644
index 000000000000..ba8649383947
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/CPPLINT.cfg
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+filter=-runtime/references
+exclude_files=op.h
diff --git a/cpp-package/include/mxnet-cpp/MxNetCpp.h b/cpp-package/include/mxnet-cpp/MxNetCpp.h
new file mode 100644
index 000000000000..4e3cfe8aaafb
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/MxNetCpp.h
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file MxNetCpp.h
+ * \brief meta include file for mxnet.cpp
+ * \author Chuntao Hong, Zhang Chen
+ */
+
+#ifndef MXNET_CPP_MXNETCPP_H_
+#define MXNET_CPP_MXNETCPP_H_
+
+#include "mxnet-cpp/executor.hpp"
+#include "mxnet-cpp/symbol.hpp"
+#include "mxnet-cpp/ndarray.hpp"
+#include "mxnet-cpp/operator.hpp"
+#include "mxnet-cpp/optimizer.hpp"
+#include "mxnet-cpp/kvstore.hpp"
+#include "mxnet-cpp/op.h"
+#include "mxnet-cpp/op_suppl.h"
+#include "mxnet-cpp/io.hpp"
+#include "mxnet-cpp/metric.h"
+#include "mxnet-cpp/initializer.h"
+#include "mxnet-cpp/contrib.h"
+
+#endif  // MXNET_CPP_MXNETCPP_H_
diff --git a/cpp-package/include/mxnet-cpp/base.h b/cpp-package/include/mxnet-cpp/base.h
new file mode 100644
index 000000000000..d0f1bea15f00
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/base.h
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file base.h
+* \brief base definitions for mxnetcpp
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef MXNET_CPP_BASE_H_
+#define MXNET_CPP_BASE_H_
+
+#include <cstdlib>
+#include "mxnet/c_api.h"
+#include "nnvm/c_api.h"
+
+namespace mxnet {
+namespace cpp {
+
+typedef unsigned index_t;
+
+enum OpReqType {
+  /*! \brief no operation, do not write anything */
+  kNullOp,
+  /*! \brief write gradient to provided space */
+  kWriteTo,
+  /*!
+  * \brief perform an inplace write,
+  * Target shares memory with one of input arguments.
+  * This option only happen when
+  */
+  kWriteInplace,
+  /*! \brief add to the provided space */
+  kAddTo
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_BASE_H_
diff --git a/cpp-package/include/mxnet-cpp/contrib.h b/cpp-package/include/mxnet-cpp/contrib.h
new file mode 100644
index 000000000000..890ab2bf0062
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/contrib.h
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+*  Copyright (c) 2019 by Contributors
+* \file contrib.h
+* \brief utility function to enable some contrib features
+* \author Haohuan Wang
+*/
+#ifndef MXNET_CPP_CONTRIB_H_
+#define MXNET_CPP_CONTRIB_H_
+
+#include <iostream>
+#include <string>
+#include <map>
+#include <vector>
+#include "mxnet-cpp/symbol.h"
+
+namespace mxnet {
+namespace cpp {
+namespace details {
+
+  /*!
+   * split a string with the given delimiter
+   * @param str string to be parsed
+   * @param delimiter delimiter
+   * @return delimited list of string
+   */
+  inline std::vector<std::string> split(const std::string& str, const std::string& delimiter) {
+    std::vector<std::string> splitted;
+    size_t last = 0;
+    size_t next = 0;
+    while ((next = str.find(delimiter, last)) != std::string::npos) {
+      splitted.push_back(str.substr(last, next - last));
+      last = next + 1;
+    }
+    splitted.push_back(str.substr(last));
+    return splitted;
+  }
+
+}  // namespace details
+
+namespace contrib {
+
+  // needs to be same with
+  //   https://github.com/apache/incubator-mxnet/blob/1c874cfc807cee755c38f6486e8e0f4d94416cd8/src/operator/subgraph/tensorrt/tensorrt-inl.h#L190
+  static const std::string TENSORRT_SUBGRAPH_PARAM_IDENTIFIER = "subgraph_params_names";
+  // needs to be same with
+  //   https://github.com/apache/incubator-mxnet/blob/master/src/operator/subgraph/tensorrt/tensorrt.cc#L244
+  static const std::string TENSORRT_SUBGRAPH_PARAM_PREFIX = "subgraph_param_";
+  /*!
+   * this is a mimic to https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/contrib/tensorrt.py#L37
+   * @param symbol symbol that already called subgraph api
+   * @param argParams original arg params, params needed by tensorrt will be removed after calling this function
+   * @param auxParams original aux params, params needed by tensorrt will be removed after calling this function
+   */
+  inline void InitTensorRTParams(const mxnet::cpp::Symbol& symbol,
+      std::map<std::string, mxnet::cpp::NDArray> *argParams,
+      std::map<std::string, mxnet::cpp::NDArray> *auxParams) {
+    mxnet::cpp::Symbol internals = symbol.GetInternals();
+    mx_uint numSymbol = internals.GetNumOutputs();
+    for (mx_uint i = 0; i < numSymbol; ++i) {
+        std::map<std::string, std::string> attrs = internals[i].ListAttributes();
+        if (attrs.find(TENSORRT_SUBGRAPH_PARAM_IDENTIFIER) != attrs.end()) {
+            std::string new_params_names;
+            std::map<std::string, mxnet::cpp::NDArray> tensorrtParams;
+            std::vector<std::string> keys = details::split(
+                attrs[TENSORRT_SUBGRAPH_PARAM_IDENTIFIER], ";");
+            for (const auto& key : keys) {
+                if (argParams->find(key) != argParams->end()) {
+                    new_params_names += key + ";";
+                    tensorrtParams[TENSORRT_SUBGRAPH_PARAM_PREFIX + key] = (*argParams)[key];
+                    argParams->erase(key);
+                } else if (auxParams->find(key) != auxParams->end()) {
+                    new_params_names += key + ";";
+                    tensorrtParams[TENSORRT_SUBGRAPH_PARAM_PREFIX + key] = (*auxParams)[key];
+                    auxParams->erase(key);
+                }
+            }
+            std::map<std::string, std::string> new_attrs = {};
+            for (const auto& kv : tensorrtParams) {
+                // passing the ndarray address into TRT node attributes to get the weight
+                uint64_t address = reinterpret_cast<uint64_t>(kv.second.GetHandle());
+                new_attrs[kv.first] = std::to_string(address);
+            }
+            if (!new_attrs.empty()) {
+                internals[i].SetAttributes(new_attrs);
+                internals[i].SetAttribute(TENSORRT_SUBGRAPH_PARAM_IDENTIFIER,
+                    new_params_names.substr(0, new_params_names.length() - 1));
+            }
+        }
+    }
+}
+
+}  // namespace contrib
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_CONTRIB_H_
diff --git a/cpp-package/include/mxnet-cpp/executor.h b/cpp-package/include/mxnet-cpp/executor.h
new file mode 100644
index 000000000000..88ebd886e018
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/executor.h
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file executor.h
+* \brief executor definition
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef MXNET_CPP_EXECUTOR_H_
+#define MXNET_CPP_EXECUTOR_H_
+
+#include <vector>
+#include <map>
+#include <set>
+#include <string>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/symbol.h"
+
+namespace mxnet {
+namespace cpp {
+
+class Optimizer;
+
+/*!
+* \brief Executor interface
+*/
+class Executor {
+ public:
+  Executor(const Symbol &symbol, Context context,
+           const std::vector<NDArray> &arg_arrays,
+           const std::vector<NDArray> &grad_arrays,
+           const std::vector<OpReqType> &grad_reqs,
+           const std::vector<NDArray> &aux_arrays,
+           const std::map<std::string, Context> &group_to_ctx =
+               std::map<std::string, Context>(),
+           Executor *shared_exec = nullptr);
+  explicit Executor(const CachedOpHandle &h) { handle_ = h; }
+  /*!
+  * \brief Perform a Forward operation of Operator
+  *  After this operation, user can get the result by using function head.
+  */
+  void Forward(bool is_train) {
+    std::vector<NDArrayHandle> arg_handles;
+    for (const auto &array : arg_arrays) {
+      arg_handles.push_back(array.GetHandle());
+    }
+    int prev_is_record = 0;
+    int prev_train_mode = 0;
+    CHECK_EQ(MXAutogradSetIsRecording(1, &prev_is_record), 0);
+    if (is_train == true) {
+      CHECK_EQ(MXAutogradSetIsTraining(1, &prev_train_mode), 0);
+    }
+    std::vector<NDArrayHandle> output_handles;
+    std::transform(outputs.begin(), outputs.end(),
+        std::back_inserter(output_handles), [](NDArray& a) {
+          return a.GetHandle();
+        });
+    int out_size = 0;
+    NDArrayHandle *out_array = nullptr;
+    bool out_initialized = false;
+    CHECK_EQ(MXInvokeCachedOp(handle_, arg_handles.size(), arg_handles.data(),
+                              device_type, device_id, &out_size, &out_array, nullptr),
+             0);
+    outputs.clear();
+    outputs.reserve(out_size);
+    for (mx_uint i = 0; i < out_size; ++i) {
+      outputs.push_back(NDArray(out_array[i]));
+    }
+    int cur_train_mode = prev_train_mode;
+    int cur_is_record = prev_is_record;
+    if (is_train == true) {
+      CHECK_EQ(MXAutogradSetIsTraining(cur_train_mode, &prev_train_mode), 0);
+    }
+    CHECK_EQ(MXAutogradSetIsRecording(cur_is_record, &prev_is_record), 0);
+  }
+  /*!
+  * \brief Perform a Backward operation of the Operator.
+  *  This must be called after Forward.
+  *  After this operation, NDArrays specified by grad_in_args_store will be
+  *updated accordingly.
+  *  User is allowed to pass in an empty Array if the head node is
+  *  loss function and head gradeitn is not needed.
+  *
+  * \param head_grads the gradient of head nodes to be backproped.
+  */
+  void Backward(const std::vector<NDArray> &head_grads =
+                    std::vector<NDArray>()) {
+    if (require_grad == true) {
+      if (outputs.size() == 0) {
+        Forward(false);
+      }
+      std::vector<NDArrayHandle> out_handles;
+      for (const auto &array : outputs) {
+        out_handles.push_back(array.GetHandle());
+      }
+                          
+      std::vector<NDArrayHandle> head_grads_;
+      for (auto d : head_grads) {
+        head_grads_.push_back(d.GetHandle());
+      }
+      if (head_grads_.size() > 0) {
+        CHECK_EQ(MXAutogradBackwardEx(out_handles.size(), out_handles.data(),
+                                      head_grads_.data(), 0, nullptr, 0, 0, 1,
+                                      nullptr, nullptr), 0);
+      } else {
+        CHECK_EQ(MXAutogradBackwardEx(out_handles.size(), out_handles.data(),
+                                      nullptr, 0, nullptr, 0, 0, 1,
+                                      nullptr, nullptr), 0);
+      }
+      grad_arrays.clear();
+      grad_arrays.reserve(arg_arrays.size());
+      for (const auto &array : arg_arrays) {
+        NDArrayHandle grad;
+        CHECK_EQ(MXNDArrayGetGrad(array.GetHandle(), &grad),0);
+        grad_arrays.push_back(NDArray(grad));
+      }
+    }
+  }
+  // TODO(zhangchen-qinyinghua)
+  // To implement reshape function
+  void Reshape();
+  /*!
+  * \brief destructor, free the handle
+  */
+  ~Executor() { MXFreeCachedOp(handle_); }
+  std::vector<NDArray> arg_arrays;
+  std::vector<NDArray> grad_arrays;
+  std::vector<NDArray> aux_arrays;
+  int device_type;
+  int device_id;
+  bool require_grad;
+  /*!
+  * \brief arrays store the outputs of forward
+  */
+  std::vector<NDArray> outputs;
+  std::map<std::string, NDArray> arg_dict() {
+    return GetDict(symbol_.ListArguments(), arg_arrays);
+  }
+  std::map<std::string, NDArray> grad_dict() {
+    return GetDict(symbol_.ListArguments(), grad_arrays);
+  }
+  std::map<std::string, NDArray> aux_dict() {
+    return GetDict(symbol_.ListAuxiliaryStates(), aux_arrays);
+  }
+
+ private:
+  Executor(const Executor &e);
+  Executor &operator=(const Executor &e);
+  CachedOpHandle handle_;
+  Symbol symbol_;
+  std::map<std::string, NDArray> GetDict(const std::vector<std::string> &names,
+                                         const std::vector<NDArray> &arrays) {
+    std::map<std::string, NDArray> ret;
+    std::set<std::string> name_set;
+    for (const auto &s : names) {
+      CHECK(name_set.find(s) == name_set.end()) << "Duplicate names detected, "
+                                                << s;
+      name_set.insert(s);
+    }
+    CHECK_EQ(name_set.size(), arrays.size())
+        << "names size not equal to arrays size";
+    for (size_t i = 0; i < names.size(); ++i) {
+      ret[names[i]] = arrays[i];
+    }
+    return ret;
+  }
+};
+}  // namespace cpp
+}  // namespace mxnet
+#endif  // MXNET_CPP_EXECUTOR_H_
diff --git a/cpp-package/include/mxnet-cpp/executor.hpp b/cpp-package/include/mxnet-cpp/executor.hpp
new file mode 100644
index 000000000000..904a87468d21
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/executor.hpp
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file executor.hpp
+ * \brief implementation of the executor
+ * \author Zhang Chen, Chuntao Hong
+ */
+
+#ifndef MXNET_CPP_EXECUTOR_HPP_
+#define MXNET_CPP_EXECUTOR_HPP_
+
+#include <vector>
+#include <map>
+#include <string>
+#include "mxnet-cpp/executor.h"
+#include "mxnet-cpp/optimizer.h"
+
+
+namespace mxnet {
+namespace cpp {
+inline Executor::Executor(const Symbol &symbol, Context context,
+                          const std::vector<NDArray> &arg_arrays,
+                          const std::vector<NDArray> &grad_arrays,
+                          const std::vector<OpReqType> &grad_reqs,
+                          const std::vector<NDArray> &aux_arrays,
+                          const std::map<std::string, Context> &group_to_ctx,
+                          Executor *shared_exec) {
+  this->arg_arrays = arg_arrays;
+  this->grad_arrays = grad_arrays;
+  this->aux_arrays = aux_arrays;
+  this->symbol_ = symbol;
+  this->device_type = context.GetDeviceType();
+  this->device_id = context.GetDeviceId();
+
+  std::vector<NDArrayHandle> arg_handles;
+  std::vector<NDArrayHandle> grad_handles;
+  std::vector<NDArrayHandle> aux_handles;
+
+  for (const auto &array : arg_arrays) {
+    arg_handles.push_back(array.GetHandle());
+  }
+  for (const auto &array : grad_arrays) {
+    grad_handles.push_back(array.GetHandle());
+  }
+
+  this->require_grad = false;
+  std::vector<mx_uint> grad_reqs_uint;
+  for (auto s : grad_reqs) {
+    if (s != OpReqType::kNullOp) {
+      this->require_grad = true;
+    }
+    grad_reqs_uint.push_back(s);
+  }
+  CHECK_EQ(MXAutogradMarkVariables(arg_handles.size(), arg_handles.data(),
+                                   grad_reqs_uint.data(), grad_handles.data()),0);
+  // std::vector<const char *> map_keys;
+  // std::vector<int> dev_types, dev_ids;
+  // for (const auto &s : group_to_ctx) {
+  //   map_keys.push_back(s.first.c_str());
+  //   dev_types.push_back(s.second.GetDeviceType());
+  //   dev_ids.push_back(s.second.GetDeviceId());
+  // }
+
+  CHECK_EQ(MXCreateCachedOp(symbol.GetHandle(), 0, nullptr, nullptr, &handle_, false), 0);
+}
+
+inline mx_uint GradType2Int(OpReqType t) {
+  if (t == OpReqType::kNullOp) {
+    return 0;
+  } else if (t == OpReqType::kWriteTo) {
+    return 1;
+  } else if (t == OpReqType::kWriteInplace) {
+    return 2;
+  } else if (t == OpReqType::kAddTo) {
+    return 3;
+  } else {
+    LOG(FATAL) << "unknown grad type " << t;
+  }
+  LOG(FATAL) << "should not reach here ";
+  return 0;
+}
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_EXECUTOR_HPP_
diff --git a/cpp-package/include/mxnet-cpp/initializer.h b/cpp-package/include/mxnet-cpp/initializer.h
new file mode 100644
index 000000000000..34725b9dfa81
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/initializer.h
@@ -0,0 +1,257 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file initializer.h
+ * \brief random initializer
+ * \author Zhang Chen
+ */
+
+#ifndef MXNET_CPP_INITIALIZER_H_
+#define MXNET_CPP_INITIALIZER_H_
+
+#include <cmath>
+#include <string>
+#include <vector>
+#include <random>
+#include "mxnet-cpp/ndarray.h"
+
+namespace mxnet {
+namespace cpp {
+
+class Initializer {
+ public:
+  static bool StringStartWith(const std::string& name,
+                              const std::string& check_str) {
+    return (name.size() >= check_str.size() &&
+            name.substr(0, check_str.size()) == check_str);
+  }
+  static bool StringEndWith(const std::string& name,
+                            const std::string& check_str) {
+    return (name.size() >= check_str.size() &&
+            name.substr(name.size() - check_str.size(), check_str.size()) ==
+                check_str);
+  }
+  virtual void operator()(const std::string& name, NDArray* arr) {
+    if (StringStartWith(name, "upsampling")) {
+      InitBilinear(arr);
+    } else if (StringEndWith(name, "bias")) {
+      InitBias(arr);
+    } else if (StringEndWith(name, "gamma")) {
+      InitGamma(arr);
+    } else if (StringEndWith(name, "beta")) {
+      InitBeta(arr);
+    } else if (StringEndWith(name, "weight")) {
+      InitWeight(arr);
+    } else if (StringEndWith(name, "moving_mean")) {
+      InitZero(arr);
+    } else if (StringEndWith(name, "moving_var")) {
+      InitOne(arr);
+    } else if (StringEndWith(name, "moving_inv_var")) {
+      InitZero(arr);
+    } else if (StringEndWith(name, "moving_avg")) {
+      InitZero(arr);
+    } else if (StringEndWith(name, "min")) {
+      InitZero(arr);
+    } else if (StringEndWith(name, "max")) {
+      InitOne(arr);
+    } else if (StringEndWith(name, "weight_quantize")) {
+      InitQuantizedWeight(arr);
+    } else if (StringEndWith(name, "bias_quantize")) {
+      InitQuantizedBias(arr);
+    } else {
+      InitDefault(arr);
+    }
+  }
+
+ protected:
+  virtual void InitBilinear(NDArray* arr) {
+    Shape shape(arr->GetShape());
+    std::vector<float> weight(shape.Size(), 0);
+    int f = std::ceil(shape[3] / 2.0);
+    float c = (2 * f - 1 - f % 2) / (2. * f);
+    for (size_t i = 0; i < shape.Size(); ++i) {
+      int x = i % shape[3];
+      int y = (i / shape[3]) % shape[2];
+      weight[i] = (1 - std::abs(x / f - c)) * (1 - std::abs(y / f - c));
+    }
+    (*arr).SyncCopyFromCPU(weight);
+  }
+  virtual void InitZero(NDArray* arr) { (*arr) = 0.0f; }
+  virtual void InitOne(NDArray* arr) { (*arr) = 1.0f; }
+  virtual void InitBias(NDArray* arr) { (*arr) = 0.0f; }
+  virtual void InitGamma(NDArray* arr) { (*arr) = 1.0f; }
+  virtual void InitBeta(NDArray* arr) { (*arr) = 0.0f; }
+  virtual void InitWeight(NDArray* arr) {}
+  virtual void InitQuantizedWeight(NDArray* arr) {
+    std::default_random_engine generator;
+    std::uniform_int_distribution<int32_t> _val(-127, 127);
+    (*arr) = _val(generator);
+  }
+  virtual void InitQuantizedBias(NDArray* arr) {
+    (*arr) = 0;
+  }
+  virtual void InitDefault(NDArray* arr) {}
+};
+
+class Constant : public Initializer {
+ public:
+  explicit Constant(float value)
+    : value(value) {}
+  void operator()(const std::string &name, NDArray *arr) override {
+    (*arr) = value;
+  }
+ protected:
+  float value;
+};
+
+class Zero : public Constant {
+ public:
+  Zero(): Constant(0.0f) {}
+};
+
+class One : public Constant {
+ public:
+  One(): Constant(1.0f) {}
+};
+
+class Uniform : public Initializer {
+ public:
+  explicit Uniform(float scale)
+    : Uniform(-scale, scale) {}
+  Uniform(float begin, float end)
+    : begin(begin), end(end) {}
+  void operator()(const std::string &name, NDArray *arr) override {
+    if (StringEndWith(name, "weight_quantize")) {
+      InitQuantizedWeight(arr);
+      return;
+    }
+    if (StringEndWith(name, "bias_quantize")) {
+      InitQuantizedBias(arr);
+      return;
+    }
+    NDArray::SampleUniform(begin, end, arr);
+  }
+ protected:
+  float begin, end;
+};
+
+class Normal : public Initializer {
+ public:
+  Normal(float mu, float sigma)
+    : mu(mu), sigma(sigma) {}
+  void operator()(const std::string &name, NDArray *arr) override {
+    if (StringEndWith(name, "weight_quantize")) {
+      InitQuantizedWeight(arr);
+      return;
+    }
+    if (StringEndWith(name, "bias_quantize")) {
+      InitQuantizedBias(arr);
+      return;
+    }
+    NDArray::SampleGaussian(mu, sigma, arr);
+  }
+ protected:
+  float mu, sigma;
+};
+
+class Bilinear : public Initializer {
+ public:
+  Bilinear() {}
+  void operator()(const std::string &name, NDArray *arr) override {
+    if (StringEndWith(name, "weight_quantize")) {
+      InitQuantizedWeight(arr);
+      return;
+    }
+    if (StringEndWith(name, "bias_quantize")) {
+      InitQuantizedBias(arr);
+      return;
+    }
+    InitBilinear(arr);
+  }
+};
+
+class Xavier : public Initializer {
+ public:
+  enum RandType {
+    gaussian,
+    uniform
+  } rand_type;
+  enum FactorType {
+    avg,
+    in,
+    out
+  } factor_type;
+  float magnitude;
+  Xavier(RandType rand_type = gaussian, FactorType factor_type = avg,
+         float magnitude = 3)
+      : rand_type(rand_type), factor_type(factor_type), magnitude(magnitude) {}
+
+  void operator()(const std::string &name, NDArray* arr) override {
+    if (StringEndWith(name, "weight_quantize")) {
+      InitQuantizedWeight(arr);
+      return;
+    }
+    if (StringEndWith(name, "bias_quantize")) {
+      InitQuantizedBias(arr);
+      return;
+    }
+
+    Shape shape(arr->GetShape());
+    float hw_scale = 1.0f;
+    if (shape.ndim() > 2) {
+      for (size_t i = 2; i < shape.ndim(); ++i) {
+        hw_scale *= shape[i];
+      }
+    }
+    float fan_in = shape[1] * hw_scale, fan_out = shape[0] * hw_scale;
+    float factor = 1.0f;
+    switch (factor_type) {
+      case avg:
+        factor = (fan_in + fan_out) / 2.0;
+        break;
+      case in:
+        factor = fan_in;
+        break;
+      case out:
+        factor = fan_out;
+    }
+    float scale = std::sqrt(magnitude / factor);
+    switch (rand_type) {
+      case uniform:
+        NDArray::SampleUniform(-scale, scale, arr);
+        break;
+      case gaussian:
+        NDArray::SampleGaussian(0, scale, arr);
+        break;
+    }
+  }
+};
+
+class MSRAPrelu : public Xavier {
+ public:
+  explicit MSRAPrelu(FactorType factor_type = avg, float slope = 0.25f)
+      : Xavier(gaussian, factor_type, 2. / (1 + slope * slope)) {}
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_INITIALIZER_H_
diff --git a/cpp-package/include/mxnet-cpp/io.h b/cpp-package/include/mxnet-cpp/io.h
new file mode 100644
index 000000000000..7d2d620bd886
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/io.h
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file operator.h
+* \brief definition of io, such as DataIter
+* \author Zhang Chen
+*/
+#ifndef MXNET_CPP_IO_H_
+#define MXNET_CPP_IO_H_
+
+#include <map>
+#include <string>
+#include <vector>
+#include <sstream>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/ndarray.h"
+#include "dmlc/logging.h"
+
+namespace mxnet {
+namespace cpp {
+/*!
+* \brief Default object for holding a mini-batch of data and related
+* information.
+*/
+class DataBatch {
+ public:
+  NDArray data;
+  NDArray label;
+  int pad_num;
+  std::vector<int> index;
+};
+class DataIter {
+ public:
+  virtual void BeforeFirst(void) = 0;
+  virtual bool Next(void) = 0;
+  virtual NDArray GetData(void) = 0;
+  virtual NDArray GetLabel(void) = 0;
+  virtual int GetPadNum(void) = 0;
+  virtual std::vector<int> GetIndex(void) = 0;
+
+  DataBatch GetDataBatch() {
+    return DataBatch{GetData(), GetLabel(), GetPadNum(), GetIndex()};
+  }
+  void Reset() { BeforeFirst(); }
+
+  virtual ~DataIter() = default;
+};
+
+class MXDataIterMap {
+ public:
+  inline MXDataIterMap() {
+    mx_uint num_data_iter_creators = 0;
+    DataIterCreator *data_iter_creators = nullptr;
+    int r = MXListDataIters(&num_data_iter_creators, &data_iter_creators);
+    CHECK_EQ(r, 0);
+    for (mx_uint i = 0; i < num_data_iter_creators; i++) {
+      const char *name;
+      const char *description;
+      mx_uint num_args;
+      const char **arg_names;
+      const char **arg_type_infos;
+      const char **arg_descriptions;
+      r = MXDataIterGetIterInfo(data_iter_creators[i], &name, &description,
+                                &num_args, &arg_names, &arg_type_infos,
+                                &arg_descriptions);
+      CHECK_EQ(r, 0);
+      mxdataiter_creators_[name] = data_iter_creators[i];
+    }
+  }
+  inline DataIterCreator GetMXDataIterCreator(const std::string &name) {
+    return mxdataiter_creators_[name];
+  }
+
+ private:
+  std::map<std::string, DataIterCreator> mxdataiter_creators_;
+};
+
+struct MXDataIterBlob {
+ public:
+  MXDataIterBlob() : handle_(nullptr) {}
+  explicit MXDataIterBlob(DataIterHandle handle) : handle_(handle) {}
+  ~MXDataIterBlob() { MXDataIterFree(handle_); }
+  DataIterHandle handle_;
+
+ private:
+  MXDataIterBlob &operator=(const MXDataIterBlob &);
+};
+
+class MXDataIter : public DataIter {
+ public:
+  explicit MXDataIter(const std::string &mxdataiter_type);
+  MXDataIter(const MXDataIter &other) {
+    creator_ = other.creator_;
+    params_ = other.params_;
+    blob_ptr_ = other.blob_ptr_;
+  }
+  void BeforeFirst();
+  bool Next();
+  NDArray GetData();
+  NDArray GetLabel();
+  int GetPadNum();
+  std::vector<int> GetIndex();
+  MXDataIter CreateDataIter();
+  /*!
+   * \brief set config parameters
+   * \param name name of the config parameter
+   * \param value value of the config parameter
+   * \return reference of self
+   */
+  template <typename T>
+  MXDataIter &SetParam(const std::string &name, const T &value) {
+    std::string value_str;
+    std::stringstream ss;
+    ss << value;
+    ss >> value_str;
+
+    params_[name] = value_str;
+    return *this;
+  }
+
+ private:
+  DataIterCreator creator_;
+  std::map<std::string, std::string> params_;
+  std::shared_ptr<MXDataIterBlob> blob_ptr_;
+  static MXDataIterMap*& mxdataiter_map();
+};
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_IO_H_
+
diff --git a/cpp-package/include/mxnet-cpp/io.hpp b/cpp-package/include/mxnet-cpp/io.hpp
new file mode 100644
index 000000000000..677c0f6ee1f0
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/io.hpp
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+* \file operator.hpp
+* \brief implementation of data iter
+* \author Zhang Chen
+*/
+#ifndef MXNET_CPP_IO_HPP_
+#define MXNET_CPP_IO_HPP_
+
+#include <string>
+#include <vector>
+#include "mxnet-cpp/io.h"
+
+namespace mxnet {
+namespace cpp {
+
+inline MXDataIterMap*& MXDataIter::mxdataiter_map() {
+    static MXDataIterMap* mxdataiter_map_ = new MXDataIterMap;
+    return mxdataiter_map_;
+}
+
+inline MXDataIter::MXDataIter(const std::string &mxdataiter_type) {
+  creator_ = mxdataiter_map()->GetMXDataIterCreator(mxdataiter_type);
+  blob_ptr_ = std::make_shared<MXDataIterBlob>(nullptr);
+}
+
+inline void MXDataIter::BeforeFirst() {
+  int r = MXDataIterBeforeFirst(blob_ptr_->handle_);
+  CHECK_EQ(r, 0);
+}
+
+inline bool MXDataIter::Next() {
+  int out;
+  int r = MXDataIterNext(blob_ptr_->handle_, &out);
+  CHECK_EQ(r, 0);
+  return out;
+}
+
+inline NDArray MXDataIter::GetData() {
+  NDArrayHandle handle;
+  int r = MXDataIterGetData(blob_ptr_->handle_, &handle);
+  CHECK_EQ(r, 0);
+  return NDArray(handle);
+}
+
+inline NDArray MXDataIter::GetLabel() {
+  NDArrayHandle handle;
+  int r = MXDataIterGetLabel(blob_ptr_->handle_, &handle);
+  CHECK_EQ(r, 0);
+  return NDArray(handle);
+}
+
+inline int MXDataIter::GetPadNum() {
+  int out;
+  int r = MXDataIterGetPadNum(blob_ptr_->handle_, &out);
+  CHECK_EQ(r, 0);
+  return out;
+}
+inline std::vector<int> MXDataIter::GetIndex() {
+  uint64_t *out_index, out_size;
+  int r = MXDataIterGetIndex(blob_ptr_->handle_, &out_index, &out_size);
+  CHECK_EQ(r, 0);
+  std::vector<int> ret;
+  for (uint64_t i = 0; i < out_size; ++i) {
+    ret.push_back(out_index[i]);
+  }
+  return ret;
+}
+
+inline MXDataIter MXDataIter::CreateDataIter() {
+  std::vector<const char *> param_keys;
+  std::vector<const char *> param_values;
+
+  for (auto &data : params_) {
+    param_keys.push_back(data.first.c_str());
+    param_values.push_back(data.second.c_str());
+  }
+
+  MXDataIterCreateIter(creator_, param_keys.size(), param_keys.data(),
+                       param_values.data(), &blob_ptr_->handle_);
+  return *this;
+}
+
+// MXDataIter MNIst
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_IO_HPP_
+
diff --git a/cpp-package/include/mxnet-cpp/kvstore.h b/cpp-package/include/mxnet-cpp/kvstore.h
new file mode 100644
index 000000000000..67f984fce0ee
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/kvstore.h
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file kvstore.h
+* \brief definition of kvstore
+* \author Chuntao Hong
+*/
+
+#ifndef MXNET_CPP_KVSTORE_H_
+#define MXNET_CPP_KVSTORE_H_
+
+#include <string>
+#include <vector>
+#include "mxnet-cpp/ndarray.h"
+
+namespace mxnet {
+namespace cpp {
+
+class KVStore {
+ public:
+  static void SetType(const std::string& type);
+  static void RunServer();
+  static void Init(int key, const NDArray& val);
+  static void Init(const std::string& key, const NDArray& val);
+  static void Init(const std::vector<int>& keys, const std::vector<NDArray>& vals);
+  static void Init(const std::vector<std::string>& keys, const std::vector<NDArray>& vals);
+  static void Push(int key, const NDArray& val, int priority = 0);
+  static void Push(const std::string& key, const NDArray& val, int priority = 0);
+  static void Push(const std::vector<int>& keys,
+                   const std::vector<NDArray>& vals, int priority = 0);
+  static void Push(const std::vector<std::string>& keys,
+                   const std::vector<NDArray>& vals, int priority = 0);
+  static void Pull(int key, NDArray* out, int priority = 0);
+  static void Pull(const std::string& key, NDArray* out, int priority = 0);
+  static void Pull(const std::vector<int>& keys,
+                   std::vector<NDArray>* outs, int priority = 0);
+  static void Pull(const std::vector<std::string>& keys,
+                   std::vector<NDArray>* outs, int priority = 0);
+  // TODO(lx): put lr in optimizer or not?
+  static void SetOptimizer(std::unique_ptr<Optimizer> optimizer, bool local = false);
+  static std::string GetType();
+  static int GetRank();
+  static int GetNumWorkers();
+  static void Barrier();
+  static std::string GetRole();
+
+ private:
+  KVStore();
+  static KVStoreHandle& get_handle();
+  static std::unique_ptr<Optimizer>& get_optimizer();
+  static KVStore*& get_kvstore();
+  static void Controller(int head, const char* body, void* controller_handle);
+  static void Updater(int key, NDArrayHandle recv, NDArrayHandle local, void* handle_);
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_KVSTORE_H_
diff --git a/cpp-package/include/mxnet-cpp/kvstore.hpp b/cpp-package/include/mxnet-cpp/kvstore.hpp
new file mode 100644
index 000000000000..6cd405b91dd4
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/kvstore.hpp
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file kvstore.hpp
+ * \brief implementation of kvstore
+ * \author Xin Li
+ */
+
+#include <algorithm>
+#include <map>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "mxnet-cpp/kvstore.h"
+#include "mxnet-cpp/optimizer.h"
+
+#ifndef MXNET_CPP_KVSTORE_HPP_
+#define MXNET_CPP_KVSTORE_HPP_
+
+namespace mxnet {
+namespace cpp {
+
+inline void KVStore::Controller(int head, const char* body, void* controller_handle) {
+  if (head == 0) {
+    std::map<std::string, std::string> params;
+    std::istringstream sin(body);
+    std::string line;
+    while (getline(sin, line)) {
+      size_t n = line.find('=');
+      params.emplace(line.substr(0, n), line.substr(n+1));
+    }
+    std::unique_ptr<Optimizer> opt(OptimizerRegistry::Find(params.at("opt_type")));
+    params.erase("opt_type");
+    for (const auto& pair : params) {
+      opt->SetParam(pair.first, pair.second);
+    }
+    get_kvstore()->SetOptimizer(std::move(opt), true);
+  }
+}
+
+inline KVStoreHandle& KVStore::get_handle() {
+  static KVStoreHandle handle_ = nullptr;
+  return handle_;
+}
+
+inline std::unique_ptr<Optimizer>& KVStore::get_optimizer() {
+  static std::unique_ptr<Optimizer> optimizer_;
+  return optimizer_;
+}
+
+inline KVStore*& KVStore::get_kvstore() {
+  static KVStore* kvstore_ = new KVStore;
+  return kvstore_;
+}
+
+inline KVStore::KVStore() {}
+
+inline void KVStore::SetType(const std::string& type) {
+  CHECK_EQ(MXKVStoreCreate(type.c_str(), &(get_kvstore()->get_handle())), 0);
+}
+
+inline void KVStore::RunServer() {
+  CHECK_NE(GetRole(), "worker");
+  CHECK_EQ(MXKVStoreRunServer(get_kvstore()->get_handle(), &Controller, 0), 0);
+}
+
+inline void KVStore::Init(int key, const NDArray& val) {
+  NDArrayHandle val_handle = val.GetHandle();
+  CHECK_EQ(MXKVStoreInit(get_kvstore()->get_handle(), 1, &key, &val_handle), 0);
+}
+
+inline void KVStore::Init(const std::string& key, const NDArray& val) {
+  const char* key_handle = key.c_str();
+  NDArrayHandle val_handle = val.GetHandle();
+  CHECK_EQ(MXKVStoreInitEx(get_kvstore()->get_handle(), 1, &key_handle, &val_handle), 0);
+}
+
+inline void KVStore::Init(const std::vector<int>& keys, const std::vector<NDArray>& vals) {
+  CHECK_EQ(keys.size(), vals.size());
+  std::vector<NDArrayHandle> val_handles(vals.size());
+  std::transform(vals.cbegin(), vals.cend(), val_handles.begin(),
+      [](const NDArray& val) {
+        return val.GetHandle();
+      });
+
+  CHECK_EQ(MXKVStoreInit(get_kvstore()->get_handle(), keys.size(), keys.data(),
+      val_handles.data()), 0);
+}
+
+inline void KVStore::Init(const std::vector<std::string>& keys, const std::vector<NDArray>& vals) {
+  CHECK_EQ(keys.size(), vals.size());
+  std::vector<const char*> key_handles(keys.size());
+  std::transform(keys.cbegin(), keys.cend(), key_handles.begin(),
+      [](const std::string& key) {
+        return key.c_str();
+      });
+  std::vector<NDArrayHandle> val_handles(vals.size());
+  std::transform(vals.cbegin(), vals.cend(), val_handles.begin(),
+      [](const NDArray& val) {
+        return val.GetHandle();
+      });
+
+  CHECK_EQ(MXKVStoreInitEx(get_kvstore()->get_handle(), key_handles.size(), key_handles.data(),
+      val_handles.data()), 0);
+}
+
+inline void KVStore::Push(int key, const NDArray& val, int priority) {
+  NDArrayHandle val_handle = val.GetHandle();
+  CHECK_EQ(MXKVStorePush(get_kvstore()->get_handle(), 1, &key, &val_handle, priority), 0);
+}
+
+inline void KVStore::Push(const std::string& key, const NDArray& val, int priority) {
+  const char* key_handle = key.c_str();
+  NDArrayHandle val_handle = val.GetHandle();
+  CHECK_EQ(MXKVStorePushEx(get_kvstore()->get_handle(), 1, &key_handle, &val_handle, priority), 0);
+}
+
+inline void KVStore::Push(const std::vector<int>& keys,
+                          const std::vector<NDArray>& vals, int priority) {
+  CHECK_EQ(keys.size(), vals.size());
+  std::vector<NDArrayHandle> val_handles(vals.size());
+  std::transform(vals.cbegin(), vals.cend(), val_handles.begin(),
+      [](const NDArray& val) {
+        return val.GetHandle();
+      });
+
+  CHECK_EQ(MXKVStorePush(get_kvstore()->get_handle(), keys.size(), keys.data(),
+      val_handles.data(), priority), 0);
+}
+
+inline void KVStore::Push(const std::vector<std::string>& keys,
+                          const std::vector<NDArray>& vals, int priority) {
+  CHECK_EQ(keys.size(), vals.size());
+  std::vector<const char*> key_handles(keys.size());
+  std::transform(keys.cbegin(), keys.cend(), key_handles.begin(),
+      [](const std::string& key) {
+        return key.c_str();
+      });
+  std::vector<NDArrayHandle> val_handles(vals.size());
+  std::transform(vals.cbegin(), vals.cend(), val_handles.begin(),
+      [](const NDArray& val) {
+        return val.GetHandle();
+      });
+
+  CHECK_EQ(MXKVStorePushEx(get_kvstore()->get_handle(), key_handles.size(), key_handles.data(),
+      val_handles.data(), priority), 0);
+}
+
+inline void KVStore::Pull(int key, NDArray* out, int priority) {
+  NDArrayHandle out_handle = out->GetHandle();
+  CHECK_EQ(MXKVStorePull(get_kvstore()->get_handle(), 1, &key, &out_handle, priority), 0);
+}
+
+inline void KVStore::Pull(const std::string& key, NDArray* out, int priority) {
+  const char* key_handle = key.c_str();
+  NDArrayHandle out_handle = out->GetHandle();
+  CHECK_EQ(MXKVStorePullEx(get_kvstore()->get_handle(), 1, &key_handle, &out_handle, priority), 0);
+}
+
+inline void KVStore::Pull(const std::vector<int>& keys,
+                          std::vector<NDArray>* outs, int priority) {
+  CHECK_EQ(keys.size(), outs->size());
+
+  std::vector<NDArrayHandle> out_handles(keys.size());
+  std::transform(outs->cbegin(), outs->cend(), out_handles.begin(),
+      [](const NDArray& val) {
+        return val.GetHandle();
+      });
+
+  CHECK_EQ(MXKVStorePull(get_kvstore()->get_handle(), keys.size(), keys.data(),
+      out_handles.data(), priority), 0);
+}
+
+inline void KVStore::Pull(const std::vector<std::string>& keys,
+                          std::vector<NDArray>* outs, int priority) {
+  CHECK_EQ(keys.size(), outs->size());
+
+  std::vector<const char*> key_handles(keys.size());
+  std::transform(keys.cbegin(), keys.cend(), key_handles.begin(),
+      [](const std::string& key) {
+        return key.c_str();
+      });
+  std::vector<NDArrayHandle> out_handles(keys.size());
+  std::transform(outs->cbegin(), outs->cend(), out_handles.begin(),
+      [](const NDArray& val) {
+        return val.GetHandle();
+      });
+
+  CHECK_EQ(MXKVStorePullEx(get_kvstore()->get_handle(), key_handles.size(), key_handles.data(),
+      out_handles.data(), priority), 0);
+}
+
+inline void KVStore::Updater(int key, NDArrayHandle recv, NDArrayHandle local,
+                             void* handle_) {
+  Optimizer *opt = static_cast<Optimizer*>(handle_);
+  opt->Update(key, NDArray(local), NDArray(recv));
+}
+
+inline void KVStore::SetOptimizer(std::unique_ptr<Optimizer> optimizer, bool local) {
+  if (local) {
+    get_kvstore()->get_optimizer() = std::move(optimizer);
+    CHECK_EQ(MXKVStoreSetUpdater(get_kvstore()->get_handle(),
+                                 &Updater, get_kvstore()->get_optimizer().get()), 0);
+  } else {
+    CHECK_EQ(MXKVStoreSendCommmandToServers(get_kvstore()->get_handle(), 0,
+                                            (*optimizer).Serialize().c_str()), 0);
+  }
+}
+
+inline std::string KVStore::GetType() {
+  const char *type;
+  CHECK_EQ(MXKVStoreGetType(get_kvstore()->get_handle(), &type), 0);
+  return type;
+}
+
+inline int KVStore::GetRank() {
+  int rank;
+  CHECK_EQ(MXKVStoreGetRank(get_kvstore()->get_handle(), &rank), 0);
+  return rank;
+}
+
+inline int KVStore::GetNumWorkers() {
+  int num_workers;
+  CHECK_EQ(MXKVStoreGetGroupSize(get_kvstore()->get_handle(), &num_workers), 0);
+  return num_workers;
+}
+
+inline void KVStore::Barrier() {
+  CHECK_EQ(MXKVStoreBarrier(get_kvstore()->get_handle()), 0);
+}
+
+inline std::string KVStore::GetRole() {
+  int ret;
+  CHECK_EQ(MXKVStoreIsSchedulerNode(&ret), 0);
+  if (ret) {
+    return "scheduler";
+  }
+  CHECK_EQ(MXKVStoreIsServerNode(&ret), 0);
+  if (ret) {
+    return "server";
+  }
+  CHECK_EQ(MXKVStoreIsWorkerNode(&ret), 0);
+  CHECK(ret);
+  return "worker";
+}
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_KVSTORE_HPP_
diff --git a/cpp-package/include/mxnet-cpp/lr_scheduler.h b/cpp-package/include/mxnet-cpp/lr_scheduler.h
new file mode 100644
index 000000000000..cffd1c7576e5
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/lr_scheduler.h
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file lr_scheduler.h
+* \brief Scheduling learning rate
+*/
+
+#ifndef MXNET_CPP_LR_SCHEDULER_H_
+#define MXNET_CPP_LR_SCHEDULER_H_
+
+#include "dmlc/logging.h"
+
+namespace mxnet {
+namespace cpp {
+
+/*!
+* \brief lr scheduler interface
+*/
+class LRScheduler {
+ public:
+  /*!
+  * \brief constructor
+  * \param base_lr the initial learning rate.
+  */
+  explicit LRScheduler(float base_lr = 0.01)
+      : base_lr_(base_lr) {}
+  /*!
+  * \brief set base lr
+  * \param lr learning rate from optimizer
+  */
+  void SetLR(const float lr) { base_lr_ = lr; }
+  /*!
+  * \brief get a new learning rate
+  */
+  virtual float GetLR(unsigned num_update) = 0;
+  /*!
+  * \brief destructor
+  */
+  virtual ~LRScheduler() {}
+
+ protected:
+  float base_lr_;
+};
+
+class FactorScheduler : public LRScheduler {
+ public:
+  explicit FactorScheduler(int step, float factor = 1, float stop_factor_lr = 1e-8)
+      : LRScheduler() {
+    step_ = step;
+    factor_ = factor;
+    stop_factor_lr_ = stop_factor_lr;
+  }
+
+  float GetLR(unsigned num_update) override {
+    while (num_update > unsigned(count_ + step_)) {
+      count_ += step_;
+      base_lr_ *= factor_;
+      if (base_lr_ < stop_factor_lr_) {
+        base_lr_ = stop_factor_lr_;
+        LG << "Update[" << num_update << "]: now learning rate arrived at " \
+           << base_lr_ << ", will not change in the future";
+      } else {
+        LG << "Update[" << num_update << "]: Change learning rate to " << base_lr_;
+      }
+    }
+    return base_lr_;
+  }
+
+ private:
+  int count_ = 0;
+  int step_;
+  float factor_;
+  float stop_factor_lr_;
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_LR_SCHEDULER_H_
diff --git a/cpp-package/include/mxnet-cpp/metric.h b/cpp-package/include/mxnet-cpp/metric.h
new file mode 100644
index 000000000000..d015d8b4acc9
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/metric.h
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file base.h
+* \brief metrics defined
+* \author Zhang Chen
+*/
+
+#ifndef MXNET_CPP_METRIC_H_
+#define MXNET_CPP_METRIC_H_
+
+#include <cmath>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include "mxnet-cpp/ndarray.h"
+#include "dmlc/logging.h"
+
+namespace mxnet {
+namespace cpp {
+
+class EvalMetric {
+ public:
+  explicit EvalMetric(const std::string& name, int num = 0)
+      : name(name), num(num) {}
+  virtual void Update(NDArray labels, NDArray preds) = 0;
+  void Reset() {
+    num_inst = 0;
+    sum_metric = 0.0f;
+  }
+  float Get() { return sum_metric / num_inst; }
+  void GetNameValue();
+
+ protected:
+  std::string name;
+  int num;
+  float sum_metric = 0.0f;
+  int num_inst = 0;
+
+  static void CheckLabelShapes(NDArray labels, NDArray preds,
+                               bool strict = false) {
+    if (strict) {
+      CHECK_EQ(Shape(labels.GetShape()), Shape(preds.GetShape()));
+    } else {
+      CHECK_EQ(labels.Size(), preds.Size());
+    }
+  }
+};
+
+class Accuracy : public EvalMetric {
+ public:
+  Accuracy() : EvalMetric("accuracy") {}
+
+  void Update(NDArray labels, NDArray preds) override {
+    CHECK_EQ(labels.GetShape().size(), 1);
+    mx_uint len = labels.GetShape()[0];
+    std::vector<mx_float> pred_data(len);
+    std::vector<mx_float> label_data(len);
+    preds.ArgmaxChannel().SyncCopyToCPU(&pred_data, len);
+    labels.SyncCopyToCPU(&label_data, len);
+    for (mx_uint i = 0; i < len; ++i) {
+      sum_metric += (pred_data[i] == label_data[i]) ? 1 : 0;
+      num_inst += 1;
+    }
+  }
+};
+
+class LogLoss : public EvalMetric {
+ public:
+  LogLoss() : EvalMetric("logloss") {}
+
+  void Update(NDArray labels, NDArray preds) override {
+    static const float epsilon = 1e-15;
+    mx_uint len = labels.GetShape()[0];
+    mx_uint m = preds.GetShape()[1];
+    std::vector<mx_float> pred_data(len * m);
+    std::vector<mx_float> label_data(len);
+    preds.SyncCopyToCPU(&pred_data, pred_data.size());
+    labels.SyncCopyToCPU(&label_data, len);
+    for (mx_uint i = 0; i < len; ++i) {
+      sum_metric +=
+          -std::log(std::max(pred_data[i * m + label_data[i]], epsilon));
+      num_inst += 1;
+    }
+  }
+};
+
+class MAE : public EvalMetric {
+ public:
+  MAE() : EvalMetric("mae") {}
+
+  void Update(NDArray labels, NDArray preds) override {
+    CheckLabelShapes(labels, preds);
+
+    std::vector<mx_float> pred_data;
+    preds.SyncCopyToCPU(&pred_data);
+    std::vector<mx_float> label_data;
+    labels.SyncCopyToCPU(&label_data);
+
+    size_t len = preds.Size();
+    mx_float sum = 0;
+    for (size_t i = 0; i < len; ++i) {
+      sum += std::abs(pred_data[i] - label_data[i]);
+    }
+    sum_metric += sum / len;
+    ++num_inst;
+  }
+};
+
+class MSE : public EvalMetric {
+ public:
+  MSE() : EvalMetric("mse") {}
+
+  void Update(NDArray labels, NDArray preds) override {
+    CheckLabelShapes(labels, preds);
+
+    std::vector<mx_float> pred_data;
+    preds.SyncCopyToCPU(&pred_data);
+    std::vector<mx_float> label_data;
+    labels.SyncCopyToCPU(&label_data);
+
+    size_t len = preds.Size();
+    mx_float sum = 0;
+    for (size_t i = 0; i < len; ++i) {
+      mx_float diff = pred_data[i] - label_data[i];
+      sum += diff * diff;
+    }
+    sum_metric += sum / len;
+    ++num_inst;
+  }
+};
+
+class RMSE : public EvalMetric {
+ public:
+  RMSE() : EvalMetric("rmse") {}
+
+  void Update(NDArray labels, NDArray preds) override {
+    CheckLabelShapes(labels, preds);
+
+    std::vector<mx_float> pred_data;
+    preds.SyncCopyToCPU(&pred_data);
+    std::vector<mx_float> label_data;
+    labels.SyncCopyToCPU(&label_data);
+
+    size_t len = preds.Size();
+    mx_float sum = 0;
+    for (size_t i = 0; i < len; ++i) {
+      mx_float diff = pred_data[i] - label_data[i];
+      sum += diff * diff;
+    }
+    sum_metric += std::sqrt(sum / len);
+    ++num_inst;
+  }
+};
+
+class PSNR : public EvalMetric {
+ public:
+  PSNR() : EvalMetric("psnr") {
+  }
+
+  void Update(NDArray labels, NDArray preds) override {
+    CheckLabelShapes(labels, preds);
+
+    std::vector<mx_float> pred_data;
+    preds.SyncCopyToCPU(&pred_data);
+    std::vector<mx_float> label_data;
+    labels.SyncCopyToCPU(&label_data);
+
+    size_t len = preds.Size();
+    mx_float sum = 0;
+    for (size_t i = 0; i < len; ++i) {
+      mx_float diff = pred_data[i] - label_data[i];
+      sum += diff * diff;
+    }
+    mx_float mse = sum / len;
+    if (mse > 0) {
+      sum_metric += 10 * std::log(255.0f / mse) / log10_;
+    } else {
+      sum_metric += 99.0f;
+    }
+    ++num_inst;
+  }
+
+ private:
+  mx_float log10_ = std::log(10.0f);
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_METRIC_H_
+
diff --git a/cpp-package/include/mxnet-cpp/model.h b/cpp-package/include/mxnet-cpp/model.h
new file mode 100644
index 000000000000..b3a0a9dbef6e
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/model.h
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file model.h
+* \brief MXNET.cpp model module
+* \author Zhang Chen
+*/
+
+#ifndef MXNET_CPP_MODEL_H_
+#define MXNET_CPP_MODEL_H_
+
+#include <string>
+#include <vector>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/symbol.h"
+#include "mxnet-cpp/ndarray.h"
+
+namespace mxnet {
+namespace cpp {
+
+struct FeedForwardConfig {
+  Symbol symbol;
+  std::vector<Context> ctx = {Context::cpu()};
+  int num_epoch = 0;
+  int epoch_size = 0;
+  std::string optimizer = "sgd";
+  // TODO(zhangchen-qinyinghua) More implement
+  // initializer=Uniform(0.01),
+  // numpy_batch_size=128,
+  // arg_params=None, aux_params=None,
+  // allow_extra_params=False,
+  // begin_epoch=0,
+  // **kwargs):
+  FeedForwardConfig(const FeedForwardConfig &other) {}
+  FeedForwardConfig() {}
+};
+class FeedForward {
+ public:
+  explicit FeedForward(const FeedForwardConfig &conf) : conf_(conf) {}
+  void Predict();
+  void Score();
+  void Fit();
+  void Save();
+  void Load();
+  static FeedForward Create();
+
+ private:
+  void InitParams();
+  void InitPredictor();
+  void InitIter();
+  void InitEvalIter();
+  FeedForwardConfig conf_;
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_MODEL_H_
+
diff --git a/cpp-package/include/mxnet-cpp/ndarray.h b/cpp-package/include/mxnet-cpp/ndarray.h
new file mode 100644
index 000000000000..0a9a41234758
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/ndarray.h
@@ -0,0 +1,485 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file ndarray.h
+* \brief definition of ndarray
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef MXNET_CPP_NDARRAY_H_
+#define MXNET_CPP_NDARRAY_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include <iostream>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/shape.h"
+
+namespace mxnet {
+namespace cpp {
+
+enum DeviceType {
+  kCPU = 1,
+  kGPU = 2,
+  kCPUPinned = 3
+};
+
+/*!
+* \brief Context interface
+*/
+class Context {
+ public:
+  /*!
+  * \brief Context constructor
+  * \param type type of the device
+  * \param id id of the device
+  */
+  Context(const DeviceType &type, int id) : type_(type), id_(id) {}
+  /*!
+  * \return the type of the device
+  */
+  DeviceType GetDeviceType() const { return type_; }
+  /*!
+  * \return the id of the device
+  */
+  int GetDeviceId() const { return id_; }
+
+  /*!
+   * \brief Return a GPU context
+   * \param device_id id of the device
+   * \return the corresponding GPU context
+   */
+  static Context gpu(int device_id = 0) {
+    return Context(DeviceType::kGPU, device_id);
+  }
+
+  /*!
+   * \brief Return a CPU context
+   * \param device_id id of the device. this is not needed by CPU
+   * \return the corresponding CPU context
+   */
+  static Context cpu(int device_id = 0) {
+    return Context(DeviceType::kCPU, device_id);
+  }
+
+ private:
+  DeviceType type_;
+  int id_;
+};
+
+/*!
+* \brief struct to store NDArrayHandle
+*/
+struct NDBlob {
+ public:
+  /*!
+  * \brief default constructor
+  */
+  NDBlob() : handle_(nullptr) {}
+  /*!
+  * \brief construct with a NDArrayHandle
+  * \param handle NDArrayHandle to store
+  */
+  explicit NDBlob(NDArrayHandle handle) : handle_(handle) {}
+  /*!
+  * \brief destructor, free the NDArrayHandle
+  */
+  ~NDBlob() { MXNDArrayFree(handle_); }
+  /*!
+  * \brief the NDArrayHandle
+  */
+  NDArrayHandle handle_;
+
+ private:
+  NDBlob(const NDBlob &);
+  NDBlob &operator=(const NDBlob &);
+};
+
+/*!
+* \brief NDArray interface
+*/
+class NDArray {
+ public:
+  /*!
+  * \brief construct with a none handle
+  */
+  NDArray();
+  /*!
+  * \brief construct with a NDArrayHandle
+  */
+  explicit NDArray(const NDArrayHandle &handle);
+  /*!
+  * \brief construct a new dynamic NDArray
+  * \param shape the shape of array
+  * \param context context of NDArray
+  * \param delay_alloc whether delay the allocation
+  * \param dtype data type of NDArray
+  */
+  NDArray(const std::vector<mx_uint> &shape, const Context &context,
+          bool delay_alloc = true, int dtype = 0);
+  /*!
+  * \brief construct a new dynamic NDArray
+  * \param shape the shape of array
+  * \param constext context of NDArray
+  * \param delay_alloc whether delay the allocation
+  * \param dtype data type of NDArray
+  */
+  NDArray(const Shape &shape, const Context &context,
+          bool delay_alloc = true, int dtype = 0);
+  NDArray(const mx_float *data, size_t size);
+  /*!
+  * \brief construct a new dynamic NDArray
+  * \param data the data to create NDArray from
+  * \param shape the shape of array
+  * \param constext context of NDArray
+  */
+  NDArray(const mx_float *data, const Shape &shape, const Context &context);
+  /*!
+  * \brief construct a new dynamic NDArray
+  * \param data the data to create NDArray from
+  * \param shape the shape of array
+  * \param constext context of NDArray
+  */
+  NDArray(const std::vector<mx_float> &data, const Shape &shape,
+          const Context &context);
+  explicit NDArray(const std::vector<mx_float> &data);
+  NDArray operator+(mx_float scalar);
+  NDArray operator-(mx_float scalar);
+  NDArray operator*(mx_float scalar);
+  NDArray operator/(mx_float scalar);
+  NDArray operator%(mx_float scalar);
+  NDArray operator+(const NDArray &);
+  NDArray operator-(const NDArray &);
+  NDArray operator*(const NDArray &);
+  NDArray operator/(const NDArray &);
+  NDArray operator%(const NDArray &);
+  /*!
+  * \brief set all the elements in ndarray to be scalar
+  * \param scalar the scalar to set
+  * \return reference of self
+  */
+  NDArray &operator=(mx_float scalar);
+  /*!
+  * \brief elementwise add to current space
+  *  this mutate the current NDArray
+  * \param scalar the data to add
+  * \return reference of self
+  */
+  NDArray &operator+=(mx_float scalar);
+  /*!
+  * \brief elementwise subtract from current ndarray
+  * this mutate the current NDArray
+  * \param scalar the data to subtract
+  * \return reference of self
+  */
+  NDArray &operator-=(mx_float scalar);
+  /*!
+  * \brief elementwise multiplication to current ndarray
+  *  this mutate the current NDArray
+  * \param scalar the data to subtract
+  * \return reference of self
+  */
+  NDArray &operator*=(mx_float scalar);
+  /*!
+  * \brief elementwise division from current ndarray
+  *  this mutate the current NDArray
+  * \param scalar the data to subtract
+  * \return reference of self
+  */
+  NDArray &operator/=(mx_float scalar);
+  /*!
+  * \brief elementwise modulo from current ndarray
+  *  this mutate the current NDArray
+  * \param scalar the data to subtract
+  * \return reference of self
+  */
+  NDArray &operator%=(mx_float scalar);
+  /*!
+  * \brief elementwise add to current space
+  *  this mutate the current NDArray
+  * \param src the data to add
+  * \return reference of self
+  */
+  NDArray &operator+=(const NDArray &src);
+  /*!
+  * \brief elementwise subtract from current ndarray
+  * this mutate the current NDArray
+  * \param src the data to subtract
+  * \return reference of self
+  */
+  NDArray &operator-=(const NDArray &src);
+  /*!
+  * \brief elementwise multiplication to current ndarray
+  *  this mutate the current NDArray
+  * \param src the data to subtract
+  * \return reference of self
+  */
+  NDArray &operator*=(const NDArray &src);
+  /*!
+  * \brief elementwise division from current ndarray
+  *  this mutate the current NDArray
+  * \param src the data to subtract
+  * \return reference of self
+  */
+  NDArray &operator/=(const NDArray &src);
+  /*!
+  * \brief elementwise modulo from current ndarray
+  *  this mutate the current NDArray
+  * \param src the data to subtract
+  * \return reference of self
+  */
+  NDArray &operator%=(const NDArray &src);
+  NDArray ArgmaxChannel();
+  /*!
+  * \brief Do a synchronize copy from a contiguous CPU memory region.
+  *
+  *  This function will call WaitToWrite before the copy is performed.
+  *  This is useful to copy data from existing memory region that are
+  *  not wrapped by NDArray(thus dependency not being tracked).
+  *
+  * \param data the data source to copy from.
+  * \param size the memory size we want to copy from.
+  */
+  void SyncCopyFromCPU(const mx_float *data, size_t size);
+  /*!
+  * \brief Do a synchronize copy from a contiguous CPU memory region.
+  *
+  *  This function will call WaitToWrite before the copy is performed.
+  *  This is useful to copy data from existing memory region that are
+  *  not wrapped by NDArray(thus dependency not being tracked).
+  *
+  * \param data the data source to copy from, int the form of mx_float vector
+  */
+  void SyncCopyFromCPU(const std::vector<mx_float> &data);
+  /*!
+  * \brief Do a synchronize copy to a contiguous CPU memory region.
+  *
+  *  This function will call WaitToRead before the copy is performed.
+  *  This is useful to copy data from existing memory region that are
+  *  not wrapped by NDArray(thus dependency not being tracked).
+  *
+  * \param data the data source to copyinto.
+  * \param size the memory size we want to copy into. Defualt value is Size()
+  */
+  void SyncCopyToCPU(mx_float *data, size_t size = 0);
+  /*!
+  * \brief Do a synchronize copy to a contiguous CPU memory region.
+  *
+  *  This function will call WaitToRead before the copy is performed.
+  *  This is useful to copy data from existing memory region that are
+  *  not wrapped by NDArray(thus dependency not being tracked).
+  *
+  * \param data the data source to copyinto.
+  * \param size the memory size we want to copy into. Defualt value is Size()
+  */
+  void SyncCopyToCPU(std::vector<mx_float> *data, size_t size = 0);
+  /*!
+  * \brief copy the content of current array to a target array.
+  * \param other the target NDArray
+  * \return the target NDarray
+  */
+  NDArray CopyTo(NDArray * other) const;
+  /*!
+  * \brief return a new copy to this NDArray
+  * \param Context the new context of this NDArray
+  * \return the new copy
+  */
+  NDArray Copy(const Context &) const;
+  /*!
+  * \brief return offset of the element at (h, w)
+  * \param h height position
+  * \param w width position
+  * \return offset of two dimensions array
+  */
+  size_t Offset(size_t h = 0, size_t w = 0) const;
+  /*!
+   * \brief return offset of three dimensions array
+   * \param c channel position
+   * \param h height position
+   * \param w width position
+   * \return offset of three dimensions array
+   */
+  size_t Offset(size_t c, size_t h, size_t w) const;
+  /*!
+  * \brief return value of the element at (index)
+  * \param index  position
+  * \return value of one dimensions array
+  */
+  mx_float At(size_t index) const;
+  /*!
+  * \brief return value of the element at (h, w)
+  * \param h height position
+  * \param w width position
+  * \return value of two dimensions array
+  */
+  mx_float At(size_t h, size_t w) const;
+  /*!
+   * \brief return value of three dimensions array
+   * \param c channel position
+   * \param h height position
+   * \param w width position
+   * \return value of three dimensions array
+   */
+  mx_float At(size_t c, size_t h, size_t w) const;
+  /*!
+  * \brief Slice a NDArray
+  * \param begin begin index in first dim
+  * \param end end index in first dim
+  * \return sliced NDArray
+  */
+  NDArray Slice(mx_uint begin, mx_uint end) const;
+  /*!
+  * \brief Return a reshaped NDArray that shares memory with current one
+  * \param new_shape the new shape
+  * \return reshaped NDarray
+  */
+  NDArray Reshape(const Shape &new_shape) const;
+  /*!
+  * \brief Block until all the pending write operations with respect
+  *    to current NDArray are finished, and read can be performed.
+  */
+  void WaitToRead() const;
+  /*!
+  * \brief Block until all the pending read/write operations with respect
+  *    to current NDArray are finished, and write can be performed.
+  */
+  void WaitToWrite();
+  /*!
+  * \brief Block until all the pending read/write operations with respect
+  *    to current NDArray are finished, and read/write can be performed.
+  */
+  static void WaitAll();
+  /*!
+  * \brief Sample gaussian distribution for each elements of out.
+  * \param mu mean of gaussian distribution.
+  * \param sigma standard deviation of gaussian distribution.
+  * \param out output NDArray.
+  */
+  static void SampleGaussian(mx_float mu, mx_float sigma, NDArray *out);
+  /*!
+  * \brief Sample uniform distribution for each elements of out.
+  * \param begin lower bound of distribution.
+  * \param end upper bound of distribution.
+  * \param out output NDArray.
+  */
+  static void SampleUniform(mx_float begin, mx_float end, NDArray *out);
+  /*!
+  * \brief Load NDArrays from binary file.
+  * \param file_name name of the binary file.
+  * \param array_list a list of NDArrays returned, do not fill the list if
+  * nullptr is given.
+  * \param array_map a map from names to NDArrays returned, do not fill the map
+  * if nullptr is given or no names is stored in binary file.
+  */
+  static void Load(const std::string &file_name,
+                   std::vector<NDArray> *array_list = nullptr,
+                   std::map<std::string, NDArray> *array_map = nullptr);
+  /*!
+  * \brief Load map of NDArrays from binary file.
+  * \param file_name name of the binary file.
+  * \return a list of NDArrays.
+  */
+  static std::map<std::string, NDArray> LoadToMap(const std::string &file_name);
+  /*!
+  * \brief Load list of NDArrays from binary file.
+  * \param file_name name of the binary file.
+  * \return a map from names to NDArrays.
+  */
+  static std::vector<NDArray> LoadToList(const std::string &file_name);
+  /*!
+  * \brief Load NDArrays from buffer.
+  * \param buffer Pointer to buffer. (ie contents of param file)
+  * \param size Size of buffer
+  * \param array_list a list of NDArrays returned, do not fill the list if
+  * nullptr is given.
+  * \param array_map a map from names to NDArrays returned, do not fill the map
+  * if nullptr is given or no names is stored in binary file.
+  */
+  static void LoadFromBuffer(const void *buffer, size_t size,
+                   std::vector<NDArray> *array_list = nullptr,
+                   std::map<std::string, NDArray> *array_map = nullptr);
+  /*!
+  * \brief Load map of NDArrays from buffer.
+  * \param buffer Pointer to buffer. (ie contents of param file)
+  * \param size Size of buffer
+  * \return a list of NDArrays.
+  */
+  static std::map<std::string, NDArray> LoadFromBufferToMap(const void *buffer, size_t size);
+  /*!
+  * \brief Load list of NDArrays from buffer.
+  * \param buffer Pointer to buffer. (ie contents of param file)
+  * \param size Size of buffer
+  * \return a map from names to NDArrays.
+  */
+  static std::vector<NDArray> LoadFromBufferToList(const void *buffer, size_t size);
+  /*!
+  * \brief save a map of string->NDArray to binary file.
+  * \param file_name name of the binary file.
+  * \param array_map a map from names to NDArrays.
+  */
+  static void Save(const std::string &file_name,
+                   const std::map<std::string, NDArray> &array_map);
+  /*!
+  * \brief save a list of NDArrays to binary file.
+  * \param file_name name of the binary file.
+  * \param array_list a list of NDArrays.
+  */
+  static void Save(const std::string &file_name,
+                   const std::vector<NDArray> &array_list);
+  /*!
+  * \return the size of current NDArray, a.k.a. the production of all shape dims
+  */
+  size_t Size() const;
+  /*!
+  * \return the shape of current NDArray, in the form of mx_uint vector
+  */
+  std::vector<mx_uint> GetShape() const;
+  /*!
+  * \return the data type of current NDArray
+  */
+  int GetDType() const;
+  /*!
+  * \brief Get the pointer to data (IMPORTANT: The ndarray should not be in GPU)
+  * \return the data pointer to the current NDArray
+  */
+  const mx_float *GetData() const;
+
+  /*!
+  * \return the context of NDArray
+  */
+  Context GetContext() const;
+
+  /*!
+  * \return the NDArrayHandle of the current NDArray
+  */
+  NDArrayHandle GetHandle() const { return blob_ptr_->handle_; }
+
+ private:
+  std::shared_ptr<NDBlob> blob_ptr_;
+};
+
+std::ostream& operator<<(std::ostream& out, const NDArray &ndarray);
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_NDARRAY_H_
diff --git a/cpp-package/include/mxnet-cpp/ndarray.hpp b/cpp-package/include/mxnet-cpp/ndarray.hpp
new file mode 100644
index 000000000000..5899066a7be5
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/ndarray.hpp
@@ -0,0 +1,466 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file ndarray.hpp
+ * \brief implementation of the ndarray
+ * \author Zhang Chen, Chuntao Hong
+ */
+
+#ifndef MXNET_CPP_NDARRAY_HPP_
+#define MXNET_CPP_NDARRAY_HPP_
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
+#include <iterator>
+#include "dmlc/logging.h"
+#include "mxnet-cpp/ndarray.h"
+#include "mxnet-cpp/operator.h"
+
+namespace mxnet {
+namespace cpp {
+
+inline NDArray::NDArray() {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreateNone(&handle), 0);
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+inline NDArray::NDArray(const NDArrayHandle &handle) {
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+inline NDArray::NDArray(const std::vector<mx_uint> &shape, const Context &context,
+                        bool delay_alloc, int dtype) {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreateEx(shape.data(), shape.size(), context.GetDeviceType(),
+                             context.GetDeviceId(), delay_alloc, dtype, &handle),
+           0);
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+inline NDArray::NDArray(const Shape &shape, const Context &context,
+                        bool delay_alloc, int dtype) {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreateEx(shape.data(), shape.ndim(), context.GetDeviceType(),
+                             context.GetDeviceId(), delay_alloc, dtype, &handle),
+           0);
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+inline NDArray::NDArray(const mx_float *data, size_t size) {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreateNone(&handle), 0);
+  MXNDArraySyncCopyFromCPU(handle, data, size);
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+inline NDArray::NDArray(const mx_float *data, const Shape &shape,
+                        const Context &context) {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreate(shape.data(), shape.ndim(), context.GetDeviceType(),
+                           context.GetDeviceId(), false, 0, &handle),
+           0);
+  CHECK_EQ(MXNDArraySyncCopyFromCPU(handle, data, shape.Size()), 0);
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+inline NDArray::NDArray(const std::vector<mx_float> &data, const Shape &shape,
+                        const Context &context) {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreate(shape.data(), shape.ndim(), context.GetDeviceType(),
+                           context.GetDeviceId(), false, 0, &handle),
+           0);
+  MXNDArraySyncCopyFromCPU(handle, data.data(), shape.Size());
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+inline NDArray::NDArray(const std::vector<mx_float> &data) {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreateNone(&handle), 0);
+  MXNDArraySyncCopyFromCPU(handle, data.data(), data.size());
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+
+inline NDArray NDArray::operator+(mx_float scalar) {
+  NDArray ret;
+  Operator("_plus_scalar")(*this, scalar).Invoke(ret);
+  return ret;
+}
+inline NDArray NDArray::operator-(mx_float scalar) {
+  NDArray ret;
+  Operator("_minus_scalar")(*this, scalar).Invoke(ret);
+  return ret;
+}
+inline NDArray NDArray::operator*(mx_float scalar) {
+  NDArray ret;
+  Operator("_mul_scalar")(*this, scalar).Invoke(ret);
+  return ret;
+}
+inline NDArray NDArray::operator/(mx_float scalar) {
+  NDArray ret;
+  Operator("_div_scalar")(*this, scalar).Invoke(ret);
+  return ret;
+}
+inline NDArray NDArray::operator%(mx_float scalar) {
+  NDArray ret;
+  Operator("_mod_scalar")(*this, scalar).Invoke(ret);
+  return ret;
+}
+inline NDArray NDArray::operator+(const NDArray &rhs) {
+  NDArray ret;
+  Operator("_plus")(*this, rhs).Invoke(ret);
+  return ret;
+}
+inline NDArray NDArray::operator-(const NDArray &rhs) {
+  NDArray ret;
+  Operator("_minus")(*this, rhs).Invoke(ret);
+  return ret;
+}
+inline NDArray NDArray::operator*(const NDArray &rhs) {
+  NDArray ret;
+  Operator("_mul")(*this, rhs).Invoke(ret);
+  return ret;
+}
+inline NDArray NDArray::operator/(const NDArray &rhs) {
+  NDArray ret;
+  Operator("_div")(*this, rhs).Invoke(ret);
+  return ret;
+}
+inline NDArray NDArray::operator%(const NDArray &rhs) {
+  NDArray ret;
+  Operator("_mod")(*this, rhs).Invoke(ret);
+  return ret;
+}
+inline NDArray &NDArray::operator=(mx_float scalar) {
+  Operator("_set_value")(scalar).Invoke(*this);
+  return *this;
+}
+inline NDArray &NDArray::operator+=(mx_float scalar) {
+  Operator("_plus_scalar")(*this, scalar).Invoke(*this);
+  return *this;
+}
+inline NDArray &NDArray::operator-=(mx_float scalar) {
+  Operator("_minus_scalar")(*this, scalar).Invoke(*this);
+  return *this;
+}
+inline NDArray &NDArray::operator*=(mx_float scalar) {
+  Operator("_mul_scalar")(*this, scalar).Invoke(*this);
+  return *this;
+}
+inline NDArray &NDArray::operator/=(mx_float scalar) {
+  Operator("_div_scalar")(*this, scalar).Invoke(*this);
+  return *this;
+}
+inline NDArray &NDArray::operator%=(mx_float scalar) {
+  Operator("_mod_scalar")(*this, scalar).Invoke(*this);
+  return *this;
+}
+inline NDArray &NDArray::operator+=(const NDArray &rhs) {
+  Operator("_plus")(*this, rhs).Invoke(*this);
+  return *this;
+}
+inline NDArray &NDArray::operator-=(const NDArray &rhs) {
+  Operator("_minus")(*this, rhs).Invoke(*this);
+  return *this;
+}
+inline NDArray &NDArray::operator*=(const NDArray &rhs) {
+  Operator("_mul")(*this, rhs).Invoke(*this);
+  return *this;
+}
+inline NDArray &NDArray::operator/=(const NDArray &rhs) {
+  Operator("_div")(*this, rhs).Invoke(*this);
+  return *this;
+}
+inline NDArray &NDArray::operator%=(const NDArray &rhs) {
+  Operator("_mod")(*this, rhs).Invoke(*this);
+  return *this;
+}
+
+inline NDArray NDArray::ArgmaxChannel() {
+  NDArray ret;
+  Operator("argmax_channel")(*this).Invoke(ret);
+  return ret;
+}
+
+inline void NDArray::SyncCopyFromCPU(const mx_float *data, size_t size) {
+  MXNDArraySyncCopyFromCPU(blob_ptr_->handle_, data, size);
+}
+inline void NDArray::SyncCopyFromCPU(const std::vector<mx_float> &data) {
+  MXNDArraySyncCopyFromCPU(blob_ptr_->handle_, data.data(), data.size());
+}
+inline void NDArray::SyncCopyToCPU(mx_float *data, size_t size) {
+  MXNDArraySyncCopyToCPU(blob_ptr_->handle_, data, size > 0 ? size : Size());
+}
+inline void NDArray::SyncCopyToCPU(std::vector<mx_float> *data, size_t size) {
+  size = size > 0 ? size : Size();
+  data->resize(size);
+  MXNDArraySyncCopyToCPU(blob_ptr_->handle_, data->data(), size);
+}
+inline NDArray NDArray::Copy(const Context &ctx) const {
+  NDArray ret(GetShape(), ctx, true, this->GetDType());
+  Operator("_copyto")(*this).Invoke(ret);
+  return ret;
+}
+inline NDArray NDArray::CopyTo(NDArray * other) const {
+  Operator("_copyto")(*this).Invoke(*other);
+  return *other;
+}
+inline NDArray NDArray::Slice(mx_uint begin, mx_uint end) const {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArraySlice(GetHandle(), begin, end, &handle), 0);
+  return NDArray(handle);
+}
+inline NDArray NDArray::Reshape(const Shape &new_shape) const {
+  NDArrayHandle handle;
+  std::vector<int> dims(new_shape.ndim());
+  for (index_t i = 0; i < new_shape.ndim(); ++i) {
+    dims[i] = new_shape[i];
+  }
+  new_shape.data();
+  CHECK_EQ(
+      MXNDArrayReshape(GetHandle(), new_shape.ndim(), dims.data(), &handle), 0);
+  return NDArray(handle);
+}
+inline void NDArray::WaitToRead() const {
+  CHECK_EQ(MXNDArrayWaitToRead(blob_ptr_->handle_), 0) << MXGetLastError();
+}
+inline void NDArray::WaitToWrite() {
+  CHECK_EQ(MXNDArrayWaitToWrite(blob_ptr_->handle_), 0) << MXGetLastError();
+}
+inline void NDArray::WaitAll() { CHECK_EQ(MXNDArrayWaitAll(), 0) << MXGetLastError(); }
+inline void NDArray::SampleGaussian(mx_float mu, mx_float sigma, NDArray *out) {
+  Operator("_random_normal")(mu, sigma).Invoke(*out);
+}
+inline void NDArray::SampleUniform(mx_float begin, mx_float end, NDArray *out) {
+  Operator("_random_uniform")(begin, end).Invoke(*out);
+}
+inline void NDArray::Load(const std::string &file_name,
+                          std::vector<NDArray> *array_list,
+                          std::map<std::string, NDArray> *array_map) {
+  mx_uint out_size, out_name_size;
+  NDArrayHandle *out_arr;
+  const char **out_names;
+  CHECK_EQ(MXNDArrayLoad(file_name.c_str(), &out_size, &out_arr, &out_name_size,
+                         &out_names),
+           0);
+  if (array_list != nullptr) {
+    array_list->reserve(out_size);
+    for (mx_uint i = 0; i < out_size; ++i) {
+      array_list->push_back(NDArray(out_arr[i]));
+    }
+  }
+  if (array_map != nullptr && out_name_size > 0) {
+    CHECK_EQ(out_name_size, out_size);
+    for (mx_uint i = 0; i < out_size; ++i) {
+      (*array_map)[out_names[i]] = NDArray(out_arr[i]);
+    }
+  }
+}
+inline std::map<std::string, NDArray> NDArray::LoadToMap(
+    const std::string &file_name) {
+  std::map<std::string, NDArray> array_map;
+  mx_uint out_size, out_name_size;
+  NDArrayHandle *out_arr;
+  const char **out_names;
+  CHECK_EQ(MXNDArrayLoad(file_name.c_str(), &out_size, &out_arr, &out_name_size,
+                         &out_names),
+           0);
+  if (out_name_size > 0) {
+    CHECK_EQ(out_name_size, out_size);
+    for (mx_uint i = 0; i < out_size; ++i) {
+      array_map[out_names[i]] = NDArray(out_arr[i]);
+    }
+  }
+  return array_map;
+}
+inline std::vector<NDArray> NDArray::LoadToList(const std::string &file_name) {
+  std::vector<NDArray> array_list;
+  mx_uint out_size, out_name_size;
+  NDArrayHandle *out_arr;
+  const char **out_names;
+  CHECK_EQ(MXNDArrayLoad(file_name.c_str(), &out_size, &out_arr, &out_name_size,
+                         &out_names),
+           0);
+  array_list.reserve(out_size);
+  for (mx_uint i = 0; i < out_size; ++i) {
+    array_list.push_back(NDArray(out_arr[i]));
+  }
+  return array_list;
+}
+inline void NDArray::LoadFromBuffer(const void *buffer, size_t size,
+                          std::vector<NDArray> *array_list,
+                          std::map<std::string, NDArray> *array_map) {
+  mx_uint out_size, out_name_size;
+  NDArrayHandle *out_arr;
+  const char **out_names;
+  CHECK_EQ(MXNDArrayLoadFromBuffer(buffer, size, &out_size, &out_arr, &out_name_size,
+                         &out_names),
+           0);
+  if (array_list != nullptr) {
+    array_list->reserve(out_size);
+    for (mx_uint i = 0; i < out_size; ++i) {
+      array_list->push_back(NDArray(out_arr[i]));
+    }
+  }
+  if (array_map != nullptr && out_name_size > 0) {
+    CHECK_EQ(out_name_size, out_size);
+    for (mx_uint i = 0; i < out_size; ++i) {
+      (*array_map)[out_names[i]] = NDArray(out_arr[i]);
+    }
+  }
+}
+inline std::map<std::string, NDArray> NDArray::LoadFromBufferToMap(
+    const void *buffer, size_t size) {
+  std::map<std::string, NDArray> array_map;
+  mx_uint out_size, out_name_size;
+  NDArrayHandle *out_arr;
+  const char **out_names;
+  CHECK_EQ(MXNDArrayLoadFromBuffer(buffer, size, &out_size, &out_arr, &out_name_size,
+                         &out_names),
+           0);
+  if (out_name_size > 0) {
+    CHECK_EQ(out_name_size, out_size);
+    for (mx_uint i = 0; i < out_size; ++i) {
+      array_map[out_names[i]] = NDArray(out_arr[i]);
+    }
+  }
+  return array_map;
+}
+inline std::vector<NDArray> NDArray::LoadFromBufferToList(const void *buffer, size_t size) {
+  std::vector<NDArray> array_list;
+  mx_uint out_size, out_name_size;
+  NDArrayHandle *out_arr;
+  const char **out_names;
+  CHECK_EQ(MXNDArrayLoadFromBuffer(buffer, size, &out_size, &out_arr, &out_name_size,
+                         &out_names),
+           0);
+  array_list.reserve(out_size);
+  for (mx_uint i = 0; i < out_size; ++i) {
+    array_list.push_back(NDArray(out_arr[i]));
+  }
+  return array_list;
+}
+inline void NDArray::Save(const std::string &file_name,
+                          const std::map<std::string, NDArray> &array_map) {
+  std::vector<NDArrayHandle> args;
+  std::vector<const char *> keys;
+  for (const auto &t : array_map) {
+    args.push_back(t.second.GetHandle());
+    keys.push_back(t.first.c_str());
+  }
+  CHECK_EQ(
+      MXNDArraySave(file_name.c_str(), args.size(), args.data(), keys.data()),
+      0);
+}
+inline void NDArray::Save(const std::string &file_name,
+                          const std::vector<NDArray> &array_list) {
+  std::vector<NDArrayHandle> args;
+  for (const auto &t : array_list) {
+    args.push_back(t.GetHandle());
+  }
+  CHECK_EQ(MXNDArraySave(file_name.c_str(), args.size(), args.data(), nullptr),
+           0);
+}
+
+inline size_t NDArray::Offset(size_t h, size_t w) const {
+  auto const shape = GetShape();
+  CHECK_EQ(shape.size(), 2) << "The NDArray needs to be 2 dimensional.";
+
+  return (h * shape[1]) + w;
+}
+
+inline size_t NDArray::Offset(size_t c, size_t h, size_t w) const {
+  auto const shape = GetShape();
+  CHECK_EQ(shape.size(), 3) << "The NDArray needs to be 3 dimensional.";
+  return h * shape[0] * shape[2] + w * shape[0] + c;
+}
+
+inline mx_float NDArray::At(size_t h, size_t w) const {
+  return GetData()[Offset(h, w)];
+}
+
+inline mx_float NDArray::At(size_t c, size_t h, size_t w) const {
+  return GetData()[Offset(c, h, w)];
+}
+
+inline mx_float NDArray::At(size_t index) const {
+  auto shape = GetShape();
+  CHECK_EQ(shape.size(), 1) << "The NDArray needs to be 1 dimensional.";
+  CHECK_LT(index, shape[0]) << "Specified index is out of range.";
+  return GetData()[index];
+}
+
+inline size_t NDArray::Size() const {
+  size_t ret = 1;
+  for (auto &i : GetShape()) ret *= i;
+  return ret;
+}
+
+inline std::vector<mx_uint> NDArray::GetShape() const {
+  const int *out_pdata;
+  int out_dim;
+  MXNDArrayGetShape(blob_ptr_->handle_, &out_dim, &out_pdata);
+  std::vector<mx_uint> ret;
+  for (int i = 0; i < out_dim; ++i) {
+    ret.push_back(out_pdata[i]);
+  }
+  return ret;
+}
+
+inline int NDArray::GetDType() const {
+  int ret;
+  MXNDArrayGetDType(blob_ptr_->handle_, &ret);
+  return ret;
+}
+
+inline const mx_float *NDArray::GetData() const {
+  void *ret;
+  MXNDArrayGetData(blob_ptr_->handle_, &ret);
+  if (GetDType() != 0) {
+    return nullptr;
+  }
+  return static_cast<mx_float*>(ret);
+}
+
+inline Context NDArray::GetContext() const {
+  int out_dev_type;
+  int out_dev_id;
+  MXNDArrayGetContext(blob_ptr_->handle_, &out_dev_type, &out_dev_id);
+  return Context((DeviceType)out_dev_type, out_dev_id);
+}
+
+inline std::ostream & operator<<(std::ostream &out, const NDArray &ndarray) {
+  // TODO(lx75249): Consider DType / beautify like numpy
+  auto shape = ndarray.GetShape();
+  NDArray cpu_array(ndarray.GetShape(), Context::cpu());
+  if (ndarray.GetContext().GetDeviceType() != DeviceType::kGPU) {
+    cpu_array = ndarray;
+  } else {
+    ndarray.WaitToRead();
+    ndarray.CopyTo(&cpu_array);
+  }
+
+  out << '[';
+  cpu_array.WaitToRead();
+  std::copy(cpu_array.GetData(), cpu_array.GetData() + ndarray.Size(),
+      std::ostream_iterator<float>(out, ", "));
+  out << ']';
+  return out;
+}
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_NDARRAY_HPP_
diff --git a/cpp-package/include/mxnet-cpp/op_map.h b/cpp-package/include/mxnet-cpp/op_map.h
new file mode 100644
index 000000000000..17746d1fa596
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/op_map.h
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file op_map.h
+* \brief definition of OpMap
+* \author Chuntao Hong
+*/
+
+#ifndef MXNET_CPP_OP_MAP_H_
+#define MXNET_CPP_OP_MAP_H_
+
+#include <map>
+#include <string>
+#include "mxnet-cpp/base.h"
+#include "dmlc/logging.h"
+
+namespace mxnet {
+namespace cpp {
+
+/*!
+* \brief OpMap instance holds a map of all the symbol creators so we can
+*  get symbol creators by name.
+*  This is used internally by Symbol and Operator.
+*/
+class OpMap {
+ public:
+  /*!
+  * \brief Create an Mxnet instance
+  */
+  inline OpMap() {
+    mx_uint num_symbol_creators = 0;
+    AtomicSymbolCreator *symbol_creators = nullptr;
+    int r =
+      MXSymbolListAtomicSymbolCreators(&num_symbol_creators, &symbol_creators);
+    CHECK_EQ(r, 0);
+    for (mx_uint i = 0; i < num_symbol_creators; i++) {
+      const char *name;
+      const char *description;
+      mx_uint num_args;
+      const char **arg_names;
+      const char **arg_type_infos;
+      const char **arg_descriptions;
+      const char *key_var_num_args;
+      r = MXSymbolGetAtomicSymbolInfo(symbol_creators[i], &name, &description,
+        &num_args, &arg_names, &arg_type_infos,
+        &arg_descriptions, &key_var_num_args);
+      CHECK_EQ(r, 0);
+      symbol_creators_[name] = symbol_creators[i];
+    }
+
+    nn_uint num_ops;
+    const char **op_names;
+    r = NNListAllOpNames(&num_ops, &op_names);
+    CHECK_EQ(r, 0);
+    for (nn_uint i = 0; i < num_ops; i++) {
+      OpHandle handle;
+      r = NNGetOpHandle(op_names[i], &handle);
+      CHECK_EQ(r, 0);
+      op_handles_[op_names[i]] = handle;
+    }
+  }
+
+  /*!
+  * \brief Get a symbol creator with its name.
+  *
+  * \param name name of the symbol creator
+  * \return handle to the symbol creator
+  */
+  inline AtomicSymbolCreator GetSymbolCreator(const std::string &name) {
+    if (symbol_creators_.count(name) == 0)
+      return GetOpHandle(name);
+    return symbol_creators_[name];
+  }
+
+  /*!
+  * \brief Get an op handle with its name.
+  *
+  * \param name name of the op
+  * \return handle to the op
+  */
+  inline OpHandle GetOpHandle(const std::string &name) {
+    return op_handles_[name];
+  }
+
+ private:
+  std::map<std::string, AtomicSymbolCreator> symbol_creators_;
+  std::map<std::string, OpHandle> op_handles_;
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_OP_MAP_H_
diff --git a/cpp-package/include/mxnet-cpp/op_suppl.h b/cpp-package/include/mxnet-cpp/op_suppl.h
new file mode 100644
index 000000000000..4f3011c17caa
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/op_suppl.h
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file op_suppl.h
+* \brief A supplement and amendment of the operators from op.h
+* \author Zhang Chen, zhubuntu, Xin Li
+*/
+
+#ifndef MXNET_CPP_OP_SUPPL_H_
+#define MXNET_CPP_OP_SUPPL_H_
+
+#include <cassert>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/shape.h"
+#include "mxnet-cpp/operator.h"
+#include "mxnet-cpp/MxNetCpp.h"
+
+namespace mxnet {
+namespace cpp {
+
+inline Symbol _Plus(Symbol lhs, Symbol rhs) {
+  return Operator("_Plus")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Mul(Symbol lhs, Symbol rhs) {
+  return Operator("_Mul")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Minus(Symbol lhs, Symbol rhs) {
+  return Operator("_Minus")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Div(Symbol lhs, Symbol rhs) {
+  return Operator("_Div")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Mod(Symbol lhs, Symbol rhs) {
+  return Operator("_Mod")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Power(Symbol lhs, Symbol rhs) {
+  return Operator("_Power")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Maximum(Symbol lhs, Symbol rhs) {
+  return Operator("_Maximum")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Minimum(Symbol lhs, Symbol rhs) {
+  return Operator("_Minimum")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _PlusScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_PlusScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _MinusScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_MinusScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _RMinusScalar(mx_float scalar, Symbol rhs) {
+  return Operator("_RMinusScalar")(rhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _MulScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_MulScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _DivScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_DivScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _RDivScalar(mx_float scalar, Symbol rhs) {
+  return Operator("_RDivScalar")(rhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _ModScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_ModScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _RModScalar(mx_float scalar, Symbol rhs) {
+  return Operator("_RModScalar")(rhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _PowerScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_PowerScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _RPowerScalar(mx_float scalar, Symbol rhs) {
+  return Operator("_RPowerScalar")(rhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _MaximumScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_MaximumScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _MinimumScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_MinimumScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+// TODO(zhangcheng-qinyinghua)
+//  make crop function run in op.h
+//  This function is due to [zhubuntu](https://github.com/zhubuntu)
+inline Symbol Crop(const std::string& symbol_name,
+    int num_args,
+    Symbol data,
+    Symbol crop_like,
+    Shape offset = Shape(0, 0),
+    Shape h_w = Shape(0, 0),
+    bool center_crop = false) {
+  return Operator("Crop")
+    .SetParam("num_args", num_args)
+    .SetParam("offset", offset)
+    .SetParam("h_w", h_w)
+    .SetParam("center_crop", center_crop)
+    .SetInput("arg0", data)
+    .SetInput("arg1", crop_like)
+    .CreateSymbol(symbol_name);
+}
+
+
+/*!
+ * \brief Apply activation function to input.
+ *        Softmax Activation is only available with CUDNN on GPUand will be
+ *        computed at each location across channel if input is 4D.
+ * \param symbol_name name of the resulting symbol.
+ * \param data Input data to activation function.
+ * \param act_type Activation function to be applied.
+ * \return new symbol
+ */
+inline Symbol Activation(const std::string& symbol_name,
+                         Symbol data,
+                         const std::string& act_type) {
+  assert(act_type == "relu" ||
+         act_type == "sigmoid" ||
+         act_type == "softrelu" ||
+         act_type == "tanh");
+  return Operator("Activation")
+           .SetParam("act_type", act_type.c_str())
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_OP_SUPPL_H_
+
diff --git a/cpp-package/include/mxnet-cpp/op_util.h b/cpp-package/include/mxnet-cpp/op_util.h
new file mode 100644
index 000000000000..b2b442fd8a88
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/op_util.h
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file op_util.h
+* \brief operator helper functions
+* \author Chris Olivier
+*/
+
+#ifndef MXNET_CPP_OP_UTIL_H_
+#define MXNET_CPP_OP_UTIL_H_
+
+#include <string>
+
+#if defined(MXNET_USE_CAFFE) && MXNET_USE_CAFFE != 0
+#include <caffe/proto/caffe.pb.h>
+#include <google/protobuf/text_format.h>
+#endif
+
+namespace mxnet {
+namespace cpp {
+
+#if defined(MXNET_USE_CAFFE) && MXNET_USE_CAFFE != 0
+
+inline ::caffe::LayerParameter textToCaffeLayerParameter(const std::string& text) {
+  caffe::NetParameter np;
+  const bool success = google::protobuf::TextFormat::ParseFromString(text, &np);
+  CHECK_EQ(success, true) << "Invalid protpbuf layer string: " << text;
+  return ::caffe::LayerParameter(np.layer(0));
+}
+
+template<typename StreamType>
+inline StreamType& operator << (StreamType& os, const ::caffe::LayerParameter& op) {
+  std::string s;
+  caffe::NetParameter np;
+  // Avoid wasting time making a copy -- just push in out default object's pointer
+  np.mutable_layer()->AddAllocated(const_cast<::caffe::LayerParameter *>(&op));
+  google::protobuf::TextFormat::PrintToString(np, &s);
+  np.mutable_layer()->ReleaseLast();
+  os << s;
+  return os;
+}
+#endif
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_OP_UTIL_H_
diff --git a/cpp-package/include/mxnet-cpp/operator.h b/cpp-package/include/mxnet-cpp/operator.h
new file mode 100644
index 000000000000..9f289f0e248b
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/operator.h
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file operator.h
+* \brief definition of operator
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef MXNET_CPP_OPERATOR_H_
+#define MXNET_CPP_OPERATOR_H_
+
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/op_map.h"
+#include "mxnet-cpp/symbol.h"
+
+namespace mxnet {
+namespace cpp {
+class Mxnet;
+/*!
+* \brief Operator interface
+*/
+class Operator {
+ public:
+  /*!
+  * \brief Operator constructor
+  * \param operator_name type of the operator
+  */
+  explicit Operator(const std::string &operator_name);
+  Operator &operator=(const Operator &rhs);
+  /*!
+  * \brief set config parameters
+  * \param name name of the config parameter
+  * \param value value of the config parameter
+  * \return reference of self
+  */
+  template <typename T>
+  Operator &SetParam(const std::string &name, const T &value) {
+    std::string value_str;
+    std::stringstream ss;
+    ss << value;
+    ss >> value_str;
+
+    params_[name] = value_str;
+    return *this;
+  }
+  /*!
+  * \brief set config parameters from positional inputs
+  * \param pos the position of parameter
+  * \param value value of the config parameter
+  * \return reference of self
+  */
+  template <typename T>
+  Operator &SetParam(int pos, const T &value) {
+    std::string value_str;
+    std::stringstream ss;
+    ss << value;
+    ss >> value_str;
+
+    params_[arg_names_[pos]] = value_str;
+    return *this;
+  }
+  /*!
+  * \brief add an input symbol
+  * \param name name of the input symbol
+  * \param symbol the input symbol
+  * \return reference of self
+  */
+  Operator &SetInput(const std::string &name, const Symbol &symbol);
+  /*!
+  * \brief add an input symbol
+  * \param symbol the input symbol
+  */
+  template<int N = 0>
+  void PushInput(const Symbol &symbol) {
+    input_symbols_.push_back(symbol.GetHandle());
+  }
+  /*!
+  * \brief add input symbols
+  * \return reference of self
+  */
+  Operator &operator()() { return *this; }
+  /*!
+  * \brief add input symbols
+  * \param symbol the input symbol
+  * \return reference of self
+  */
+  Operator &operator()(const Symbol &symbol) {
+    input_symbols_.push_back(symbol.GetHandle());
+    return *this;
+  }
+  /*!
+  * \brief add a list of input symbols
+  * \param symbols the vector of the input symbols
+  * \return reference of self
+  */
+  Operator &operator()(const std::vector<Symbol> &symbols) {
+    for (auto &s : symbols) {
+      input_symbols_.push_back(s.GetHandle());
+    }
+    return *this;
+  }
+  /*!
+  * \brief create a Symbol from the current operator
+  * \param name the name of the operator
+  * \return the operator Symbol
+  */
+  Symbol CreateSymbol(const std::string &name = "");
+
+  /*!
+  * \brief add an input ndarray
+  * \param name name of the input ndarray
+  * \param ndarray the input ndarray
+  * \return reference of self
+  */
+  Operator &SetInput(const std::string &name, const NDArray &ndarray);
+  /*!
+  * \brief add an input ndarray
+  * \param ndarray the input ndarray
+  */
+  template<int N = 0>
+  Operator &PushInput(const NDArray &ndarray) {
+    input_ndarrays_.push_back(ndarray.GetHandle());
+    return *this;
+  }
+  /*!
+  * \brief add positional inputs
+  */
+  template <class T, class... Args, int N = 0>
+  Operator &PushInput(const T &t, Args... args) {
+    SetParam(N, t);
+    PushInput<Args..., N+1>(args...);
+    return *this;
+  }
+  /*!
+  * \brief add the last positional input
+  */
+  template <class T, int N = 0>
+  Operator &PushInput(const T &t) {
+    SetParam(N, t);
+    return *this;
+  }
+  /*!
+  * \brief add input ndarrays
+  * \param ndarray the input ndarray
+  * \return reference of self
+  */
+  Operator &operator()(const NDArray &ndarray) {
+    input_ndarrays_.push_back(ndarray.GetHandle());
+    return *this;
+  }
+  /*!
+  * \brief add a list of input ndarrays
+  * \param ndarrays the vector of the input ndarrays
+  * \return reference of self
+  */
+  Operator &operator()(const std::vector<NDArray> &ndarrays) {
+    for (auto &s : ndarrays) {
+      input_ndarrays_.push_back(s.GetHandle());
+    }
+    return *this;
+  }
+  /*!
+  * \brief add input ndarrays
+  * \return reference of self
+  */
+  template <typename... Args>
+  Operator &operator()(Args... args) {
+    PushInput(args...);
+    return *this;
+  }
+  std::vector<NDArray> Invoke();
+  void Invoke(NDArray &output);
+  void Invoke(std::vector<NDArray> &outputs);
+
+ private:
+  std::map<std::string, std::string> params_desc_;
+  bool variable_params_ = false;
+  std::map<std::string, std::string> params_;
+  std::vector<SymbolHandle> input_symbols_;
+  std::vector<NDArrayHandle> input_ndarrays_;
+  std::vector<std::string> input_keys_;
+  std::vector<std::string> arg_names_;
+  AtomicSymbolCreator handle_;
+  static OpMap*& op_map();
+};
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_OPERATOR_H_
diff --git a/cpp-package/include/mxnet-cpp/operator.hpp b/cpp-package/include/mxnet-cpp/operator.hpp
new file mode 100644
index 000000000000..2bb9c132d1f8
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/operator.hpp
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+* \file operator.hpp
+* \brief implementation of operator
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef MXNET_CPP_OPERATOR_HPP_
+#define MXNET_CPP_OPERATOR_HPP_
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <iterator>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/op_map.h"
+#include "mxnet-cpp/operator.h"
+
+namespace mxnet {
+namespace cpp {
+
+/*
+ * Pushing NDArray or Symbol as inputs here to avoid partial specialization
+ * like PushInput<NDArray, Args..., N>, which is not allowed in C++
+ */
+template <>
+inline Operator& Operator::SetParam<NDArray>(int pos, const NDArray &value) {
+  input_ndarrays_.push_back(value.GetHandle());
+  return *this;
+}
+template <>
+inline Operator& Operator::SetParam<Symbol>(int pos, const Symbol &value) {
+  input_symbols_.push_back(value.GetHandle());
+  return *this;
+}
+
+inline OpMap*& Operator::op_map() {
+  static OpMap *op_map_ = new OpMap();
+  return op_map_;
+}
+
+inline Operator::Operator(const std::string &operator_name) {
+  handle_ = op_map()->GetSymbolCreator(operator_name);
+  const char *name;
+  const char *description;
+  mx_uint num_args;
+  const char **arg_names;
+  const char **arg_type_infos;
+  const char **arg_descriptions;
+  const char *key_var_num_args;
+  MXSymbolGetAtomicSymbolInfo(handle_,
+      &name,
+      &description,
+      &num_args,
+      &arg_names,
+      &arg_type_infos,
+      &arg_descriptions,
+      &key_var_num_args);
+  for (mx_uint i = 0; i < num_args; ++i) {
+    arg_names_.push_back(arg_names[i]);
+  }
+}
+
+inline Symbol Operator::CreateSymbol(const std::string &name) {
+  if (input_keys_.size() > 0) {
+    CHECK_EQ(input_keys_.size(), input_symbols_.size());
+  }
+  const char *pname = name == "" ? nullptr : name.c_str();
+
+  SymbolHandle symbol_handle;
+  std::vector<const char *> input_keys;
+  std::vector<const char *> param_keys;
+  std::vector<const char *> param_values;
+
+  for (auto &data : params_) {
+    param_keys.push_back(data.first.c_str());
+    param_values.push_back(data.second.c_str());
+  }
+  for (auto &data : this->input_keys_) {
+    input_keys.push_back(data.c_str());
+  }
+  const char **input_keys_p =
+      (input_keys.size() > 0) ? input_keys.data() : nullptr;
+
+  MXSymbolCreateAtomicSymbol(handle_, param_keys.size(), param_keys.data(),
+                             param_values.data(), &symbol_handle);
+  MXSymbolCompose(symbol_handle, pname, input_symbols_.size(), input_keys_p,
+                  input_symbols_.data());
+  return Symbol(symbol_handle);
+}
+
+inline void Operator::Invoke(std::vector<NDArray> &outputs) {
+  if (input_keys_.size() > 0) {
+    CHECK_EQ(input_keys_.size(), input_ndarrays_.size());
+  }
+
+  std::vector<const char *> input_keys;
+  std::vector<const char *> param_keys;
+  std::vector<const char *> param_values;
+
+  for (auto &data : params_) {
+    param_keys.push_back(data.first.c_str());
+    param_values.push_back(data.second.c_str());
+  }
+
+  int num_inputs = input_ndarrays_.size();
+  int num_outputs = outputs.size();
+  std::vector<NDArrayHandle> output_handles;
+  std::transform(outputs.begin(), outputs.end(),
+      std::back_inserter(output_handles), [](NDArray& a) {
+        return a.GetHandle();
+      });
+
+  NDArrayHandle *outputs_receiver = nullptr;
+  if (num_outputs > 0) {
+    outputs_receiver = output_handles.data();
+  }
+
+  if (MXImperativeInvoke(handle_, num_inputs, input_ndarrays_.data(),
+                         &num_outputs, &outputs_receiver,
+                         param_keys.size(), param_keys.data(),
+                         param_values.data(), nullptr))
+      LOG(FATAL) << MXGetLastError();
+
+  if (outputs.size() > 0)
+    return;
+
+  std::transform(outputs_receiver, outputs_receiver+num_outputs,
+      std::back_inserter(outputs), [](const NDArrayHandle& handle) {
+        return NDArray(handle);
+      });
+}
+
+inline std::vector<NDArray> Operator::Invoke() {
+  std::vector<NDArray> outputs;
+  Invoke(outputs);
+  return outputs;
+}
+
+inline void Operator::Invoke(NDArray &output) {
+  std::vector<NDArray> outputs{output};
+  Invoke(outputs);
+}
+
+inline Operator &Operator::SetInput(const std::string &name, const Symbol &symbol) {
+    if (symbol.GetHandle()) {
+      input_keys_.push_back(name.c_str());
+      input_symbols_.push_back(symbol.GetHandle());
+    }
+    return *this;
+}
+
+inline Operator &Operator::SetInput(const std::string &name, const NDArray &ndarray) {
+  input_keys_.push_back(name.c_str());
+  input_ndarrays_.push_back(ndarray.GetHandle());
+  return *this;
+}
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_OPERATOR_HPP_
diff --git a/cpp-package/include/mxnet-cpp/optimizer.h b/cpp-package/include/mxnet-cpp/optimizer.h
new file mode 100644
index 000000000000..ac842874fea6
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/optimizer.h
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file optimizer.h
+* \brief definition of optimizer
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef MXNET_CPP_OPTIMIZER_H_
+#define MXNET_CPP_OPTIMIZER_H_
+
+#include <dmlc/strtonum.h>
+#include <map>
+#include <vector>
+#include <string>
+#include <memory>
+#include <functional>
+#include "mxnet-cpp/base.h"
+#include "dmlc/logging.h"
+#include "mxnet-cpp/ndarray.h"
+#include "mxnet-cpp/op_map.h"
+#include "mxnet-cpp/lr_scheduler.h"
+
+namespace mxnet {
+namespace cpp {
+
+/*!
+* \brief Optimizer interface
+*/
+class Optimizer {
+ public:
+  /*!
+  * \brief constructor
+  * \param beign_num_update The initial number of updates
+  */
+  explicit Optimizer(unsigned begin_num_update);
+  /*!
+  * \brief get optimizer type
+  * \return string of optimizer type
+  */
+  virtual std::string GetType() const = 0;
+  /*!
+  * \brief destructor
+  */
+  virtual ~Optimizer();
+  /*!
+  * \brief set config parameters
+  * \param name name of the config parameter
+  * \param value value of the config parameter
+  * \return reference of self
+  */
+  template <typename T>
+  Optimizer *SetParam(const std::string &name, const T &value) {
+    std::string value_str;
+    std::stringstream ss;
+    ss << value;
+    ss >> value_str;
+
+    params_[name] = value_str;
+    return this;
+  }
+  /*!
+  * \bried set the lr scheduler
+  * \param lrScheduler lr scheduler used for this optimizer
+  * \return reference if self
+  */
+  Optimizer *SetLRScheduler(std::unique_ptr<LRScheduler> lrScheduler) {
+    CHECK(lrScheduler);
+    lrScheduler_ = std::move(lrScheduler);
+    lrScheduler_->SetLR(dmlc::stof(params_["lr"]));
+    return this;
+  }
+  /*!
+  *  \brief Update a weight with gradient.
+  *  \param index the unique index for the weight.
+  *  \param weight the weight to update.
+  *  \param grad gradient for the weight.
+  */
+  virtual void Update(int index, NDArray weight, NDArray grad) = 0;
+  // TODO(zhangcheng-qinyinghua)
+  // implement Update a list of arrays, maybe in the form of map
+  // void Update(int index, std::vector<NDArray> weights, std::vector<NDArray>
+  // grad, mx_float lr);
+
+  /*!
+  *  \brief Serialize the optimizer parameters to a string.
+  *  \return serialization
+  */
+  std::string Serialize() const;
+
+ protected:
+  std::map<std::string, std::string> params_;
+  static OpMap*& op_map();
+  const std::vector<const char*> GetParamKeys_() const;
+  const std::vector<const char*> GetParamValues_() const;
+  std::map<int, unsigned> count_;
+  unsigned begin_num_update_, num_update_;
+  unsigned UpdateCount_(int index);
+  float GetLR_(int index);
+  float GetWD_(int index);
+  virtual void CreateState_(int index, NDArray weight);
+  std::unique_ptr<LRScheduler> lrScheduler_ = nullptr;
+};
+
+typedef std::function<Optimizer*()> OptimizerCreator;
+
+class OptimizerRegistry {
+ public:
+  static Optimizer* Find(const std::string& name);
+  static int __REGISTER__(const std::string& name, OptimizerCreator creator);
+ private:
+  static std::map<std::string, OptimizerCreator>& cmap();
+  OptimizerRegistry() = delete;
+  ~OptimizerRegistry() = delete;
+};
+#define MXNETCPP_REGISTER_OPTIMIZER(Name, OptimizerType)\
+  OptimizerRegistry::__REGISTER__(#Name, [](){return new OptimizerType();})
+
+class SGDOptimizer : public Optimizer {
+ public:
+  explicit SGDOptimizer(unsigned begin_num_update = 0);
+  std::string GetType() const override;
+  void Update(int index, NDArray weight, NDArray grad) override;
+ private:
+  virtual ~SGDOptimizer();
+  void CreateState_(int index, NDArray weight) override;
+  std::map<int, NDArray*> states_;
+  AtomicSymbolCreator update_handle_;
+  AtomicSymbolCreator mom_update_handle_;
+};
+
+class SignumOptimizer : public Optimizer {
+ public:
+  explicit SignumOptimizer(unsigned begin_num_update = 0);
+  std::string GetType() const override;
+  void Update(int index, NDArray weight, NDArray grad) override;
+ private:
+  virtual ~SignumOptimizer();
+  void CreateState_(int index, NDArray weight) override;
+  std::map<int, NDArray*> states_;
+  AtomicSymbolCreator update_handle_;
+  AtomicSymbolCreator mom_update_handle_;
+};
+
+
+class RMSPropOptimizer : public Optimizer {
+ public:
+  explicit RMSPropOptimizer(unsigned begin_num_update = 0);
+  std::string GetType() const override;
+  void Update(int index, NDArray weight, NDArray grad) override;
+ private:
+  virtual ~RMSPropOptimizer();
+  void CreateState_(int index, NDArray weight) override;
+  std::map<int, NDArray*> n_, g_, delta_;
+  AtomicSymbolCreator update_handle_;
+  AtomicSymbolCreator alex_update_handle_;
+};
+
+class AdamOptimizer : public Optimizer {
+ public:
+  explicit AdamOptimizer(unsigned begin_num_update = 0);
+  std::string GetType() const override;
+  void Update(int index, NDArray weight, NDArray grad) override;
+ private:
+  virtual ~AdamOptimizer();
+  void CreateState_(int index, NDArray weight) override;
+  std::map<int, NDArray*> mean_;
+  std::map<int, NDArray*> var_;
+  AtomicSymbolCreator update_handle_;
+};
+
+class AdaGradOptimizer : public Optimizer {
+ public:
+  explicit AdaGradOptimizer(unsigned begin_num_update = 0);
+  std::string GetType() const override;
+  void Update(int index, NDArray weight, NDArray grad) override;
+ private:
+  virtual ~AdaGradOptimizer();
+  void CreateState_(int index, NDArray weight) override;
+  std::map<int, NDArray*> history_;
+};
+
+class AdaDeltaOptimizer : public Optimizer {
+ public:
+  explicit AdaDeltaOptimizer(unsigned begin_num_update = 0);
+  std::string GetType() const override;
+  void Update(int index, NDArray weight, NDArray grad) override;
+ private:
+  virtual ~AdaDeltaOptimizer();
+  void CreateState_(int index, NDArray weight) override;
+  std::map<int, NDArray*> acc_g_, acc_delta_;
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_OPTIMIZER_H_
diff --git a/cpp-package/include/mxnet-cpp/optimizer.hpp b/cpp-package/include/mxnet-cpp/optimizer.hpp
new file mode 100644
index 000000000000..5780df08bc2a
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/optimizer.hpp
@@ -0,0 +1,495 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+* \file optimizer.hpp
+* \brief implementation of optimizer
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef MXNET_CPP_OPTIMIZER_HPP_
+#define MXNET_CPP_OPTIMIZER_HPP_
+
+#include <dmlc/strtonum.h>
+#include <algorithm>
+#include <utility>
+#include <numeric>
+#include <map>
+#include <cmath>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/optimizer.h"
+#include "mxnet-cpp/op.h"
+#include "mxnet-cpp/op_map.h"
+
+namespace {
+
+// TODO(lx75249): Add imperative operators to op.h under ndarray namespace
+inline void _clip(mxnet::cpp::NDArray &data, float limit) {
+  data = mxnet::cpp::Operator("clip")
+    .SetParam("a_min", -limit)
+    .SetParam("a_max", limit)
+    .SetInput("data", data)
+    .Invoke()[0];
+}
+inline mxnet::cpp::NDArray _sqrt(mxnet::cpp::NDArray data) {
+  return mxnet::cpp::Operator("sqrt")
+    .SetInput("data", data)
+    .Invoke()[0];
+}
+
+}  // namespace
+
+namespace mxnet {
+namespace cpp {
+inline Optimizer::Optimizer(unsigned begin_num_update)
+  : begin_num_update_(begin_num_update),
+    num_update_(begin_num_update_) {
+  params_["lr"] = "0.01f";
+  params_["wd"] = "0.f";
+}
+
+inline std::map<std::string, OptimizerCreator>& OptimizerRegistry::cmap() {
+  static std::map<std::string, OptimizerCreator> cmap_;
+  return cmap_;
+}
+
+inline OpMap*& Optimizer::op_map() {
+  static OpMap *op_map_ = new OpMap();
+  return op_map_;
+}
+
+inline Optimizer::~Optimizer() {}
+
+inline void Optimizer::CreateState_(int index, NDArray weight) {
+}
+
+inline std::string Optimizer::Serialize() const {
+  using ValueType = std::map<std::string, std::string>::value_type;
+  auto params = params_;
+  params.emplace("opt_type", GetType());
+  return std::accumulate(params.cbegin(), params.cend(), std::string(""),
+    [](const std::string& sum, const ValueType& i) {
+      return sum + '\n' + i.first + '=' + i.second;
+    }).substr(1);
+}
+
+inline const std::vector<const char*> Optimizer::GetParamKeys_() const {
+  std::vector<const char*> keys;
+  for (auto& iter : params_)
+    keys.push_back(iter.first.c_str());
+  return keys;
+}
+
+inline const std::vector<const char*> Optimizer::GetParamValues_() const {
+  std::vector<const char*> values;
+  for (auto& iter : params_)
+    values.push_back(iter.second.c_str());
+  return values;
+}
+
+inline unsigned Optimizer::UpdateCount_(int index) {
+  if (count_.count(index) == 0) {
+    count_.emplace(index, begin_num_update_);
+  }
+  unsigned new_count = ++count_[index];
+  num_update_ = std::max(num_update_, new_count);
+  return new_count;
+}
+
+inline float Optimizer::GetLR_(int index) {
+  if (nullptr != lrScheduler_) {
+    return lrScheduler_->GetLR(num_update_);
+  }
+  return dmlc::stof(params_["lr"]);
+}
+
+inline float Optimizer::GetWD_(int index) {
+  float wd = dmlc::stof(params_["wd"]);
+  return wd;
+}
+
+inline Optimizer* OptimizerRegistry::Find(const std::string& name) {
+  if (cmap().empty()) {
+    // Optimizers should only be registered once
+    MXNETCPP_REGISTER_OPTIMIZER(sgd, SGDOptimizer);
+    MXNETCPP_REGISTER_OPTIMIZER(ccsgd, SGDOptimizer);  // For backward compatibility
+    MXNETCPP_REGISTER_OPTIMIZER(rmsprop, RMSPropOptimizer);
+    MXNETCPP_REGISTER_OPTIMIZER(adam, AdamOptimizer);
+    MXNETCPP_REGISTER_OPTIMIZER(adagrad, AdaGradOptimizer);
+    MXNETCPP_REGISTER_OPTIMIZER(adadelta, AdaDeltaOptimizer);
+    MXNETCPP_REGISTER_OPTIMIZER(signum, SignumOptimizer);
+  }
+  auto it = cmap().find(name);
+  if (it == cmap().end())
+    return nullptr;
+  return it->second();
+}
+
+inline int OptimizerRegistry::__REGISTER__(const std::string& name, OptimizerCreator creator) {
+  CHECK_EQ(cmap().count(name), 0) << name << " already registered";
+  cmap().emplace(name, std::move(creator));
+  return 0;
+}
+
+inline SGDOptimizer::SGDOptimizer(unsigned begin_num_update)
+  : Optimizer(begin_num_update) {
+  update_handle_ = op_map()->GetSymbolCreator("sgd_update");
+  mom_update_handle_ = op_map()->GetSymbolCreator("sgd_mom_update");
+}
+
+inline std::string SGDOptimizer::GetType() const {
+  return "sgd";
+}
+
+inline SGDOptimizer::~SGDOptimizer() {
+  for (auto &it : states_) {
+    delete it.second;
+  }
+}
+
+inline void SGDOptimizer::Update(int index, NDArray weight, NDArray grad) {
+  if (states_.count(index) == 0) {
+    CreateState_(index, weight);
+  }
+
+  params_["lr"] = std::to_string(GetLR_(index));
+  params_["wd"] = std::to_string(GetWD_(index));
+  UpdateCount_(index);
+  auto keys = GetParamKeys_();
+  auto values = GetParamValues_();
+  CHECK_EQ(keys.size(), values.size());
+
+  NDArrayHandle inputs[3];
+  inputs[0] = weight.GetHandle();
+  inputs[1] = grad.GetHandle();
+
+  int num_outputs = 1;
+  NDArrayHandle output = weight.GetHandle();
+  NDArrayHandle *outputs = &output;
+
+  if (states_[index] == nullptr) {
+    MXImperativeInvoke(update_handle_, 2, inputs,
+        &num_outputs, &outputs,
+        keys.size(), keys.data(), values.data(), nullptr);
+  } else {
+    inputs[2] = states_[index]->GetHandle();
+    MXImperativeInvoke(mom_update_handle_, 3, inputs,
+        &num_outputs, &outputs,
+        keys.size(), keys.data(), values.data(), nullptr);
+  }
+}
+
+inline void SGDOptimizer::CreateState_(int index, NDArray weight) {
+  if (params_.count("momentum") == 0) {
+    states_[index] = nullptr;
+  } else {
+    states_[index] = new NDArray(weight.GetShape(), weight.GetContext());
+    *states_[index] = 0;
+  }
+}
+
+// inplementing Signum optimizer
+
+inline SignumOptimizer::SignumOptimizer(unsigned begin_num_update)
+  : Optimizer(begin_num_update) {
+  update_handle_ = op_map()->GetSymbolCreator("signsgd_update");
+  mom_update_handle_ = op_map()->GetSymbolCreator("signum_update");
+}
+
+inline std::string SignumOptimizer::GetType() const {
+  return "signum";
+}
+
+inline SignumOptimizer::~SignumOptimizer() {
+  for (auto &it : states_) {
+    delete it.second;
+  }
+}
+
+inline void SignumOptimizer::Update(int index, NDArray weight, NDArray grad) {
+  if (states_.count(index) == 0) {
+    CreateState_(index, weight);
+  }
+
+  params_["lr"] = std::to_string(GetLR_(index));
+  params_["wd"] = std::to_string(GetWD_(index));
+  UpdateCount_(index);
+  auto keys = GetParamKeys_();
+  auto values = GetParamValues_();
+  CHECK_EQ(keys.size(), values.size());
+
+  NDArrayHandle inputs[3];
+  inputs[0] = weight.GetHandle();
+  inputs[1] = grad.GetHandle();
+
+  int num_outputs = 1;
+  NDArrayHandle output = weight.GetHandle();
+  NDArrayHandle *outputs = &output;
+
+  if (states_[index] == nullptr) {
+    MXImperativeInvoke(update_handle_, 2, inputs,
+        &num_outputs, &outputs,
+        keys.size(), keys.data(), values.data(), nullptr);
+  } else {
+    inputs[2] = states_[index]->GetHandle();
+    MXImperativeInvoke(mom_update_handle_, 3, inputs,
+        &num_outputs, &outputs,
+        keys.size(), keys.data(), values.data(), nullptr);
+  }
+}
+
+inline void SignumOptimizer::CreateState_(int index, NDArray weight) {
+  if (params_.count("momentum") == 0) {
+    states_[index] = nullptr;
+  } else {
+    states_[index] = new NDArray(weight.GetShape(), weight.GetContext());
+    *states_[index] = 0;
+  }
+}
+
+// finish implementing Signum
+
+
+
+inline RMSPropOptimizer::RMSPropOptimizer(unsigned begin_num_update)
+  : Optimizer(begin_num_update) {
+  update_handle_ = op_map()->GetSymbolCreator("rmsprop_update");
+  alex_update_handle_ = op_map()->GetSymbolCreator("rmspropalex_update");
+  SetParam("gamma1", 0.9f);
+  SetParam("gamma2", 0.9f);
+  SetParam("epsilon", 1e-8);
+}
+
+inline std::string RMSPropOptimizer::GetType() const {
+  return "rmsprop";
+}
+
+inline RMSPropOptimizer::~RMSPropOptimizer() {
+  for (auto &it : n_) {
+    delete it.second;
+  }
+  for (auto &it : g_) {
+    delete it.second;
+  }
+  for (auto &it : delta_) {
+    delete it.second;
+  }
+}
+
+inline void RMSPropOptimizer::Update(int index, NDArray weight, NDArray grad) {
+  if (n_.count(index) == 0) {
+    CreateState_(index, weight);
+  }
+
+  params_["lr"] = std::to_string(GetLR_(index));
+  params_["wd"] = std::to_string(GetWD_(index));
+  UpdateCount_(index);
+  auto keys = GetParamKeys_();
+  auto values = GetParamValues_();
+  CHECK_EQ(keys.size(), values.size());
+
+  NDArrayHandle inputs[5];
+  inputs[0] = weight.GetHandle();
+  inputs[1] = grad.GetHandle();
+  inputs[2] = n_[index]->GetHandle();
+  inputs[3] = g_[index]->GetHandle();
+  inputs[4] = delta_[index]->GetHandle();
+
+  int num_outputs = 1;
+  NDArrayHandle output = weight.GetHandle();
+  NDArrayHandle *outputs = &output;
+
+  MXImperativeInvoke(alex_update_handle_, 5, inputs,
+      &num_outputs, &outputs,
+      keys.size(), keys.data(), values.data(), nullptr);
+}
+
+inline void RMSPropOptimizer::CreateState_(int index, NDArray weight) {
+  n_[index] = new NDArray(weight.GetShape(), weight.GetContext());
+  *n_[index] = 0;
+  g_[index] = new NDArray(weight.GetShape(), weight.GetContext());
+  *g_[index] = 0;
+  delta_[index] = new NDArray(weight.GetShape(), weight.GetContext());
+  *delta_[index] = 0;
+}
+
+inline AdamOptimizer::AdamOptimizer(unsigned begin_num_update)
+  : Optimizer(begin_num_update) {
+  update_handle_ = op_map()->GetSymbolCreator("adam_update");
+  SetParam("beta1", 0.9f);
+  SetParam("beta2", 0.999f);
+  SetParam("epsilon", 1e-8);
+}
+
+inline std::string AdamOptimizer::GetType() const {
+  return "adam";
+}
+
+inline AdamOptimizer::~AdamOptimizer() {
+  for (auto &it : mean_) {
+    delete it.second;
+  }
+  for (auto &it : var_) {
+    delete it.second;
+  }
+}
+
+inline void AdamOptimizer::Update(int index, NDArray weight, NDArray grad) {
+  if (mean_.count(index) == 0) {
+    CreateState_(index, weight);
+  }
+
+  params_["lr"] = std::to_string(GetLR_(index));
+  params_["wd"] = std::to_string(GetWD_(index));
+  UpdateCount_(index);
+  auto keys = GetParamKeys_();
+  auto values = GetParamValues_();
+  CHECK_EQ(keys.size(), values.size());
+
+  float lr = dmlc::stof(params_["lr"]);
+  float b1 = dmlc::stof(params_["beta1"]);
+  float b2 = dmlc::stof(params_["beta2"]);
+  float t = count_[index];
+  float coef1 = 1.0f - std::pow(b1, t);
+  float coef2 = 1.0f - std::pow(b2, t);
+  lr *= std::sqrt(coef2) / coef1;
+
+  NDArrayHandle inputs[4];
+  inputs[0] = weight.GetHandle();
+  inputs[1] = grad.GetHandle();
+
+  int num_outputs = 1;
+  NDArrayHandle output = weight.GetHandle();
+  NDArrayHandle *outputs = &output;
+
+  inputs[2] = mean_[index]->GetHandle();
+  inputs[3] = var_[index]->GetHandle();
+
+  MXImperativeInvoke(update_handle_, 4, inputs,
+    &num_outputs, &outputs,
+    keys.size(), keys.data(), values.data(), nullptr);
+}
+
+inline void AdamOptimizer::CreateState_(int index, NDArray weight) {
+  mean_[index] = new NDArray(weight.GetShape(), weight.GetContext());
+  *mean_[index] = 0;
+  var_[index] = new NDArray(weight.GetShape(), weight.GetContext());
+  *var_[index] = 0;
+}
+
+inline AdaGradOptimizer::AdaGradOptimizer(unsigned begin_num_update)
+  : Optimizer(begin_num_update) {
+  SetParam("eps", 1e-7);
+}
+
+inline std::string AdaGradOptimizer::GetType() const {
+  return "adagrad";
+}
+
+inline void AdaGradOptimizer::Update(int index, NDArray weight, NDArray grad) {
+  if (history_.count(index) == 0) {
+    CreateState_(index, weight);
+  }
+
+  float eps = dmlc::stof(params_["eps"]);
+  float lr = GetLR_(index);
+  float wd = GetWD_(index);
+  UpdateCount_(index);
+  if (params_.count("rescale_grad") > 0) {
+    grad *= dmlc::stof(params_["rescale_grad"]);
+  }
+  if (params_.count("clip_gradient") > 0) {
+    _clip(grad, dmlc::stof(params_["clip_gradient"]));
+  }
+  auto& history = *history_[index];
+  history += grad * grad;
+  weight -= (grad / _sqrt(history + eps) + weight * wd) * lr;
+}
+
+inline AdaGradOptimizer::~AdaGradOptimizer() {
+  for (auto& it : history_) {
+    delete it.second;
+  }
+}
+
+inline void AdaGradOptimizer::CreateState_(int index, NDArray weight) {
+  history_[index] = new NDArray(weight.GetShape(), weight.GetContext());
+  *history_[index] = 0;
+}
+
+inline AdaDeltaOptimizer::AdaDeltaOptimizer(unsigned begin_num_update)
+  : Optimizer(begin_num_update) {
+  SetParam("rho", 0.90f);
+  SetParam("epsilon", 1e-5);
+}
+
+inline std::string AdaDeltaOptimizer::GetType() const {
+  return "adadelta";
+}
+
+inline void AdaDeltaOptimizer::Update(int index, NDArray weight, NDArray grad) {
+  if (acc_g_.count(index) == 0) {
+    CreateState_(index, weight);
+  }
+
+  float rho = dmlc::stof(params_["rho"]);
+  float epsilon = dmlc::stof(params_["epsilon"]);
+  float wd = GetWD_(index);
+  UpdateCount_(index);
+
+  if (params_.count("rescale_grad") > 0) {
+    grad *= dmlc::stof(params_["rescale_grad"]);
+  }
+  if (params_.count("clip_gradient") > 0) {
+    _clip(grad, dmlc::stof(params_["clip_gradient"]));
+  }
+
+  auto& acc_g = *acc_g_[index];
+  auto& acc_delta = *acc_delta_[index];
+  acc_g *= rho;
+  acc_g += grad * grad * (1.0f - rho);
+
+  auto delta = _sqrt(acc_delta + epsilon) / _sqrt(acc_g + epsilon) * grad;
+  acc_delta *= rho;
+  acc_delta += delta * delta * (1.0f - rho);
+  weight *= 1.0f - wd;
+  weight -= delta;
+}
+
+inline AdaDeltaOptimizer::~AdaDeltaOptimizer() {
+  for (auto& it : acc_g_) {
+    delete it.second;
+  }
+  for (auto& it : acc_delta_) {
+    delete it.second;
+  }
+}
+
+inline void AdaDeltaOptimizer::CreateState_(int index, NDArray weight) {
+  acc_g_[index] = new NDArray(weight.GetShape(), weight.GetContext());
+  *acc_g_[index] = 0;
+  acc_delta_[index] = new NDArray(weight.GetShape(), weight.GetContext());
+  *acc_delta_[index] = 0;
+}
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_OPTIMIZER_HPP_
diff --git a/cpp-package/include/mxnet-cpp/shape.h b/cpp-package/include/mxnet-cpp/shape.h
new file mode 100644
index 000000000000..b15f19ca3eb1
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/shape.h
@@ -0,0 +1,408 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file shape.h
+* \brief definition of shape
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef MXNET_CPP_SHAPE_H_
+#define MXNET_CPP_SHAPE_H_
+
+#include <istream>
+#include <ostream>
+#include <algorithm>
+#include <vector>
+#include "mxnet-cpp/base.h"
+
+namespace mxnet {
+namespace cpp {
+
+/*!
+* \brief dynamic shape class that can hold shape
+*   of arbirary dimension
+*/
+struct Shape {
+ public:
+  /*! \brief constructor */
+  Shape()
+    : ndim_(0),
+    num_heap_allocated_(0),
+    data_heap_(nullptr) {}
+  /*!
+  * \brief constructor from a vector of index_t
+  * \param v the vector
+  */
+  explicit Shape(const std::vector<index_t> &v)
+    : ndim_(v.size()) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = nullptr;
+      num_heap_allocated_ = 0;
+      std::copy(v.begin(), v.end(), data_stack_);
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      std::copy(v.begin(), v.end(), data_heap_);
+    }
+  }
+  /*!
+  * \brief constructor one dimmension shape
+  * \param s1 size of the first dimmension
+  */
+  explicit Shape(index_t s1)
+    : ndim_(1) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = nullptr;
+      num_heap_allocated_ = 0;
+      data_stack_[0] = s1;
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      data_heap_[0] = s1;
+    }
+  }
+  /*!
+  * \brief constructor two dimmension shape
+  * \param s1 size of the first dimmension
+  * \param s2 size of the second dimmension
+  */
+  Shape(index_t s1, index_t s2)
+    : ndim_(2) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = nullptr;
+      num_heap_allocated_ = 0;
+      data_stack_[0] = s1;
+      data_stack_[1] = s2;
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      data_heap_[0] = s1;
+      data_heap_[1] = s2;
+    }
+  }
+  /*!
+  * \brief constructor three dimmension shape
+  * \param s1 size of the first dimmension
+  * \param s2 size of the second dimmension
+  * \param s3 size of the third dimmension
+  */
+  Shape(index_t s1, index_t s2, index_t s3)
+    : ndim_(3) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = nullptr;
+      num_heap_allocated_ = 0;
+      data_stack_[0] = s1;
+      data_stack_[1] = s2;
+      data_stack_[2] = s3;
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      data_heap_[0] = s1;
+      data_heap_[1] = s2;
+      data_heap_[2] = s3;
+    }
+  }
+  /*!
+  * \brief constructor four dimmension shape
+  * \param s1 size of the first dimmension
+  * \param s2 size of the second dimmension
+  * \param s3 size of the third dimmension
+  * \param s4 size of the fourth dimmension
+  */
+  Shape(index_t s1, index_t s2, index_t s3, index_t s4)
+    : ndim_(4) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = nullptr;
+      num_heap_allocated_ = 0;
+      data_stack_[0] = s1;
+      data_stack_[1] = s2;
+      data_stack_[2] = s3;
+      data_stack_[3] = s4;
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      data_heap_[0] = s1;
+      data_heap_[1] = s2;
+      data_heap_[2] = s3;
+      data_heap_[3] = s4;
+    }
+  }
+  /*!
+  * \brief constructor five dimmension shape
+  * \param s1 size of the first dimmension
+  * \param s2 size of the second dimmension
+  * \param s3 size of the third dimmension
+  * \param s4 size of the fourth dimmension
+  * \param s5 size of the fifth dimmension
+  */
+  Shape(index_t s1, index_t s2, index_t s3, index_t s4, index_t s5)
+    : ndim_(5) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = nullptr;
+      num_heap_allocated_ = 0;
+      data_stack_[0] = s1;
+      data_stack_[1] = s2;
+      data_stack_[2] = s3;
+      data_stack_[3] = s4;
+      data_stack_[4] = s5;
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      data_heap_[0] = s1;
+      data_heap_[1] = s2;
+      data_heap_[2] = s3;
+      data_heap_[3] = s4;
+      data_heap_[4] = s5;
+    }
+  }
+  /*!
+  * \brief constructor from Shape
+  * \param s the source shape
+  */
+  Shape(const Shape &s)
+    : ndim_(s.ndim_) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = nullptr;
+      num_heap_allocated_ = 0;
+      std::copy(s.data_stack_, s.data_stack_ + ndim_, data_stack_);
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      std::copy(s.data_heap_, s.data_heap_ + ndim_, data_heap_);
+    }
+  }
+#if MSHADOW_IN_CXX11
+  /*!
+  * \brief move constructor from Shape
+  * \param s the source shape
+  */
+  Shape(Shape &&s)
+    : ndim_(s.ndim_),
+    num_heap_allocated_(s.num_heap_allocated_),
+    data_heap_(s.data_heap_) {
+    if (ndim_ <= kStackCache) {
+      std::copy(s.data_stack_, s.data_stack_ + ndim_, data_stack_);
+    }
+    // remove data heap space from s
+    s.data_heap_ = nullptr;
+  }
+#endif
+  /*! \brief destructor */
+  ~Shape() {
+    // data_heap_ can be nullptr
+    delete[] data_heap_;
+  }
+  /*!
+  * \brief copy shape from content betwen two iterators
+  * \param begin the beginning of iterator
+  * \param end the end of the iterator
+  * \tparam RandomAccessIterator iterator type
+  */
+  template<typename RandomAccessIterator>
+  inline void CopyFrom(RandomAccessIterator begin,
+    RandomAccessIterator end) {
+    this->SetDim(end - begin);
+    std::copy(begin, end, data());
+  }
+  /*!
+  * \brief assignment from shape
+  * \param shape source shape
+  * \return reference of self
+  */
+  inline Shape &operator=(const Shape &shape) {
+    this->SetDim(shape.ndim_);
+    const index_t *src = shape.data();
+    std::copy(src, src + ndim_, data());
+    return *this;
+  }
+  /*!
+  * \brief assignment from vector
+  * \param shape source shape
+  * \return reference of self
+  */
+  inline Shape &operator=(const std::vector<index_t> &shape) {
+    this->CopyFrom(shape.begin(), shape.end());
+    return *this;
+  }
+  /*! \return the data content of the shape */
+  inline const index_t *data() const {
+    return ndim_ <= kStackCache ? data_stack_ : data_heap_;
+  }
+  /*! \return the data content of the shape */
+  inline index_t *data() {
+    return ndim_ <= kStackCache ? data_stack_ : data_heap_;
+  }
+  /*! \brief return number of dimension of the tensor inside */
+  inline index_t ndim(void) const {
+    return ndim_;
+  }
+  /*!
+  * \brief get corresponding index
+  * \param i dimension index
+  * \return the corresponding dimension size
+  */
+  inline index_t &operator[](index_t i) {
+    return data()[i];
+  }
+  /*!
+  * \brief get corresponding index
+  * \param i dimension index
+  * \return the corresponding dimension size
+  */
+  inline const index_t &operator[](index_t i) const {
+    return data()[i];
+  }
+  /*! \brief total number of elements in the tensor */
+  inline size_t Size(void) const {
+    size_t size = 1;
+    const index_t *d = this->data();
+    for (index_t i = 0; i < ndim_; ++i) {
+      size *= d[i];
+    }
+    return size;
+  }
+  /*!
+  * \return whether two shape equals
+  * \param s the shape to compare against
+  */
+  inline bool operator==(const Shape &s) const {
+    if (ndim_ != s.ndim_) return false;
+    if (ndim_ <= kStackCache) {
+      for (index_t i = 0; i < ndim_; ++i) {
+        if (data_stack_[i] != s.data_stack_[i]) return false;
+      }
+    } else {
+      for (index_t i = 0; i < ndim_; ++i) {
+        if (data_heap_[i] != s.data_heap_[i]) return false;
+      }
+    }
+    return true;
+  }
+  /*!
+  * \return whether two shape not equals
+  * \param s the shape to compare against
+  */
+  inline bool operator!=(const Shape &s) const {
+    return !(*this == s);
+  }
+
+  friend std::ostream &operator<<(std::ostream &os, const Shape &shape);
+  friend std::istream &operator>>(std::istream &is, Shape &shape);
+
+ private:
+  // the shape will be stored in data_stack_
+  // when dimension is smaller than kStackCache
+  // when it is bigger, it will be stored in data_heap_;
+  /*! \brief size of in stack space */
+  static const index_t kStackCache = 5;
+  /*! \brief number of dimnsion of the shape */
+  index_t ndim_;
+  /*! \brief number of cells allocated in data_heap_ */
+  index_t num_heap_allocated_;
+  /*! \brief in stack space used to store shape when it is small */
+  index_t data_stack_[kStackCache];
+  /*! \brief space to store shape when dimension is big*/
+  index_t *data_heap_;
+  /*!
+  * \brief internal function to set the dimension
+  * \param dim the dimension of the shape
+  */
+  inline void SetDim(index_t dim) {
+    if (dim > kStackCache &&
+      dim > num_heap_allocated_) {
+      // data_heap_ can be nullptr
+      delete[] data_heap_;
+      data_heap_ = new index_t[dim];
+      num_heap_allocated_ = dim;
+    }
+    ndim_ = dim;
+  }
+};
+
+/*!
+* \brief allow string printing of the shape
+* \param os the output stream
+* \param shape the shape
+* \return the ostream
+*/
+inline std::ostream &operator<<(std::ostream &os, const Shape &shape) {
+  os << '(';
+  for (index_t i = 0; i < shape.ndim(); ++i) {
+    if (i != 0) os << ',';
+    os << static_cast<int>(shape[i]);  // Supports negative Shape 'special codes' for inferring
+  }
+  // python style tuple
+  if (shape.ndim() == 1) os << ',';
+  os << ')';
+  return os;
+}
+
+/*!
+* \brief read shape from the istream
+* \param is the input stream
+* \param shape the shape
+* \return the istream
+*/
+inline std::istream &operator>>(std::istream &is, Shape &shape) {
+  // get (
+  while (true) {
+    char ch = is.get();
+    if (ch == '(') break;
+    if (!isspace(ch)) {
+      is.setstate(std::ios::failbit);
+      return is;
+    }
+  }
+  index_t idx;
+  std::vector<index_t> tmp;
+  while (is >> idx) {
+    tmp.push_back(idx);
+    char ch;
+    do {
+      ch = is.get();
+    } while (isspace(ch));
+    if (ch == ',') {
+      while (true) {
+        ch = is.peek();
+        if (isspace(ch)) {
+          is.get(); continue;
+        }
+        if (ch == ')') {
+          is.get(); break;
+        }
+        break;
+      }
+      if (ch == ')') break;
+    } else if (ch == ')') {
+      break;
+    } else {
+      is.setstate(std::ios::failbit);
+      return is;
+    }
+  }
+  shape.CopyFrom(tmp.begin(), tmp.end());
+  return is;
+}
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_SHAPE_H_
diff --git a/cpp-package/include/mxnet-cpp/symbol.h b/cpp-package/include/mxnet-cpp/symbol.h
new file mode 100644
index 000000000000..46fe2cbd28d0
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/symbol.h
@@ -0,0 +1,300 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file symbol.h
+* \brief definition of symbol
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef MXNET_CPP_SYMBOL_H_
+#define MXNET_CPP_SYMBOL_H_
+
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/ndarray.h"
+#include "mxnet-cpp/op_map.h"
+
+namespace mxnet {
+namespace cpp {
+
+class Executor;
+
+/*!
+* \brief struct to store SymbolHandle
+*/
+struct SymBlob {
+ public:
+  /*!
+  * \brief default constructor
+  */
+  SymBlob() : handle_(nullptr) {}
+  /*!
+  * \brief construct with SymbolHandle to store
+  */
+  explicit SymBlob(SymbolHandle handle) : handle_(handle) {}
+  /*!
+  * \brief destructor, free the SymbolHandle
+  */
+  ~SymBlob() { MXSymbolFree(handle_); }
+  /*!
+  * \brief the SymbolHandle to store
+  */
+  SymbolHandle handle_;
+
+ private:
+  SymBlob(const SymBlob &);
+  SymBlob &operator=(const SymBlob &);
+};
+
+/*!
+* \brief Symbol interface
+*/
+class Symbol {
+ public:
+  Symbol() {}
+  /*!
+  * \brief construct a Symbol with SymbolHandle
+  * \param handle the given SymbolHandle
+  */
+  explicit Symbol(SymbolHandle handle);
+  /*!
+  * \brief construct a variable Symbol
+  * \param name the name of the variable
+  */
+  explicit Symbol(const char *name);
+  /*!
+  * \brief construct a variable Symbol
+  * \param name the name of the variable
+  */
+  explicit Symbol(const std::string &name);
+  Symbol operator+(const Symbol &rhs) const;
+  Symbol operator-(const Symbol &rhs) const;
+  Symbol operator*(const Symbol &rhs) const;
+  Symbol operator/(const Symbol &rhs) const;
+  Symbol operator%(const Symbol &rhs) const;
+
+  Symbol operator+(mx_float scalar) const;
+  Symbol operator-(mx_float scalar) const;
+  Symbol operator*(mx_float scalar) const;
+  Symbol operator/(mx_float scalar) const;
+  Symbol operator%(mx_float scalar) const;
+  Symbol Copy() const;
+  /*!
+  * \brief construct a variable Symbol
+  * \param name the name of the variable
+  */
+  static Symbol Variable(const std::string &name = "");
+  Symbol operator[](int index);
+  Symbol operator[](const std::string &index);
+  /*!
+  * \brief Create a symbol that groups symbols together
+  * \param symbols List of symbols to be groupe
+  */
+  static Symbol Group(const std::vector<Symbol> &symbols);
+  /*!
+  * \brief load Symbol from a JSON file
+  * \param file_name the name of the file
+  */
+  static Symbol Load(const std::string &file_name);
+  /*!
+  * \brief load Symbol from a JSON string
+  * \param json_str the JSON string
+  */
+  static Symbol LoadJSON(const std::string &json_str);
+  /*!
+  * \brief save Symbol to a file
+  * \param file_name the name of the file
+  */
+  void Save(const std::string &file_name) const;
+  /*!
+  * \brief save Symbol into a JSON string
+  */
+  std::string ToJSON() const;
+  /*!
+  * \brief save Symbol into a JSON string
+  * \retutrn the symbol whose outputs are all the internals.
+  */
+  Symbol GetInternals() const;
+  /*!
+  * \return the SymbolHandle
+  */
+  SymbolHandle GetHandle() const { return (blob_ptr_) ? blob_ptr_->handle_: nullptr; }
+  /*!
+  * \brief construct an operator Symbol, with given input Symbol and config
+  * \param name the name of the Symbol
+  * \param input_keys the vector of keys of the input
+  * \param input_values the vector of the intput Symbols
+  * \param config_keys the vector of keys of the config
+  * \param config_values the vecotr of values of the config
+  */
+  Symbol(const std::string &operator_name, const std::string &name,
+         std::vector<const char *> input_keys,
+         std::vector<SymbolHandle> input_values,
+         std::vector<const char *> config_keys,
+         std::vector<const char *> config_values);
+  /*!
+  * \brief infer the shapes by providing shapes of known argument shapes.
+  * \param arg_shapes map of argument name to shape of arguments with known
+  * shapes.
+  * \param in_shapes used to store infered shapes of input arguments.
+  * \param out_shapes used to store infered shapes of outputs.
+  * \param aux_shapes use to store the infered shapes of auxiliary states
+  */
+  void InferShape(
+      const std::map<std::string, std::vector<mx_uint> > &arg_shapes,
+      std::vector<std::vector<mx_uint> > *in_shape,
+      std::vector<std::vector<mx_uint> > *aux_shape,
+      std::vector<std::vector<mx_uint> > *out_shape) const;
+  /*!
+  * \brief List the arguments names.
+  *
+  * The position of the returned list also corresponds to calling position in
+  *operator()
+  * \return the arguments list of this symbol, they can be either named or
+  *unnamed (empty string).
+  */
+  std::vector<std::string> ListArguments() const;
+  /*! \return lists all argument names and aux states of the symbol */
+  std::vector<std::string> ListInputs() const;
+  /*! \return get the descriptions of outputs for this symbol */
+  std::vector<std::string> ListOutputs() const;
+  /*! \return get the descriptions of auxiliary data for this symbol */
+  std::vector<std::string> ListAuxiliaryStates() const;
+  /*! \return get all attributes for this symbol */
+  std::map<std::string, std::string> ListAttributes() const;
+  /*!
+   * \brief set key-value attribute to the symbol
+   * @param key string represent the key for the attribute
+   * @param value string represent the value for the attribute
+   */
+  void SetAttribute(const std::string& key, const std::string& value);
+  /*!
+   * \brief set a series of key-value attribute to the symbol
+   * @param attrs string:string map represent the key value attributes
+   */
+  void SetAttributes(const std::map<std::string, std::string>& attrs);
+  /*! \return get number of outputs for this symbol */
+  mx_uint GetNumOutputs() const;
+  /*! \return get the new symbol through subgraph API for this symbol */
+  mxnet::cpp::Symbol GetBackendSymbol(const std::string& backendName) const;
+  /*! \return get the name of the symbol */
+  std::string GetName() const;
+  /*!
+  * \brief infer and construct all the arrays to bind to executor by providing
+  * some known arrays.
+  * \param context the context of all the infered arrays
+  * \param arg_arrays infered input arguments arrays.
+  * \param arad_arrays infered arrays to store the gradient output of the input
+  * arguments.
+  * \param aux_arrays infered arrays that is used as internal state in op.
+  * \param args_map map of some given arguments arrays.
+  * \param args_grad_store map of some gradient given store arrays.
+  * \param args_req_type map of some given type of gradient saving. Can only be
+  * in {kNullOp, kAddTo, kWriteTo}.
+  * \param aux_map NDArray that stores the internal state in op
+  */
+  void InferExecutorArrays(
+      const Context &context, std::vector<NDArray> *arg_arrays,
+      std::vector<NDArray> *grad_arrays, std::vector<OpReqType> *grad_reqs,
+      std::vector<NDArray> *aux_arrays,
+      const std::map<std::string, NDArray> &args_map,
+      const std::map<std::string, NDArray> &arg_grad_store =
+          std::map<std::string, NDArray>(),
+      const std::map<std::string, OpReqType> &grad_req_type =
+          std::map<std::string, OpReqType>(),
+      const std::map<std::string, NDArray> &aux_map =
+          std::map<std::string, NDArray>()) const;
+  /*!
+  * \brief infer and construct all the input arguments arrays to bind to
+  * executor by providing some known arguments arrays.
+  * \param context the context of all the infered arrays.
+  * \param args_map map of all the infered input arguments arrays.
+  * \param known_args map of some given arguments arrays.
+  */
+  void InferArgsMap(const Context &context,
+                    std::map<std::string, NDArray> *args_map,
+                    const std::map<std::string, NDArray> &known_args) const;
+  /*!
+  * \brief Create an executor by bind symbol with context and arguments.
+  *  If user do not want to compute the gradients of i-th argument,
+  *grad_req_type[i] can be kNullOp.
+  *  The input arrays in the given maps should have the same name with the input
+  *symbol.
+  *  Only need some of the necessary arrays, and the other arrays can be infered
+  *automatically.
+  *
+  * \param context the context of binding.
+  * \param args_map the NDArray that stores the input arguments to the symbol.
+  * \param arg_grad_store NDArray that is used to store the gradient output of
+  *the input arguments.
+  * \param grad_req_type requirment type of gradient saving. Can only be in
+  *{kNullOp, kAddTo, kWriteTo}.
+  * \param aux_map NDArray that stores the internal state in op
+  * \return a new executor, which need to be free manually.
+  */
+  Executor *SimpleBind(const Context &context,
+                       const std::map<std::string, NDArray> &args_map,
+                       const std::map<std::string, NDArray> &arg_grad_store =
+                           std::map<std::string, NDArray>(),
+                       const std::map<std::string, OpReqType> &grad_req_type =
+                           std::map<std::string, OpReqType>(),
+                       const std::map<std::string, NDArray> &aux_map =
+                           std::map<std::string, NDArray>());
+  /*!
+  * \brief Create an executor by bind symbol with context and arguments.
+  *  If user do not want to compute the gradients of i-th argument,
+  *grad_req_type[i] can be kNullOp.
+  *
+  * \param context the context of binding.
+  * \param arg_arrays the NDArray that stores the input arguments to the symbol.
+  * \param grad_arrays NDArray that is used to store the gradient output of the
+  *input arguments.
+  * \param grad_reqs requirment type of gradient saving. Can only be in
+  *{kNullOp, kAddTo, kWriteTo}.
+  * \param aux_arrays NDArray that is used as internal state in op
+  * \param group_to_ctx dict of string to mx.Context
+  * \param shared_exec Executor to share memory with. This is intended for
+  *runtime reshaping, variable length sequencesn etc.  The returned executor
+  *shares state with shared_exec, and should not be used in parallel with it.
+  * \return a new executor, which need to be free manually.
+  */
+  Executor *Bind(const Context &context, const std::vector<NDArray> &arg_arrays,
+                 const std::vector<NDArray> &grad_arrays,
+                 const std::vector<OpReqType> &grad_reqs,
+                 const std::vector<NDArray> &aux_arrays,
+                 const std::map<std::string, Context> &group_to_ctx =
+                     std::map<std::string, Context>(),
+                 Executor *shared_exec = nullptr);
+
+ private:
+  std::shared_ptr<SymBlob> blob_ptr_;
+  static OpMap*& op_map();
+};
+Symbol operator+(mx_float lhs, const Symbol &rhs);
+Symbol operator-(mx_float lhs, const Symbol &rhs);
+Symbol operator*(mx_float lhs, const Symbol &rhs);
+Symbol operator/(mx_float lhs, const Symbol &rhs);
+Symbol operator%(mx_float lhs, const Symbol &rhs);
+}  // namespace cpp
+}  // namespace mxnet
+#endif  // MXNET_CPP_SYMBOL_H_
diff --git a/cpp-package/include/mxnet-cpp/symbol.hpp b/cpp-package/include/mxnet-cpp/symbol.hpp
new file mode 100644
index 000000000000..187dad842862
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/symbol.hpp
@@ -0,0 +1,424 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file symbol.hpp
+ * \brief implementation of the symbol
+ * \author Zhang Chen, Chuntao Hong
+ */
+
+#ifndef MXNET_CPP_SYMBOL_HPP_
+#define MXNET_CPP_SYMBOL_HPP_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "dmlc/logging.h"
+#include "mxnet-cpp/symbol.h"
+
+#include "mxnet-cpp/op_suppl.h"
+
+namespace mxnet {
+namespace cpp {
+inline OpMap*& Symbol::op_map() {
+  static OpMap* op_map_ = new OpMap();
+  return op_map_;
+}
+inline Symbol::Symbol(SymbolHandle handle) {
+  blob_ptr_ = std::make_shared<SymBlob>(handle);
+}
+inline Symbol::Symbol(const char *name) {
+  SymbolHandle handle;
+  CHECK_EQ(MXSymbolCreateVariable(name, &(handle)), 0);
+  blob_ptr_ = std::make_shared<SymBlob>(handle);
+}
+inline Symbol::Symbol(const std::string &name) : Symbol(name.c_str()) {}
+inline Symbol Symbol::Variable(const std::string &name) { return Symbol(name); }
+inline Symbol Symbol::operator+(const Symbol &rhs) const { return _Plus(*this, rhs); }
+inline Symbol Symbol::operator-(const Symbol &rhs) const { return _Minus(*this, rhs); }
+inline Symbol Symbol::operator*(const Symbol &rhs) const { return _Mul(*this, rhs); }
+inline Symbol Symbol::operator/(const Symbol &rhs) const { return _Div(*this, rhs); }
+inline Symbol Symbol::operator%(const Symbol &rhs) const { return _Mod(*this, rhs); }
+inline Symbol Symbol::operator+(mx_float scalar) const {
+  return _PlusScalar(*this, scalar);
+}
+inline Symbol Symbol::operator-(mx_float scalar) const {
+  return _MinusScalar(*this, scalar);
+}
+inline Symbol Symbol::operator*(mx_float scalar) const {
+  return _MulScalar(*this, scalar);
+}
+inline Symbol Symbol::operator/(mx_float scalar) const {
+  return _DivScalar(*this, scalar);
+}
+inline Symbol Symbol::operator%(mx_float scalar) const {
+  return _ModScalar(*this, scalar);
+}
+inline Symbol Symbol::operator[](int index) {
+  SymbolHandle out;
+  MXSymbolGetOutput(GetHandle(), index, &out);
+  return Symbol(out);
+}
+inline Symbol Symbol::operator[](const std::string &index) {
+  auto outputs = ListOutputs();
+  for (mx_uint i = 0; i < outputs.size(); ++i) {
+    if (outputs[i] == index) {
+      return (*this)[i];
+    }
+  }
+  LOG(FATAL) << "Cannot find output that matches name " << index;
+  return (*this)[0];
+}
+inline Symbol Symbol::Group(const std::vector<Symbol> &symbols) {
+  SymbolHandle out;
+  std::vector<SymbolHandle> handle_list;
+  for (const auto &t : symbols) {
+    handle_list.push_back(t.GetHandle());
+  }
+  MXSymbolCreateGroup(handle_list.size(), handle_list.data(), &out);
+  return Symbol(out);
+}
+inline Symbol Symbol::Load(const std::string &file_name) {
+  op_map();
+  SymbolHandle handle;
+  CHECK_EQ(MXSymbolCreateFromFile(file_name.c_str(), &(handle)), 0);
+  return Symbol(handle);
+}
+inline Symbol Symbol::LoadJSON(const std::string &json_str) {
+  op_map();
+  SymbolHandle handle;
+  CHECK_EQ(MXSymbolCreateFromJSON(json_str.c_str(), &(handle)), 0);
+  return Symbol(handle);
+}
+inline void Symbol::Save(const std::string &file_name) const {
+  CHECK_EQ(MXSymbolSaveToFile(GetHandle(), file_name.c_str()), 0);
+}
+inline std::string Symbol::ToJSON() const {
+  const char *out_json;
+  CHECK_EQ(MXSymbolSaveToJSON(GetHandle(), &out_json), 0);
+  return std::string(out_json);
+}
+inline Symbol Symbol::GetInternals() const {
+  SymbolHandle handle;
+  CHECK_EQ(MXSymbolGetInternals(GetHandle(), &handle), 0);
+  return Symbol(handle);
+}
+inline Symbol::Symbol(const std::string &operator_name, const std::string &name,
+               std::vector<const char *> input_keys,
+               std::vector<SymbolHandle> input_values,
+               std::vector<const char *> config_keys,
+               std::vector<const char *> config_values) {
+  SymbolHandle handle;
+  AtomicSymbolCreator creator = op_map()->GetSymbolCreator(operator_name);
+  MXSymbolCreateAtomicSymbol(creator, config_keys.size(), config_keys.data(),
+                             config_values.data(), &handle);
+  MXSymbolCompose(handle, operator_name.c_str(), input_keys.size(),
+                  input_keys.data(), input_values.data());
+  blob_ptr_ = std::make_shared<SymBlob>(handle);
+}
+
+inline Symbol Symbol::Copy() const {
+  SymbolHandle handle;
+  CHECK_EQ(MXSymbolCopy(GetHandle(), &handle), 0);
+  return Symbol(handle);
+}
+
+inline std::vector<std::string> Symbol::ListArguments() const {
+  std::vector<std::string> ret;
+  mx_uint size;
+  const char **sarr;
+  MXSymbolListArguments(GetHandle(), &size, &sarr);
+  for (mx_uint i = 0; i < size; ++i) {
+    ret.push_back(std::string(sarr[i]));
+  }
+  return ret;
+}
+
+inline std::vector<std::string> Symbol::ListInputs() const {
+  std::vector<std::string> ret;
+  mx_uint size;
+  const char **sarr;
+  NNSymbolListInputNames(GetHandle(), 0, &size, &sarr);
+  for (mx_uint i = 0; i < size; ++i) {
+    ret.push_back(std::string(sarr[i]));
+  }
+  return ret;
+}
+
+inline std::vector<std::string> Symbol::ListOutputs() const {
+  std::vector<std::string> ret;
+  mx_uint size;
+  const char **sarr;
+  MXSymbolListOutputs(GetHandle(), &size, &sarr);
+  for (mx_uint i = 0; i < size; ++i) {
+    ret.push_back(std::string(sarr[i]));
+  }
+  return ret;
+}
+inline std::vector<std::string> Symbol::ListAuxiliaryStates() const {
+  std::vector<std::string> ret;
+  mx_uint size;
+  const char **sarr;
+  MXSymbolListAuxiliaryStates(GetHandle(), &size, &sarr);
+  for (mx_uint i = 0; i < size; ++i) {
+    ret.push_back(std::string(sarr[i]));
+  }
+  return ret;
+}
+
+inline std::map<std::string, std::string> Symbol::ListAttributes() const {
+    mx_uint size;
+    const char** pairs;
+    CHECK_EQ(MXSymbolListAttrShallow(GetHandle(), &size, &pairs), 0);
+    std::map<std::string, std::string> attributes;
+    for (mx_uint i = 0; i < size; ++i) {
+        // pairs is 2 * size with key, value pairs according to
+        //   https://github.com/apache/incubator-mxnet/blob/master/include/mxnet/c_api.h#L1428
+        attributes[pairs[2 * i]] = pairs[2 * i + 1];
+    }
+    return attributes;
+}
+
+inline void Symbol::SetAttribute(const std::string &key, const std::string &value) {
+    CHECK_EQ(MXSymbolSetAttr(GetHandle(), key.c_str(), value.c_str()), 0);
+}
+
+inline void Symbol::SetAttributes(const std::map<std::string, std::string> &attrs) {
+    for (const auto& kv : attrs) {
+        SetAttribute(kv.first, kv.second);
+    }
+}
+
+inline mx_uint Symbol::GetNumOutputs() const {
+    mx_uint numOutputs;
+    CHECK_EQ(MXSymbolGetNumOutputs(GetHandle(), &numOutputs), 0);
+    return numOutputs;
+}
+
+inline mxnet::cpp::Symbol Symbol::GetBackendSymbol(const std::string &backendName) const {
+    SymbolHandle symbolHandle;
+    CHECK_EQ(MXGenBackendSubgraph(GetHandle(), backendName.c_str(), &symbolHandle), 0);
+    return mxnet::cpp::Symbol(symbolHandle);
+}
+
+inline std::string Symbol::GetName() const {
+  int success;
+  const char* out_name;
+  CHECK_EQ(MXSymbolGetName(GetHandle(), &out_name, &success), 0);
+  CHECK_EQ(success, 1);
+  return std::string(out_name);
+}
+
+inline void Symbol::InferShape(
+    const std::map<std::string, std::vector<mx_uint> > &arg_shapes,
+    std::vector<std::vector<mx_uint> > *in_shape,
+    std::vector<std::vector<mx_uint> > *aux_shape,
+    std::vector<std::vector<mx_uint> > *out_shape) const {
+
+  std::vector<const char *> keys;
+  std::vector<mx_uint> arg_ind_ptr;
+  std::vector<int> arg_shape_data;
+
+  for (const auto &arg : arg_shapes) {
+    keys.push_back(arg.first.c_str());
+    arg_ind_ptr.push_back(arg_shape_data.size());
+    for (auto i : arg.second) {
+      arg_shape_data.push_back(i);
+    }
+  }
+  arg_ind_ptr.push_back(arg_shape_data.size());
+
+  mx_uint in_shape_size;
+  const int *in_shape_ndim;
+  const int **in_shape_data;
+  mx_uint out_shape_size;
+  const int *out_shape_ndim;
+  const int **out_shape_data;
+  mx_uint aux_shape_size;
+  const int *aux_shape_ndim;
+  const int **aux_shape_data;
+  int complete;
+
+  CHECK_EQ(MXSymbolInferShape(GetHandle(), keys.size(), keys.data(),
+                              arg_ind_ptr.data(), arg_shape_data.data(),
+                              &in_shape_size, &in_shape_ndim, &in_shape_data,
+                              &out_shape_size, &out_shape_ndim, &out_shape_data,
+                              &aux_shape_size, &aux_shape_ndim, &aux_shape_data,
+                              &complete),
+           0);
+
+  if (complete) {
+    for (mx_uint i = 0; i < in_shape_size; ++i) {
+      in_shape->push_back(std::vector<mx_uint>());
+      for (int j = 0; j < in_shape_ndim[i]; ++j) {
+        (*in_shape)[i].push_back(in_shape_data[i][j]);
+      }
+    }
+    for (mx_uint i = 0; i < aux_shape_size; ++i) {
+      aux_shape->push_back(std::vector<mx_uint>());
+      for (int j = 0; j < aux_shape_ndim[i]; ++j) {
+        (*aux_shape)[i].push_back(aux_shape_data[i][j]);
+      }
+    }
+    for (mx_uint i = 0; i < out_shape_size; ++i) {
+      out_shape->push_back(std::vector<mx_uint>());
+      for (int j = 0; j < out_shape_ndim[i]; ++j) {
+        (*out_shape)[i].push_back(out_shape_data[i][j]);
+      }
+    }
+  }
+}
+
+inline void Symbol::InferExecutorArrays(
+    const Context &context, std::vector<NDArray> *arg_arrays,
+    std::vector<NDArray> *grad_arrays, std::vector<OpReqType> *grad_reqs,
+    std::vector<NDArray> *aux_arrays,
+    const std::map<std::string, NDArray> &args_map,
+    const std::map<std::string, NDArray> &arg_grad_store,
+    const std::map<std::string, OpReqType> &grad_req_type,
+    const std::map<std::string, NDArray> &aux_map) const {
+
+  const auto arg_name_list = ListArguments();
+  std::vector<std::vector<mx_uint> > in_shapes, aux_shapes, out_shapes;
+  std::map<std::string, std::vector<mx_uint> > arg_shapes;
+
+  for (const auto &arg_name : arg_name_list) {
+    auto iter = args_map.find(arg_name);
+    if (iter != args_map.end()) {
+      arg_shapes[arg_name] = iter->second.GetShape();
+    }
+  }
+
+  InferShape(arg_shapes, &in_shapes, &aux_shapes, &out_shapes);
+
+  for (size_t i = 0; i < in_shapes.size(); ++i) {
+    const auto &shape = in_shapes[i];
+    const auto &arg_name = arg_name_list[i];
+    auto iter_arg = args_map.find(arg_name);
+    if (iter_arg != args_map.end()) {
+      arg_arrays->push_back(iter_arg->second);
+    } else {
+      arg_arrays->push_back(NDArray(shape, context, false));
+      NDArray::SampleGaussian(0, 1, &arg_arrays->back());
+    }
+    auto iter_grad = arg_grad_store.find(arg_name);
+    if (iter_grad != arg_grad_store.end()) {
+      grad_arrays->push_back(iter_grad->second);
+    } else {
+      grad_arrays->push_back(NDArray(shape, context, false));
+    }
+    auto iter_req = grad_req_type.find(arg_name);
+    if (iter_req != grad_req_type.end()) {
+      grad_reqs->push_back(iter_req->second);
+    } else if (arg_name.rfind("data") != std::string::npos
+            || arg_name.rfind("label") != std::string::npos) {
+      grad_reqs->push_back(OpReqType::kNullOp);
+    } else {
+      grad_reqs->push_back(OpReqType::kWriteTo);
+    }
+  }
+
+  const auto aux_name_list = ListAuxiliaryStates();
+  for (size_t i = 0; i < aux_shapes.size(); ++i) {
+    const auto &shape = aux_shapes[i];
+    const auto &aux_name = aux_name_list[i];
+    auto iter_aux = aux_map.find(aux_name);
+    if (iter_aux != aux_map.end()) {
+      aux_arrays->push_back(iter_aux->second);
+    } else {
+      aux_arrays->push_back(NDArray(shape, context, false));
+      NDArray::SampleGaussian(0, 1, &aux_arrays->back());
+    }
+  }
+}
+inline void Symbol::InferArgsMap(
+    const Context &context, std::map<std::string, NDArray> *args_map,
+    const std::map<std::string, NDArray> &known_args) const {
+
+  const auto arg_name_list = ListArguments();
+  std::vector<std::vector<mx_uint> > in_shapes, aux_shapes, out_shapes;
+  std::map<std::string, std::vector<mx_uint> > arg_shapes;
+
+  for (const auto &arg_name : arg_name_list) {
+    auto iter = known_args.find(arg_name);
+    if (iter != known_args.end()) {
+      arg_shapes[arg_name] = iter->second.GetShape();
+    }
+  }
+
+  InferShape(arg_shapes, &in_shapes, &aux_shapes, &out_shapes);
+
+  for (size_t i = 0; i < in_shapes.size(); ++i) {
+    const auto &shape = in_shapes[i];
+    const auto &arg_name = arg_name_list[i];
+    auto iter_arg = known_args.find(arg_name);
+    if (iter_arg != known_args.end()) {
+      (*args_map)[arg_name] = iter_arg->second;
+    } else {
+      (*args_map)[arg_name] = NDArray(shape, context, false);
+      NDArray::SampleGaussian(0, 1, &(*args_map)[arg_name]);
+    }
+  }
+}
+
+inline Executor *Symbol::SimpleBind(
+    const Context &context, const std::map<std::string, NDArray> &args_map,
+    const std::map<std::string, NDArray> &arg_grad_store,
+    const std::map<std::string, OpReqType> &grad_req_type,
+    const std::map<std::string, NDArray> &aux_map) {
+  std::vector<NDArray> arg_arrays;
+  std::vector<NDArray> grad_arrays;
+  std::vector<OpReqType> grad_reqs;
+  std::vector<NDArray> aux_arrays;
+
+  InferExecutorArrays(context, &arg_arrays, &grad_arrays, &grad_reqs,
+                      &aux_arrays, args_map, arg_grad_store, grad_req_type,
+                      aux_map);
+
+  return new Executor(*this, context, arg_arrays, grad_arrays, grad_reqs,
+                      aux_arrays);
+}
+
+inline Executor *Symbol::Bind(const Context &context,
+                       const std::vector<NDArray> &arg_arrays,
+                       const std::vector<NDArray> &grad_arrays,
+                       const std::vector<OpReqType> &grad_reqs,
+                       const std::vector<NDArray> &aux_arrays,
+                       const std::map<std::string, Context> &group_to_ctx,
+                       Executor *shared_exec) {
+  return new Executor(*this, context, arg_arrays, grad_arrays, grad_reqs,
+                      aux_arrays, group_to_ctx, shared_exec);
+}
+inline Symbol operator+(mx_float lhs, const Symbol &rhs) { return rhs + lhs; }
+inline Symbol operator-(mx_float lhs, const Symbol &rhs) {
+  return mxnet::cpp::_RMinusScalar(lhs, rhs);
+}
+inline Symbol operator*(mx_float lhs, const Symbol &rhs) { return rhs * lhs; }
+inline Symbol operator/(mx_float lhs, const Symbol &rhs) {
+  return mxnet::cpp::_RDivScalar(lhs, rhs);
+}
+inline Symbol operator%(mx_float lhs, const Symbol &rhs) {
+  return mxnet::cpp::_RModScalar(lhs, rhs);
+}
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_SYMBOL_HPP_
diff --git a/cpp-package/scripts/OpWrapperGenerator.py b/cpp-package/scripts/OpWrapperGenerator.py
new file mode 100644
index 000000000000..8e28c89aea7c
--- /dev/null
+++ b/cpp-package/scripts/OpWrapperGenerator.py
@@ -0,0 +1,440 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# -*- coding: utf-8 -*-
+# This is a python script that generates operator wrappers such as FullyConnected,
+# based on current libmxnet.dll. This script is written so that we don't need to
+# write new operator wrappers when new ones are added to the library.
+
+from ctypes import *
+from ctypes.util import find_library
+import os
+import logging
+import platform
+import re
+import sys
+import tempfile
+import filecmp
+import shutil
+import codecs
+
+def gen_enum_value(value):
+    return 'k' + value[0].upper() + value[1:]
+
+class EnumType:
+    name = ''
+    enumValues = []
+    def __init__(self, typeName = 'ElementWiseOpType', \
+                 typeString = "{'avg', 'max', 'sum'}"):
+        self.name = typeName
+        if (typeString[0] == '{'):  # is a enum type
+            isEnum = True
+            # parse enum
+            self.enumValues = typeString[typeString.find('{') + 1:typeString.find('}')].split(',')
+            for i in range(0, len(self.enumValues)):
+                self.enumValues[i] = self.enumValues[i].strip().strip("'")
+        else:
+            logging.warn("trying to parse none-enum type as enum: %s" % typeString)
+    def GetDefinitionString(self, indent = 0):
+        indentStr = ' ' * indent
+        ret = indentStr + 'enum class %s {\n' % self.name
+        for i in range(0, len(self.enumValues)):
+            ret = ret + indentStr + '  %s = %d' % (gen_enum_value(self.enumValues[i]), i)
+            if (i != len(self.enumValues) -1):
+                ret = ret + ","
+            ret = ret + "\n"
+        ret = ret + "};\n"
+        return ret
+    def GetDefaultValueString(self, value = ''):
+        return self.name + "::" + gen_enum_value(value)
+    def GetEnumStringArray(self, indent = 0):
+        indentStr = ' ' * indent
+        ret = indentStr + 'static const char *%sValues[] = {\n' % self.name
+        for i in range(0, len(self.enumValues)):
+            ret = ret + indentStr + '  "%s"' % self.enumValues[i]
+            if (i != len(self.enumValues) -1):
+                ret = ret + ","
+            ret = ret + "\n"
+        ret = ret + indentStr + "};\n"
+        return ret
+    def GetConvertEnumVariableToString(self, variable=''):
+        return "%sValues[int(%s)]" % (self.name, variable)
+
+
+class Arg:
+    typeDict = {'boolean':'bool',\
+        'boolean or None':'dmlc::optional<bool>',\
+        'Shape(tuple)':'Shape',\
+        'Symbol':'Symbol',\
+        'NDArray':'Symbol',\
+        'NDArray-or-Symbol':'Symbol',\
+        'Symbol[]':'const std::vector<Symbol>&',\
+        'Symbol or Symbol[]':'const std::vector<Symbol>&',\
+        'NDArray[]':'const std::vector<Symbol>&',\
+        'caffe-layer-parameter':'::caffe::LayerParameter',\
+        'NDArray-or-Symbol[]':'const std::vector<Symbol>&',\
+        'float':'mx_float',\
+        'real_t':'mx_float',\
+        'int':'int',\
+        'int (non-negative)': 'uint32_t',\
+        'long (non-negative)': 'uint64_t',\
+        'int or None':'dmlc::optional<int>',\
+        'float or None':'dmlc::optional<float>',\
+        'long':'int64_t',\
+        'double':'double',\
+        'double or None':'dmlc::optional<double>',\
+        'Shape or None':'dmlc::optional<Shape>',\
+        'string':'const std::string&',\
+        'tuple of <float>':'nnvm::Tuple<mx_float>',\
+        'tuple of <>':'mxnet::cpp::Shape',\
+        '':'index_t'}
+    name = ''
+    type = ''
+    description = ''
+    isEnum = False
+    enum = None
+    hasDefault = False
+    defaultString = ''
+    def __init__(self, opName = '', argName = '', typeString = '', descString = ''):
+        self.name = argName
+        self.description = descString
+        if (typeString[0] == '{'):  # is enum type
+            self.isEnum = True
+            self.enum = EnumType(self.ConstructEnumTypeName(opName, argName), typeString)
+            self.type = self.enum.name
+        else:
+            try:
+                self.type = self.typeDict[typeString.split(',')[0]]
+            except:
+                print('argument "%s" of operator "%s" has unknown type "%s"' % (argName, opName, typeString))
+                pass
+        if typeString.find('default=') != -1:
+            self.hasDefault = True
+            self.defaultString = typeString.split('default=')[1].strip().strip("'")
+            if typeString.startswith('string'):
+                self.defaultString = self.MakeCString(self.defaultString)
+            elif self.isEnum:
+                self.defaultString = self.enum.GetDefaultValueString(self.defaultString)
+            elif self.defaultString == 'None':
+                self.defaultString = self.type + '()'
+            elif self.type == "bool":
+                if self.defaultString == "1" or self.defaultString == "True":
+                    self.defaultString = "true"
+                else:
+                    self.defaultString = "false"
+            elif self.defaultString[0] == '(':
+                self.defaultString = 'Shape' + self.defaultString
+            elif self.defaultString[0] == '[':
+                self.defaultString = 'Shape(' + self.defaultString[1:-1] + ")"
+            elif self.type == 'dmlc::optional<int>':
+                self.defaultString = self.type + '(' + self.defaultString + ')'
+            elif self.type == 'dmlc::optional<bool>':
+                self.defaultString = self.type + '(' + self.defaultString + ')'
+            elif typeString.startswith('caffe-layer-parameter'):
+                self.defaultString = 'textToCaffeLayerParameter(' + self.MakeCString(self.defaultString) + ')'
+                hasCaffe = True
+
+    def MakeCString(self, str):
+        str = str.replace('\n', "\\n")
+        str = str.replace('\t', "\\t")
+        return '\"' + str + '\"'
+
+    def ConstructEnumTypeName(self, opName = '', argName = ''):
+        a = opName[0].upper()
+        # format ArgName so instead of act_type it returns ActType
+        argNameWords = argName.split('_')
+        argName = ''
+        for an in argNameWords:
+            argName = argName + an[0].upper() + an[1:]
+        typeName = a + opName[1:] + argName
+        return typeName
+
+class Op:
+    name = ''
+    description = ''
+    args = []
+
+    def __init__(self, name = '', description = '', args = []):
+        self.name = name
+        self.description = description
+        # add a 'name' argument
+        nameArg = Arg(self.name, \
+                      'symbol_name', \
+                      'string', \
+                      'name of the resulting symbol')
+        args.insert(0, nameArg)
+        # reorder arguments, put those with default value to the end
+        orderedArgs = []
+        for arg in args:
+            if not arg.hasDefault:
+                orderedArgs.append(arg)
+        for arg in args:
+            if arg.hasDefault:
+                orderedArgs.append(arg)
+        self.args = orderedArgs
+
+    def WrapDescription(self, desc = ''):
+        ret = []
+        sentences = desc.split('.')
+        lines = desc.split('\n')
+        for line in lines:
+          line = line.strip()
+          if len(line) <= 80:
+            ret.append(line.strip())
+          else:
+            while len(line) > 80:
+              pos = line.rfind(' ', 0, 80)+1
+              if pos <= 0:
+                pos = line.find(' ')
+              if pos < 0:
+                pos = len(line)
+              ret.append(line[:pos].strip())
+              line = line[pos:]
+        return ret
+
+    def GenDescription(self, desc = '', \
+                        firstLineHead = ' * \\brief ', \
+                        otherLineHead = ' *        '):
+        ret = ''
+        descs = self.WrapDescription(desc)
+        ret = ret + firstLineHead
+        if len(descs) == 0:
+          return ret.rstrip()
+        ret = (ret + descs[0]).rstrip() + '\n'
+        for i in range(1, len(descs)):
+            ret = ret + (otherLineHead + descs[i]).rstrip() + '\n'
+        return ret
+
+    def GetOpDefinitionString(self, use_name, indent=0):
+        ret = ''
+        indentStr = ' ' * indent
+        # define enums if any
+        for arg in self.args:
+            if arg.isEnum and use_name:
+                # comments
+                ret = ret + self.GenDescription(arg.description, \
+                                        '/*! \\brief ', \
+                                        ' *        ')
+                ret = ret + " */\n"
+                # definition
+                ret = ret + arg.enum.GetDefinitionString(indent) + '\n'
+        # create function comments
+        ret = ret + self.GenDescription(self.description, \
+                                        '/*!\n * \\brief ', \
+                                        ' *        ')
+        for arg in self.args:
+            if arg.name != 'symbol_name' or use_name:
+                ret = ret + self.GenDescription(arg.name + ' ' + arg.description, \
+                                        ' * \\param ', \
+                                        ' *        ')
+        ret = ret + " * \\return new symbol\n"
+        ret = ret + " */\n"
+        # create function header
+        declFirstLine = indentStr + 'inline Symbol %s(' % self.name
+        ret = ret + declFirstLine
+        argIndentStr = ' ' * len(declFirstLine)
+        arg_start = 0 if use_name else 1
+        if len(self.args) > arg_start:
+            ret = ret + self.GetArgString(self.args[arg_start])
+        for i in range(arg_start+1, len(self.args)):
+            ret = ret + ',\n'
+            ret = ret + argIndentStr + self.GetArgString(self.args[i])
+        ret = ret + ') {\n'
+        # create function body
+        # if there is enum, generate static enum<->string mapping
+        for arg in self.args:
+            if arg.isEnum:
+                ret = ret + arg.enum.GetEnumStringArray(indent + 2)
+        # now generate code
+        ret = ret + indentStr + '  return Operator(\"%s\")\n' % self.name
+        for arg in self.args:   # set params
+            if arg.type == 'Symbol' or \
+                arg.type == 'const std::string&' or \
+                arg.type == 'const std::vector<Symbol>&':
+                continue
+            v = arg.name
+            if arg.isEnum:
+                v = arg.enum.GetConvertEnumVariableToString(v)
+            ret = ret + indentStr + ' ' * 11 + \
+                '.SetParam(\"%s\", %s)\n' % (arg.name, v)
+        #ret = ret[:-1]  # get rid of the last \n
+        symbols = ''
+        inputAlreadySet = False
+        for arg in self.args:   # set inputs
+            if arg.type != 'Symbol':
+                continue
+            inputAlreadySet = True
+            #if symbols != '':
+            #    symbols = symbols + ', '
+            #symbols = symbols + arg.name
+            ret = ret + indentStr + ' ' * 11 + \
+                '.SetInput(\"%s\", %s)\n' % (arg.name, arg.name)
+        for arg in self.args:   # set input arrays vector<Symbol>
+            if arg.type != 'const std::vector<Symbol>&':
+                continue
+            if (inputAlreadySet):
+                logging.error("op %s has both Symbol[] and Symbol inputs!" % self.name)
+            inputAlreadySet = True
+            symbols = arg.name
+            ret = ret + '(%s)\n' % symbols
+        ret = ret + indentStr + ' ' * 11
+        if use_name:
+            ret = ret + '.CreateSymbol(symbol_name);\n'
+        else:
+            ret = ret + '.CreateSymbol();\n'
+        ret = ret + indentStr + '}\n'
+        return ret
+
+    def GetArgString(self, arg):
+        ret = '%s %s' % (arg.type, arg.name)
+        if arg.hasDefault:
+            ret = ret + ' = ' + arg.defaultString
+        return ret
+
+
+def ParseAllOps():
+    """
+    MXNET_DLL int MXSymbolListAtomicSymbolCreators(mx_uint *out_size,
+                                                   AtomicSymbolCreator **out_array);
+
+    MXNET_DLL int MXSymbolGetAtomicSymbolInfo(AtomicSymbolCreator creator,
+                                              const char **name,
+                                              const char **description,
+                                              mx_uint *num_args,
+                                              const char ***arg_names,
+                                              const char ***arg_type_infos,
+                                              const char ***arg_descriptions,
+                                              const char **key_var_num_args);
+    """
+    cdll.libmxnet = cdll.LoadLibrary(sys.argv[1])
+    ListOP = cdll.libmxnet.MXSymbolListAtomicSymbolCreators
+    GetOpInfo = cdll.libmxnet.MXSymbolGetAtomicSymbolInfo
+    ListOP.argtypes=[POINTER(c_int), POINTER(POINTER(c_void_p))]
+    GetOpInfo.argtypes=[c_void_p, \
+        POINTER(c_char_p), \
+        POINTER(c_char_p), \
+        POINTER(c_int), \
+        POINTER(POINTER(c_char_p)), \
+        POINTER(POINTER(c_char_p)), \
+        POINTER(POINTER(c_char_p)), \
+        POINTER(c_char_p), \
+        POINTER(c_char_p)
+        ]
+
+    nOps = c_int()
+    opHandlers = POINTER(c_void_p)()
+    r = ListOP(byref(nOps), byref(opHandlers))
+    ret = ''
+    ret2 = ''
+    for i in range(0, nOps.value):
+        handler = opHandlers[i]
+        name = c_char_p()
+        description = c_char_p()
+        nArgs = c_int()
+        argNames = POINTER(c_char_p)()
+        argTypes = POINTER(c_char_p)()
+        argDescs = POINTER(c_char_p)()
+        varArgName = c_char_p()
+        return_type = c_char_p()
+
+        GetOpInfo(handler, byref(name), byref(description), \
+            byref(nArgs), byref(argNames), byref(argTypes), \
+            byref(argDescs), byref(varArgName), byref(return_type))
+
+        if name.value.decode('utf-8').startswith('_'):     # get rid of functions like __init__
+            continue
+
+        args = []
+
+        for i in range(0, nArgs.value):
+            arg = Arg(name.value.decode('utf-8'),
+                      argNames[i].decode('utf-8'),
+                      argTypes[i].decode('utf-8'),
+                      argDescs[i].decode('utf-8'))
+            args.append(arg)
+
+        op = Op(name.value.decode('utf-8'), description.value.decode('utf-8'), args)
+
+        ret = ret + op.GetOpDefinitionString(True) + "\n"
+        ret2 = ret2 + op.GetOpDefinitionString(False) + "\n"
+    return ret + ret2
+
+if __name__ == "__main__":
+    #et = EnumType(typeName = 'MyET')
+    #print(et.GetDefinitionString())
+    #print(et.GetEnumStringArray())
+    #arg = Arg()
+    #print(arg.ConstructEnumTypeName('SoftmaxActivation', 'act_type'))
+    #arg = Arg(opName = 'FullConnected', argName='act_type', \
+    #    typeString="{'elu', 'leaky', 'prelu', 'rrelu'},optional, default='leaky'", \
+    #    descString='Activation function to be applied.')
+    #print(arg.isEnum)
+    #print(arg.defaultString)
+    #arg = Arg("fc", "alpha", "float, optional, default=0.0001", "alpha")
+    #decl = "%s %s" % (arg.type, arg.name)
+    #if arg.hasDefault:
+    #    decl = decl + "=" + arg.defaultString
+    #print(decl)
+
+    temp_file_name = ""
+    output_file = '../include/mxnet-cpp/op.h'
+    try:
+        # generate file header
+        patternStr = ("/*!\n"
+                      "*  Copyright (c) 2016 by Contributors\n"
+                      "* \\file op.h\n"
+                      "* \\brief definition of all the operators\n"
+                      "* \\author Chuntao Hong, Xin Li\n"
+                      "*/\n"
+                      "\n"
+                      "#ifndef MXNET_CPP_OP_H_\n"
+                      "#define MXNET_CPP_OP_H_\n"
+                      "\n"
+                      "#include <string>\n"
+                      "#include <vector>\n"
+                      "#include \"mxnet-cpp/base.h\"\n"
+                      "#include \"mxnet-cpp/shape.h\"\n"
+                      "#include \"mxnet-cpp/op_util.h\"\n"
+                      "#include \"mxnet-cpp/operator.h\"\n"
+                      "#include \"dmlc/optional.h\"\n"
+                      "#include \"nnvm/tuple.h\"\n"
+                      "\n"
+                      "namespace mxnet {\n"
+                      "namespace cpp {\n"
+                      "\n"
+                      "%s"
+                      "} //namespace cpp\n"
+                      "} //namespace mxnet\n"
+                      "#endif  // MXNET_CPP_OP_H_\n")
+
+        # Generate a temporary file name
+        tf = tempfile.NamedTemporaryFile()
+        temp_file_name = tf.name
+        tf.close()
+        with codecs.open(temp_file_name, 'w', 'utf-8') as f:
+            f.write(patternStr % ParseAllOps())
+    except Exception as e:
+      if (os.path.exists(output_file)):
+        os.remove(output_file)
+      if len(temp_file_name) > 0:
+        os.remove(temp_file_name)
+      raise(e)
+    if os.path.exists(output_file):
+      if not filecmp.cmp(temp_file_name, output_file):
+          os.remove(output_file)
+    if not os.path.exists(output_file):
+      shutil.move(temp_file_name, output_file)
diff --git a/cpp-package/scripts/lint.py b/cpp-package/scripts/lint.py
new file mode 100644
index 000000000000..f6e549878a42
--- /dev/null
+++ b/cpp-package/scripts/lint.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=protected-access, unused-variable, locally-disabled, redefined-variable-type
+"""Lint helper to generate lint summary of source.
+Copyright by Contributors
+"""
+from __future__ import print_function
+import codecs
+import sys
+import re
+import os
+import cpplint
+from cpplint import _cpplint_state
+from pylint import epylint
+
+CXX_SUFFIX = set(['cc', 'c', 'cpp', 'h', 'cu', 'hpp'])
+PYTHON_SUFFIX = set(['py'])
+
+class LintHelper(object):
+    """Class to help runing the lint and records summary"""
+
+    @staticmethod
+    def _print_summary_map(strm, result_map, ftype):
+        """Print summary of certain result map."""
+        if len(result_map) == 0:
+            return 0
+        npass = len([x for k, x in result_map.iteritems() if len(x) == 0])
+        strm.write('=====%d/%d %s files passed check=====\n' % (npass, len(result_map), ftype))
+        for fname, emap in result_map.iteritems():
+            if len(emap) == 0:
+                continue
+            strm.write('%s: %d Errors of %d Categories map=%s\n' % (
+                fname, sum(emap.values()), len(emap), str(emap)))
+        return len(result_map) - npass
+
+    def __init__(self):
+        self.project_name = None
+        self.cpp_header_map = {}
+        self.cpp_src_map = {}
+        self.python_map = {}
+        pylint_disable = ['superfluous-parens',
+                          'too-many-instance-attributes',
+                          'too-few-public-methods']
+        # setup pylint
+        self.pylint_opts = ['--extension-pkg-whitelist=numpy',
+                            '--disable=' + ','.join(pylint_disable)]
+
+        self.pylint_cats = set(['error', 'warning', 'convention', 'refactor'])
+        # setup cpp lint
+        cpplint_args = ['.', '--extensions=' + (','.join(CXX_SUFFIX))]
+        _ = cpplint.ParseArguments(cpplint_args)
+        cpplint._SetFilters(','.join(['-build/c++11',
+                                      '-build/namespaces',
+                                      '-build/include',
+                                      '-build/header_guard',
+                                      '+build/include_what_you_use',
+                                      '+build/include_order']))
+        cpplint._SetCountingStyle('toplevel')
+        cpplint._line_length = 100
+
+    def process_cpp(self, path, suffix):
+        """Process a cpp file."""
+        _cpplint_state.ResetErrorCounts()
+        cpplint.ProcessFile(str(path), _cpplint_state.verbose_level)
+        _cpplint_state.PrintErrorCounts()
+        errors = _cpplint_state.errors_by_category.copy()
+
+        if suffix == 'h':
+            self.cpp_header_map[str(path)] = errors
+        else:
+            self.cpp_src_map[str(path)] = errors
+
+    def process_python(self, path):
+        """Process a python file."""
+        (pylint_stdout, pylint_stderr) = epylint.py_run(
+            ' '.join([str(path)] + self.pylint_opts), return_std=True)
+        emap = {}
+        print(pylint_stderr.read())
+        for line in pylint_stdout:
+            sys.stderr.write(line)
+            key = line.split(':')[-1].split('(')[0].strip()
+            if key not in self.pylint_cats:
+                continue
+            if key not in emap:
+                emap[key] = 1
+            else:
+                emap[key] += 1
+        sys.stderr.write('\n')
+        self.python_map[str(path)] = emap
+
+    def print_summary(self, strm):
+        """Print summary of lint."""
+        nerr = 0
+        nerr += LintHelper._print_summary_map(strm, self.cpp_header_map, 'cpp-header')
+        nerr += LintHelper._print_summary_map(strm, self.cpp_src_map, 'cpp-soruce')
+        nerr += LintHelper._print_summary_map(strm, self.python_map, 'python')
+        if nerr == 0:
+            strm.write('All passed!\n')
+        else:
+            strm.write('%d files failed lint\n' % nerr)
+        return nerr
+
+# singleton helper for lint check
+_HELPER = LintHelper()
+
+def get_header_guard_dmlc(filename):
+    """Get Header Guard Convention for DMLC Projects.
+    For headers in include, directly use the path
+    For headers in src, use project name plus path
+    Examples: with project-name = dmlc
+        include/dmlc/timer.h -> DMLC_TIMTER_H_
+        src/io/libsvm_parser.h -> DMLC_IO_LIBSVM_PARSER_H_
+    """
+    fileinfo = cpplint.FileInfo(filename)
+    file_path_from_root = fileinfo.RepositoryName()
+    inc_list = ['include', 'api', 'wrapper']
+
+    if file_path_from_root.find('src/') != -1 and _HELPER.project_name is not None:
+        idx = file_path_from_root.find('src/')
+        file_path_from_root = _HELPER.project_name +  file_path_from_root[idx + 3:]
+    else:
+        for spath in inc_list:
+            prefix = spath + os.sep
+            if file_path_from_root.startswith(prefix):
+                file_path_from_root = re.sub('^' + prefix, '', file_path_from_root)
+                break
+    return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
+
+cpplint.GetHeaderGuardCPPVariable = get_header_guard_dmlc
+
+def process(fname, allow_type):
+    """Process a file."""
+    fname = str(fname)
+    # HACK: ignore op.h which is automatically generated
+    if fname.endswith('op.h'):
+      return
+    arr = fname.rsplit('.', 1)
+    if fname.find('#') != -1 or arr[-1] not in allow_type:
+        return
+    if arr[-1] in CXX_SUFFIX:
+        _HELPER.process_cpp(fname, arr[-1])
+    if arr[-1] in PYTHON_SUFFIX:
+        _HELPER.process_python(fname)
+
+def main():
+    """Main entry function."""
+    if len(sys.argv) < 3:
+        print('Usage: <project-name> <filetype> <list-of-path to traverse>')
+        print('\tfiletype can be python/cpp/all')
+        exit(-1)
+    _HELPER.project_name = sys.argv[1]
+    file_type = sys.argv[2]
+    allow_type = []
+    if file_type == 'python' or file_type == 'all':
+        allow_type += [x for x in PYTHON_SUFFIX]
+    if file_type == 'cpp' or file_type == 'all':
+        allow_type += [x for x in CXX_SUFFIX]
+    allow_type = set(allow_type)
+    if os.name != 'nt':
+        sys.stderr = codecs.StreamReaderWriter(sys.stderr,
+                                               codecs.getreader('utf8'),
+                                               codecs.getwriter('utf8'),
+                                               'replace')
+    for path in sys.argv[3:]:
+        if os.path.isfile(path):
+            process(path, allow_type)
+        else:
+            for root, dirs, files in os.walk(path):
+                for name in files:
+                    process(os.path.join(root, name), allow_type)
+
+    nerr = _HELPER.print_summary(sys.stderr)
+    sys.exit(nerr > 0)
+
+if __name__ == '__main__':
+    main()
diff --git a/cpp-package/tests/ci_test.sh b/cpp-package/tests/ci_test.sh
new file mode 100755
index 000000000000..58f04b341654
--- /dev/null
+++ b/cpp-package/tests/ci_test.sh
@@ -0,0 +1,73 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e # exit on the first error
+cd $(dirname $(readlink -f $0))/../example
+echo $PWD
+export LD_LIBRARY_PATH=$(readlink -f ../../lib):$LD_LIBRARY_PATH
+echo $LD_LIBRARY_PATH
+ls -l ../../lib/
+
+./get_data.sh
+
+cp ../../build/cpp-package/example/lenet .
+./lenet 1
+
+cp ../../build/cpp-package/example/alexnet .
+./alexnet 1
+
+cp ../../build/cpp-package/example/lenet_with_mxdataiter .
+./lenet_with_mxdataiter 1
+
+cp ../../build/cpp-package/example/resnet .
+./resnet 1
+
+cp ../../build/cpp-package/example/inception_bn .
+./inception_bn 1
+
+cp ../../build/cpp-package/example/mlp .
+./mlp 150
+
+cp ../../build/cpp-package/example/mlp_cpu .
+./mlp_cpu
+
+cp ../../build/cpp-package/example/mlp_gpu .
+./mlp_gpu
+
+cp ../../build/cpp-package/example/test_optimizer .
+./test_optimizer
+
+cp ../../build/cpp-package/example/test_kvstore .
+./test_kvstore
+
+cp ../../build/cpp-package/example/test_score .
+./test_score 0.93
+
+cp ../../build/cpp-package/example/test_ndarray_copy .
+./test_ndarray_copy
+
+# skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/20011
+#cp ../../build/cpp-package/example/test_regress_label .
+#./test_regress_label
+
+sh unittests/unit_test_mlp_csv.sh
+
+cd inference
+
+cp ../../../build/cpp-package/example/sentiment_analysis_rnn .
+./unit_test_sentiment_analysis_rnn.sh
+cd ..
diff --git a/cpp-package/tests/travis/run_test.sh b/cpp-package/tests/travis/run_test.sh
new file mode 100755
index 000000000000..4925b3526bf3
--- /dev/null
+++ b/cpp-package/tests/travis/run_test.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+if [ ${TASK} == "lint" ]; then
+    make lint || exit -1
+    echo "Check documentations of c++ code..."
+    make doc 2>log.txt
+    (cat log.txt| grep -v ENABLE_PREPROCESSING |grep -v "unsupported tag") > logclean.txt
+    echo "---------Error Log----------"
+    cat logclean.txt
+    echo "----------------------------"
+    (cat logclean.txt|grep warning) && exit -1
+    (cat logclean.txt|grep error) && exit -1
+    exit 0
+fi
+
+if [ ${TRAVIS_OS_NAME} == "linux" ]; then
+  # use g++-4.8 in linux
+  export CXX=g++-4.8
+fi
+
+if [ ${TASK} == "build" ]; then
+    make
+    exit $?
+fi
diff --git a/cpp-package/tests/travis/setup.sh b/cpp-package/tests/travis/setup.sh
new file mode 100755
index 000000000000..e0c850ed39a9
--- /dev/null
+++ b/cpp-package/tests/travis/setup.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+if [ ${TASK} == "lint" ]; then
+    pip3 install cpplint 'pylint==2.3.1' --user
+fi
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 6e062af2fa4d..335ab90b3cb1 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1357,7 +1357,7 @@ MXNET_DLL int MXInvokeCachedOp(CachedOpHandle handle,
 /*!
  * \brief cached op set monitor callback
  */
-MXNET_DLL int MXCachedOpRegisterOpHook(NDArrayHandle handle,
+MXNET_DLL int MXCachedOpRegisterOpHook(CachedOpHandle handle,
                                        CachedOpMonitorCallback callback,
                                        bool monitor_all);
 
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index 6715a2fc91ab..f2e859248b47 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -149,13 +149,15 @@ int MXImperativeInvoke(AtomicSymbolCreator creator,
   API_BEGIN();
   MXImperativeInvokeImpl(creator, num_inputs, inputs, num_outputs, outputs,
                          num_params, param_keys, param_vals);
-  NDArray** out_array = *reinterpret_cast<NDArray***>(outputs);
-  ret->out_types.clear();
-  ret->out_types.reserve(*num_outputs);
-  for (int i = 0; i < *num_outputs; ++i) {
-    ret->out_types.emplace_back(out_array[i]->storage_type());
+  if (out_stypes != nullptr) {
+    NDArray** out_array = *reinterpret_cast<NDArray***>(outputs);
+    ret->out_types.clear();
+    ret->out_types.reserve(*num_outputs);
+    for (int i = 0; i < *num_outputs; ++i) {
+      ret->out_types.emplace_back(out_array[i]->storage_type());
+    }
+    *out_stypes = dmlc::BeginPtr(ret->out_types);
   }
-  *out_stypes = dmlc::BeginPtr(ret->out_types);
   API_END();
 }
 
@@ -247,14 +249,15 @@ int MXInvokeCachedOp(CachedOpHandle handle,
     }
     *outputs = dmlc::BeginPtr(ret->ret_handles);
   }
-
-  NDArray** out_array = reinterpret_cast<NDArray**>(*outputs);
-  ret->out_types.clear();
-  ret->out_types.reserve(*num_outputs);
-  for (int i = 0; i < *num_outputs; ++i) {
-    ret->out_types.emplace_back(out_array[i]->storage_type());
+  if (out_stypes != nullptr) {
+    NDArray** out_array = reinterpret_cast<NDArray**>(*outputs);
+    ret->out_types.clear();
+    ret->out_types.reserve(*num_outputs);
+    for (int i = 0; i < *num_outputs; ++i) {
+      ret->out_types.emplace_back(out_array[i]->storage_type());
+    }
+    *out_stypes = dmlc::BeginPtr(ret->out_types);
   }
-  *out_stypes = dmlc::BeginPtr(ret->out_types);
 
   API_END();
 }
@@ -398,7 +401,7 @@ int MXAutogradGetSymbol(NDArrayHandle handle, SymbolHandle *out) {
   API_END();
 }
 
-int MXCachedOpRegisterOpHook(NDArrayHandle handle,
+int MXCachedOpRegisterOpHook(CachedOpHandle handle,
                              CachedOpMonitorCallback callback,
                              bool monitor_all) {
   API_BEGIN();
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
new file mode 100644
index 000000000000..22a1e5ff011c
--- /dev/null
+++ b/src/operator/softmax_output-inl.h
@@ -0,0 +1,476 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file softmax_output-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_SOFTMAX_OUTPUT_INL_H_
+#define MXNET_OPERATOR_SOFTMAX_OUTPUT_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace softmaxout_enum {
+enum SoftmaxOutputOpInputs {kData, kLabel};
+enum SoftmaxOutputOpOutputs {kOut};
+enum SoftmaxOutputNormType {kNull, kBatch, kValid};
+enum SoftmaxOutputOpResource {kTempSpace};
+}  // namespace softmaxout_enum
+
+struct SoftmaxOutputParam : public dmlc::Parameter<SoftmaxOutputParam> {
+  float grad_scale;
+  float ignore_label;
+  bool multi_output;
+  bool use_ignore;
+  bool preserve_shape;
+  int normalization;
+  bool out_grad;
+  float smooth_alpha;
+  DMLC_DECLARE_PARAMETER(SoftmaxOutputParam) {
+    DMLC_DECLARE_FIELD(grad_scale).set_default(1.0f)
+    .describe("Scales the gradient by a float factor.");
+    DMLC_DECLARE_FIELD(ignore_label).set_default(-1.0f)
+    .describe("The instances whose `labels` == `ignore_label` will be ignored "
+              "during backward, if `use_ignore` is set to ``true``).");
+    DMLC_DECLARE_FIELD(multi_output).set_default(false)
+    .describe("If set to ``true``, the softmax function will be computed along "
+              "axis ``1``. This is applied when the shape "
+              "of input array differs from the shape of label array.");
+    DMLC_DECLARE_FIELD(use_ignore).set_default(false)
+    .describe("If set to ``true``, the `ignore_label` value will not contribute "
+              "to the backward gradient.");
+    DMLC_DECLARE_FIELD(preserve_shape).set_default(false)
+    .describe("If set to ``true``, the softmax function will be computed along "
+              "the last axis (``-1``).");
+    DMLC_DECLARE_FIELD(normalization)
+    .add_enum("null", softmaxout_enum::kNull)
+    .add_enum("batch", softmaxout_enum::kBatch)
+    .add_enum("valid", softmaxout_enum::kValid)
+    .set_default(softmaxout_enum::kNull)
+    .describe("Normalizes the gradient.");
+    DMLC_DECLARE_FIELD(out_grad)
+    .set_default(false)
+    .describe("Multiplies gradient with output gradient element-wise.");
+    DMLC_DECLARE_FIELD(smooth_alpha)
+    .set_default(0.0f)
+    .set_range(0.0f, 1.0f)
+    .describe("Constant for computing a label smoothed version of cross-entropy"
+              "for the backwards pass.  This constant gets subtracted from the"
+              "one-hot encoding of the gold label and distributed uniformly to"
+              "all other labels.");
+  };
+
+  bool operator==(const SoftmaxOutputParam& other) const {
+    return this->grad_scale == other.grad_scale &&
+           this->ignore_label == other.ignore_label &&
+           this->multi_output == other.multi_output &&
+           this->use_ignore == other.use_ignore &&
+           this->preserve_shape == other.preserve_shape &&
+           this->normalization == other.normalization &&
+           this->out_grad == other.out_grad &&
+           this->smooth_alpha == other.smooth_alpha;
+  }
+};
+
+template<typename xpu, typename DType>
+class SoftmaxOutputOp : public Operator {
+ public:
+  explicit SoftmaxOutputOp(SoftmaxOutputParam param) : param_(param) {}
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 2U) << "SoftmaxOutput Input: [data, label]";
+    CHECK_EQ(out_data.size(), 1U) << "SoftmaxOutput Output: [output]";
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    if (param_.multi_output) {
+      index_t n = in_data[softmaxout_enum::kData].size(0);
+      index_t k = in_data[softmaxout_enum::kData].size(1);
+      Shape<3> s3 = Shape3(n, k, static_cast<index_t>(in_data[softmaxout_enum::kData].Size()/n/k));
+      Tensor<xpu, 3, DType> data =
+          in_data[softmaxout_enum::kData].get_with_shape<xpu, 3, DType>(s3, s);
+      Tensor<xpu, 3, DType> out =
+          out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
+      Softmax(out, data);
+    } else {
+      if (param_.preserve_shape) {
+        Tensor<xpu, 2, DType> data = in_data[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
+        Softmax(out, data);
+      } else {
+        index_t n = in_data[softmaxout_enum::kData].size(0);
+        index_t k = in_data[softmaxout_enum::kData].Size()/n;
+        Shape<2> s2 = Shape2(n, k);
+        Tensor<xpu, 2, DType> data =
+            in_data[softmaxout_enum::kData].get_with_shape<xpu, 2, DType>(s2, s);
+        Tensor<xpu, 2, DType> out =
+            out_data[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(s2, s);
+        Softmax(out, data);
+      }
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 2U);
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_GE(in_grad.size(), 1U);
+    CHECK_GE(req.size(), 1U);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    if (out_data[softmaxout_enum::kOut].shape_ ==
+        in_data[softmaxout_enum::kLabel].shape_) {
+      // use probability as label
+      Tensor<xpu, 2, DType> label = in_data[softmaxout_enum::kLabel].FlatTo2D<xpu, DType>(s);
+      Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
+      Tensor<xpu, 2, DType> grad = in_grad[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
+      if (param_.out_grad) {
+        Tensor<xpu, 2, DType> ograd = out_grad[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
+        grad = scalar<DType>(param_.grad_scale) * (out - label) * ograd;
+      } else {
+        grad = (out - label) * scalar<DType>(param_.grad_scale);
+      }
+    } else if (param_.multi_output) {
+      index_t n = out_data[softmaxout_enum::kOut].size(0);
+      index_t k = out_data[softmaxout_enum::kOut].size(1);
+      Shape<3> s3 = Shape3(n, k, static_cast<index_t>(out_data[softmaxout_enum::kOut].Size()/n/k));
+      Shape<2> s2 = Shape2(s3[0], s3[2]);
+      Tensor<xpu, 2, DType> label =
+          in_data[softmaxout_enum::kLabel].get_with_shape<xpu, 2, DType>(s2, s);
+      Tensor<xpu, 3, DType> out =
+          out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
+      Tensor<xpu, 3, DType> grad =
+          in_grad[softmaxout_enum::kData].get_with_shape<xpu, 3, DType>(s3, s);
+
+      index_t valid_cnt = label.shape_.Size();
+      if (param_.use_ignore) {
+          SoftmaxGrad(grad, out, label, static_cast<DType>(param_.ignore_label));
+      } else {
+          SoftmaxGrad(grad, out, label);
+      }
+      if (param_.normalization == softmaxout_enum::kBatch) {
+        valid_cnt = label.size(0);
+      } else if (param_.normalization == softmaxout_enum::kValid) {
+        int i_label = static_cast<int>(param_.ignore_label);
+        Tensor<cpu, 2, DType> workspace =
+          ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<2, DType>(
+          label.shape_);
+        Copy(workspace, label, label.stream_);
+        for (index_t i = 0; i < workspace.size(0); ++i) {
+          for (index_t j = 0; j < workspace.size(1); ++j) {
+            if (static_cast<int>(workspace[i][j]) == i_label) {
+              valid_cnt--;
+            }
+          }
+        }
+        valid_cnt = valid_cnt == 0 ? 1 : valid_cnt;
+      } else {
+        valid_cnt = 1;
+      }
+      grad *= DType(param_.grad_scale /
+                    (param_.normalization == softmaxout_enum::kValid ? 1 : s3[2]) /
+                    valid_cnt);
+      if (param_.out_grad) {
+        Tensor<xpu, 3, DType> ograd =
+          out_grad[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
+        grad *= ograd;
+      }
+    } else {
+      Shape<1> label_shape = Shape1(in_data[softmaxout_enum::kLabel].Size());
+      Shape<2> data_shape;
+      if (param_.preserve_shape) {
+        data_shape = out_data[softmaxout_enum::kOut].shape_.FlatTo2D();
+//        Tensor<xpu, 1, DType> label = in_data[softmaxout_enum::kLabel].FlatTo1D<xpu, DType>(s);
+//        Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
+//        Tensor<xpu, 2, DType> grad = in_grad[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
+      } else {
+        index_t n = out_data[softmaxout_enum::kOut].size(0);
+        data_shape = Shape2(n, out_data[softmaxout_enum::kOut].Size()/n);
+      }
+      Tensor<xpu, 1, DType> label = in_data[softmaxout_enum::kLabel].get_with_shape<xpu, 1, DType>(
+          label_shape, s);
+      Tensor<xpu, 2, DType> out =
+          out_data[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(data_shape, s);
+      Tensor<xpu, 2, DType> grad =
+          in_grad[softmaxout_enum::kData].get_with_shape<xpu, 2, DType>(data_shape, s);
+      index_t valid_cnt = label.shape_.Size();
+      if (param_.use_ignore) {
+        if (param_.smooth_alpha == 0.0f) {
+          SoftmaxGrad(grad, out, label, static_cast<DType>(param_.ignore_label));
+        } else {
+          SmoothSoftmaxGrad(grad, out, label, static_cast<DType>(param_.ignore_label),
+                            param_.smooth_alpha);
+        }
+      } else {
+        if (param_.smooth_alpha == 0.0f) {
+          SoftmaxGrad(grad, out, label);
+        } else {
+          SmoothSoftmaxGrad(grad, out, label, param_.smooth_alpha);
+        }
+      }
+      if (param_.normalization == softmaxout_enum::kBatch) {
+        valid_cnt = label.size(0);
+      } else if (param_.normalization == softmaxout_enum::kValid) {
+        int i_label = static_cast<int>(param_.ignore_label);
+        Tensor<cpu, 1, DType> workspace =
+          ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<1, DType>(
+          label.shape_);
+        Copy(workspace, label, label.stream_);
+        for (index_t i = 0; i < label.size(0); ++i) {
+          if (static_cast<int>(workspace[i]) == i_label) {
+            valid_cnt--;
+          }
+        }
+        valid_cnt = valid_cnt == 0 ? 1 : valid_cnt;
+      } else {
+        valid_cnt = 1;
+      }
+      grad *= DType(param_.grad_scale / valid_cnt);
+      if (param_.out_grad) {
+        Tensor<xpu, 2, DType> ograd =
+          out_grad[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(data_shape, s);
+        grad *= ograd;
+      }
+    }
+  }
+
+ private:
+  SoftmaxOutputParam param_;
+};  // class SoftmaxOutputOp
+
+template<typename xpu>
+void SoftmaxOutputCompute(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx, const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  const SoftmaxOutputParam &param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
+  const std::vector<TBlob> no_use_but_adapt_origin_api;
+  CHECK_EQ(inputs.size(), 2U);
+
+  MSHADOW_REAL_TYPE_SWITCH(inputs[softmaxout_enum::kData].type_flag_, DType, {
+    SoftmaxOutputOp<xpu, DType> op(param);
+    op.Forward(ctx, inputs, req, outputs, no_use_but_adapt_origin_api);
+  });
+}
+
+template<typename xpu>
+void SoftmaxOutputGradCompute(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<TBlob>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<TBlob>& outputs) {
+  const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
+  const std::vector<TBlob> no_use_but_adapt_origin_api;
+  CHECK_EQ(inputs.size(), 2U);
+
+  std::vector<TBlob> out_grad{inputs[0]};
+  std::vector<TBlob> out_data{inputs[0]};
+  std::vector<TBlob> in_data(inputs.begin(), inputs.end());
+  int dtype = inputs[0].type_flag_;
+  const std::vector<TBlob> &in_grad = outputs;
+
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    SoftmaxOutputOp<xpu, DType> op(param);
+    op.Backward(ctx, out_grad, in_data, out_data, req, in_grad, no_use_but_adapt_origin_api);
+  });
+}
+
+
+#if DMLC_USE_CXX11
+class SoftmaxOutputProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    return {"data", "label"};
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(mxnet::ShapeVector *in_shape,
+                  mxnet::ShapeVector *out_shape,
+                  mxnet::ShapeVector *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
+    const mxnet::TShape &dshape = in_shape->at(0);
+    if (!shape_is_known(dshape)) return false;
+
+    // label.shape == data.shape: use probability as label
+    if (dshape != (*in_shape)[softmaxout_enum::kLabel]) {
+      if (param_.multi_output) {
+        mxnet::TShape lshape1 = Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]);
+        mxnet::TShape lshape2(dshape.ndim() - 1, -1);
+        lshape2[0] = dshape[0];
+        for (int i = 2; i < dshape.ndim(); ++i)
+          lshape2[i-1] = dshape[i];
+        mxnet::TShape lshape3 = dshape;
+        lshape3[1] = 1;
+        if (!mxnet::ndim_is_known(in_shape->at(softmaxout_enum::kLabel))) {
+          in_shape->at(softmaxout_enum::kLabel) = lshape1;
+        } else if (in_shape->at(softmaxout_enum::kLabel) == lshape1) {
+        } else if (in_shape->at(softmaxout_enum::kLabel) == lshape2) {
+        } else if (in_shape->at(softmaxout_enum::kLabel) == lshape3) {
+        } else {
+          std::ostringstream os;
+          os << "Expecting " << lshape1 << " or " << lshape2
+             << ". But got " << in_shape->at(softmaxout_enum::kLabel);
+          throw InferShapeError(os.str(), softmaxout_enum::kLabel);
+        }
+      } else {
+        mxnet::TShape label_shape(dshape.ndim() - 1, -1);
+        for (int i = 0; i + 1 < dshape.ndim(); ++i)
+          label_shape[i] = dshape[i];
+        SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, label_shape);
+      }
+    }
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1U);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (size_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new SoftmaxOutputProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "SoftmaxOutput";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    if (param_.out_grad) {
+      return {in_data[softmaxout_enum::kLabel], out_data[softmaxout_enum::kOut],
+              out_grad[softmaxout_enum::kOut]};
+    } else {
+      return {in_data[softmaxout_enum::kLabel], out_data[softmaxout_enum::kOut]};
+    }
+  }
+
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+    return {{out_data[softmaxout_enum::kOut], in_grad[softmaxout_enum::kData]}};
+  }
+
+  std::vector<std::pair<int, void*> > ForwardInplaceOption(
+    const std::vector<int> &in_data,
+    const std::vector<void*> &out_data) const override {
+    return {{in_data[softmaxout_enum::kData], out_data[softmaxout_enum::kOut]}};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return nullptr;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ protected:
+  SoftmaxOutputParam param_;
+};  // class SoftmaxOutputProp
+
+class DeprecatedSoftmaxProp : public SoftmaxOutputProp {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    LOG(INFO) << "Softmax symbol is renamed to SoftmaxOutput. "
+      << "This API will be deprecated in Dec, 2015";
+    SoftmaxOutputProp::param_.Init(kwargs);
+  }
+
+  std::string TypeString() const override {
+    return "Softmax";
+  }
+};
+#endif  // DMLC_USE_CXX11
+
+}  // namespace op
+}  // namespace mxnet
+
+namespace std {
+template<>
+struct hash<mxnet::op::SoftmaxOutputParam> {
+  size_t operator()(const mxnet::op::SoftmaxOutputParam& val) {
+    size_t ret = 0;
+    ret = dmlc::HashCombine(ret, val.grad_scale);
+    ret = dmlc::HashCombine(ret, val.ignore_label);
+    ret = dmlc::HashCombine(ret, val.multi_output);
+    ret = dmlc::HashCombine(ret, val.use_ignore);
+    ret = dmlc::HashCombine(ret, val.preserve_shape);
+    ret = dmlc::HashCombine(ret, val.normalization);
+    ret = dmlc::HashCombine(ret, val.out_grad);
+    ret = dmlc::HashCombine(ret, val.smooth_alpha);
+    return ret;
+  }
+};
+}  // namespace std
+
+#endif  // MXNET_OPERATOR_SOFTMAX_OUTPUT_INL_H_
diff --git a/src/operator/softmax_output.cc b/src/operator/softmax_output.cc
new file mode 100644
index 000000000000..d87b78145e9e
--- /dev/null
+++ b/src/operator/softmax_output.cc
@@ -0,0 +1,285 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file softmax_output.cc
+ * \brief
+ * \author Bing Xu, Zhang Rong A
+*/
+#include "./softmax_output-inl.h"
+#if MXNET_USE_MKLDNN == 1
+#include "./nn/mkldnn/mkldnn_ops-inl.h"
+#include "./nn/mkldnn/mkldnn_base-inl.h"
+#endif
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(SoftmaxOutputParam);
+struct SoftmaxOutputGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::ObjectPtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+  std::vector<nnvm::NodeEntry> out_data(n->num_outputs());
+  for (uint32_t i = 0; i < out_data.size(); ++i) {
+    out_data[i] = nnvm::NodeEntry{n, i, 0};
+  }
+  std::vector<nnvm::NodeEntry> heads;
+  heads.push_back(out_data[softmaxout_enum::kOut]);
+  heads.push_back(n->inputs[softmaxout_enum::kLabel]);
+
+  nnvm::ObjectPtr gnode = nnvm::Node::Create();
+  gnode->inputs = std::move(heads);
+  gnode->control_deps.emplace_back(n);
+  gnode->attrs = n->attrs;
+  gnode->attrs.op = nnvm::Op::Get("_backward_SoftmaxOutput");
+  gnode->attrs.name = n->attrs.name + "_backward";
+  std::vector<nnvm::NodeEntry> in_grad(2);
+  in_grad[0] = nnvm::NodeEntry{gnode, 0, 0};
+  in_grad[1] = nnvm::NodeEntry{gnode, 1, 0};
+  return in_grad;
+  }
+};
+
+static inline std::vector<std::string> ListArguments() {
+  return {"data", "label"};
+}
+
+static bool SoftmaxOutputType(const nnvm::NodeAttrs& attrs,
+                              std::vector<int> *in_type,
+                              std::vector<int> *out_type) {
+  CHECK_EQ(in_type->size(), 2U);
+  int dtype = (*in_type)[0];
+  if (type_is_none(dtype)) {
+    // Input type is undefined, we try backward inference
+    if (out_type->size() == 0 || type_is_none((*out_type)[0])) {
+      // Neither the input nor the output are defined,
+      // types cannot be infered for this op
+      return false;
+    } else {
+      // Input type is undefined but output type is: backward inference
+      dtype = (*out_type)[0];
+    }
+  } else {
+    // Input type is defined but output type is not: forward inference
+    out_type->clear();
+    out_type->push_back(dtype);
+  }
+  for (size_t i = 0; i < in_type->size(); ++i) {
+    if ((*in_type)[i] == -1) {
+      (*in_type)[i] = dtype;
+    } else {
+      UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
+    }
+  }
+  return true;
+}
+
+static bool SoftmaxOutputShape(const nnvm::NodeAttrs& attrs,
+                               mxnet::ShapeVector *in_shape,
+                               mxnet::ShapeVector *out_shape) {
+  using namespace mshadow;
+  const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
+  const mxnet::TShape &dshape = in_shape->at(0);
+  if (!mxnet::ndim_is_known(dshape)) return false;
+
+  // label.shape == data.shape: use probability as label
+  if (dshape != (*in_shape)[softmaxout_enum::kLabel]) {
+    if (param.multi_output) {
+      mxnet::TShape lshape1 = Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]);
+      mxnet::TShape lshape2(dshape.ndim() - 1, -1);
+      lshape2[0] = dshape[0];
+      for (int i = 2; i < dshape.ndim(); ++i)
+        lshape2[i-1] = dshape[i];
+      mxnet::TShape lshape3 = dshape;
+      lshape3[1] = 1;
+      if (!mxnet::ndim_is_known(in_shape->at(softmaxout_enum::kLabel))) {
+        in_shape->at(softmaxout_enum::kLabel) = lshape1;
+      } else if (in_shape->at(softmaxout_enum::kLabel) == lshape1) {
+      } else if (in_shape->at(softmaxout_enum::kLabel) == lshape2) {
+      } else if (in_shape->at(softmaxout_enum::kLabel) == lshape3) {
+      } else {
+        std::ostringstream os;
+        os << "Expecting " << lshape1 << " or " << lshape2
+           << ". But got " << in_shape->at(softmaxout_enum::kLabel);
+        throw InferShapeError(os.str(), softmaxout_enum::kLabel);
+      }
+    } else {
+      mxnet::TShape label_shape(dshape.ndim() - 1, -1);
+      for (int i = 0; i + 1 < dshape.ndim(); ++i)
+        label_shape[i] = dshape[i];
+      SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, label_shape);
+    }
+  }
+
+  out_shape->clear();
+  out_shape->push_back(dshape);
+  return true;
+}
+
+#if MXNET_USE_MKLDNN == 1
+inline static bool SoftmaxOutputStorageType(const nnvm::NodeAttrs& attrs,
+                                            const int dev_mask,
+                                            DispatchMode* dispatch_mode,
+                                            std::vector<int>* in_attrs,
+                                            std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2);
+  CHECK_EQ(out_attrs->size(), 1);
+
+  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs,
+                           out_attrs);
+}
+
+void SoftmaxOutputComputeExCPU(const nnvm::NodeAttrs &attrs,
+                               const OpContext &ctx,
+                               const std::vector<NDArray> &inputs,
+                               const std::vector<OpReqType> &req,
+                               const std::vector<NDArray> &outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  const SoftmaxOutputParam &param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
+  if (SupportMKLDNN(inputs[0]) && !ctx.is_train && SupportMKLDNNSoftmaxOutput(param)) {
+    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    MKLDNNRun(MKLDNNSoftmaxOutputForward, attrs, ctx, inputs, req, outputs);
+    MKLDNN_OPCHECK_RUN(SoftmaxOutputCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    return;
+  }
+  FallBackCompute(SoftmaxOutputCompute<cpu>, attrs, ctx, inputs, req, outputs);
+}
+#endif
+
+NNVM_REGISTER_OP(SoftmaxOutput)
+.describe(R"code(Computes the gradient of cross entropy loss with respect to softmax output.
+
+- This operator computes the gradient in two steps.
+  The cross entropy loss does not actually need to be computed.
+
+  - Applies softmax function on the input array.
+  - Computes and returns the gradient of cross entropy loss w.r.t. the softmax output.
+
+- The softmax function, cross entropy loss and gradient is given by:
+
+  - Softmax Function:
+
+    .. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}
+
+  - Cross Entropy Function:
+
+    .. math:: \text{CE(label, output)} = - \sum_i \text{label}_i \log(\text{output}_i)
+
+  - The gradient of cross entropy loss w.r.t softmax output:
+
+    .. math:: \text{gradient} = \text{output} - \text{label}
+
+- During forward propagation, the softmax function is computed for each instance in the input array.
+
+  For general *N*-D input arrays with shape :math:`(d_1, d_2, ..., d_n)`. The size is
+  :math:`s=d_1 \cdot d_2 \cdot \cdot \cdot d_n`. We can use the parameters `preserve_shape`
+  and `multi_output` to specify the way to compute softmax:
+
+  - By default, `preserve_shape` is ``false``. This operator will reshape the input array
+    into a 2-D array with shape :math:`(d_1, \frac{s}{d_1})` and then compute the softmax function for
+    each row in the reshaped array, and afterwards reshape it back to the original shape
+    :math:`(d_1, d_2, ..., d_n)`.
+  - If `preserve_shape` is ``true``, the softmax function will be computed along
+    the last axis (`axis` = ``-1``).
+  - If `multi_output` is ``true``, the softmax function will be computed along
+    the second axis (`axis` = ``1``).
+
+- During backward propagation, the gradient of cross-entropy loss w.r.t softmax output array is computed.
+  The provided label can be a one-hot label array or a probability label array.
+
+  - If the parameter `use_ignore` is ``true``, `ignore_label` can specify input instances
+    with a particular label to be ignored during backward propagation. **This has no effect when
+    softmax `output` has same shape as `label`**.
+
+    Example::
+
+      data = [[1,2,3,4],[2,2,2,2],[3,3,3,3],[4,4,4,4]]
+      label = [1,0,2,3]
+      ignore_label = 1
+      SoftmaxOutput(data=data, label = label,\
+                    multi_output=true, use_ignore=true,\
+                    ignore_label=ignore_label)
+      ## forward softmax output
+      [[ 0.0320586   0.08714432  0.23688284  0.64391428]
+       [ 0.25        0.25        0.25        0.25      ]
+       [ 0.25        0.25        0.25        0.25      ]
+       [ 0.25        0.25        0.25        0.25      ]]
+      ## backward gradient output
+      [[ 0.    0.    0.    0.  ]
+       [-0.75  0.25  0.25  0.25]
+       [ 0.25  0.25 -0.75  0.25]
+       [ 0.25  0.25  0.25 -0.75]]
+      ## notice that the first row is all 0 because label[0] is 1, which is equal to ignore_label.
+
+  - The parameter `grad_scale` can be used to rescale the gradient, which is often used to
+    give each loss function different weights.
+
+  - This operator also supports various ways to normalize the gradient by `normalization`,
+    The `normalization` is applied if softmax output has different shape than the labels.
+    The `normalization` mode can be set to the followings:
+
+    - ``'null'``: do nothing.
+    - ``'batch'``: divide the gradient by the batch size.
+    - ``'valid'``: divide the gradient by the number of instances which are not ignored.
+
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<SoftmaxOutputParam>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FInferStorageType>("FInferStorageType", SoftmaxOutputStorageType)
+.set_attr<bool>("TIsMKLDNN", true)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SoftmaxOutputComputeExCPU)
+#endif
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"data", "label"};
+})
+.set_attr<nnvm::FListOutputNames>("FListOutputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"output"};
+})
+.set_attr<mxnet::FInferShape>("FInferShape", SoftmaxOutputShape)
+.set_attr<nnvm::FInferType>("FInferType", SoftmaxOutputType)
+.set_attr<FCompute>("FCompute<cpu>", SoftmaxOutputCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", SoftmaxOutputGrad{"_backward_SoftmaxOutput"})
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
+  return std::vector<std::pair<int, int> >{{0, 0}};
+})
+.add_argument("data", "NDArray-or-Symbol", "Input array.")
+.add_argument("label", "NDArray-or-Symbol", "Ground truth label.")
+.add_arguments(SoftmaxOutputParam::__FIELDS__());
+
+// Softmax symbol is renamed to SoftmaxOutput and deprecated since Dec, 2015
+NNVM_REGISTER_OP(SoftmaxOutput).add_alias("Softmax");
+
+NNVM_REGISTER_OP(_backward_SoftmaxOutput)
+.set_num_inputs(2)
+.set_num_outputs(2)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
+  return std::vector<std::pair<int, int> >{{0, 0}};
+})
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n){
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr_parser(ParamParser<SoftmaxOutputParam>)
+.set_attr<FCompute>("FCompute<cpu>", SoftmaxOutputGradCompute<cpu>);
+}  // namespace op
+}  // namespace mxnet

From b60808f8a9f4c1132d00a3c424b675f629955c00 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 7 Apr 2021 17:51:05 -0700
Subject: [PATCH 02/47] add tests

---
 ci/docker/runtime_functions.sh        |  7 +++++++
 ci/jenkins/Jenkins_steps.groovy       | 14 ++++++++++++++
 ci/jenkins/Jenkinsfile_unix_gpu       |  1 +
 python/mxnet/amp/lists/symbol_fp16.py |  1 +
 4 files changed, 23 insertions(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index e53a57bde3ec..c88e7d1dddcf 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -631,6 +631,7 @@ build_ubuntu_gpu() {
         -DUSE_NVML=OFF \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
         -DUSE_CUDNN=ON \
+        -DUSE_CPP_PACKAGE=ON \
         -DUSE_BLAS=Open \
         -DUSE_ONEDNN=OFF \
         -DUSE_DIST_KVSTORE=ON \
@@ -878,6 +879,12 @@ unittest_centos7_gpu() {
     pytest --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu/test_amp_init.py
 }
 
+integrationtest_ubuntu_gpu_cpp_package() {
+    set -ex
+    export DMLC_LOG_STACK_TRACE_DEPTH=10
+    cpp-package/tests/ci_test.sh
+}
+
 integrationtest_ubuntu_cpu_onnx() {
 	set -ex
 	export PYTHONPATH=./python/
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index ae3786297f46..54f070085718 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -846,6 +846,20 @@ def test_unix_distributed_kvstore_gpu(lib_name) {
     }]
 }
 
+def test_unix_cpp_package_gpu(lib_name) {
+    return ['cpp-package GPU Makefile': {
+      node(NODE_LINUX_GPU_G4) {
+        ws('workspace/it-cpp-package') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.unpack_and_init(lib_name, mx_lib_cpp_examples)
+            utils.docker_run('ubuntu_gpu_cu111', 'integrationtest_ubuntu_gpu_cpp_package', true)
+            utils.publish_test_coverage()
+          }
+        }
+      }
+    }]
+}
+
 def test_centos7_python3_cpu(lib_name) {
     return ['Python3: CentOS 7 CPU': {
       node(NODE_LINUX_CPU) {
diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index ef385af2078d..2beb0f4aa1f4 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -46,6 +46,7 @@ core_logic: {
     custom_steps.test_unix_python3_gpu('gpu'),
     custom_steps.test_unix_python3_onednn_gpu('onednn_gpu'),
     custom_steps.test_unix_python3_onednn_nocudnn_gpu('onednn_gpu_nocudnn'),
+    custom_steps.test_unix_cpp_package_gpu('gpu'),
     // TODO(szha): fix and reenable the hanging issue. tracked in #18098
     // custom_steps.test_unix_distributed_kvstore_gpu('gpu'),
     custom_steps.test_unix_byteps_gpu('gpu'),
diff --git a/python/mxnet/amp/lists/symbol_fp16.py b/python/mxnet/amp/lists/symbol_fp16.py
index f78d32d6193a..ec277e17ece9 100644
--- a/python/mxnet/amp/lists/symbol_fp16.py
+++ b/python/mxnet/amp/lists/symbol_fp16.py
@@ -439,6 +439,7 @@
     'slice_like',
     'softsign',
     'sort',
+    'SoftmaxOutput',
     'space_to_depth',
     'sqrt',
     'squeeze',

From ce056a0bd7952f7970d1d089053f29197ad0068e Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 8 Apr 2021 16:40:24 -0700
Subject: [PATCH 03/47] switch to python3

---
 cpp-package/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp-package/CMakeLists.txt b/cpp-package/CMakeLists.txt
index db64fa99bddf..2a168e7c824c 100644
--- a/cpp-package/CMakeLists.txt
+++ b/cpp-package/CMakeLists.txt
@@ -35,7 +35,7 @@ add_custom_target(
   MAIN_DEPENDENCY mxnet
   DEPENDS mxnet ${CMAKE_CURRENT_SOURCE_DIR}/scripts/OpWrapperGenerator.py
   COMMAND echo "Running: OpWrapperGenerator.py"
-  COMMAND python OpWrapperGenerator.py $<TARGET_FILE:mxnet>
+  COMMAND python3 OpWrapperGenerator.py $<TARGET_FILE:mxnet>
   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts
 )
 add_dependencies(mxnet_cpp cpp_package_op_h)

From bc62d6b7dc26020b33be0d78834848afd468c376 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 12 Apr 2021 15:36:55 -0700
Subject: [PATCH 04/47] update OpWrapperGenerator.py

---
 cpp-package/scripts/OpWrapperGenerator.py | 61 ++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/cpp-package/scripts/OpWrapperGenerator.py b/cpp-package/scripts/OpWrapperGenerator.py
index 8e28c89aea7c..bfb2e60adbfb 100644
--- a/cpp-package/scripts/OpWrapperGenerator.py
+++ b/cpp-package/scripts/OpWrapperGenerator.py
@@ -320,7 +320,7 @@ def ParseAllOps():
                                               const char ***arg_descriptions,
                                               const char **key_var_num_args);
     """
-    cdll.libmxnet = cdll.LoadLibrary(sys.argv[1])
+    cdll.libmxnet = cdll.LoadLibrary(find_lib_path()[0])
     ListOP = cdll.libmxnet.MXSymbolListAtomicSymbolCreators
     GetOpInfo = cdll.libmxnet.MXSymbolGetAtomicSymbolInfo
     ListOP.argtypes=[POINTER(c_int), POINTER(POINTER(c_void_p))]
@@ -373,6 +373,65 @@ def ParseAllOps():
         ret2 = ret2 + op.GetOpDefinitionString(False) + "\n"
     return ret + ret2
 
+def find_lib_path(prefix='libmxnet'):
+    """Find MXNet dynamic library files.
+
+    Returns
+    -------
+    lib_path : list(string)
+        List of all found path to the libraries.
+    """
+    lib_from_env = os.environ.get('MXNET_LIBRARY_PATH')
+    if lib_from_env:
+        lib_from_env = lib_from_env.replace('libmxnet', prefix)
+        if os.path.isfile(lib_from_env):
+            if not os.path.isabs(lib_from_env):
+                logging.warning("MXNET_LIBRARY_PATH should be an absolute path, instead of: %s",
+                                lib_from_env)
+            else:
+                if os.name == 'nt':
+                    os.environ['PATH'] = os.environ['PATH'] + ';' + os.path.dirname(lib_from_env)
+                return [lib_from_env]
+        else:
+            logging.warning("MXNET_LIBRARY_PATH '%s' doesn't exist", lib_from_env)
+
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    api_path = os.path.join(curr_path, '../../lib/')
+    cmake_build_path = os.path.join(curr_path, '../../build/')
+    dll_path = [curr_path, api_path, cmake_build_path]
+    if os.name == 'nt':
+        dll_path.append(os.path.join(curr_path, '../../build'))
+        vs_configuration = 'Release'
+        if platform.architecture()[0] == '64bit':
+            dll_path.append(os.path.join(curr_path, '../../build', vs_configuration))
+            dll_path.append(os.path.join(curr_path, '../../windows/x64', vs_configuration))
+        else:
+            dll_path.append(os.path.join(curr_path, '../../build', vs_configuration))
+            dll_path.append(os.path.join(curr_path, '../../windows', vs_configuration))
+    elif os.name == "posix" and os.environ.get('LD_LIBRARY_PATH', None):
+        dll_path[0:0] = [p.strip() for p in os.environ['LD_LIBRARY_PATH'].split(":")]
+    if os.name == 'nt':
+        os.environ['PATH'] = os.path.dirname(__file__) + ';' + os.environ.get('PATH', '')
+        dll_path = [os.path.join(p, prefix + '.dll') for p in dll_path]
+    elif platform.system() == 'Darwin':
+        dll_path = [os.path.join(p, prefix + '.dylib') for p in dll_path] + \
+                   [os.path.join(p, prefix + '.so') for p in dll_path]
+    else:
+        dll_path.append('../../../')
+        dll_path = [os.path.join(p, prefix + '.so') for p in dll_path]
+    lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
+    if len(lib_path) == 0:
+        raise RuntimeError('Cannot find the MXNet library.\n' +
+                           'List of candidates:\n' + str('\n'.join(dll_path)))
+    if os.name == 'nt':
+        os.environ['PATH'] = os.environ['PATH'] + ';' + os.path.dirname(lib_path[0])
+        if sys.version_info >= (3, 8):
+            if 'CUDA_PATH' not in os.environ:
+                raise RuntimeError('Cannot find the env CUDA_PATH.Please set CUDA_PATH env with cuda path')
+            os.add_dll_directory(os.path.dirname(lib_path[0]))
+            os.add_dll_directory(os.path.join(os.environ['CUDA_PATH'], 'bin'))
+    return lib_path
+
 if __name__ == "__main__":
     #et = EnumType(typeName = 'MyET')
     #print(et.GetDefinitionString())

From b6a16d9754749cb6645cc6b1035b2ecaa9d0ae92 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 15 Apr 2021 12:49:53 -0700
Subject: [PATCH 05/47] test: test cpp-package on gpu

---
 ci/docker/runtime_functions.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index c88e7d1dddcf..40c65ccf80d3 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -625,6 +625,8 @@ build_ubuntu_gpu_onednn_nocudnn() {
 build_ubuntu_gpu() {
     set -ex
     cd /work/build
+    ln -f -s /usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs/libcuda.so libcuda.so.1
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs/
     CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_CUDA=ON \

From 0b90b830e10ad55c108e274ca1c1817f444aecae Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 15 Apr 2021 21:12:06 -0700
Subject: [PATCH 06/47] test:update ci_test_cpp

---
 cpp-package/tests/ci_test.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp-package/tests/ci_test.sh b/cpp-package/tests/ci_test.sh
index 58f04b341654..da572df73b7a 100755
--- a/cpp-package/tests/ci_test.sh
+++ b/cpp-package/tests/ci_test.sh
@@ -18,9 +18,9 @@
 set -e # exit on the first error
 cd $(dirname $(readlink -f $0))/../example
 echo $PWD
-export LD_LIBRARY_PATH=$(readlink -f ../../lib):$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$(readlink -f /work/build):$LD_LIBRARY_PATH
 echo $LD_LIBRARY_PATH
-ls -l ../../lib/
+ls -l /work/build/
 
 ./get_data.sh
 

From b3d121ed0663cb9732cef3254a3b6623174a860a Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 16 Apr 2021 13:43:52 -0700
Subject: [PATCH 07/47] add softmaxoutput cuda support

---
 ci/docker/runtime_functions.sh  |  5 +++--
 ci/jenkins/Jenkins_steps.groovy | 18 +++++++++++++--
 ci/jenkins/Jenkinsfile_unix_cpu |  1 +
 cpp-package/tests/ci_test.sh    | 26 +++++++++++-----------
 src/operator/softmax_output.cu  | 39 +++++++++++++++++++++++++++++++++
 5 files changed, 72 insertions(+), 17 deletions(-)
 create mode 100644 src/operator/softmax_output.cu

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 40c65ccf80d3..3a245434c2bd 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -320,6 +320,7 @@ build_ubuntu_cpu_openblas() {
         -DUSE_BLAS=Open \
         -DUSE_ONEDNN=OFF \
         -DUSE_CUDA=OFF \
+        -DUSE_CPP_PACKAGE=ON \
         -DUSE_DIST_KVSTORE=ON \
         -DBUILD_CYTHON_MODULES=ON \
         -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
@@ -625,7 +626,7 @@ build_ubuntu_gpu_onednn_nocudnn() {
 build_ubuntu_gpu() {
     set -ex
     cd /work/build
-    ln -f -s /usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs/libcuda.so libcuda.so.1
+    ln -s libcuda.so.1 /usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs/libcuda.so 
     export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs/
     CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
@@ -881,7 +882,7 @@ unittest_centos7_gpu() {
     pytest --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu/test_amp_init.py
 }
 
-integrationtest_ubuntu_gpu_cpp_package() {
+integrationtest_ubuntu_cpp_package() {
     set -ex
     export DMLC_LOG_STACK_TRACE_DEPTH=10
     cpp-package/tests/ci_test.sh
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 54f070085718..77e0b990f719 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -804,6 +804,20 @@ def test_unix_onnx_cpu(lib_name) {
     }]
 }
 
+def test_unix_cpp_package_cpu(lib_name) {
+    return ['cpp-package CPU Makefile': {
+      node(NODE_LINUX_GPU_G4) {
+        ws('workspace/it-cpp-package-cpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.unpack_and_init(lib_name, mx_lib_cpp_examples)
+            utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpp_package', false)
+            utils.publish_test_coverage()
+          }
+        }
+      }
+    }]
+}
+
 def test_unix_distributed_kvstore_cpu(lib_name) {
     return ['dist-kvstore tests CPU': {
       node(NODE_LINUX_CPU) {
@@ -849,10 +863,10 @@ def test_unix_distributed_kvstore_gpu(lib_name) {
 def test_unix_cpp_package_gpu(lib_name) {
     return ['cpp-package GPU Makefile': {
       node(NODE_LINUX_GPU_G4) {
-        ws('workspace/it-cpp-package') {
+        ws('workspace/it-cpp-package-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init(lib_name, mx_lib_cpp_examples)
-            utils.docker_run('ubuntu_gpu_cu111', 'integrationtest_ubuntu_gpu_cpp_package', true)
+            utils.docker_run('ubuntu_gpu_cu111', 'integrationtest_ubuntu_cpp_package', true)
             utils.publish_test_coverage()
           }
         }
diff --git a/ci/jenkins/Jenkinsfile_unix_cpu b/ci/jenkins/Jenkinsfile_unix_cpu
index 7cc70d40c22c..7044a0e9f54d 100644
--- a/ci/jenkins/Jenkinsfile_unix_cpu
+++ b/ci/jenkins/Jenkinsfile_unix_cpu
@@ -48,6 +48,7 @@ core_logic: {
     custom_steps.test_unix_python3_mkl_cpu('cpu_mkl'),
     custom_steps.test_unix_python3_onednn_cpu('onednn_cpu'),
     custom_steps.test_unix_python3_onednn_mkl_cpu('onednn_mkl_cpu'),
+    custom_steps.test_unix_cpp_package_cpu('cpu'),
     /* disable onnx tests for now, until onnx work is forwarded-ported to master
     custom_steps.test_unix_onnx_cpu('cpu'),
     */
diff --git a/cpp-package/tests/ci_test.sh b/cpp-package/tests/ci_test.sh
index da572df73b7a..324769923186 100755
--- a/cpp-package/tests/ci_test.sh
+++ b/cpp-package/tests/ci_test.sh
@@ -24,40 +24,40 @@ ls -l /work/build/
 
 ./get_data.sh
 
-cp ../../build/cpp-package/example/lenet .
+cp ../../../build/cpp-package/example/lenet .
 ./lenet 1
 
-cp ../../build/cpp-package/example/alexnet .
+cp ../../../build/cpp-package/example/alexnet .
 ./alexnet 1
 
-cp ../../build/cpp-package/example/lenet_with_mxdataiter .
+cp ../../../build/cpp-package/example/lenet_with_mxdataiter .
 ./lenet_with_mxdataiter 1
 
-cp ../../build/cpp-package/example/resnet .
+cp ../../../build/cpp-package/example/resnet .
 ./resnet 1
 
-cp ../../build/cpp-package/example/inception_bn .
+cp ../../../build/cpp-package/example/inception_bn .
 ./inception_bn 1
 
-cp ../../build/cpp-package/example/mlp .
+cp ../../../build/cpp-package/example/mlp .
 ./mlp 150
 
-cp ../../build/cpp-package/example/mlp_cpu .
+cp ../../../build/cpp-package/example/mlp_cpu .
 ./mlp_cpu
 
-cp ../../build/cpp-package/example/mlp_gpu .
+cp ../../../build/cpp-package/example/mlp_gpu .
 ./mlp_gpu
 
-cp ../../build/cpp-package/example/test_optimizer .
+cp ../../../build/cpp-package/example/test_optimizer .
 ./test_optimizer
 
-cp ../../build/cpp-package/example/test_kvstore .
+cp ../../../build/cpp-package/example/test_kvstore .
 ./test_kvstore
 
-cp ../../build/cpp-package/example/test_score .
+cp ../../../build/cpp-package/example/test_score .
 ./test_score 0.93
 
-cp ../../build/cpp-package/example/test_ndarray_copy .
+cp ../../../build/cpp-package/example/test_ndarray_copy .
 ./test_ndarray_copy
 
 # skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/20011
@@ -68,6 +68,6 @@ sh unittests/unit_test_mlp_csv.sh
 
 cd inference
 
-cp ../../../build/cpp-package/example/sentiment_analysis_rnn .
+cp ../../../../build/cpp-package/example/sentiment_analysis_rnn .
 ./unit_test_sentiment_analysis_rnn.sh
 cd ..
diff --git a/src/operator/softmax_output.cu b/src/operator/softmax_output.cu
new file mode 100644
index 000000000000..49daa5a5b218
--- /dev/null
+++ b/src/operator/softmax_output.cu
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file softmax_output.cu
+ * \brief
+ * \author Bing Xu
+*/
+
+#include "./softmax_output-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(SoftmaxOutput)
+.set_attr<FCompute>("FCompute<gpu>", SoftmaxOutputCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_SoftmaxOutput)
+.set_attr<FCompute>("FCompute<gpu>", SoftmaxOutputGradCompute<gpu>);
+
+}  // namespace op
+}  // namespace mxnet

From b3075a9ef2b1fcd2e355610defd9ba599caa8889 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 16 Apr 2021 14:48:19 -0700
Subject: [PATCH 08/47] update build

---
 ci/docker/runtime_functions.sh            |  2 +-
 cpp-package/scripts/OpWrapperGenerator.py | 61 +----------------------
 2 files changed, 2 insertions(+), 61 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 3a245434c2bd..bd97918eeca8 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -626,7 +626,7 @@ build_ubuntu_gpu_onednn_nocudnn() {
 build_ubuntu_gpu() {
     set -ex
     cd /work/build
-    ln -s libcuda.so.1 /usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs/libcuda.so 
+    ln -f -s /usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs/libcuda.so libcuda.so.1
     export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs/
     CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
diff --git a/cpp-package/scripts/OpWrapperGenerator.py b/cpp-package/scripts/OpWrapperGenerator.py
index bfb2e60adbfb..8e28c89aea7c 100644
--- a/cpp-package/scripts/OpWrapperGenerator.py
+++ b/cpp-package/scripts/OpWrapperGenerator.py
@@ -320,7 +320,7 @@ def ParseAllOps():
                                               const char ***arg_descriptions,
                                               const char **key_var_num_args);
     """
-    cdll.libmxnet = cdll.LoadLibrary(find_lib_path()[0])
+    cdll.libmxnet = cdll.LoadLibrary(sys.argv[1])
     ListOP = cdll.libmxnet.MXSymbolListAtomicSymbolCreators
     GetOpInfo = cdll.libmxnet.MXSymbolGetAtomicSymbolInfo
     ListOP.argtypes=[POINTER(c_int), POINTER(POINTER(c_void_p))]
@@ -373,65 +373,6 @@ def ParseAllOps():
         ret2 = ret2 + op.GetOpDefinitionString(False) + "\n"
     return ret + ret2
 
-def find_lib_path(prefix='libmxnet'):
-    """Find MXNet dynamic library files.
-
-    Returns
-    -------
-    lib_path : list(string)
-        List of all found path to the libraries.
-    """
-    lib_from_env = os.environ.get('MXNET_LIBRARY_PATH')
-    if lib_from_env:
-        lib_from_env = lib_from_env.replace('libmxnet', prefix)
-        if os.path.isfile(lib_from_env):
-            if not os.path.isabs(lib_from_env):
-                logging.warning("MXNET_LIBRARY_PATH should be an absolute path, instead of: %s",
-                                lib_from_env)
-            else:
-                if os.name == 'nt':
-                    os.environ['PATH'] = os.environ['PATH'] + ';' + os.path.dirname(lib_from_env)
-                return [lib_from_env]
-        else:
-            logging.warning("MXNET_LIBRARY_PATH '%s' doesn't exist", lib_from_env)
-
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    api_path = os.path.join(curr_path, '../../lib/')
-    cmake_build_path = os.path.join(curr_path, '../../build/')
-    dll_path = [curr_path, api_path, cmake_build_path]
-    if os.name == 'nt':
-        dll_path.append(os.path.join(curr_path, '../../build'))
-        vs_configuration = 'Release'
-        if platform.architecture()[0] == '64bit':
-            dll_path.append(os.path.join(curr_path, '../../build', vs_configuration))
-            dll_path.append(os.path.join(curr_path, '../../windows/x64', vs_configuration))
-        else:
-            dll_path.append(os.path.join(curr_path, '../../build', vs_configuration))
-            dll_path.append(os.path.join(curr_path, '../../windows', vs_configuration))
-    elif os.name == "posix" and os.environ.get('LD_LIBRARY_PATH', None):
-        dll_path[0:0] = [p.strip() for p in os.environ['LD_LIBRARY_PATH'].split(":")]
-    if os.name == 'nt':
-        os.environ['PATH'] = os.path.dirname(__file__) + ';' + os.environ.get('PATH', '')
-        dll_path = [os.path.join(p, prefix + '.dll') for p in dll_path]
-    elif platform.system() == 'Darwin':
-        dll_path = [os.path.join(p, prefix + '.dylib') for p in dll_path] + \
-                   [os.path.join(p, prefix + '.so') for p in dll_path]
-    else:
-        dll_path.append('../../../')
-        dll_path = [os.path.join(p, prefix + '.so') for p in dll_path]
-    lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
-    if len(lib_path) == 0:
-        raise RuntimeError('Cannot find the MXNet library.\n' +
-                           'List of candidates:\n' + str('\n'.join(dll_path)))
-    if os.name == 'nt':
-        os.environ['PATH'] = os.environ['PATH'] + ';' + os.path.dirname(lib_path[0])
-        if sys.version_info >= (3, 8):
-            if 'CUDA_PATH' not in os.environ:
-                raise RuntimeError('Cannot find the env CUDA_PATH.Please set CUDA_PATH env with cuda path')
-            os.add_dll_directory(os.path.dirname(lib_path[0]))
-            os.add_dll_directory(os.path.join(os.environ['CUDA_PATH'], 'bin'))
-    return lib_path
-
 if __name__ == "__main__":
     #et = EnumType(typeName = 'MyET')
     #print(et.GetDefinitionString())

From c6a40a2fc2326878039f3cc7edc20be5a134e4f6 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 16 Apr 2021 18:23:12 -0700
Subject: [PATCH 09/47] update symbol

---
 .../example/inference/imagenet_inference.cpp  |  11 +-
 .../inference/sentiment_analysis_rnn.cpp      |   8 +-
 cpp-package/example/mlp.cpp                   |   3 +-
 cpp-package/include/mxnet-cpp/executor.h      |   5 +-
 cpp-package/include/mxnet-cpp/executor.hpp    |  52 +-------
 cpp-package/include/mxnet-cpp/symbol.h        |  16 +--
 cpp-package/include/mxnet-cpp/symbol.hpp      | 115 +++++++++++-------
 7 files changed, 85 insertions(+), 125 deletions(-)

diff --git a/cpp-package/example/inference/imagenet_inference.cpp b/cpp-package/example/inference/imagenet_inference.cpp
index 845a227fe93d..f123510b9fc1 100644
--- a/cpp-package/example/inference/imagenet_inference.cpp
+++ b/cpp-package/example/inference/imagenet_inference.cpp
@@ -210,18 +210,13 @@ Predictor::Predictor(const std::string& model_json_file,
   Shape label_shape(input_shape_[0]);
   args_map_["softmax_label"] = NDArray(label_shape, global_ctx_, false);
   std::vector<NDArray> arg_arrays;
-  std::vector<NDArray> grad_arrays;
-  std::vector<OpReqType> grad_reqs;
-  std::vector<NDArray> aux_arrays;
-
+  bool require_grad;
   // infer and create ndarrays according to the given input ndarrays.
-  net_.InferExecutorArrays(global_ctx_, &arg_arrays, &grad_arrays, &grad_reqs,
-                           &aux_arrays, args_map_, std::map<std::string, NDArray>(),
+  net_.InferExecutorArrays(global_ctx_, &arg_arrays, require_grad, args_map_, std::map<std::string, NDArray>(),
                            std::map<std::string, OpReqType>(), aux_map_);
-  for (auto& i : grad_reqs) i = OpReqType::kNullOp;
 
   // Create an executor after binding the model to input parameters.
-  executor_ = new Executor(net_, global_ctx_, arg_arrays, grad_arrays, grad_reqs, aux_arrays);
+  executor_ = new Executor(net_, global_ctx_, arg_arrays, false);
 }
 
 /*
diff --git a/cpp-package/example/inference/sentiment_analysis_rnn.cpp b/cpp-package/example/inference/sentiment_analysis_rnn.cpp
index 53b618ff116c..9d51ef721c80 100755
--- a/cpp-package/example/inference/sentiment_analysis_rnn.cpp
+++ b/cpp-package/example/inference/sentiment_analysis_rnn.cpp
@@ -146,18 +146,18 @@ Predictor::Predictor(const std::string& model_json,
   highest_bucket_key = *(std::max_element(bucket_keys.begin(), bucket_keys.end()));
   args_map["data0"] = NDArray(Shape(highest_bucket_key, 1), global_ctx, false);
   args_map["data1"] = NDArray(Shape(1), global_ctx, false);
+  bool require_grad;
 
-  net.InferExecutorArrays(global_ctx, &arg_arrays, &grad_arrays, &grad_reqs,
-                          &aux_arrays, args_map, std::map<std::string, NDArray>(),
+  net.InferExecutorArrays(global_ctx, &arg_arrays, require_grad, args_map, std::map<std::string, NDArray>(),
                               std::map<std::string, OpReqType>(), aux_map);
-  Executor *master_executor = net.Bind(global_ctx, arg_arrays, grad_arrays, grad_reqs, aux_arrays,
+  Executor *master_executor = net.Bind(global_ctx, arg_arrays, require_grad,
                                  std::map<std::string, Context>(), nullptr);
   executor_buckets[highest_bucket_key] = master_executor;
 
   for (int bucket : bucket_keys) {
     if (executor_buckets.find(bucket) == executor_buckets.end()) {
       arg_arrays[0]  = NDArray(Shape(bucket, 1), global_ctx, false);
-      Executor *executor = net.Bind(global_ctx, arg_arrays, grad_arrays, grad_reqs, aux_arrays,
+      Executor *executor = net.Bind(global_ctx, arg_arrays, require_grad,
                                     std::map<std::string, Context>(), master_executor);
       executor_buckets[bucket] = executor;
     }
diff --git a/cpp-package/example/mlp.cpp b/cpp-package/example/mlp.cpp
index 970dad74e727..e49fc79657b3 100644
--- a/cpp-package/example/mlp.cpp
+++ b/cpp-package/example/mlp.cpp
@@ -141,8 +141,7 @@ void MLP(int max_epoch) {
   std::vector<NDArray> aux_states;
 
   std::cout << "make the Executor" << std::endl;
-  Executor* exe = new Executor(sym_out, ctx_dev, in_args, arg_grad_store,
-                               grad_req_type, aux_states);
+  Executor* exe = new Executor(sym_out, ctx_dev, in_args, true);
 
   std::cout << "Training" << std::endl;
   mx_float learning_rate = 0.0001;
diff --git a/cpp-package/include/mxnet-cpp/executor.h b/cpp-package/include/mxnet-cpp/executor.h
index 88ebd886e018..f04d18d61e6d 100644
--- a/cpp-package/include/mxnet-cpp/executor.h
+++ b/cpp-package/include/mxnet-cpp/executor.h
@@ -46,9 +46,7 @@ class Executor {
  public:
   Executor(const Symbol &symbol, Context context,
            const std::vector<NDArray> &arg_arrays,
-           const std::vector<NDArray> &grad_arrays,
-           const std::vector<OpReqType> &grad_reqs,
-           const std::vector<NDArray> &aux_arrays,
+           bool require_grad,
            const std::map<std::string, Context> &group_to_ctx =
                std::map<std::string, Context>(),
            Executor *shared_exec = nullptr);
@@ -75,7 +73,6 @@ class Executor {
         });
     int out_size = 0;
     NDArrayHandle *out_array = nullptr;
-    bool out_initialized = false;
     CHECK_EQ(MXInvokeCachedOp(handle_, arg_handles.size(), arg_handles.data(),
                               device_type, device_id, &out_size, &out_array, nullptr),
              0);
diff --git a/cpp-package/include/mxnet-cpp/executor.hpp b/cpp-package/include/mxnet-cpp/executor.hpp
index 904a87468d21..7f7ca18fe85e 100644
--- a/cpp-package/include/mxnet-cpp/executor.hpp
+++ b/cpp-package/include/mxnet-cpp/executor.hpp
@@ -37,66 +37,18 @@ namespace mxnet {
 namespace cpp {
 inline Executor::Executor(const Symbol &symbol, Context context,
                           const std::vector<NDArray> &arg_arrays,
-                          const std::vector<NDArray> &grad_arrays,
-                          const std::vector<OpReqType> &grad_reqs,
-                          const std::vector<NDArray> &aux_arrays,
+                          bool require_grad,
                           const std::map<std::string, Context> &group_to_ctx,
                           Executor *shared_exec) {
   this->arg_arrays = arg_arrays;
-  this->grad_arrays = grad_arrays;
-  this->aux_arrays = aux_arrays;
   this->symbol_ = symbol;
+  this->require_grad = require_grad;
   this->device_type = context.GetDeviceType();
   this->device_id = context.GetDeviceId();
 
-  std::vector<NDArrayHandle> arg_handles;
-  std::vector<NDArrayHandle> grad_handles;
-  std::vector<NDArrayHandle> aux_handles;
-
-  for (const auto &array : arg_arrays) {
-    arg_handles.push_back(array.GetHandle());
-  }
-  for (const auto &array : grad_arrays) {
-    grad_handles.push_back(array.GetHandle());
-  }
-
-  this->require_grad = false;
-  std::vector<mx_uint> grad_reqs_uint;
-  for (auto s : grad_reqs) {
-    if (s != OpReqType::kNullOp) {
-      this->require_grad = true;
-    }
-    grad_reqs_uint.push_back(s);
-  }
-  CHECK_EQ(MXAutogradMarkVariables(arg_handles.size(), arg_handles.data(),
-                                   grad_reqs_uint.data(), grad_handles.data()),0);
-  // std::vector<const char *> map_keys;
-  // std::vector<int> dev_types, dev_ids;
-  // for (const auto &s : group_to_ctx) {
-  //   map_keys.push_back(s.first.c_str());
-  //   dev_types.push_back(s.second.GetDeviceType());
-  //   dev_ids.push_back(s.second.GetDeviceId());
-  // }
-
   CHECK_EQ(MXCreateCachedOp(symbol.GetHandle(), 0, nullptr, nullptr, &handle_, false), 0);
 }
 
-inline mx_uint GradType2Int(OpReqType t) {
-  if (t == OpReqType::kNullOp) {
-    return 0;
-  } else if (t == OpReqType::kWriteTo) {
-    return 1;
-  } else if (t == OpReqType::kWriteInplace) {
-    return 2;
-  } else if (t == OpReqType::kAddTo) {
-    return 3;
-  } else {
-    LOG(FATAL) << "unknown grad type " << t;
-  }
-  LOG(FATAL) << "should not reach here ";
-  return 0;
-}
-
 }  // namespace cpp
 }  // namespace mxnet
 
diff --git a/cpp-package/include/mxnet-cpp/symbol.h b/cpp-package/include/mxnet-cpp/symbol.h
index 46fe2cbd28d0..f3d930472ecb 100644
--- a/cpp-package/include/mxnet-cpp/symbol.h
+++ b/cpp-package/include/mxnet-cpp/symbol.h
@@ -204,9 +204,6 @@ class Symbol {
   * some known arrays.
   * \param context the context of all the infered arrays
   * \param arg_arrays infered input arguments arrays.
-  * \param arad_arrays infered arrays to store the gradient output of the input
-  * arguments.
-  * \param aux_arrays infered arrays that is used as internal state in op.
   * \param args_map map of some given arguments arrays.
   * \param args_grad_store map of some gradient given store arrays.
   * \param args_req_type map of some given type of gradient saving. Can only be
@@ -215,8 +212,7 @@ class Symbol {
   */
   void InferExecutorArrays(
       const Context &context, std::vector<NDArray> *arg_arrays,
-      std::vector<NDArray> *grad_arrays, std::vector<OpReqType> *grad_reqs,
-      std::vector<NDArray> *aux_arrays,
+      bool &require_grad,
       const std::map<std::string, NDArray> &args_map,
       const std::map<std::string, NDArray> &arg_grad_store =
           std::map<std::string, NDArray>(),
@@ -267,11 +263,7 @@ class Symbol {
   *
   * \param context the context of binding.
   * \param arg_arrays the NDArray that stores the input arguments to the symbol.
-  * \param grad_arrays NDArray that is used to store the gradient output of the
-  *input arguments.
-  * \param grad_reqs requirment type of gradient saving. Can only be in
-  *{kNullOp, kAddTo, kWriteTo}.
-  * \param aux_arrays NDArray that is used as internal state in op
+  * \param require_grad if require to do backward propogation.
   * \param group_to_ctx dict of string to mx.Context
   * \param shared_exec Executor to share memory with. This is intended for
   *runtime reshaping, variable length sequencesn etc.  The returned executor
@@ -279,9 +271,7 @@ class Symbol {
   * \return a new executor, which need to be free manually.
   */
   Executor *Bind(const Context &context, const std::vector<NDArray> &arg_arrays,
-                 const std::vector<NDArray> &grad_arrays,
-                 const std::vector<OpReqType> &grad_reqs,
-                 const std::vector<NDArray> &aux_arrays,
+                 bool require_grad,
                  const std::map<std::string, Context> &group_to_ctx =
                      std::map<std::string, Context>(),
                  Executor *shared_exec = nullptr);
diff --git a/cpp-package/include/mxnet-cpp/symbol.hpp b/cpp-package/include/mxnet-cpp/symbol.hpp
index 187dad842862..3c2a379f3cb3 100644
--- a/cpp-package/include/mxnet-cpp/symbol.hpp
+++ b/cpp-package/include/mxnet-cpp/symbol.hpp
@@ -287,16 +287,28 @@ inline void Symbol::InferShape(
   }
 }
 
+inline std::map<std::string, std::vector<mx_uint> > GetDict(const std::vector<std::string> &names,
+                                                            const std::vector<std::vector<mx_uint> > &shapes) {
+    std::map<std::string, std::vector<mx_uint> > ret;
+    CHECK_EQ(names.size(), shapes.size())
+        << "names size not equal to shapes size";
+    for (size_t i = 0; i < names.size(); ++i) {
+      ret[names[i]] = shapes[i];
+    }
+    return ret;
+}
+
 inline void Symbol::InferExecutorArrays(
     const Context &context, std::vector<NDArray> *arg_arrays,
-    std::vector<NDArray> *grad_arrays, std::vector<OpReqType> *grad_reqs,
-    std::vector<NDArray> *aux_arrays,
+    bool &require_grad,
     const std::map<std::string, NDArray> &args_map,
     const std::map<std::string, NDArray> &arg_grad_store,
     const std::map<std::string, OpReqType> &grad_req_type,
     const std::map<std::string, NDArray> &aux_map) const {
 
   const auto arg_name_list = ListArguments();
+  const auto input_name_list = ListInputs();
+  const auto aux_name_list = ListAuxiliaryStates();
   std::vector<std::vector<mx_uint> > in_shapes, aux_shapes, out_shapes;
   std::map<std::string, std::vector<mx_uint> > arg_shapes;
 
@@ -308,44 +320,66 @@ inline void Symbol::InferExecutorArrays(
   }
 
   InferShape(arg_shapes, &in_shapes, &aux_shapes, &out_shapes);
+  std::map<std::string, std::vector<mx_uint> > inshape_map = GetDict(arg_name_list, in_shapes);
+  std::map<std::string, std::vector<mx_uint> > auxshape_map = GetDict(aux_name_list, aux_shapes);
 
-  for (size_t i = 0; i < in_shapes.size(); ++i) {
-    const auto &shape = in_shapes[i];
-    const auto &arg_name = arg_name_list[i];
-    auto iter_arg = args_map.find(arg_name);
+  for (size_t i = 0; i < input_name_list.size(); ++i) {
+    const auto &input_name = input_name_list[i];
+    std::vector<mx_uint> shape;
+    auto iter_arg = args_map.find(input_name);
     if (iter_arg != args_map.end()) {
-      arg_arrays->push_back(iter_arg->second);
+      arg_arrays->push_back((iter_arg->second).Copy(context));
     } else {
-      arg_arrays->push_back(NDArray(shape, context, false));
-      NDArray::SampleGaussian(0, 1, &arg_arrays->back());
-    }
-    auto iter_grad = arg_grad_store.find(arg_name);
-    if (iter_grad != arg_grad_store.end()) {
-      grad_arrays->push_back(iter_grad->second);
-    } else {
-      grad_arrays->push_back(NDArray(shape, context, false));
+      auto iter_inshape = inshape_map.find(input_name);
+      if (iter_inshape != inshape_map.end()) {
+        shape = iter_inshape->second;
+        arg_arrays->push_back(NDArray(shape, context, false));
+        NDArray::SampleGaussian(0, 1, &arg_arrays->back());
+      } else {
+        auto iter_aux = arg_grad_store.find(input_name);
+        if (iter_aux != arg_grad_store.end()) {
+          arg_arrays->push_back((iter_aux->second).Copy(context));
+        } else {
+          auto iter_auxshape = auxshape_map.find(input_name);
+          CHECK(iter_auxshape != auxshape_map.end())
+              << "Can not find name in args array and aux array";
+          shape = iter_auxshape->second;
+          arg_arrays->push_back(NDArray(shape, context, false));
+          NDArray::SampleGaussian(0, 1, &arg_arrays->back());
+        }
+      }
     }
-    auto iter_req = grad_req_type.find(arg_name);
+    auto iter_req = grad_req_type.find(input_name);
+    auto req = OpReqType::kNullOp;
     if (iter_req != grad_req_type.end()) {
-      grad_reqs->push_back(iter_req->second);
-    } else if (arg_name.rfind("data") != std::string::npos
-            || arg_name.rfind("label") != std::string::npos) {
-      grad_reqs->push_back(OpReqType::kNullOp);
+      req = iter_req->second;
+    } else if (input_name.rfind("data") != std::string::npos
+            || input_name.rfind("label") != std::string::npos) {
+      req = OpReqType::kNullOp;
     } else {
-      grad_reqs->push_back(OpReqType::kWriteTo);
+      req = OpReqType::kWriteTo;
     }
-  }
-
-  const auto aux_name_list = ListAuxiliaryStates();
-  for (size_t i = 0; i < aux_shapes.size(); ++i) {
-    const auto &shape = aux_shapes[i];
-    const auto &aux_name = aux_name_list[i];
-    auto iter_aux = aux_map.find(aux_name);
-    if (iter_aux != aux_map.end()) {
-      aux_arrays->push_back(iter_aux->second);
+    if (req != OpReqType::kNullOp) {
+      require_grad = true;
+      std::vector<NDArrayHandle> arg_handles;
+      std::vector<NDArrayHandle> grad_handles;
+      std::vector<mx_uint> grad_reqs_uint;
+      auto iter_grad = arg_grad_store.find(input_name);
+      if (iter_grad != arg_grad_store.end()) {
+        arg_handles.push_back(&arg_arrays->back());
+        grad_reqs_uint.push_back(req);
+        grad_handles.push_back(((iter_grad->second).Copy(context)).GetHandle());
+        CHECK_EQ(MXAutogradMarkVariables(1, arg_handles.data(),
+                                         grad_reqs_uint.data(),
+                                         grad_handles.data()),0);
     } else {
-      aux_arrays->push_back(NDArray(shape, context, false));
-      NDArray::SampleGaussian(0, 1, &aux_arrays->back());
+        arg_handles.push_back(&arg_arrays->back());
+        grad_reqs_uint.push_back(req);
+        grad_handles.push_back(NDArray(shape, context, false).GetHandle());
+        CHECK_EQ(MXAutogradMarkVariables(1, arg_handles.data(),
+                                         grad_reqs_uint.data(),
+                                         grad_handles.data()),0);
+      }
     }
   }
 }
@@ -385,27 +419,20 @@ inline Executor *Symbol::SimpleBind(
     const std::map<std::string, OpReqType> &grad_req_type,
     const std::map<std::string, NDArray> &aux_map) {
   std::vector<NDArray> arg_arrays;
-  std::vector<NDArray> grad_arrays;
-  std::vector<OpReqType> grad_reqs;
-  std::vector<NDArray> aux_arrays;
+  bool require_grad = false;
 
-  InferExecutorArrays(context, &arg_arrays, &grad_arrays, &grad_reqs,
-                      &aux_arrays, args_map, arg_grad_store, grad_req_type,
+  InferExecutorArrays(context, &arg_arrays, require_grad, args_map, arg_grad_store, grad_req_type,
                       aux_map);
 
-  return new Executor(*this, context, arg_arrays, grad_arrays, grad_reqs,
-                      aux_arrays);
+  return new Executor(*this, context, arg_arrays, require_grad);
 }
 
 inline Executor *Symbol::Bind(const Context &context,
                        const std::vector<NDArray> &arg_arrays,
-                       const std::vector<NDArray> &grad_arrays,
-                       const std::vector<OpReqType> &grad_reqs,
-                       const std::vector<NDArray> &aux_arrays,
+                       bool require_grad,
                        const std::map<std::string, Context> &group_to_ctx,
                        Executor *shared_exec) {
-  return new Executor(*this, context, arg_arrays, grad_arrays, grad_reqs,
-                      aux_arrays, group_to_ctx, shared_exec);
+  return new Executor(*this, context, arg_arrays, require_grad, group_to_ctx, shared_exec);
 }
 inline Symbol operator+(mx_float lhs, const Symbol &rhs) { return rhs + lhs; }
 inline Symbol operator-(mx_float lhs, const Symbol &rhs) {

From 64d1111e15ab5dfcdb8dbdd9157c1543b38384e6 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Sat, 17 Apr 2021 12:09:23 -0700
Subject: [PATCH 10/47] Revert "update symbol"

This reverts commit c6a40a2fc2326878039f3cc7edc20be5a134e4f6.
---
 .../example/inference/imagenet_inference.cpp  |  11 +-
 .../inference/sentiment_analysis_rnn.cpp      |   8 +-
 cpp-package/example/mlp.cpp                   |   3 +-
 cpp-package/include/mxnet-cpp/executor.h      |   5 +-
 cpp-package/include/mxnet-cpp/executor.hpp    |  52 +++++++-
 cpp-package/include/mxnet-cpp/symbol.h        |  16 ++-
 cpp-package/include/mxnet-cpp/symbol.hpp      | 115 +++++++-----------
 7 files changed, 125 insertions(+), 85 deletions(-)

diff --git a/cpp-package/example/inference/imagenet_inference.cpp b/cpp-package/example/inference/imagenet_inference.cpp
index f123510b9fc1..845a227fe93d 100644
--- a/cpp-package/example/inference/imagenet_inference.cpp
+++ b/cpp-package/example/inference/imagenet_inference.cpp
@@ -210,13 +210,18 @@ Predictor::Predictor(const std::string& model_json_file,
   Shape label_shape(input_shape_[0]);
   args_map_["softmax_label"] = NDArray(label_shape, global_ctx_, false);
   std::vector<NDArray> arg_arrays;
-  bool require_grad;
+  std::vector<NDArray> grad_arrays;
+  std::vector<OpReqType> grad_reqs;
+  std::vector<NDArray> aux_arrays;
+
   // infer and create ndarrays according to the given input ndarrays.
-  net_.InferExecutorArrays(global_ctx_, &arg_arrays, require_grad, args_map_, std::map<std::string, NDArray>(),
+  net_.InferExecutorArrays(global_ctx_, &arg_arrays, &grad_arrays, &grad_reqs,
+                           &aux_arrays, args_map_, std::map<std::string, NDArray>(),
                            std::map<std::string, OpReqType>(), aux_map_);
+  for (auto& i : grad_reqs) i = OpReqType::kNullOp;
 
   // Create an executor after binding the model to input parameters.
-  executor_ = new Executor(net_, global_ctx_, arg_arrays, false);
+  executor_ = new Executor(net_, global_ctx_, arg_arrays, grad_arrays, grad_reqs, aux_arrays);
 }
 
 /*
diff --git a/cpp-package/example/inference/sentiment_analysis_rnn.cpp b/cpp-package/example/inference/sentiment_analysis_rnn.cpp
index 9d51ef721c80..53b618ff116c 100755
--- a/cpp-package/example/inference/sentiment_analysis_rnn.cpp
+++ b/cpp-package/example/inference/sentiment_analysis_rnn.cpp
@@ -146,18 +146,18 @@ Predictor::Predictor(const std::string& model_json,
   highest_bucket_key = *(std::max_element(bucket_keys.begin(), bucket_keys.end()));
   args_map["data0"] = NDArray(Shape(highest_bucket_key, 1), global_ctx, false);
   args_map["data1"] = NDArray(Shape(1), global_ctx, false);
-  bool require_grad;
 
-  net.InferExecutorArrays(global_ctx, &arg_arrays, require_grad, args_map, std::map<std::string, NDArray>(),
+  net.InferExecutorArrays(global_ctx, &arg_arrays, &grad_arrays, &grad_reqs,
+                          &aux_arrays, args_map, std::map<std::string, NDArray>(),
                               std::map<std::string, OpReqType>(), aux_map);
-  Executor *master_executor = net.Bind(global_ctx, arg_arrays, require_grad,
+  Executor *master_executor = net.Bind(global_ctx, arg_arrays, grad_arrays, grad_reqs, aux_arrays,
                                  std::map<std::string, Context>(), nullptr);
   executor_buckets[highest_bucket_key] = master_executor;
 
   for (int bucket : bucket_keys) {
     if (executor_buckets.find(bucket) == executor_buckets.end()) {
       arg_arrays[0]  = NDArray(Shape(bucket, 1), global_ctx, false);
-      Executor *executor = net.Bind(global_ctx, arg_arrays, require_grad,
+      Executor *executor = net.Bind(global_ctx, arg_arrays, grad_arrays, grad_reqs, aux_arrays,
                                     std::map<std::string, Context>(), master_executor);
       executor_buckets[bucket] = executor;
     }
diff --git a/cpp-package/example/mlp.cpp b/cpp-package/example/mlp.cpp
index e49fc79657b3..970dad74e727 100644
--- a/cpp-package/example/mlp.cpp
+++ b/cpp-package/example/mlp.cpp
@@ -141,7 +141,8 @@ void MLP(int max_epoch) {
   std::vector<NDArray> aux_states;
 
   std::cout << "make the Executor" << std::endl;
-  Executor* exe = new Executor(sym_out, ctx_dev, in_args, true);
+  Executor* exe = new Executor(sym_out, ctx_dev, in_args, arg_grad_store,
+                               grad_req_type, aux_states);
 
   std::cout << "Training" << std::endl;
   mx_float learning_rate = 0.0001;
diff --git a/cpp-package/include/mxnet-cpp/executor.h b/cpp-package/include/mxnet-cpp/executor.h
index f04d18d61e6d..88ebd886e018 100644
--- a/cpp-package/include/mxnet-cpp/executor.h
+++ b/cpp-package/include/mxnet-cpp/executor.h
@@ -46,7 +46,9 @@ class Executor {
  public:
   Executor(const Symbol &symbol, Context context,
            const std::vector<NDArray> &arg_arrays,
-           bool require_grad,
+           const std::vector<NDArray> &grad_arrays,
+           const std::vector<OpReqType> &grad_reqs,
+           const std::vector<NDArray> &aux_arrays,
            const std::map<std::string, Context> &group_to_ctx =
                std::map<std::string, Context>(),
            Executor *shared_exec = nullptr);
@@ -73,6 +75,7 @@ class Executor {
         });
     int out_size = 0;
     NDArrayHandle *out_array = nullptr;
+    bool out_initialized = false;
     CHECK_EQ(MXInvokeCachedOp(handle_, arg_handles.size(), arg_handles.data(),
                               device_type, device_id, &out_size, &out_array, nullptr),
              0);
diff --git a/cpp-package/include/mxnet-cpp/executor.hpp b/cpp-package/include/mxnet-cpp/executor.hpp
index 7f7ca18fe85e..904a87468d21 100644
--- a/cpp-package/include/mxnet-cpp/executor.hpp
+++ b/cpp-package/include/mxnet-cpp/executor.hpp
@@ -37,18 +37,66 @@ namespace mxnet {
 namespace cpp {
 inline Executor::Executor(const Symbol &symbol, Context context,
                           const std::vector<NDArray> &arg_arrays,
-                          bool require_grad,
+                          const std::vector<NDArray> &grad_arrays,
+                          const std::vector<OpReqType> &grad_reqs,
+                          const std::vector<NDArray> &aux_arrays,
                           const std::map<std::string, Context> &group_to_ctx,
                           Executor *shared_exec) {
   this->arg_arrays = arg_arrays;
+  this->grad_arrays = grad_arrays;
+  this->aux_arrays = aux_arrays;
   this->symbol_ = symbol;
-  this->require_grad = require_grad;
   this->device_type = context.GetDeviceType();
   this->device_id = context.GetDeviceId();
 
+  std::vector<NDArrayHandle> arg_handles;
+  std::vector<NDArrayHandle> grad_handles;
+  std::vector<NDArrayHandle> aux_handles;
+
+  for (const auto &array : arg_arrays) {
+    arg_handles.push_back(array.GetHandle());
+  }
+  for (const auto &array : grad_arrays) {
+    grad_handles.push_back(array.GetHandle());
+  }
+
+  this->require_grad = false;
+  std::vector<mx_uint> grad_reqs_uint;
+  for (auto s : grad_reqs) {
+    if (s != OpReqType::kNullOp) {
+      this->require_grad = true;
+    }
+    grad_reqs_uint.push_back(s);
+  }
+  CHECK_EQ(MXAutogradMarkVariables(arg_handles.size(), arg_handles.data(),
+                                   grad_reqs_uint.data(), grad_handles.data()),0);
+  // std::vector<const char *> map_keys;
+  // std::vector<int> dev_types, dev_ids;
+  // for (const auto &s : group_to_ctx) {
+  //   map_keys.push_back(s.first.c_str());
+  //   dev_types.push_back(s.second.GetDeviceType());
+  //   dev_ids.push_back(s.second.GetDeviceId());
+  // }
+
   CHECK_EQ(MXCreateCachedOp(symbol.GetHandle(), 0, nullptr, nullptr, &handle_, false), 0);
 }
 
+inline mx_uint GradType2Int(OpReqType t) {
+  if (t == OpReqType::kNullOp) {
+    return 0;
+  } else if (t == OpReqType::kWriteTo) {
+    return 1;
+  } else if (t == OpReqType::kWriteInplace) {
+    return 2;
+  } else if (t == OpReqType::kAddTo) {
+    return 3;
+  } else {
+    LOG(FATAL) << "unknown grad type " << t;
+  }
+  LOG(FATAL) << "should not reach here ";
+  return 0;
+}
+
 }  // namespace cpp
 }  // namespace mxnet
 
diff --git a/cpp-package/include/mxnet-cpp/symbol.h b/cpp-package/include/mxnet-cpp/symbol.h
index f3d930472ecb..46fe2cbd28d0 100644
--- a/cpp-package/include/mxnet-cpp/symbol.h
+++ b/cpp-package/include/mxnet-cpp/symbol.h
@@ -204,6 +204,9 @@ class Symbol {
   * some known arrays.
   * \param context the context of all the infered arrays
   * \param arg_arrays infered input arguments arrays.
+  * \param arad_arrays infered arrays to store the gradient output of the input
+  * arguments.
+  * \param aux_arrays infered arrays that is used as internal state in op.
   * \param args_map map of some given arguments arrays.
   * \param args_grad_store map of some gradient given store arrays.
   * \param args_req_type map of some given type of gradient saving. Can only be
@@ -212,7 +215,8 @@ class Symbol {
   */
   void InferExecutorArrays(
       const Context &context, std::vector<NDArray> *arg_arrays,
-      bool &require_grad,
+      std::vector<NDArray> *grad_arrays, std::vector<OpReqType> *grad_reqs,
+      std::vector<NDArray> *aux_arrays,
       const std::map<std::string, NDArray> &args_map,
       const std::map<std::string, NDArray> &arg_grad_store =
           std::map<std::string, NDArray>(),
@@ -263,7 +267,11 @@ class Symbol {
   *
   * \param context the context of binding.
   * \param arg_arrays the NDArray that stores the input arguments to the symbol.
-  * \param require_grad if require to do backward propogation.
+  * \param grad_arrays NDArray that is used to store the gradient output of the
+  *input arguments.
+  * \param grad_reqs requirment type of gradient saving. Can only be in
+  *{kNullOp, kAddTo, kWriteTo}.
+  * \param aux_arrays NDArray that is used as internal state in op
   * \param group_to_ctx dict of string to mx.Context
   * \param shared_exec Executor to share memory with. This is intended for
   *runtime reshaping, variable length sequencesn etc.  The returned executor
@@ -271,7 +279,9 @@ class Symbol {
   * \return a new executor, which need to be free manually.
   */
   Executor *Bind(const Context &context, const std::vector<NDArray> &arg_arrays,
-                 bool require_grad,
+                 const std::vector<NDArray> &grad_arrays,
+                 const std::vector<OpReqType> &grad_reqs,
+                 const std::vector<NDArray> &aux_arrays,
                  const std::map<std::string, Context> &group_to_ctx =
                      std::map<std::string, Context>(),
                  Executor *shared_exec = nullptr);
diff --git a/cpp-package/include/mxnet-cpp/symbol.hpp b/cpp-package/include/mxnet-cpp/symbol.hpp
index 3c2a379f3cb3..187dad842862 100644
--- a/cpp-package/include/mxnet-cpp/symbol.hpp
+++ b/cpp-package/include/mxnet-cpp/symbol.hpp
@@ -287,28 +287,16 @@ inline void Symbol::InferShape(
   }
 }
 
-inline std::map<std::string, std::vector<mx_uint> > GetDict(const std::vector<std::string> &names,
-                                                            const std::vector<std::vector<mx_uint> > &shapes) {
-    std::map<std::string, std::vector<mx_uint> > ret;
-    CHECK_EQ(names.size(), shapes.size())
-        << "names size not equal to shapes size";
-    for (size_t i = 0; i < names.size(); ++i) {
-      ret[names[i]] = shapes[i];
-    }
-    return ret;
-}
-
 inline void Symbol::InferExecutorArrays(
     const Context &context, std::vector<NDArray> *arg_arrays,
-    bool &require_grad,
+    std::vector<NDArray> *grad_arrays, std::vector<OpReqType> *grad_reqs,
+    std::vector<NDArray> *aux_arrays,
     const std::map<std::string, NDArray> &args_map,
     const std::map<std::string, NDArray> &arg_grad_store,
     const std::map<std::string, OpReqType> &grad_req_type,
     const std::map<std::string, NDArray> &aux_map) const {
 
   const auto arg_name_list = ListArguments();
-  const auto input_name_list = ListInputs();
-  const auto aux_name_list = ListAuxiliaryStates();
   std::vector<std::vector<mx_uint> > in_shapes, aux_shapes, out_shapes;
   std::map<std::string, std::vector<mx_uint> > arg_shapes;
 
@@ -320,66 +308,44 @@ inline void Symbol::InferExecutorArrays(
   }
 
   InferShape(arg_shapes, &in_shapes, &aux_shapes, &out_shapes);
-  std::map<std::string, std::vector<mx_uint> > inshape_map = GetDict(arg_name_list, in_shapes);
-  std::map<std::string, std::vector<mx_uint> > auxshape_map = GetDict(aux_name_list, aux_shapes);
 
-  for (size_t i = 0; i < input_name_list.size(); ++i) {
-    const auto &input_name = input_name_list[i];
-    std::vector<mx_uint> shape;
-    auto iter_arg = args_map.find(input_name);
+  for (size_t i = 0; i < in_shapes.size(); ++i) {
+    const auto &shape = in_shapes[i];
+    const auto &arg_name = arg_name_list[i];
+    auto iter_arg = args_map.find(arg_name);
     if (iter_arg != args_map.end()) {
-      arg_arrays->push_back((iter_arg->second).Copy(context));
+      arg_arrays->push_back(iter_arg->second);
     } else {
-      auto iter_inshape = inshape_map.find(input_name);
-      if (iter_inshape != inshape_map.end()) {
-        shape = iter_inshape->second;
-        arg_arrays->push_back(NDArray(shape, context, false));
-        NDArray::SampleGaussian(0, 1, &arg_arrays->back());
-      } else {
-        auto iter_aux = arg_grad_store.find(input_name);
-        if (iter_aux != arg_grad_store.end()) {
-          arg_arrays->push_back((iter_aux->second).Copy(context));
-        } else {
-          auto iter_auxshape = auxshape_map.find(input_name);
-          CHECK(iter_auxshape != auxshape_map.end())
-              << "Can not find name in args array and aux array";
-          shape = iter_auxshape->second;
-          arg_arrays->push_back(NDArray(shape, context, false));
-          NDArray::SampleGaussian(0, 1, &arg_arrays->back());
-        }
-      }
+      arg_arrays->push_back(NDArray(shape, context, false));
+      NDArray::SampleGaussian(0, 1, &arg_arrays->back());
+    }
+    auto iter_grad = arg_grad_store.find(arg_name);
+    if (iter_grad != arg_grad_store.end()) {
+      grad_arrays->push_back(iter_grad->second);
+    } else {
+      grad_arrays->push_back(NDArray(shape, context, false));
     }
-    auto iter_req = grad_req_type.find(input_name);
-    auto req = OpReqType::kNullOp;
+    auto iter_req = grad_req_type.find(arg_name);
     if (iter_req != grad_req_type.end()) {
-      req = iter_req->second;
-    } else if (input_name.rfind("data") != std::string::npos
-            || input_name.rfind("label") != std::string::npos) {
-      req = OpReqType::kNullOp;
+      grad_reqs->push_back(iter_req->second);
+    } else if (arg_name.rfind("data") != std::string::npos
+            || arg_name.rfind("label") != std::string::npos) {
+      grad_reqs->push_back(OpReqType::kNullOp);
     } else {
-      req = OpReqType::kWriteTo;
+      grad_reqs->push_back(OpReqType::kWriteTo);
     }
-    if (req != OpReqType::kNullOp) {
-      require_grad = true;
-      std::vector<NDArrayHandle> arg_handles;
-      std::vector<NDArrayHandle> grad_handles;
-      std::vector<mx_uint> grad_reqs_uint;
-      auto iter_grad = arg_grad_store.find(input_name);
-      if (iter_grad != arg_grad_store.end()) {
-        arg_handles.push_back(&arg_arrays->back());
-        grad_reqs_uint.push_back(req);
-        grad_handles.push_back(((iter_grad->second).Copy(context)).GetHandle());
-        CHECK_EQ(MXAutogradMarkVariables(1, arg_handles.data(),
-                                         grad_reqs_uint.data(),
-                                         grad_handles.data()),0);
+  }
+
+  const auto aux_name_list = ListAuxiliaryStates();
+  for (size_t i = 0; i < aux_shapes.size(); ++i) {
+    const auto &shape = aux_shapes[i];
+    const auto &aux_name = aux_name_list[i];
+    auto iter_aux = aux_map.find(aux_name);
+    if (iter_aux != aux_map.end()) {
+      aux_arrays->push_back(iter_aux->second);
     } else {
-        arg_handles.push_back(&arg_arrays->back());
-        grad_reqs_uint.push_back(req);
-        grad_handles.push_back(NDArray(shape, context, false).GetHandle());
-        CHECK_EQ(MXAutogradMarkVariables(1, arg_handles.data(),
-                                         grad_reqs_uint.data(),
-                                         grad_handles.data()),0);
-      }
+      aux_arrays->push_back(NDArray(shape, context, false));
+      NDArray::SampleGaussian(0, 1, &aux_arrays->back());
     }
   }
 }
@@ -419,20 +385,27 @@ inline Executor *Symbol::SimpleBind(
     const std::map<std::string, OpReqType> &grad_req_type,
     const std::map<std::string, NDArray> &aux_map) {
   std::vector<NDArray> arg_arrays;
-  bool require_grad = false;
+  std::vector<NDArray> grad_arrays;
+  std::vector<OpReqType> grad_reqs;
+  std::vector<NDArray> aux_arrays;
 
-  InferExecutorArrays(context, &arg_arrays, require_grad, args_map, arg_grad_store, grad_req_type,
+  InferExecutorArrays(context, &arg_arrays, &grad_arrays, &grad_reqs,
+                      &aux_arrays, args_map, arg_grad_store, grad_req_type,
                       aux_map);
 
-  return new Executor(*this, context, arg_arrays, require_grad);
+  return new Executor(*this, context, arg_arrays, grad_arrays, grad_reqs,
+                      aux_arrays);
 }
 
 inline Executor *Symbol::Bind(const Context &context,
                        const std::vector<NDArray> &arg_arrays,
-                       bool require_grad,
+                       const std::vector<NDArray> &grad_arrays,
+                       const std::vector<OpReqType> &grad_reqs,
+                       const std::vector<NDArray> &aux_arrays,
                        const std::map<std::string, Context> &group_to_ctx,
                        Executor *shared_exec) {
-  return new Executor(*this, context, arg_arrays, require_grad, group_to_ctx, shared_exec);
+  return new Executor(*this, context, arg_arrays, grad_arrays, grad_reqs,
+                      aux_arrays, group_to_ctx, shared_exec);
 }
 inline Symbol operator+(mx_float lhs, const Symbol &rhs) { return rhs + lhs; }
 inline Symbol operator-(mx_float lhs, const Symbol &rhs) {

From b91d00ebcbeef4bfcb450b27b2e554e34afca257 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 19 Apr 2021 09:21:36 -0700
Subject: [PATCH 11/47] update executor

---
 ci/jenkins/Jenkins_steps.groovy            |  4 +--
 cpp-package/include/mxnet-cpp/executor.h   |  4 +--
 cpp-package/include/mxnet-cpp/executor.hpp | 41 ++++++++++------------
 3 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 77e0b990f719..51efc4dfab8a 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -805,8 +805,8 @@ def test_unix_onnx_cpu(lib_name) {
 }
 
 def test_unix_cpp_package_cpu(lib_name) {
-    return ['cpp-package CPU Makefile': {
-      node(NODE_LINUX_GPU_G4) {
+    return ['cpp-package CPU': {
+      node(NODE_LINUX_CPU) {
         ws('workspace/it-cpp-package-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init(lib_name, mx_lib_cpp_examples)
diff --git a/cpp-package/include/mxnet-cpp/executor.h b/cpp-package/include/mxnet-cpp/executor.h
index 88ebd886e018..b0ba994e3a30 100644
--- a/cpp-package/include/mxnet-cpp/executor.h
+++ b/cpp-package/include/mxnet-cpp/executor.h
@@ -59,7 +59,7 @@ class Executor {
   */
   void Forward(bool is_train) {
     std::vector<NDArrayHandle> arg_handles;
-    for (const auto &array : arg_arrays) {
+    for (const auto &array : combined_arrays) {
       arg_handles.push_back(array.GetHandle());
     }
     int prev_is_record = 0;
@@ -75,7 +75,6 @@ class Executor {
         });
     int out_size = 0;
     NDArrayHandle *out_array = nullptr;
-    bool out_initialized = false;
     CHECK_EQ(MXInvokeCachedOp(handle_, arg_handles.size(), arg_handles.data(),
                               device_type, device_id, &out_size, &out_array, nullptr),
              0);
@@ -144,6 +143,7 @@ class Executor {
   std::vector<NDArray> arg_arrays;
   std::vector<NDArray> grad_arrays;
   std::vector<NDArray> aux_arrays;
+  std::vector<NDArray> combined_arrays;
   int device_type;
   int device_id;
   bool require_grad;
diff --git a/cpp-package/include/mxnet-cpp/executor.hpp b/cpp-package/include/mxnet-cpp/executor.hpp
index 904a87468d21..85f85d81d858 100644
--- a/cpp-package/include/mxnet-cpp/executor.hpp
+++ b/cpp-package/include/mxnet-cpp/executor.hpp
@@ -51,7 +51,6 @@ inline Executor::Executor(const Symbol &symbol, Context context,
 
   std::vector<NDArrayHandle> arg_handles;
   std::vector<NDArrayHandle> grad_handles;
-  std::vector<NDArrayHandle> aux_handles;
 
   for (const auto &array : arg_arrays) {
     arg_handles.push_back(array.GetHandle());
@@ -70,32 +69,28 @@ inline Executor::Executor(const Symbol &symbol, Context context,
   }
   CHECK_EQ(MXAutogradMarkVariables(arg_handles.size(), arg_handles.data(),
                                    grad_reqs_uint.data(), grad_handles.data()),0);
-  // std::vector<const char *> map_keys;
-  // std::vector<int> dev_types, dev_ids;
-  // for (const auto &s : group_to_ctx) {
-  //   map_keys.push_back(s.first.c_str());
-  //   dev_types.push_back(s.second.GetDeviceType());
-  //   dev_ids.push_back(s.second.GetDeviceId());
-  // }
+
+  std::map<std::string, NDArray> arg_map = arg_dict();
+  std::map<std::string, NDArray> aux_map = aux_dict();
+  const auto input_name_list = symbol_.ListInputs();
+  std::vector<NDArray> combined_arrays;
+  for (size_t i = 0; i < input_name_list.size(); ++i) {
+    const auto &input_name = input_name_list[i];
+    auto iter_arg = arg_map.find(input_name);
+    if (iter_arg != arg_map.end()) {
+      combined_arrays.push_back(iter_arg->second);
+    } else {
+      auto iter_aux = aux_map.find(input_name);
+      CHECK(iter_aux != aux_map.end())
+          << "Can not find name in args array and aux array";
+      combined_arrays.push_back(iter_aux->second);
+    }
+  }
+  this->combined_arrays = combined_arrays;
 
   CHECK_EQ(MXCreateCachedOp(symbol.GetHandle(), 0, nullptr, nullptr, &handle_, false), 0);
 }
 
-inline mx_uint GradType2Int(OpReqType t) {
-  if (t == OpReqType::kNullOp) {
-    return 0;
-  } else if (t == OpReqType::kWriteTo) {
-    return 1;
-  } else if (t == OpReqType::kWriteInplace) {
-    return 2;
-  } else if (t == OpReqType::kAddTo) {
-    return 3;
-  } else {
-    LOG(FATAL) << "unknown grad type " << t;
-  }
-  LOG(FATAL) << "should not reach here ";
-  return 0;
-}
 
 }  // namespace cpp
 }  // namespace mxnet

From b12fb9a10bcc107d9f7b5a3a31d9735b510f41c7 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 19 Apr 2021 13:28:31 -0700
Subject: [PATCH 12/47] update runtime_functions.sh

---
 ci/docker/runtime_functions.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index bd97918eeca8..1c4cd4671b4c 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -626,8 +626,8 @@ build_ubuntu_gpu_onednn_nocudnn() {
 build_ubuntu_gpu() {
     set -ex
     cd /work/build
-    ln -f -s /usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs/libcuda.so libcuda.so.1
-    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs/
+    ln -f -s /usr/local/cuda/lib64/stubs/libcuda.so libcuda.so.1
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs
     CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_CUDA=ON \

From 43dc452a4e2615238ec23236c6140c6d336f4a53 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 20 Apr 2021 13:23:15 -0700
Subject: [PATCH 13/47] update cpp-package gpu build

---
 ci/docker/runtime_functions.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 1c4cd4671b4c..16c304503964 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -626,8 +626,7 @@ build_ubuntu_gpu_onednn_nocudnn() {
 build_ubuntu_gpu() {
     set -ex
     cd /work/build
-    ln -f -s /usr/local/cuda/lib64/stubs/libcuda.so libcuda.so.1
-    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs
+    export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64/stubs:/usr/local/cuda-11.1/lib64:/usr/lib/x86_64-linux-gnu
     CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_CUDA=ON \

From ec76a6c188a953c6b94c2975ed86c23c3e3d924d Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 21 Apr 2021 16:03:18 -0700
Subject: [PATCH 14/47] update libcuda.so.1 sym link

---
 ci/docker/runtime_functions.sh |  1 +
 cpp-package/tests/ci_test.sh   | 26 +++++++++++++-------------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 16c304503964..b1dc8b846b24 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -626,6 +626,7 @@ build_ubuntu_gpu_onednn_nocudnn() {
 build_ubuntu_gpu() {
     set -ex
     cd /work/build
+    ln -f -s /usr/lib/x86_64-linux-gnu/libcuda.so.450.80.02 libcuda.so.1
     export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64/stubs:/usr/local/cuda-11.1/lib64:/usr/lib/x86_64-linux-gnu
     CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
diff --git a/cpp-package/tests/ci_test.sh b/cpp-package/tests/ci_test.sh
index 324769923186..c9b8dfa00bfb 100755
--- a/cpp-package/tests/ci_test.sh
+++ b/cpp-package/tests/ci_test.sh
@@ -24,40 +24,40 @@ ls -l /work/build/
 
 ./get_data.sh
 
-cp ../../../build/cpp-package/example/lenet .
+cp /work/build/cpp-package/example/lenet .
 ./lenet 1
 
-cp ../../../build/cpp-package/example/alexnet .
+cp /work/build/cpp-package/example/alexnet .
 ./alexnet 1
 
-cp ../../../build/cpp-package/example/lenet_with_mxdataiter .
+cp /work/build/cpp-package/example/lenet_with_mxdataiter .
 ./lenet_with_mxdataiter 1
 
-cp ../../../build/cpp-package/example/resnet .
+cp /work/build/cpp-package/example/resnet .
 ./resnet 1
 
-cp ../../../build/cpp-package/example/inception_bn .
+cp /work/build/cpp-package/example/inception_bn .
 ./inception_bn 1
 
-cp ../../../build/cpp-package/example/mlp .
+cp /work/build/cpp-package/example/mlp .
 ./mlp 150
 
-cp ../../../build/cpp-package/example/mlp_cpu .
+cp /work/build/cpp-package/example/mlp_cpu .
 ./mlp_cpu
 
-cp ../../../build/cpp-package/example/mlp_gpu .
+cp /work/build/cpp-package/example/mlp_gpu .
 ./mlp_gpu
 
-cp ../../../build/cpp-package/example/test_optimizer .
+cp /work/build/cpp-package/example/test_optimizer .
 ./test_optimizer
 
-cp ../../../build/cpp-package/example/test_kvstore .
+cp /work/build/cpp-package/example/test_kvstore .
 ./test_kvstore
 
-cp ../../../build/cpp-package/example/test_score .
+cp /work/build/cpp-package/example/test_score .
 ./test_score 0.93
 
-cp ../../../build/cpp-package/example/test_ndarray_copy .
+cp /work/build/cpp-package/example/test_ndarray_copy .
 ./test_ndarray_copy
 
 # skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/20011
@@ -68,6 +68,6 @@ sh unittests/unit_test_mlp_csv.sh
 
 cd inference
 
-cp ../../../../build/cpp-package/example/sentiment_analysis_rnn .
+cp /work/build/cpp-package/example/sentiment_analysis_rnn .
 ./unit_test_sentiment_analysis_rnn.sh
 cd ..

From 2271657494f450639ba8ff80fdb5b0784d85efa3 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 21 Apr 2021 17:02:56 -0700
Subject: [PATCH 15/47] test: linked libraries

---
 ci/docker/runtime_functions.sh | 2 +-
 cpp-package/CMakeLists.txt     | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index b1dc8b846b24..41dd46bb0971 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -626,7 +626,7 @@ build_ubuntu_gpu_onednn_nocudnn() {
 build_ubuntu_gpu() {
     set -ex
     cd /work/build
-    ln -f -s /usr/lib/x86_64-linux-gnu/libcuda.so.450.80.02 libcuda.so.1
+    ln -f -s /usr/lib/x86_64-linux-gnu/libcuda.so libcuda.so.1
     export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64/stubs:/usr/local/cuda-11.1/lib64:/usr/lib/x86_64-linux-gnu
     CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
diff --git a/cpp-package/CMakeLists.txt b/cpp-package/CMakeLists.txt
index 2a168e7c824c..273a60a3f6e3 100644
--- a/cpp-package/CMakeLists.txt
+++ b/cpp-package/CMakeLists.txt
@@ -34,6 +34,8 @@ add_custom_target(
   BYPRODUCTS ${CPP_PACKAGE_OP_H_HEADER}
   MAIN_DEPENDENCY mxnet
   DEPENDS mxnet ${CMAKE_CURRENT_SOURCE_DIR}/scripts/OpWrapperGenerator.py
+  COMMAND echo "TEST Linked Libraries"
+  COMMAND ldd $<TARGET_FILE:mxnet>
   COMMAND echo "Running: OpWrapperGenerator.py"
   COMMAND python3 OpWrapperGenerator.py $<TARGET_FILE:mxnet>
   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts

From 1993a4811b3fb3f6d6dca7439defd7f55517ff83 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 22 Apr 2021 10:06:10 -0700
Subject: [PATCH 16/47] update

---
 ci/docker/runtime_functions.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 41dd46bb0971..f0f32ecd1e8d 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -627,7 +627,7 @@ build_ubuntu_gpu() {
     set -ex
     cd /work/build
     ln -f -s /usr/lib/x86_64-linux-gnu/libcuda.so libcuda.so.1
-    export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64/stubs:/usr/local/cuda-11.1/lib64:/usr/lib/x86_64-linux-gnu
+    export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs:/usr/local/cuda-11.1/lib64:/usr/lib/x86_64-linux-gnu
     CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_CUDA=ON \

From b084c15cab5d27cfae5a5358b76ff7daf8365607 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 22 Apr 2021 10:32:36 -0700
Subject: [PATCH 17/47] export LIBRARY_PATH

---
 ci/docker/runtime_functions.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index f0f32ecd1e8d..77c56f7d3b47 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -626,8 +626,8 @@ build_ubuntu_gpu_onednn_nocudnn() {
 build_ubuntu_gpu() {
     set -ex
     cd /work/build
-    ln -f -s /usr/lib/x86_64-linux-gnu/libcuda.so libcuda.so.1
-    export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs:/usr/local/cuda-11.1/lib64:/usr/lib/x86_64-linux-gnu
+    export LIBRARY_PATH=${LIBRARY_PATH}:/usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs:/usr/lib/x86_64-linux-gnu
+    export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs:/usr/lib/x86_64-linux-gnu
     CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_CUDA=ON \

From fedef002eb591f2f3733b9ce6cfda297968a17fa Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 22 Apr 2021 11:00:55 -0700
Subject: [PATCH 18/47] pack build/cpp-package/examples

---
 ci/jenkins/Jenkins_steps.groovy | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 51efc4dfab8a..1bf0efec90ea 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -34,9 +34,9 @@ mx_cmake_lib_cython = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so,
 mx_cmake_lib_debug = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, build/tests/mxnet_unit_tests'
 mx_onednn_lib = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, example/extensions/lib_external_ops/build/libexternal_lib.so'
 mx_tensorrt_lib = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
-mx_lib_cpp_examples = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, example/extensions/lib_external_ops/build/libexternal_lib.so, python/mxnet/_cy3/*.so, python/mxnet/_ffi/_cy3/*.so'
+mx_lib_cpp_examples = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, example/extensions/lib_external_ops/build/libexternal_lib.so, build/cpp-package/example/*, python/mxnet/_cy3/*.so, python/mxnet/_ffi/_cy3/*.so'
 mx_lib_cpp_examples_no_tvm_op = 'build/libmxnet.so, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, python/mxnet/_cy3/*.so, python/mxnet/_ffi/_cy3/*.so'
-mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf'
+mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/cpp-package/example/*'
 mx_cd_lib = 'lib/libmxnet.so, licenses/*, lib/libgfortran.so.*, lib/libopenblas.so.0, include/onednn/oneapi/dnnl/dnnl_version.h, include/onednn/oneapi/dnnl/dnnl_config.h'
 
 

From 730d5eb622a4967348de14c714fc7a7266672056 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 22 Apr 2021 11:45:55 -0700
Subject: [PATCH 19/47] link libcuda.so.1

---
 ci/docker/runtime_functions.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 77c56f7d3b47..1b88bb6f44fb 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -626,6 +626,7 @@ build_ubuntu_gpu_onednn_nocudnn() {
 build_ubuntu_gpu() {
     set -ex
     cd /work/build
+    ln -s -f /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
     export LIBRARY_PATH=${LIBRARY_PATH}:/usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs:/usr/lib/x86_64-linux-gnu
     export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs:/usr/lib/x86_64-linux-gnu
     CC=gcc-7 CXX=g++-7 cmake \

From 21007f4f09ec3a7fc4253815071770907a560d71 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 22 Apr 2021 12:52:46 -0700
Subject: [PATCH 20/47] update jenkins

---
 ci/docker/runtime_functions.sh  | 2 +-
 ci/jenkins/Jenkins_steps.groovy | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 1b88bb6f44fb..b67562957159 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -626,7 +626,7 @@ build_ubuntu_gpu_onednn_nocudnn() {
 build_ubuntu_gpu() {
     set -ex
     cd /work/build
-    ln -s -f /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
+    sudo ln -s -f /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
     export LIBRARY_PATH=${LIBRARY_PATH}:/usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs:/usr/lib/x86_64-linux-gnu
     export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs:/usr/lib/x86_64-linux-gnu
     CC=gcc-7 CXX=g++-7 cmake \
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 1bf0efec90ea..1156d78e62ab 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -85,7 +85,7 @@ def compile_unix_cpu_openblas(lib_name) {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_openblas', false)
-            utils.pack_lib(lib_name, mx_lib_cython, true)
+            utils.pack_lib(lib_name, mx_lib_cpp_examples, true)
           }
         }
       }

From e2aaec11977038f568b9b29bc03291fbc8c387ae Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 22 Apr 2021 14:19:33 -0700
Subject: [PATCH 21/47] update dockerfile

---
 ci/docker/Dockerfile.build.ubuntu | 2 ++
 ci/docker/runtime_functions.sh    | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/docker/Dockerfile.build.ubuntu b/ci/docker/Dockerfile.build.ubuntu
index 7077cac976fa..95e3b0664b91 100644
--- a/ci/docker/Dockerfile.build.ubuntu
+++ b/ci/docker/Dockerfile.build.ubuntu
@@ -136,6 +136,8 @@ RUN python3 -m pip install --upgrade pip && \
     python3 -m pip install cmake==3.16.6 && \
     python3 -m pip install -r /work/requirements
 
+RUN ln -s -f /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
+
 ARG USER_ID=0
 COPY install/docker_filepermissions.sh /work/
 RUN /work/docker_filepermissions.sh
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index b67562957159..77c56f7d3b47 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -626,7 +626,6 @@ build_ubuntu_gpu_onednn_nocudnn() {
 build_ubuntu_gpu() {
     set -ex
     cd /work/build
-    sudo ln -s -f /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
     export LIBRARY_PATH=${LIBRARY_PATH}:/usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs:/usr/lib/x86_64-linux-gnu
     export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs:/usr/lib/x86_64-linux-gnu
     CC=gcc-7 CXX=g++-7 cmake \

From c06334ba103721f7beb192e78baa9afcf70d78b7 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 27 Apr 2021 16:00:02 -0700
Subject: [PATCH 22/47] turn on use_nvidia

---
 ci/jenkins/Jenkins_steps.groovy | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 1156d78e62ab..b81367165ffa 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -223,7 +223,7 @@ def compile_unix_full_gpu(lib_name) {
         ws('workspace/build-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('ubuntu_gpu_cu111', 'build_ubuntu_gpu', false)
+            utils.docker_run('ubuntu_gpu_cu111', 'build_ubuntu_gpu', true)
             utils.pack_lib(lib_name, mx_lib_cpp_examples)
           }
         }

From 992d3d6c8ffdade84255e0321470cd97e4829417 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 27 Apr 2021 16:26:01 -0700
Subject: [PATCH 23/47] fix sanity

---
 ci/docker/Dockerfile.build.ubuntu | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ci/docker/Dockerfile.build.ubuntu b/ci/docker/Dockerfile.build.ubuntu
index 95e3b0664b91..7077cac976fa 100644
--- a/ci/docker/Dockerfile.build.ubuntu
+++ b/ci/docker/Dockerfile.build.ubuntu
@@ -136,8 +136,6 @@ RUN python3 -m pip install --upgrade pip && \
     python3 -m pip install cmake==3.16.6 && \
     python3 -m pip install -r /work/requirements
 
-RUN ln -s -f /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
-
 ARG USER_ID=0
 COPY install/docker_filepermissions.sh /work/
 RUN /work/docker_filepermissions.sh

From 8e4c0bafcbeb17c673ac4e1b962da019724e2ea0 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 28 Apr 2021 16:54:12 -0700
Subject: [PATCH 24/47] fix build

---
 ci/docker/runtime_functions.sh | 6 +++---
 cpp-package/CMakeLists.txt     | 2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 77c56f7d3b47..78d06ad16d6d 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -626,9 +626,9 @@ build_ubuntu_gpu_onednn_nocudnn() {
 build_ubuntu_gpu() {
     set -ex
     cd /work/build
-    export LIBRARY_PATH=${LIBRARY_PATH}:/usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs:/usr/lib/x86_64-linux-gnu
-    export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs:/usr/lib/x86_64-linux-gnu
-    CC=gcc-7 CXX=g++-7 cmake \
+    ln -s -f /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so libcuda.so.1
+    export LIBRARY_PATH=${LIBRARY_PATH}:/work/build
+    export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/work/build
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_CUDA=ON \
         -DUSE_NVML=OFF \
diff --git a/cpp-package/CMakeLists.txt b/cpp-package/CMakeLists.txt
index 273a60a3f6e3..2a168e7c824c 100644
--- a/cpp-package/CMakeLists.txt
+++ b/cpp-package/CMakeLists.txt
@@ -34,8 +34,6 @@ add_custom_target(
   BYPRODUCTS ${CPP_PACKAGE_OP_H_HEADER}
   MAIN_DEPENDENCY mxnet
   DEPENDS mxnet ${CMAKE_CURRENT_SOURCE_DIR}/scripts/OpWrapperGenerator.py
-  COMMAND echo "TEST Linked Libraries"
-  COMMAND ldd $<TARGET_FILE:mxnet>
   COMMAND echo "Running: OpWrapperGenerator.py"
   COMMAND python3 OpWrapperGenerator.py $<TARGET_FILE:mxnet>
   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts

From b1bd45015d9b7090decf0210dfd53d76f8a48227 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 28 Apr 2021 17:52:24 -0700
Subject: [PATCH 25/47] turn off use_nvidia

---
 ci/jenkins/Jenkins_steps.groovy | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index b81367165ffa..1156d78e62ab 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -223,7 +223,7 @@ def compile_unix_full_gpu(lib_name) {
         ws('workspace/build-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('ubuntu_gpu_cu111', 'build_ubuntu_gpu', true)
+            utils.docker_run('ubuntu_gpu_cu111', 'build_ubuntu_gpu', false)
             utils.pack_lib(lib_name, mx_lib_cpp_examples)
           }
         }

From 64e2d1681ddd194e1d500d96a7222eed04817119 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 29 Apr 2021 09:25:01 -0700
Subject: [PATCH 26/47] update runtime_function.sh

---
 ci/docker/runtime_functions.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 78d06ad16d6d..35d3839b8e55 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -629,6 +629,7 @@ build_ubuntu_gpu() {
     ln -s -f /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so libcuda.so.1
     export LIBRARY_PATH=${LIBRARY_PATH}:/work/build
     export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/work/build
+    CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_CUDA=ON \
         -DUSE_NVML=OFF \

From 0ba0a4dbbef04950bacbfc1aae4740385cfc60be Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 29 Apr 2021 13:56:24 -0700
Subject: [PATCH 27/47] fix mlp

---
 cpp-package/example/Makefile               |  2 +-
 cpp-package/include/mxnet-cpp/executor.hpp | 12 +++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/cpp-package/example/Makefile b/cpp-package/example/Makefile
index 89af219d3103..dcb06399cfff 100644
--- a/cpp-package/example/Makefile
+++ b/cpp-package/example/Makefile
@@ -24,7 +24,7 @@ endif
 prebuild :
 	@mkdir -p build
 	$(shell ./get_data.sh)
-	$(shell cp -r ../../lib ./)
+	$(shell cp -r ../../build ./)
 CPPEX_SRC = $(wildcard *.cpp)
 CPPEX_EXE = $(patsubst %.cpp, %, $(CPPEX_SRC))
 
diff --git a/cpp-package/include/mxnet-cpp/executor.hpp b/cpp-package/include/mxnet-cpp/executor.hpp
index 85f85d81d858..b3fa1320611d 100644
--- a/cpp-package/include/mxnet-cpp/executor.hpp
+++ b/cpp-package/include/mxnet-cpp/executor.hpp
@@ -52,11 +52,13 @@ inline Executor::Executor(const Symbol &symbol, Context context,
   std::vector<NDArrayHandle> arg_handles;
   std::vector<NDArrayHandle> grad_handles;
 
-  for (const auto &array : arg_arrays) {
-    arg_handles.push_back(array.GetHandle());
-  }
-  for (const auto &array : grad_arrays) {
-    grad_handles.push_back(array.GetHandle());
+  CHECK_EQ(arg_arrays.size(), grad_arrays.size())
+      << "Number of input arg_arrays is different from the number of input grad_arrays";
+  for (int i = 0; i < arg_arrays.size(); i++) {
+    if (grad_arrays[i].GetShape().size() != 0) {
+      grad_handles.push_back(grad_arrays[i].GetHandle());
+      arg_handles.push_back(arg_arrays[i].GetHandle());
+    }
   }
 
   this->require_grad = false;

From 7e9256ab8afb8b442d35aecd1e12b9d51a7dd079 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 29 Apr 2021 18:05:46 -0700
Subject: [PATCH 28/47] fix gradient vanish

---
 ci/docker/runtime_functions.sh       | 16 +++++++++-------
 ci/jenkins/Jenkins_steps.groovy      |  4 ++--
 cpp-package/example/alexnet.cpp      |  4 ++--
 cpp-package/example/charRNN.cpp      |  7 ++++---
 cpp-package/example/inception_bn.cpp |  6 +++---
 cpp-package/example/resnet.cpp       |  6 +++---
 cpp-package/tests/ci_test.sh         |  8 ++++----
 7 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 35d3839b8e55..380c11e5ee20 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -850,9 +850,17 @@ unittest_ubuntu_python3_gpu_nocudnn() {
     pytest --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu/test_amp_init.py
 }
 
-unittest_cpp() {
+unittest_cpp_cpu() {
     set -ex
     export DMLC_LOG_STACK_TRACE_DEPTH=100
+    export MXNET_USE_CPU=1
+    build/tests/mxnet_unit_tests
+}
+
+unittest_cpp_gpu() {
+    set -ex
+    export DMLC_LOG_STACK_TRACE_DEPTH=100
+    export MXNET_USE_CPU=0
     build/tests/mxnet_unit_tests
 }
 
@@ -883,12 +891,6 @@ unittest_centos7_gpu() {
     pytest --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu/test_amp_init.py
 }
 
-integrationtest_ubuntu_cpp_package() {
-    set -ex
-    export DMLC_LOG_STACK_TRACE_DEPTH=10
-    cpp-package/tests/ci_test.sh
-}
-
 integrationtest_ubuntu_cpu_onnx() {
 	set -ex
 	export PYTHONPATH=./python/
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 1156d78e62ab..276c19dba548 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -810,7 +810,7 @@ def test_unix_cpp_package_cpu(lib_name) {
         ws('workspace/it-cpp-package-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init(lib_name, mx_lib_cpp_examples)
-            utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpp_package', false)
+            utils.docker_run('ubuntu_cpu', 'unittest_cpp_cpu', false)
             utils.publish_test_coverage()
           }
         }
@@ -866,7 +866,7 @@ def test_unix_cpp_package_gpu(lib_name) {
         ws('workspace/it-cpp-package-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init(lib_name, mx_lib_cpp_examples)
-            utils.docker_run('ubuntu_gpu_cu111', 'integrationtest_ubuntu_cpp_package', true)
+            utils.docker_run('ubuntu_gpu_cu111', 'unittest_cpp_gpu', true)
             utils.publish_test_coverage()
           }
         }
diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp
index 1c182182c1a5..8e7b50f133bd 100644
--- a/cpp-package/example/alexnet.cpp
+++ b/cpp-package/example/alexnet.cpp
@@ -251,11 +251,11 @@ int main(int argc, char const *argv[]) {
   /*if fine tune from some pre-trained model, we should load the parameters*/
   // NDArray::Load("./model/alex_params_3", nullptr, &args_map);
   /*else, we should use initializer Xavier to init the params*/
-  Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2.34);
+  auto initializer = Uniform(0.07);
   for (auto &arg : args_map) {
     /*be careful here, the arg's name must has some specific ends or starts for
      * initializer to call*/
-    xavier(arg.first, &arg.second);
+    initializer(arg.first, &arg.second);
   }
 
   /*these binary files should be generated using im2rc tools, which can be found
diff --git a/cpp-package/example/charRNN.cpp b/cpp-package/example/charRNN.cpp
index 524509c375af..0b87abf72343 100644
--- a/cpp-package/example/charRNN.cpp
+++ b/cpp-package/example/charRNN.cpp
@@ -461,9 +461,10 @@ void train(const std::string file, int batch_size, int max_epoch, int start_epoc
   Executor* exe = RNN.SimpleBind(device, args_map);
 
   if (start_epoch == -1) {
-    Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2.34);
-    for (auto &arg : exe->arg_dict())
-      xavier(arg.first, &arg.second);
+    auto initializer = Uniform(0.07);
+    for (auto &arg : exe->arg_dict()) {
+      initializer(arg.first, &arg.second);
+    }
   } else {
     LoadCheckpoint(prefix + "-" + std::to_string(start_epoch) + ".params", exe);
   }
diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp
index 8fe6b070497c..af018a2d93c2 100644
--- a/cpp-package/example/inception_bn.cpp
+++ b/cpp-package/example/inception_bn.cpp
@@ -200,9 +200,9 @@ int main(int argc, char const *argv[]) {
   }
 
   // initialize parameters
-  Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2);
-  for (auto &arg : args_map) {
-    xavier(arg.first, &arg.second);
+  auto initializer = Uniform(0.07);
+  for (auto& arg : args_map) {
+    initializer(arg.first, &arg.second);
   }
 
   Optimizer* opt = OptimizerRegistry::Find("sgd");
diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp
index 51dbf420ef99..c876724533d4 100644
--- a/cpp-package/example/resnet.cpp
+++ b/cpp-package/example/resnet.cpp
@@ -212,9 +212,9 @@ int main(int argc, char const *argv[]) {
   }
 
   // initialize parameters
-  Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2);
-  for (auto &arg : args_map) {
-    xavier(arg.first, &arg.second);
+  auto initializer = Uniform(0.07);
+  for (auto& arg : args_map) {
+    initializer(arg.first, &arg.second);
   }
 
   Optimizer* opt = OptimizerRegistry::Find("sgd");
diff --git a/cpp-package/tests/ci_test.sh b/cpp-package/tests/ci_test.sh
index c9b8dfa00bfb..6e623cdfc1cd 100755
--- a/cpp-package/tests/ci_test.sh
+++ b/cpp-package/tests/ci_test.sh
@@ -48,11 +48,11 @@ cp /work/build/cpp-package/example/mlp_cpu .
 cp /work/build/cpp-package/example/mlp_gpu .
 ./mlp_gpu
 
-cp /work/build/cpp-package/example/test_optimizer .
-./test_optimizer
+# cp /work/build/cpp-package/example/test_optimizer .
+# ./test_optimizer
 
-cp /work/build/cpp-package/example/test_kvstore .
-./test_kvstore
+# cp /work/build/cpp-package/example/test_kvstore .
+# ./test_kvstore
 
 cp /work/build/cpp-package/example/test_score .
 ./test_score 0.93

From f1ff96cb509ab54714f2041e4194e7df623fbede Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 29 Apr 2021 19:40:32 -0700
Subject: [PATCH 29/47] update cpp tests

---
 ci/docker/runtime_functions.sh  | 24 +++++++++++++++---------
 ci/jenkins/Jenkins_steps.groovy |  4 ++--
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 380c11e5ee20..b677dff59fe4 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -850,17 +850,9 @@ unittest_ubuntu_python3_gpu_nocudnn() {
     pytest --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu/test_amp_init.py
 }
 
-unittest_cpp_cpu() {
+unittest_cpp() {
     set -ex
     export DMLC_LOG_STACK_TRACE_DEPTH=100
-    export MXNET_USE_CPU=1
-    build/tests/mxnet_unit_tests
-}
-
-unittest_cpp_gpu() {
-    set -ex
-    export DMLC_LOG_STACK_TRACE_DEPTH=100
-    export MXNET_USE_CPU=0
     build/tests/mxnet_unit_tests
 }
 
@@ -891,6 +883,20 @@ unittest_centos7_gpu() {
     pytest --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu/test_amp_init.py
 }
 
+integrationtest_ubuntu_cpp_package_cpu() {
+    set -ex
+    export DMLC_LOG_STACK_TRACE_DEPTH=10
+    export MXNET_USE_CPU=1
+    cpp-package/tests/ci_test.sh
+}
+
+integrationtest_ubuntu_cpp_package_gpu() {
+    set -ex
+    export DMLC_LOG_STACK_TRACE_DEPTH=10
+    export MXNET_USE_CPU=0
+    cpp-package/tests/ci_test.sh
+}
+
 integrationtest_ubuntu_cpu_onnx() {
 	set -ex
 	export PYTHONPATH=./python/
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 276c19dba548..4a263b33a3a8 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -810,7 +810,7 @@ def test_unix_cpp_package_cpu(lib_name) {
         ws('workspace/it-cpp-package-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init(lib_name, mx_lib_cpp_examples)
-            utils.docker_run('ubuntu_cpu', 'unittest_cpp_cpu', false)
+            utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpp_package_cpu', false)
             utils.publish_test_coverage()
           }
         }
@@ -866,7 +866,7 @@ def test_unix_cpp_package_gpu(lib_name) {
         ws('workspace/it-cpp-package-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init(lib_name, mx_lib_cpp_examples)
-            utils.docker_run('ubuntu_gpu_cu111', 'unittest_cpp_gpu', true)
+            utils.docker_run('ubuntu_gpu_cu111', 'integrationtest_ubuntu_cpp_package_gpu', true)
             utils.publish_test_coverage()
           }
         }

From 6540581925de682a3ffcf7f96b7c3ca918fc387f Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 30 Apr 2021 09:48:18 -0700
Subject: [PATCH 30/47] add more tests

---
 cpp-package/example/CMakeLists.txt                 | 14 ++++++++++++++
 .../inference/unit_test_sentiment_analysis_rnn.sh  |  4 ++--
 cpp-package/example/unittests/unit_test_mlp_csv.sh |  2 +-
 cpp-package/tests/ci_test.sh                       |  4 ++--
 4 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/cpp-package/example/CMakeLists.txt b/cpp-package/example/CMakeLists.txt
index d682a88c7760..eeb2d9ab7f7f 100644
--- a/cpp-package/example/CMakeLists.txt
+++ b/cpp-package/example/CMakeLists.txt
@@ -48,6 +48,20 @@ target_link_libraries(mlp_gpu mxnet_cpp)
 add_executable(resnet resnet.cpp)
 target_link_libraries(resnet mxnet_cpp)
 
+add_executable(test_optimizer test_optimizer.cpp)
+target_link_libraries(test_optimizer mxnet_cpp)
+
+add_executable(test_ndarray_copy test_ndarray_copy.cpp)
+target_link_libraries(test_ndarray_copy mxnet_cpp)
+
+add_executable(test_score test_score.cpp)
+target_link_libraries(test_score mxnet_cpp)
+
+add_executable(sentiment_analysis_rnn sentiment_analysis_rnn.cpp)
+target_link_libraries(sentiment_analysis_rnn mxnet_cpp)
+
+add_executable(test_packed_func test_packed_func.cpp)
+target_link_libraries(test_packed_func mxnet_cpp)
 
 if(MSVC)
   add_custom_target(cpp_package_deploy_library ALL
diff --git a/cpp-package/example/inference/unit_test_sentiment_analysis_rnn.sh b/cpp-package/example/inference/unit_test_sentiment_analysis_rnn.sh
index 6f42e449ce58..1f123c43b7b6 100755
--- a/cpp-package/example/inference/unit_test_sentiment_analysis_rnn.sh
+++ b/cpp-package/example/inference/unit_test_sentiment_analysis_rnn.sh
@@ -24,9 +24,9 @@ export EXE_NAME="sentiment_analysis_rnn"
 
 # Running the example with a movie review.
 if [ "$(uname)" == "Darwin" ]; then
-    DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:../../../lib ./${EXE_NAME}  --input "This movie is the best." 2&> ${EXE_NAME}.log
+    DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:../../../build ./${EXE_NAME}  --input "This movie is the best." 2&> ${EXE_NAME}.log
 else
-    LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../../../lib ./${EXE_NAME}  --input "This movie is the best." 2&> ${EXE_NAME}.log
+    LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../../../build ./${EXE_NAME}  --input "This movie is the best." 2&> ${EXE_NAME}.log
 fi
 result=`grep "The sentiment score between 0 and 1.*\=" ${EXE_NAME}.log | cut -d '=' -f2`
 lower_bound=0.8
diff --git a/cpp-package/example/unittests/unit_test_mlp_csv.sh b/cpp-package/example/unittests/unit_test_mlp_csv.sh
index 55ddcdecaafd..c7f2eef3ad05 100755
--- a/cpp-package/example/unittests/unit_test_mlp_csv.sh
+++ b/cpp-package/example/unittests/unit_test_mlp_csv.sh
@@ -28,7 +28,7 @@ set -e # exit on the first error
 export EXE_NAME=mlp_csv
 
 cd $(dirname $(readlink -f $0))/../
-export LD_LIBRARY_PATH=$(readlink -f ../../lib):$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$(readlink -f ../../build):$LD_LIBRARY_PATH
 
 if [ ! -f ../../build/cpp-package/example/${EXE_NAME} ];
 then
diff --git a/cpp-package/tests/ci_test.sh b/cpp-package/tests/ci_test.sh
index 6e623cdfc1cd..600b0f5f0911 100755
--- a/cpp-package/tests/ci_test.sh
+++ b/cpp-package/tests/ci_test.sh
@@ -48,8 +48,8 @@ cp /work/build/cpp-package/example/mlp_cpu .
 cp /work/build/cpp-package/example/mlp_gpu .
 ./mlp_gpu
 
-# cp /work/build/cpp-package/example/test_optimizer .
-# ./test_optimizer
+cp /work/build/cpp-package/example/test_optimizer .
+./test_optimizer
 
 # cp /work/build/cpp-package/example/test_kvstore .
 # ./test_kvstore

From d055afde15802c5f99c74f66772f31dcd78c7024 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 30 Apr 2021 13:09:00 -0700
Subject: [PATCH 31/47] add more tests

---
 cpp-package/example/CMakeLists.txt                 | 8 ++++----
 cpp-package/example/inference/CMakeLists.txt       | 3 +++
 cpp-package/example/unittests/unit_test_mlp_csv.sh | 4 ++--
 cpp-package/tests/ci_test.sh                       | 8 ++++----
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/cpp-package/example/CMakeLists.txt b/cpp-package/example/CMakeLists.txt
index eeb2d9ab7f7f..2f2a6338c191 100644
--- a/cpp-package/example/CMakeLists.txt
+++ b/cpp-package/example/CMakeLists.txt
@@ -57,11 +57,11 @@ target_link_libraries(test_ndarray_copy mxnet_cpp)
 add_executable(test_score test_score.cpp)
 target_link_libraries(test_score mxnet_cpp)
 
-add_executable(sentiment_analysis_rnn sentiment_analysis_rnn.cpp)
-target_link_libraries(sentiment_analysis_rnn mxnet_cpp)
+add_executable(mlp_csv mlp_csv.cpp)
+target_link_libraries(mlp_csv mxnet_cpp)
 
-add_executable(test_packed_func test_packed_func.cpp)
-target_link_libraries(test_packed_func mxnet_cpp)
+add_executable(test_kvstore test_kvstore.cpp)
+target_link_libraries(test_kvstore mxnet_cpp)
 
 if(MSVC)
   add_custom_target(cpp_package_deploy_library ALL
diff --git a/cpp-package/example/inference/CMakeLists.txt b/cpp-package/example/inference/CMakeLists.txt
index 0566d28a57df..8b2ea708fe10 100644
--- a/cpp-package/example/inference/CMakeLists.txt
+++ b/cpp-package/example/inference/CMakeLists.txt
@@ -20,3 +20,6 @@ set_property(SOURCE ${CMAKE_CURRENT_LIST_DIR}/../../include/mxnet-cpp/op.h PROPE
 
 add_executable(imagenet_inference "imagenet_inference.cpp")
 target_link_libraries(imagenet_inference mxnet_cpp)
+
+add_executable(sentiment_analysis_rnn "sentiment_analysis_rnn.cpp")
+target_link_libraries(sentiment_analysis_rnn mxnet_cpp)
diff --git a/cpp-package/example/unittests/unit_test_mlp_csv.sh b/cpp-package/example/unittests/unit_test_mlp_csv.sh
index c7f2eef3ad05..9800df576767 100755
--- a/cpp-package/example/unittests/unit_test_mlp_csv.sh
+++ b/cpp-package/example/unittests/unit_test_mlp_csv.sh
@@ -39,8 +39,8 @@ fi
 cp ../../build/cpp-package/example/${EXE_NAME} .
 
 ./get_data.sh
-python mnist_to_csv.py ./data/mnist_data/train-images-idx3-ubyte ./data/mnist_data/train-labels-idx1-ubyte ./data/mnist_data/mnist_train.csv 60000
-python mnist_to_csv.py ./data/mnist_data/t10k-images-idx3-ubyte ./data/mnist_data/t10k-labels-idx1-ubyte ./data/mnist_data/mnist_test.csv 10000
+python3 mnist_to_csv.py ./data/mnist_data/train-images-idx3-ubyte ./data/mnist_data/train-labels-idx1-ubyte ./data/mnist_data/mnist_train.csv 60000
+python3 mnist_to_csv.py ./data/mnist_data/t10k-images-idx3-ubyte ./data/mnist_data/t10k-labels-idx1-ubyte ./data/mnist_data/mnist_test.csv 10000
 
 ./${EXE_NAME} --train ./data/mnist_data/mnist_train.csv --test ./data/mnist_data/mnist_test.csv --epochs 10 --batch_size 100 --hidden_units "128 64 10" 2&> ${EXE_NAME}.log
 
diff --git a/cpp-package/tests/ci_test.sh b/cpp-package/tests/ci_test.sh
index 600b0f5f0911..f879f4719ab4 100755
--- a/cpp-package/tests/ci_test.sh
+++ b/cpp-package/tests/ci_test.sh
@@ -51,8 +51,8 @@ cp /work/build/cpp-package/example/mlp_gpu .
 cp /work/build/cpp-package/example/test_optimizer .
 ./test_optimizer
 
-# cp /work/build/cpp-package/example/test_kvstore .
-# ./test_kvstore
+cp /work/build/cpp-package/example/test_kvstore .
+./test_kvstore
 
 cp /work/build/cpp-package/example/test_score .
 ./test_score 0.93
@@ -64,10 +64,10 @@ cp /work/build/cpp-package/example/test_ndarray_copy .
 #cp ../../build/cpp-package/example/test_regress_label .
 #./test_regress_label
 
-sh unittests/unit_test_mlp_csv.sh
+# sh unittests/unit_test_mlp_csv.sh
 
 cd inference
 
-cp /work/build/cpp-package/example/sentiment_analysis_rnn .
+cp /work/build/cpp-package/example/inference/sentiment_analysis_rnn .
 ./unit_test_sentiment_analysis_rnn.sh
 cd ..

From cf104e5050af61aa775d3954f4bb150ead3aa899 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 30 Apr 2021 16:06:35 -0700
Subject: [PATCH 32/47] turn on MXNET_USE_CPU

---
 ci/docker/runtime_functions.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index b677dff59fe4..f347069299a7 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -321,6 +321,7 @@ build_ubuntu_cpu_openblas() {
         -DUSE_ONEDNN=OFF \
         -DUSE_CUDA=OFF \
         -DUSE_CPP_PACKAGE=ON \
+        -DMXNET_USE_CPU=ON \
         -DUSE_DIST_KVSTORE=ON \
         -DBUILD_CYTHON_MODULES=ON \
         -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
@@ -886,14 +887,12 @@ unittest_centos7_gpu() {
 integrationtest_ubuntu_cpp_package_cpu() {
     set -ex
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    export MXNET_USE_CPU=1
     cpp-package/tests/ci_test.sh
 }
 
 integrationtest_ubuntu_cpp_package_gpu() {
     set -ex
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    export MXNET_USE_CPU=0
     cpp-package/tests/ci_test.sh
 }
 

From ad4c370fac66445cc4d3c000c1349013daaca592 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 30 Apr 2021 16:16:18 -0700
Subject: [PATCH 33/47] fix sentiment inference

---
 cpp-package/example/CMakeLists.txt           | 3 +++
 cpp-package/example/inference/CMakeLists.txt | 3 ---
 cpp-package/tests/ci_test.sh                 | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp-package/example/CMakeLists.txt b/cpp-package/example/CMakeLists.txt
index 2f2a6338c191..d95952d98247 100644
--- a/cpp-package/example/CMakeLists.txt
+++ b/cpp-package/example/CMakeLists.txt
@@ -63,6 +63,9 @@ target_link_libraries(mlp_csv mxnet_cpp)
 add_executable(test_kvstore test_kvstore.cpp)
 target_link_libraries(test_kvstore mxnet_cpp)
 
+add_executable(sentiment_analysis_rnn ./inference/sentiment_analysis_rnn.cpp)
+target_link_libraries(sentiment_analysis_rnn mxnet_cpp)
+
 if(MSVC)
   add_custom_target(cpp_package_deploy_library ALL
     DEPENDS mxnet
diff --git a/cpp-package/example/inference/CMakeLists.txt b/cpp-package/example/inference/CMakeLists.txt
index 8b2ea708fe10..0566d28a57df 100644
--- a/cpp-package/example/inference/CMakeLists.txt
+++ b/cpp-package/example/inference/CMakeLists.txt
@@ -20,6 +20,3 @@ set_property(SOURCE ${CMAKE_CURRENT_LIST_DIR}/../../include/mxnet-cpp/op.h PROPE
 
 add_executable(imagenet_inference "imagenet_inference.cpp")
 target_link_libraries(imagenet_inference mxnet_cpp)
-
-add_executable(sentiment_analysis_rnn "sentiment_analysis_rnn.cpp")
-target_link_libraries(sentiment_analysis_rnn mxnet_cpp)
diff --git a/cpp-package/tests/ci_test.sh b/cpp-package/tests/ci_test.sh
index f879f4719ab4..5065d31d8feb 100755
--- a/cpp-package/tests/ci_test.sh
+++ b/cpp-package/tests/ci_test.sh
@@ -68,6 +68,6 @@ cp /work/build/cpp-package/example/test_ndarray_copy .
 
 cd inference
 
-cp /work/build/cpp-package/example/inference/sentiment_analysis_rnn .
+cp /work/build/cpp-package/example/sentiment_analysis_rnn .
 ./unit_test_sentiment_analysis_rnn.sh
 cd ..

From 887bb5aeca713bf2b9c667540e3f07a9b4398b5a Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 3 May 2021 09:42:42 -0700
Subject: [PATCH 34/47] fix cpu tests

---
 ci/docker/runtime_functions.sh                | 1 -
 cpp-package/example/Makefile                  | 4 ++--
 cpp-package/example/alexnet.cpp               | 2 +-
 cpp-package/example/googlenet.cpp             | 2 +-
 cpp-package/example/inception_bn.cpp          | 2 +-
 cpp-package/example/lenet.cpp                 | 2 +-
 cpp-package/example/lenet_with_mxdataiter.cpp | 2 +-
 cpp-package/example/resnet.cpp                | 2 +-
 cpp-package/example/test_score.cpp            | 2 +-
 9 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index f347069299a7..6faaf17ac2fa 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -321,7 +321,6 @@ build_ubuntu_cpu_openblas() {
         -DUSE_ONEDNN=OFF \
         -DUSE_CUDA=OFF \
         -DUSE_CPP_PACKAGE=ON \
-        -DMXNET_USE_CPU=ON \
         -DUSE_DIST_KVSTORE=ON \
         -DBUILD_CYTHON_MODULES=ON \
         -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
diff --git a/cpp-package/example/Makefile b/cpp-package/example/Makefile
index dcb06399cfff..125bbbc767b9 100644
--- a/cpp-package/example/Makefile
+++ b/cpp-package/example/Makefile
@@ -30,8 +30,8 @@ CPPEX_EXE = $(patsubst %.cpp, %, $(CPPEX_SRC))
 
 CFLAGS += -I../../include -I../../3rdparty/tvm/nnvm/include -I../../3rdparty/dmlc-core/include  -I../include
 
-ifeq ($(MXNET_USE_CPU),1)
-	CFLAGS += -D MXNET_USE_CPU
+ifeq ($(USE_CUDA),1)
+	CFLAGS += -D USE_CUDA
 endif
 
 # CPPEX_CFLAGS += -I../include
diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp
index 8e7b50f133bd..faa7ecc9a1d3 100644
--- a/cpp-package/example/alexnet.cpp
+++ b/cpp-package/example/alexnet.cpp
@@ -221,7 +221,7 @@ int main(int argc, char const *argv[]) {
   int num_gpu;
   MXGetGPUCount(&num_gpu);
   int batch_size = 32;
-#if !MXNET_USE_CPU
+#if USE_CUDA
   if (num_gpu > 0) {
     ctx = Context::gpu();
     batch_size = 256;
diff --git a/cpp-package/example/googlenet.cpp b/cpp-package/example/googlenet.cpp
index 7b51f4fde3a7..238566eccc8f 100644
--- a/cpp-package/example/googlenet.cpp
+++ b/cpp-package/example/googlenet.cpp
@@ -120,7 +120,7 @@ int main(int argc, char const *argv[]) {
   float weight_decay = 1e-4;
 
   auto ctx = Context::gpu();
-#if MXNET_USE_CPU
+#if !USE_CUDA
   ctx = Context::cpu();;
 #endif
 
diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp
index af018a2d93c2..8ec398af0058 100644
--- a/cpp-package/example/inception_bn.cpp
+++ b/cpp-package/example/inception_bn.cpp
@@ -166,7 +166,7 @@ int main(int argc, char const *argv[]) {
   auto ctx = Context::cpu();
   int num_gpu;
   MXGetGPUCount(&num_gpu);
-#if !MXNET_USE_CPU
+#if USE_CUDA
   if (num_gpu > 0) {
     ctx = Context::gpu();
   }
diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp
index 54be0edccc14..06b2c8d1f16e 100644
--- a/cpp-package/example/lenet.cpp
+++ b/cpp-package/example/lenet.cpp
@@ -33,7 +33,7 @@ class Lenet {
  public:
   Lenet()
       : ctx_cpu(Context(DeviceType::kCPU, 0)),
-#if MXNET_USE_CPU
+#if !USE_CUDA
         ctx_dev(Context(DeviceType::kCPU, 0))
 #else
         ctx_dev(Context(DeviceType::kGPU, 0))
diff --git a/cpp-package/example/lenet_with_mxdataiter.cpp b/cpp-package/example/lenet_with_mxdataiter.cpp
index 6b37693cda59..19e03f0db6a3 100644
--- a/cpp-package/example/lenet_with_mxdataiter.cpp
+++ b/cpp-package/example/lenet_with_mxdataiter.cpp
@@ -88,7 +88,7 @@ int main(int argc, char const *argv[]) {
   auto dev_ctx = Context::cpu();
   int num_gpu;
   MXGetGPUCount(&num_gpu);
-#if !MXNET_USE_CPU
+#if USE_CUDA
   if (num_gpu > 0) {
     dev_ctx = Context::gpu();
   }
diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp
index c876724533d4..1e138cca8111 100644
--- a/cpp-package/example/resnet.cpp
+++ b/cpp-package/example/resnet.cpp
@@ -182,7 +182,7 @@ int main(int argc, char const *argv[]) {
   int num_gpu;
   MXGetGPUCount(&num_gpu);
   int batch_size = 8;
-#if !MXNET_USE_CPU
+#if USE_CUDA
   if (num_gpu > 0) {
     ctx = Context::gpu();
     batch_size = 32;
diff --git a/cpp-package/example/test_score.cpp b/cpp-package/example/test_score.cpp
index 0ccdf65b3b19..f9c31143370d 100644
--- a/cpp-package/example/test_score.cpp
+++ b/cpp-package/example/test_score.cpp
@@ -84,7 +84,7 @@ int main(int argc, char** argv) {
   auto net = mlp(layers);
 
   Context ctx = Context::gpu();  // Use GPU for training
-#if MXNET_USE_CPU
+#if !USE_CUDA
   ctx = Context::cpu();
 #endif
 

From 65c5d1230f5542d626b6b4d175bdde3c45812b37 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 4 May 2021 09:25:52 -0700
Subject: [PATCH 35/47] Revert "fix cpu tests"

This reverts commit 887bb5aeca713bf2b9c667540e3f07a9b4398b5a.
---
 ci/docker/runtime_functions.sh                | 1 +
 cpp-package/example/Makefile                  | 4 ++--
 cpp-package/example/alexnet.cpp               | 2 +-
 cpp-package/example/googlenet.cpp             | 2 +-
 cpp-package/example/inception_bn.cpp          | 2 +-
 cpp-package/example/lenet.cpp                 | 2 +-
 cpp-package/example/lenet_with_mxdataiter.cpp | 2 +-
 cpp-package/example/resnet.cpp                | 2 +-
 cpp-package/example/test_score.cpp            | 2 +-
 9 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 6faaf17ac2fa..f347069299a7 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -321,6 +321,7 @@ build_ubuntu_cpu_openblas() {
         -DUSE_ONEDNN=OFF \
         -DUSE_CUDA=OFF \
         -DUSE_CPP_PACKAGE=ON \
+        -DMXNET_USE_CPU=ON \
         -DUSE_DIST_KVSTORE=ON \
         -DBUILD_CYTHON_MODULES=ON \
         -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
diff --git a/cpp-package/example/Makefile b/cpp-package/example/Makefile
index 125bbbc767b9..dcb06399cfff 100644
--- a/cpp-package/example/Makefile
+++ b/cpp-package/example/Makefile
@@ -30,8 +30,8 @@ CPPEX_EXE = $(patsubst %.cpp, %, $(CPPEX_SRC))
 
 CFLAGS += -I../../include -I../../3rdparty/tvm/nnvm/include -I../../3rdparty/dmlc-core/include  -I../include
 
-ifeq ($(USE_CUDA),1)
-	CFLAGS += -D USE_CUDA
+ifeq ($(MXNET_USE_CPU),1)
+	CFLAGS += -D MXNET_USE_CPU
 endif
 
 # CPPEX_CFLAGS += -I../include
diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp
index faa7ecc9a1d3..8e7b50f133bd 100644
--- a/cpp-package/example/alexnet.cpp
+++ b/cpp-package/example/alexnet.cpp
@@ -221,7 +221,7 @@ int main(int argc, char const *argv[]) {
   int num_gpu;
   MXGetGPUCount(&num_gpu);
   int batch_size = 32;
-#if USE_CUDA
+#if !MXNET_USE_CPU
   if (num_gpu > 0) {
     ctx = Context::gpu();
     batch_size = 256;
diff --git a/cpp-package/example/googlenet.cpp b/cpp-package/example/googlenet.cpp
index 238566eccc8f..7b51f4fde3a7 100644
--- a/cpp-package/example/googlenet.cpp
+++ b/cpp-package/example/googlenet.cpp
@@ -120,7 +120,7 @@ int main(int argc, char const *argv[]) {
   float weight_decay = 1e-4;
 
   auto ctx = Context::gpu();
-#if !USE_CUDA
+#if MXNET_USE_CPU
   ctx = Context::cpu();;
 #endif
 
diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp
index 8ec398af0058..af018a2d93c2 100644
--- a/cpp-package/example/inception_bn.cpp
+++ b/cpp-package/example/inception_bn.cpp
@@ -166,7 +166,7 @@ int main(int argc, char const *argv[]) {
   auto ctx = Context::cpu();
   int num_gpu;
   MXGetGPUCount(&num_gpu);
-#if USE_CUDA
+#if !MXNET_USE_CPU
   if (num_gpu > 0) {
     ctx = Context::gpu();
   }
diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp
index 06b2c8d1f16e..54be0edccc14 100644
--- a/cpp-package/example/lenet.cpp
+++ b/cpp-package/example/lenet.cpp
@@ -33,7 +33,7 @@ class Lenet {
  public:
   Lenet()
       : ctx_cpu(Context(DeviceType::kCPU, 0)),
-#if !USE_CUDA
+#if MXNET_USE_CPU
         ctx_dev(Context(DeviceType::kCPU, 0))
 #else
         ctx_dev(Context(DeviceType::kGPU, 0))
diff --git a/cpp-package/example/lenet_with_mxdataiter.cpp b/cpp-package/example/lenet_with_mxdataiter.cpp
index 19e03f0db6a3..6b37693cda59 100644
--- a/cpp-package/example/lenet_with_mxdataiter.cpp
+++ b/cpp-package/example/lenet_with_mxdataiter.cpp
@@ -88,7 +88,7 @@ int main(int argc, char const *argv[]) {
   auto dev_ctx = Context::cpu();
   int num_gpu;
   MXGetGPUCount(&num_gpu);
-#if USE_CUDA
+#if !MXNET_USE_CPU
   if (num_gpu > 0) {
     dev_ctx = Context::gpu();
   }
diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp
index 1e138cca8111..c876724533d4 100644
--- a/cpp-package/example/resnet.cpp
+++ b/cpp-package/example/resnet.cpp
@@ -182,7 +182,7 @@ int main(int argc, char const *argv[]) {
   int num_gpu;
   MXGetGPUCount(&num_gpu);
   int batch_size = 8;
-#if USE_CUDA
+#if !MXNET_USE_CPU
   if (num_gpu > 0) {
     ctx = Context::gpu();
     batch_size = 32;
diff --git a/cpp-package/example/test_score.cpp b/cpp-package/example/test_score.cpp
index f9c31143370d..0ccdf65b3b19 100644
--- a/cpp-package/example/test_score.cpp
+++ b/cpp-package/example/test_score.cpp
@@ -84,7 +84,7 @@ int main(int argc, char** argv) {
   auto net = mlp(layers);
 
   Context ctx = Context::gpu();  // Use GPU for training
-#if !USE_CUDA
+#if MXNET_USE_CPU
   ctx = Context::cpu();
 #endif
 

From 63fc6ad6e64525d5937163ba78bd24129be7488d Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 4 May 2021 09:30:33 -0700
Subject: [PATCH 36/47] update ci test

---
 ci/docker/runtime_functions.sh  |  7 -------
 ci/jenkins/Jenkins_steps.groovy | 14 --------------
 ci/jenkins/Jenkinsfile_unix_cpu |  1 -
 3 files changed, 22 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index f347069299a7..5277d69d5234 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -320,7 +320,6 @@ build_ubuntu_cpu_openblas() {
         -DUSE_BLAS=Open \
         -DUSE_ONEDNN=OFF \
         -DUSE_CUDA=OFF \
-        -DUSE_CPP_PACKAGE=ON \
         -DMXNET_USE_CPU=ON \
         -DUSE_DIST_KVSTORE=ON \
         -DBUILD_CYTHON_MODULES=ON \
@@ -884,12 +883,6 @@ unittest_centos7_gpu() {
     pytest --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu/test_amp_init.py
 }
 
-integrationtest_ubuntu_cpp_package_cpu() {
-    set -ex
-    export DMLC_LOG_STACK_TRACE_DEPTH=10
-    cpp-package/tests/ci_test.sh
-}
-
 integrationtest_ubuntu_cpp_package_gpu() {
     set -ex
     export DMLC_LOG_STACK_TRACE_DEPTH=10
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 4a263b33a3a8..5c401fb945a2 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -804,20 +804,6 @@ def test_unix_onnx_cpu(lib_name) {
     }]
 }
 
-def test_unix_cpp_package_cpu(lib_name) {
-    return ['cpp-package CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/it-cpp-package-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init(lib_name, mx_lib_cpp_examples)
-            utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpp_package_cpu', false)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    }]
-}
-
 def test_unix_distributed_kvstore_cpu(lib_name) {
     return ['dist-kvstore tests CPU': {
       node(NODE_LINUX_CPU) {
diff --git a/ci/jenkins/Jenkinsfile_unix_cpu b/ci/jenkins/Jenkinsfile_unix_cpu
index 7044a0e9f54d..7cc70d40c22c 100644
--- a/ci/jenkins/Jenkinsfile_unix_cpu
+++ b/ci/jenkins/Jenkinsfile_unix_cpu
@@ -48,7 +48,6 @@ core_logic: {
     custom_steps.test_unix_python3_mkl_cpu('cpu_mkl'),
     custom_steps.test_unix_python3_onednn_cpu('onednn_cpu'),
     custom_steps.test_unix_python3_onednn_mkl_cpu('onednn_mkl_cpu'),
-    custom_steps.test_unix_cpp_package_cpu('cpu'),
     /* disable onnx tests for now, until onnx work is forwarded-ported to master
     custom_steps.test_unix_onnx_cpu('cpu'),
     */

From 1f20a2038c32a1da87e9520d7778c9c4ec2e1924 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 4 May 2021 10:40:50 -0700
Subject: [PATCH 37/47] update

---
 ci/docker/runtime_functions.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 5277d69d5234..c49644b96621 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -320,7 +320,6 @@ build_ubuntu_cpu_openblas() {
         -DUSE_BLAS=Open \
         -DUSE_ONEDNN=OFF \
         -DUSE_CUDA=OFF \
-        -DMXNET_USE_CPU=ON \
         -DUSE_DIST_KVSTORE=ON \
         -DBUILD_CYTHON_MODULES=ON \
         -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \

From c6fc5cbd2f6da8e7e3aea84f866f0e0792a6afe1 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 4 May 2021 15:22:47 -0700
Subject: [PATCH 38/47] add back cpp doc

---
 ci/docker/runtime_functions.sh                |  4 +++
 ci/jenkins/Jenkins_steps.groovy               | 20 ++++++++++++++
 ci/jenkins/Jenkinsfile_website_full           |  1 +
 ci/jenkins/Jenkinsfile_website_full_pr        |  1 +
 ci/jenkins/Jenkinsfile_website_nightly        |  1 +
 .../Jenkinsfile_website_version_artifacts     |  1 +
 docs/static_site/src/pages/api/api.html       | 27 +++++++++++++++++++
 7 files changed, 55 insertions(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index c49644b96621..51f3a7de7f74 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -1169,9 +1169,11 @@ build_docs() {
     pushd docs/_build
     tar -xzf jekyll-artifacts.tgz
     python_doc_folder='html/api/python/docs'
+    api_folder='html/api'
 
     # Python has it's own landing page/site so we don't put it in /docs/api
     mkdir -p $python_doc_folder && tar -xzf python-artifacts.tgz --directory $python_doc_folder
+    mkdir -p $api_folder/cpp/docs/api && tar -xzf c-artifacts.tgz --directory $api_folder/cpp/docs/api
 
      # check if .htaccess file exists
     if [ ! -f "html/.htaccess" ]; then
@@ -1220,7 +1222,9 @@ build_docs_beta() {
     pushd docs/_build
     tar -xzf jekyll-artifacts.tgz
     python_doc_folder="html/versions/$BRANCH/api/python/docs"
+    cpp_doc_folder="html/versions/$BRANCH/api/cpp/docs"
     mkdir -p $python_doc_folder && tar -xzf python-artifacts.tgz --directory $python_doc_folder
+    mkdir -p $cpp_doc_folder && tar -xzf c-artifacts.tgz --directory $cpp_doc_folder
     GZIP=-9 tar -zcvf beta_website.tgz -C html .
     popd
 }
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 5c401fb945a2..d3ad36142325 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -1062,6 +1062,23 @@ def docs_python(lib_name) {
     }]
 }
 
+// Call this function from Jenkins to generate just the C and C++ API microsite artifacts.
+def docs_c(lib_name) {
+    return ['C Docs': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/docs') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.unpack_and_init(lib_name, mx_lib, false)
+            utils.docker_run('ubuntu_cpu', 'build_c_docs', false)
+            if (should_pack_website()) {
+              utils.pack_lib('c-artifacts', 'docs/_build/c-artifacts.tgz', false)
+            }
+          }
+        }
+      }
+    }]
+}
+
 
 // Call this function from Jenkins to generate just the main website artifacts.
 def docs_jekyll() {
@@ -1092,6 +1109,7 @@ def docs_prepare() {
             utils.init_git()
 
             unstash 'jekyll-artifacts'
+            unstash 'c-artifacts'
             unstash 'python-artifacts'
 
             utils.docker_run('ubuntu_cpu_jekyll', 'build_docs', false)
@@ -1118,6 +1136,7 @@ def docs_full_website() {
             utils.init_git()
 
             unstash 'jekyll-artifacts'
+            unstash 'c-artifacts'
             unstash 'python-artifacts'
 
             utils.docker_run('ubuntu_cpu_jekyll', 'build_docs', false)
@@ -1140,6 +1159,7 @@ def docs_prepare_beta() {
             utils.init_git()
 
             unstash 'jekyll-artifacts'
+            unstash 'c-artifacts'
             unstash 'python-artifacts'
 
             utils.docker_run('ubuntu_cpu_jekyll', 'build_docs_beta', false)
diff --git a/ci/jenkins/Jenkinsfile_website_full b/ci/jenkins/Jenkinsfile_website_full
index 03b576d47cb3..d2de41132d72 100644
--- a/ci/jenkins/Jenkinsfile_website_full
+++ b/ci/jenkins/Jenkinsfile_website_full
@@ -40,6 +40,7 @@ core_logic: {
 
   utils.parallel_stage('Build Docs', [
     custom_steps.docs_jekyll(),
+    custom_steps.docs_c('libmxnet'),
     custom_steps.docs_python('libmxnet'),
   ])
 
diff --git a/ci/jenkins/Jenkinsfile_website_full_pr b/ci/jenkins/Jenkinsfile_website_full_pr
index 3b0c0964f439..7ac880fc9127 100644
--- a/ci/jenkins/Jenkinsfile_website_full_pr
+++ b/ci/jenkins/Jenkinsfile_website_full_pr
@@ -40,6 +40,7 @@ core_logic: {
   utils.parallel_stage('Build Docs', [
     // Optimization would be to flag these not to stash if not previewing them
     custom_steps.docs_jekyll(),
+    custom_steps.docs_c('libmxnet'),
     custom_steps.docs_python('libmxnet'),
   ])
 
diff --git a/ci/jenkins/Jenkinsfile_website_nightly b/ci/jenkins/Jenkinsfile_website_nightly
index f180f0ac6c7c..6fa5d1a9396f 100644
--- a/ci/jenkins/Jenkinsfile_website_nightly
+++ b/ci/jenkins/Jenkinsfile_website_nightly
@@ -40,6 +40,7 @@ core_logic: {
 
   utils.parallel_stage('Build Docs', [
     custom_steps.docs_jekyll(),
+    custom_steps.docs_c('libmxnet'),
     custom_steps.docs_python('libmxnet'),
   ])
 
diff --git a/ci/jenkins/Jenkinsfile_website_version_artifacts b/ci/jenkins/Jenkinsfile_website_version_artifacts
index 7f74f1a9e076..01daa05210b8 100644
--- a/ci/jenkins/Jenkinsfile_website_version_artifacts
+++ b/ci/jenkins/Jenkinsfile_website_version_artifacts
@@ -40,6 +40,7 @@ core_logic: {
 
   utils.parallel_stage('Build Docs', [
     custom_steps.docs_jekyll(),
+    custom_steps.docs_c('libmxnet'),
     custom_steps.docs_python('libmxnet'),
   ])
 
diff --git a/docs/static_site/src/pages/api/api.html b/docs/static_site/src/pages/api/api.html
index bd0d8bfaa510..bfd292473866 100644
--- a/docs/static_site/src/pages/api/api.html
+++ b/docs/static_site/src/pages/api/api.html
@@ -21,6 +21,13 @@
   tutorial_link: /api/python/docs/tutorials
   icon: /assets/img/python_logo.svg
   tag: python
+- title: C/C++
+  guide_link: /api/cpp
+  api_link: /api/cpp/docs/api
+  tutorial_link: /api/cpp/docs/tutorials
+  description:
+  icon: /assets/img/cpp_logo.svg
+  tag: cpp
 ---
 <!--- Licensed to the Apache Software Foundation (ASF) under one -->
 <!--- or more contributor license agreements.  See the NOTICE file -->
@@ -74,6 +81,26 @@ <h4>Python-first API</h4>
 {%- endfor -%}
 <h2>Other Bindings</h2>
 <div class="row">
+{%- for doc in page.docs -%}
+  {%- if doc.tag != 'python' -%}
+  <div class="col-4">
+      <div class="docs-card">
+          <div class="docs-logo-container">
+              <img class="docs-logo-image" src="{{doc.icon | relative_url}}">
+          </div>
+          <div class="docs-action-btn">
+            <a href="{{doc.guide_link | relative_url}}"> <img src="{{'assets/img/compass.svg' | relative_url}}" class="docs-logo-docs">{{doc.title}} Guide  <span class="span-accented">›</span></a>
+          </div>
+          <div class="docs-action-btn">
+            <a href="{{doc.tutorial_link | relative_url}}"> <img src="{{'assets/img/video-tutorial.svg' | relative_url}}" class="docs-logo-docs">{{doc.title}} Tutorials  <span class="span-accented">›</span></a>
+          </div>
+          <div class="docs-action-btn">
+            <a href="{{doc.api_link | relative_url}}"> <img src="{{'assets/img/api.svg' | relative_url}}" class="docs-logo-docs">{{doc.title}} API Reference  <span class="span-accented">›</span></a>
+          </div>
+      </div>
+  </div>
+  {%- endif -%}
+{%- endfor -%}
   <div class="language-binding-banner">
     <h4>Call for Contribution</h4>
     The Clojure, Java, Julia, R, and Scala language bindings of <a href="/versions/{{site.versions[1]}}/api">MXNet v1.x</a> were removed in v2.x due to some <a href="https://github.com/apache/incubator-mxnet/issues/17676">C APIs being deprecated</a> and the bindings rely on the deprecated APIs. You can still use these language bindings in v1.x.

From d09dfec6061e22d8310d476d99f3cca3b2317cae Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 5 May 2021 10:24:54 -0700
Subject: [PATCH 39/47] add cross-compilation instructions

---
 docs/static_site/src/pages/api/api.html             |  2 +-
 .../src/pages/api/cpp/docs/tutorials/basics.md      |  2 +-
 .../cpp/docs/tutorials/multi_threaded_inference.md  |  2 +-
 .../docs/tutorials/mxnet_cpp_inference_tutorial.md  |  2 +-
 .../src/pages/api/cpp/docs/tutorials/subgraphAPI.md |  2 +-
 docs/static_site/src/pages/api/cpp/index.md         | 13 +++++++++++--
 6 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/docs/static_site/src/pages/api/api.html b/docs/static_site/src/pages/api/api.html
index bfd292473866..41c9378971d4 100644
--- a/docs/static_site/src/pages/api/api.html
+++ b/docs/static_site/src/pages/api/api.html
@@ -22,7 +22,7 @@
   icon: /assets/img/python_logo.svg
   tag: python
 - title: C/C++
-  guide_link: /api/cpp
+  guide_link: /api/cpp.html
   api_link: /api/cpp/docs/api
   tutorial_link: /api/cpp/docs/tutorials
   description:
diff --git a/docs/static_site/src/pages/api/cpp/docs/tutorials/basics.md b/docs/static_site/src/pages/api/cpp/docs/tutorials/basics.md
index befbabd606eb..212b469fc41a 100644
--- a/docs/static_site/src/pages/api/cpp/docs/tutorials/basics.md
+++ b/docs/static_site/src/pages/api/cpp/docs/tutorials/basics.md
@@ -3,7 +3,7 @@ layout: page_api
 title: Basics
 action: Get Started
 action_url: /get_started
-permalink: /api/cpp/docs/tutorials/basics
+permalink: /api/cpp/docs/tutorials/basics.html
 is_tutorial: true
 tag: cpp
 ---
diff --git a/docs/static_site/src/pages/api/cpp/docs/tutorials/multi_threaded_inference.md b/docs/static_site/src/pages/api/cpp/docs/tutorials/multi_threaded_inference.md
index bfb9c887f5c5..bbc4ea95aea2 100644
--- a/docs/static_site/src/pages/api/cpp/docs/tutorials/multi_threaded_inference.md
+++ b/docs/static_site/src/pages/api/cpp/docs/tutorials/multi_threaded_inference.md
@@ -3,7 +3,7 @@ layout: page_api
 title: Multi Threaded Inference
 action: Get Started
 action_url: /get_started
-permalink: /api/cpp/docs/tutorials/multi_threaded_inference
+permalink: /api/cpp/docs/tutorials/multi_threaded_inference.html
 is_tutorial: true
 tag: cpp
 ---
diff --git a/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md b/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md
index dcc96d4547ca..8b41fdcc4d3f 100644
--- a/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md
+++ b/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md
@@ -3,7 +3,7 @@ layout: page_api
 title: C++ API inference tutorial
 action: Get Started
 action_url: /get_started
-permalink: /api/cpp/docs/tutorials/cpp_inference
+permalink: /api/cpp/docs/tutorials/cpp_inference.html
 is_tutorial: true
 tag: cpp
 ---
diff --git a/docs/static_site/src/pages/api/cpp/docs/tutorials/subgraphAPI.md b/docs/static_site/src/pages/api/cpp/docs/tutorials/subgraphAPI.md
index 887743f61696..2b85a9db77af 100644
--- a/docs/static_site/src/pages/api/cpp/docs/tutorials/subgraphAPI.md
+++ b/docs/static_site/src/pages/api/cpp/docs/tutorials/subgraphAPI.md
@@ -3,7 +3,7 @@ layout: page_api
 title: Subgraph API
 action: Get Started
 action_url: /get_started
-permalink: /api/cpp/docs/tutorials/subgraph_api
+permalink: /api/cpp/docs/tutorials/subgraph_api.html
 is_tutorial: true
 tag: cpp
 ---
diff --git a/docs/static_site/src/pages/api/cpp/index.md b/docs/static_site/src/pages/api/cpp/index.md
index 2f834a763a12..c6234008f5b1 100644
--- a/docs/static_site/src/pages/api/cpp/index.md
+++ b/docs/static_site/src/pages/api/cpp/index.md
@@ -41,8 +41,17 @@ The cpp-package directory contains the implementation of C++ API. As mentioned a
 	```
 
 3.  Install the [prerequisites](<https://mxnet.apache.org/get_started/build_from_source#prerequisites>), desired [BLAS libraries](<https://mxnet.apache.org/get_started/build_from_source#blas-library>) and optional [OpenCV, CUDA, and cuDNN](<https://mxnet.apache.org/get_started/build_from_source#optional>) for building MXNet from source.
-4.  Please refer to  [platform specific build instructions](<https://mxnet.apache.org/get_started/build_from_source#build-instructions-by-operating-system>) and available [build configurations](https://mxnet.apache.org/get_started/build_from_source#build-configurations) for more details.
-5.  For enabling the build of C++ Package, set the **USE\_CPP\_PACKAGE = 1** in the config file.
+4.  There is a configuration file for cmake, [config/*.cmake](<https://github.com/apache/incubator-mxnet/tree/master/config>) that contains all the compilation options. You can edit this file and set the appropriate options prior to running the **cmake** command.
+5.  Please refer to  [platform specific build instructions](<https://mxnet.apache.org/get_started/build_from_source#build-instructions-by-operating-system>) and available [build configurations](https://mxnet.apache.org/get_started/build_from_source#build-configurations) for more details.
+6.  For enabling the build of C++ Package, set the **-DUSE\_CPP\_PACKAGE = 1** in cmake options.
+
+### Cross-Compilation steps:
+1.  Build the C++ package for the **host** platform to generate op.h file.
+2.  Remove the following line in [CMakeLists.txt](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/CMakeLists.txt#L15>).
+    ```
+	COMMAND python OpWrapperGenerator.py $<TARGET_FILE:mxnet>
+	``` 
+3.  Re-configure cmake for cross-compilation to build the **target** C++ package.
 
 ## Usage
 

From f8350b36b380a676056dd5e474cc5dac9d168b89 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 7 May 2021 13:01:46 -0700
Subject: [PATCH 40/47] add multi_threaded_inference_test

---
 cpp-package/example/CMakeLists.txt            |   3 +
 .../multi_threaded_inference/get_model.py     | 174 +++++++++
 .../multi_threaded_inference.cc               | 355 ++++++++++++++++++
 .../unit_test_multi_threaded_inference.sh     |   2 +
 cpp-package/tests/ci_test.sh                  |   8 +-
 .../multi_threaded_inference.cc               |  12 +-
 6 files changed, 547 insertions(+), 7 deletions(-)
 create mode 100644 cpp-package/example/inference/multi_threaded_inference/get_model.py
 create mode 100644 cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc
 create mode 100755 cpp-package/example/inference/multi_threaded_inference/unit_test_multi_threaded_inference.sh

diff --git a/cpp-package/example/CMakeLists.txt b/cpp-package/example/CMakeLists.txt
index d95952d98247..de7d085495a1 100644
--- a/cpp-package/example/CMakeLists.txt
+++ b/cpp-package/example/CMakeLists.txt
@@ -66,6 +66,9 @@ target_link_libraries(test_kvstore mxnet_cpp)
 add_executable(sentiment_analysis_rnn ./inference/sentiment_analysis_rnn.cpp)
 target_link_libraries(sentiment_analysis_rnn mxnet_cpp)
 
+add_executable(multi_threaded_inference ./inference/multi_threaded_inference/multi_threaded_inference.cc)
+target_link_libraries(multi_threaded_inference mxnet_cpp)
+
 if(MSVC)
   add_custom_target(cpp_package_deploy_library ALL
     DEPENDS mxnet
diff --git a/cpp-package/example/inference/multi_threaded_inference/get_model.py b/cpp-package/example/inference/multi_threaded_inference/get_model.py
new file mode 100644
index 000000000000..75a5d039c61d
--- /dev/null
+++ b/cpp-package/example/inference/multi_threaded_inference/get_model.py
@@ -0,0 +1,174 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import logging
+import argparse
+import requests
+import errno
+import os
+
+models = ["imagenet1k-inception-bn", "imagenet1k-resnet-50",
+          "imagenet1k-resnet-152", "imagenet1k-resnet-18"]
+
+def download(url, fname=None, dirname=None, overwrite=False, retries=5):
+    """Download an given URL
+
+    Parameters
+    ----------
+
+    url : str
+        URL to download
+    fname : str, optional
+        filename of the downloaded file. If None, then will guess a filename
+        from url.
+    dirname : str, optional
+        output directory name. If None, then guess from fname or use the current
+        directory
+    overwrite : bool, optional
+        Default is false, which means skipping download if the local file
+        exists. If true, then download the url to overwrite the local file if
+        exists.
+    retries : integer, default 5
+        The number of times to attempt the download in case of failure or non 200 return codes
+
+    Returns
+    -------
+    str
+        The filename of the downloaded file
+    """
+
+    assert retries >= 0, "Number of retries should be at least 0"
+
+    if fname is None:
+        fname = url.split('/')[-1]
+
+    if dirname is None:
+        dirname = os.path.dirname(fname)
+    else:
+        fname = os.path.join(dirname, fname)
+    if dirname != "":
+        if not os.path.exists(dirname):
+            try:
+                logging.info('create directory %s', dirname)
+                os.makedirs(dirname)
+            except OSError as exc:
+                if exc.errno != errno.EEXIST:
+                    raise OSError('failed to create ' + dirname)
+
+    if not overwrite and os.path.exists(fname):
+        logging.info("%s exists, skipping download", fname)
+        return fname
+
+    while retries+1 > 0:
+        # Disable pyling too broad Exception
+        # pylint: disable=W0703
+        try:
+            r = requests.get(url, stream=True)
+            assert r.status_code == 200, "failed to open %s" % url
+            with open(fname, 'wb') as f:
+                for chunk in r.iter_content(chunk_size=1024):
+                    if chunk: # filter out keep-alive new chunks
+                        f.write(chunk)
+                break
+        except Exception as e:
+            retries -= 1
+            if retries <= 0:
+                raise e
+
+            print("download failed, retrying, {} attempt{} left"
+                  .format(retries, 's' if retries > 1 else ''))
+    logging.info("downloaded %s into %s successfully", url, fname)
+    return fname
+
+def download_model(model_name, dst_dir='./', meta_info=None):
+    """Download a model from data.mxnet.io
+
+    Parameters
+    ----------
+    model_name : str
+        Model name to download
+    dst_dir : str
+        Destination Directory to download the model
+    meta_info : dict of dict
+        Mapping from model_name to dict of the following structure:
+        {'symbol': url, 'params': url}
+
+    Returns
+    -------
+    Two element tuple containing model_name and epoch for the params saved
+    """
+    _base_model_url = 'http://data.mxnet.io/models/'
+    _default_model_info = {
+        'imagenet1k-inception-bn': {'symbol':_base_model_url+'imagenet/inception-bn/Inception-BN-symbol.json',
+                                    'params':_base_model_url+'imagenet/inception-bn/Inception-BN-0126.params'},
+        'imagenet1k-resnet-18': {'symbol':_base_model_url+'imagenet/resnet/18-layers/resnet-18-symbol.json',
+                                 'params':_base_model_url+'imagenet/resnet/18-layers/resnet-18-0000.params'},
+        'imagenet1k-resnet-34': {'symbol':_base_model_url+'imagenet/resnet/34-layers/resnet-34-symbol.json',
+                                 'params':_base_model_url+'imagenet/resnet/34-layers/resnet-34-0000.params'},
+        'imagenet1k-resnet-50': {'symbol':_base_model_url+'imagenet/resnet/50-layers/resnet-50-symbol.json',
+                                 'params':_base_model_url+'imagenet/resnet/50-layers/resnet-50-0000.params'},
+        'imagenet1k-resnet-101': {'symbol':_base_model_url+'imagenet/resnet/101-layers/resnet-101-symbol.json',
+                                  'params':_base_model_url+'imagenet/resnet/101-layers/resnet-101-0000.params'},
+        'imagenet1k-resnet-152': {'symbol':_base_model_url+'imagenet/resnet/152-layers/resnet-152-symbol.json',
+                                  'params':_base_model_url+'imagenet/resnet/152-layers/resnet-152-0000.params'},
+        'imagenet1k-resnext-50': {'symbol':_base_model_url+'imagenet/resnext/50-layers/resnext-50-symbol.json',
+                                  'params':_base_model_url+'imagenet/resnext/50-layers/resnext-50-0000.params'},
+        'imagenet1k-resnext-101': {'symbol':_base_model_url+'imagenet/resnext/101-layers/resnext-101-symbol.json',
+                                   'params':_base_model_url+'imagenet/resnext/101-layers/resnext-101-0000.params'},
+        'imagenet1k-resnext-101-64x4d':
+            {'symbol':_base_model_url+'imagenet/resnext/101-layers/resnext-101-64x4d-symbol.json',
+             'params':_base_model_url+'imagenet/resnext/101-layers/resnext-101-64x4d-0000.params'},
+        'imagenet11k-resnet-152':
+            {'symbol':_base_model_url+'imagenet-11k/resnet-152/resnet-152-symbol.json',
+             'params':_base_model_url+'imagenet-11k/resnet-152/resnet-152-0000.params'},
+        'imagenet11k-place365ch-resnet-152':
+            {'symbol':_base_model_url+'imagenet-11k-place365-ch/resnet-152-symbol.json',
+             'params':_base_model_url+'imagenet-11k-place365-ch/resnet-152-0000.params'},
+        'imagenet11k-place365ch-resnet-50':
+            {'symbol':_base_model_url+'imagenet-11k-place365-ch/resnet-50-symbol.json',
+             'params':_base_model_url+'imagenet-11k-place365-ch/resnet-50-0000.params'},
+    }
+
+
+    if meta_info is None:
+        meta_info = _default_model_info
+    meta_info = dict(meta_info)
+    if model_name not in meta_info:
+        return (None, 0)
+    if not os.path.isdir(dst_dir):
+        os.mkdir(dst_dir)
+    meta = dict(meta_info[model_name])
+    assert 'symbol' in meta, "missing symbol url"
+    model_name = os.path.join(dst_dir, model_name)
+    download(meta['symbol'], model_name+'-symbol.json')
+    assert 'params' in meta, "mssing parameter file url"
+    download(meta['params'], model_name+'-0000.params')
+    download(_base_model_url + 'imagenet/synset.txt')
+    return (model_name, 0)
+
+def main():
+    logging.basicConfig()
+    logger = logging.getLogger("logger")
+    logger.setLevel(logging.INFO)
+    parser = argparse.ArgumentParser(description='Download model hybridize and save as symbolic model for multithreaded inference')
+    parser.add_argument("--model", type=str, choices=models, required=True)
+    args = parser.parse_args()
+
+    download_model(args.model)
+
+if __name__ == "__main__":
+    main()
diff --git a/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc b/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc
new file mode 100644
index 000000000000..dd843b40a78a
--- /dev/null
+++ b/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc
@@ -0,0 +1,355 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file multi_threaded_inference.cc
+ * \brief Multi Threaded inference example with CachedOp
+*/
+
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <fstream>
+#include <thread>
+#include <iomanip>
+#include <chrono>
+#include <mxnet/ndarray.h>
+#include <opencv2/opencv.hpp>
+#include "mxnet-cpp/MxNetCpp.h"
+#include <random>
+
+const float DEFAULT_MEAN = 117.0;
+
+
+// Code to load image, PrintOutput results, helper functions for the same obtained from:
+// https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/predict-cpp/
+
+static std::string trim(const std::string &input) {
+  auto not_space = [](int ch) { return !std::isspace(ch); };
+  auto output = input;
+  output.erase(output.begin(),
+               std::find_if(output.begin(), output.end(), not_space));
+  output.erase(std::find_if(output.rbegin(), output.rend(), not_space).base(),
+               output.end());
+  return output;
+}
+
+std::vector<std::string> LoadSynset(const std::string& synset_file) {
+  std::ifstream fi(synset_file.c_str());
+
+  if (!fi.is_open()) {
+    std::cerr << "Error opening synset file " << synset_file << std::endl;
+    assert(false);
+  }
+
+  std::vector<std::string> output;
+
+  std::string synset, lemma;
+  while (fi >> synset) {
+    getline(fi, lemma);
+    output.push_back(lemma);
+  }
+
+  fi.close();
+
+  return output;
+}
+
+void PrintOutputResult(const float* data, size_t size, const std::vector<std::string>& synset) {
+  if (size != synset.size()) {
+    std::cerr << "Result data and synset size do not match!" << std::endl;
+  }
+
+  float best_accuracy = 0.0;
+  std::size_t best_idx = 0;
+
+  for (std::size_t i = 0; i < size; ++i) {
+    if (data[i] > best_accuracy) {
+      best_accuracy = data[i];
+      best_idx = i;
+    }
+  }
+
+  std::cout << "Best Result: " << trim(synset[best_idx]) << " (id=" << best_idx << ", " <<
+            "accuracy=" << std::setprecision(8) << best_accuracy << ")" << std::endl;
+}
+
+
+// Read Image data into a float array
+void GetImageFile(const std::string &image_file, float *image_data,
+                  int channels, cv::Size resize_size) {
+  // Read all kinds of file into a BGR color 3 channels image
+  cv::Mat im_ori = cv::imread(image_file, cv::IMREAD_COLOR);
+
+  if (im_ori.empty()) {
+    std::cerr << "Can't open the image. Plase check " << image_file << ". \n";
+    assert(false);
+  }
+
+  cv::Mat im;
+  resize(im_ori, im, resize_size);
+
+  int size = im.rows * im.cols * channels;
+
+  float* ptr_image_r = image_data;
+  float* ptr_image_g = image_data + size / 3;
+  float* ptr_image_b = image_data + size / 3 * 2;
+
+  float mean_b, mean_g, mean_r;
+  mean_b = mean_g = mean_r = DEFAULT_MEAN;
+
+  for (int i = 0; i < im.rows; ++i) {
+    auto data = im.ptr<uchar>(i);
+    for (int j = 0; j < im.cols; j++) {
+      if (channels > 1) {
+        *ptr_image_b++ = static_cast<float>(*data++) - mean_b;
+        *ptr_image_g++ = static_cast<float>(*data++) - mean_g;
+      }
+    }
+    *ptr_image_r++ = static_cast<float>(*data++) - mean_r;
+  }
+}
+
+void prepare_input_data(const mxnet::cpp::Shape& shape, const mxnet::cpp::Context& ctx,
+                        int num_threads,
+                        std::vector<mxnet::cpp::NDArray>* data_arr,
+                        bool random_uniform = false) {
+  for (size_t i = 0; i < num_threads; ++i) {
+    data_arr->emplace_back(shape, ctx, false, 0);
+    int begin = i * 100;
+    int end = begin + 100;
+    if (random_uniform) {
+      mxnet::cpp::Operator("_random_uniform")(begin, end)
+          .Invoke((*data_arr)[i]);
+    }
+    mxnet::cpp::NDArray::WaitAll();
+  }
+}
+
+// Run inference on a model
+void run_inference(const std::string& model_name, const std::vector<mxnet::cpp::NDArray>& input_arrs,
+                   std::vector<mxnet::NDArray*> *output_mx_arr,
+                   int num_inf_per_thread = 1, bool random_sleep = false,
+                   int num_threads = 1, bool static_alloc = false,
+                   bool static_shape = false,
+                   bool is_gpu = false) {
+    LOG(INFO) << "Running inference for " + model_name +
+                 " num_threads: " + std::to_string(num_threads) +
+                 " num_inf_per_thread: " + std::to_string(num_inf_per_thread) +
+                 " random_sleep: " + std::to_string(random_sleep) +
+                 " static_alloc: " + std::to_string(static_alloc) +
+                 " static_shape: " + std::to_string(static_shape);
+  std::string json_file = model_name + "-symbol.json";
+  std::string param_file = model_name + "-0000.params";
+  auto out = mxnet::cpp::Symbol::Load(json_file);
+  std::string static_alloc_str = static_alloc ? "true" : "false";
+  std::string static_shape_str = static_shape ? "true" : "false";
+
+  // Prepare context
+# if MXNET_USE_CUDA == 1
+  mxnet::Context backend_ctx;
+  mxnet::cpp::Context ctx = mxnet::cpp::Context::cpu(0);
+  if (is_gpu) {
+    backend_ctx = mxnet::Context::GPU(0);
+    ctx = mxnet::cpp::Context::gpu(0);
+  } else {
+    backend_ctx = mxnet::Context::CPU(0);
+    ctx = mxnet::cpp::Context::cpu(0);
+  }
+# else
+  mxnet::Context backend_ctx = mxnet::Context::CPU(0);
+  mxnet::cpp::Context ctx = mxnet::cpp::Context::cpu(0);
+#endif
+
+  // Prepare input data and parameters
+  std::vector<mxnet::cpp::NDArray> data_arr(num_threads);
+  std::vector<mxnet::cpp::NDArray> softmax_arr;
+  std::vector<mxnet::cpp::NDArray> params;
+  mxnet::cpp::Shape data_shape = mxnet::cpp::Shape(1, 3, 224, 224);
+  mxnet::cpp::Shape softmax_shape = mxnet::cpp::Shape(1);
+  int num_inputs = out.ListInputs().size();
+
+  for (size_t i = 0; i < data_arr.size(); ++i) {
+    data_arr[i] = input_arrs[i].Copy(ctx);
+  }
+  prepare_input_data(softmax_shape, ctx, num_threads, &softmax_arr);
+  std::map<std::string, mxnet::cpp::NDArray> parameters;
+  mxnet::cpp::NDArray::Load(param_file, 0, &parameters);
+
+  for (const std::string& name : out.ListInputs()) {
+    if (name == "arg:data") {
+      continue;
+    }
+    if (parameters.find("arg:" + name) != parameters.end()) {
+      params.push_back(parameters["arg:" + name].Copy(ctx));
+    } else if (parameters.find("aux:" + name) != parameters.end()) {
+      params.push_back(parameters["aux:" + name].Copy(ctx));
+    }
+  }
+
+  CachedOpHandle hdl = CachedOpHandle();
+
+  std::vector<std::string> flag_keys{"data_indices", "param_indices",
+                                     "static_alloc", "static_shape"};
+  std::string param_indices = "[";
+  for (size_t i = 1; i < num_inputs; ++i) {
+    param_indices += std::to_string(i);
+    param_indices += std::string(", ");
+  }
+  param_indices += "]";
+  std::vector<std::string> flag_vals{"[0]", param_indices, static_alloc_str,
+                                     static_shape_str};
+  std::vector<const char*> flag_key_cstrs, flag_val_cstrs;
+  flag_key_cstrs.reserve(flag_keys.size());
+  for (size_t i = 0; i < flag_keys.size(); ++i) {
+    flag_key_cstrs.emplace_back(flag_keys[i].c_str());
+  }
+  for (size_t i = 0; i < flag_vals.size(); ++i) {
+    flag_val_cstrs.emplace_back(flag_vals[i].c_str());
+  }
+
+  int ret1 = MXCreateCachedOp(out.GetHandle(), flag_keys.size(),
+                              flag_key_cstrs.data(), flag_val_cstrs.data(),
+                              &hdl, true);
+  if (ret1 < 0) {
+    LOG(FATAL) << MXGetLastError();
+  }
+
+  // Prepare data structures and lambda to run in different threads
+  std::vector<NDArrayHandle *> cached_op_handles(num_threads);
+
+  std::vector<std::vector<NDArrayHandle>> arr_handles(num_threads);
+  for (size_t i = 0; i < num_threads; ++i) {
+    arr_handles[i].reserve(num_inputs);
+    arr_handles[i].emplace_back(data_arr[i].GetHandle());
+    for (size_t j = 1; j < num_inputs - 1; ++j) {
+      arr_handles[i].emplace_back(params[j - 1].GetHandle());
+    }
+    arr_handles[i].emplace_back(softmax_arr[i].GetHandle());
+  }
+
+  auto func = [&](int num) {
+    unsigned next = num;
+    if (random_sleep) {
+      static thread_local std::mt19937 generator;
+      std::uniform_int_distribution<int> distribution(0, 5);
+      int sleep_time = distribution(generator);
+      std::this_thread::sleep_for(std::chrono::seconds(sleep_time));
+    }
+    int num_output = 0;
+    const int *stypes;
+    int ret = MXInvokeCachedOp(hdl, arr_handles[num].size(), arr_handles[num].data(),
+                               ctx.GetDeviceType(), 0, &num_output, &(cached_op_handles[num]), &stypes);
+    if (ret < 0) {
+      LOG(FATAL) << MXGetLastError();
+    }
+    (*output_mx_arr)[num] = static_cast<mxnet::NDArray *>(*cached_op_handles[num]);
+  };
+
+  // Spawn multiple threads, join and wait for threads to complete
+  std::vector<std::thread> worker_threads(num_threads);
+  int count = 0;
+  for (auto &&i : worker_threads) {
+    i = std::thread(func, count);
+    count++;
+  }
+
+  for (auto &&i : worker_threads) {
+    i.join();
+  }
+
+  mxnet::cpp::NDArray::WaitAll();
+
+  std::string synset_file = "synset.txt";
+  auto synset = LoadSynset(synset_file);
+  std::vector<mxnet::NDArray> tmp(num_threads);
+  for (size_t i = 0; i < num_threads; i++) {
+    tmp[i] = (*output_mx_arr)[i]->Copy(mxnet::Context::CPU(0));
+    tmp[i].WaitToRead();
+    (*output_mx_arr)[i] = &tmp[i];
+  }
+  for (size_t i = 0; i < num_threads; ++i) {
+    PrintOutputResult(static_cast<float *>((*output_mx_arr)[i]->data().dptr_),
+                      (*output_mx_arr)[i]->shape().Size(), synset);
+  }
+  int ret2 = MXFreeCachedOp(hdl);
+  if (ret2 < 0) {
+    LOG(FATAL) << MXGetLastError();
+  }
+
+  mxnet::cpp::NDArray::WaitAll();
+
+}
+
+int main(int argc, char *argv[]) {
+  if (argc < 5) {
+    std::cout << "Please provide a model name, is_gpu, test_image" << std::endl
+              << "Usage: ./multi_threaded_inference [model_name] [is_gpu] [file_names]"
+              << std::endl
+              << "Example: ./.multi_threaded_inference imagenet1k-inception-bn 1 0 apple.jpg"
+              << std::endl
+              << "NOTE: Thread number ordering will be based on the ordering of file inputs" << std::endl
+              << "NOTE: Epoch is assumed to be 0" << std::endl;
+    return EXIT_FAILURE;
+  }
+  std::string model_name = std::string(argv[1]);
+  //int num_threads = std::atoi(argv[2]);
+  bool is_gpu = std::atoi(argv[2]);
+  CHECK(argc >= 4) << "Number of files provided should be atleast 1";
+  //CHECK(num_threads == argc - 3) << "Number of files provided, should be same as num_threads";
+  int num_threads = argc - 3;
+  std::vector<std::string> test_files;
+  for (size_t i = 0; i < argc - 3; ++i) {
+    test_files.emplace_back(argv[3 + i]);
+  }
+  int epoch = 0;
+  bool static_alloc = true;
+  bool static_shape = true;
+
+
+  // Image size and channels
+  size_t width = 224;
+  size_t height = 224;
+  size_t channels = 3;
+
+  size_t image_size = width * height * channels;
+
+  // Read Image Data
+  // load into an input arr
+  std::vector<std::vector<float>> files(num_threads);
+  std::vector<mxnet::cpp::NDArray> input_arrs;
+  mxnet::cpp::Shape input_shape = mxnet::cpp::Shape(1, 3, 224, 224);
+  for (size_t i = 0; i < files.size(); i++) {
+    files[i].resize(image_size);
+    GetImageFile(test_files[i], files[i].data(), channels,
+                 cv::Size(width, height));
+    input_arrs.emplace_back(mxnet::cpp::NDArray(files[i].data(), input_shape, mxnet::cpp::Context::cpu(0)));
+  }
+
+  // load symbol
+  std::string static_alloc_str = static_alloc ? "true" : "false";
+  std::string static_shape_str = static_shape ? "true" : "false";
+  std::vector<mxnet::NDArray*> output_mx_arr(num_threads);
+  run_inference(model_name, input_arrs, &output_mx_arr, 1, false, num_threads,
+                static_alloc, static_shape, is_gpu);
+  mxnet::cpp::NDArray::WaitAll();
+
+  return 0;
+}
diff --git a/cpp-package/example/inference/multi_threaded_inference/unit_test_multi_threaded_inference.sh b/cpp-package/example/inference/multi_threaded_inference/unit_test_multi_threaded_inference.sh
new file mode 100755
index 000000000000..3dc2fbe83cd1
--- /dev/null
+++ b/cpp-package/example/inference/multi_threaded_inference/unit_test_multi_threaded_inference.sh
@@ -0,0 +1,2 @@
+python3 get_model.py --model imagenet1k-inception-bn
+./multi_threaded_inference imagenet1k-inception-bn 1 ./data/grace_hopper.jpg ./data/lena.png
diff --git a/cpp-package/tests/ci_test.sh b/cpp-package/tests/ci_test.sh
index 5065d31d8feb..e025920b17cf 100755
--- a/cpp-package/tests/ci_test.sh
+++ b/cpp-package/tests/ci_test.sh
@@ -70,4 +70,10 @@ cd inference
 
 cp /work/build/cpp-package/example/sentiment_analysis_rnn .
 ./unit_test_sentiment_analysis_rnn.sh
-cd ..
+
+cd multi_threaded_inference
+
+cp ../../../../build/cpp-package/example/multi_threaded_inference .
+./unit_test_multi_threaded_inference.sh
+
+cd ../..
diff --git a/example/multi_threaded_inference/multi_threaded_inference.cc b/example/multi_threaded_inference/multi_threaded_inference.cc
index b0b6869027d7..82ed99242f94 100644
--- a/example/multi_threaded_inference/multi_threaded_inference.cc
+++ b/example/multi_threaded_inference/multi_threaded_inference.cc
@@ -151,12 +151,12 @@ void run_inference(const std::string& model_name, const std::vector<mxnet::cpp::
                    int num_threads = 1, bool static_alloc = false,
                    bool static_shape = false,
                    bool is_gpu = false) {
-    LOG(INFO) << "Running inference for " + model_name +
-                 " num_threads: " + std::to_string(num_threads) +
-                 " num_inf_per_thread: " + std::to_string(num_inf_per_thread) +
-                 " random_sleep: " + std::to_string(random_sleep) +
-                 " static_alloc: " + std::to_string(static_alloc) +
-                 " static_shape: " + std::to_string(static_shape);
+  LOG(INFO) << "Running inference for " + model_name +
+               " num_threads: " + std::to_string(num_threads) +
+               " num_inf_per_thread: " + std::to_string(num_inf_per_thread) +
+               " random_sleep: " + std::to_string(random_sleep) +
+               " static_alloc: " + std::to_string(static_alloc) +
+               " static_shape: " + std::to_string(static_shape);
   std::string json_file = model_name + "-symbol.json";
   std::string param_file = model_name + "-0000.params";
   auto out = mxnet::cpp::Symbol::Load(json_file);

From f016683c5068e06524b9fdf8499456f825f08854 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 7 May 2021 13:07:45 -0700
Subject: [PATCH 41/47] update test script

---
 .../unit_test_multi_threaded_inference.sh     | 28 ++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/cpp-package/example/inference/multi_threaded_inference/unit_test_multi_threaded_inference.sh b/cpp-package/example/inference/multi_threaded_inference/unit_test_multi_threaded_inference.sh
index 3dc2fbe83cd1..7bd97c19b604 100755
--- a/cpp-package/example/inference/multi_threaded_inference/unit_test_multi_threaded_inference.sh
+++ b/cpp-package/example/inference/multi_threaded_inference/unit_test_multi_threaded_inference.sh
@@ -1,2 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# http://mxnet.apache.org/versions/master/api/cpp/docs/tutorials/multi_threaded_inference.html
+
+# Install test data.
+wget https://github.com/tensorflow/tensorflow/raw/master/tensorflow/examples/label_image/data/grace_hopper.jpg
+wget http://optipng.sourceforge.net/pngtech/img/lena.png
+
+# Get Model.
 python3 get_model.py --model imagenet1k-inception-bn
-./multi_threaded_inference imagenet1k-inception-bn 1 ./data/grace_hopper.jpg ./data/lena.png
+
+# Run test
+./multi_threaded_inference imagenet1k-inception-bn 1 grace_hopper.jpg lena.png

From 748941f022a66e793275fc3170a76763fd7d69ab Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 7 May 2021 15:02:29 -0700
Subject: [PATCH 42/47] fix multithreaded docs

---
 .../multi_threaded_inference.cc                   |  4 ++--
 .../docs/tutorials/multi_threaded_inference.md    | 15 ++++++---------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc b/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc
index dd843b40a78a..d191b81aeae3 100644
--- a/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc
+++ b/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc
@@ -299,11 +299,11 @@ void run_inference(const std::string& model_name, const std::vector<mxnet::cpp::
 }
 
 int main(int argc, char *argv[]) {
-  if (argc < 5) {
+  if (argc < 4) {
     std::cout << "Please provide a model name, is_gpu, test_image" << std::endl
               << "Usage: ./multi_threaded_inference [model_name] [is_gpu] [file_names]"
               << std::endl
-              << "Example: ./.multi_threaded_inference imagenet1k-inception-bn 1 0 apple.jpg"
+              << "Example: ./.multi_threaded_inference imagenet1k-inception-bn 0 apple.jpg"
               << std::endl
               << "NOTE: Thread number ordering will be based on the ordering of file inputs" << std::endl
               << "NOTE: Epoch is assumed to be 0" << std::endl;
diff --git a/docs/static_site/src/pages/api/cpp/docs/tutorials/multi_threaded_inference.md b/docs/static_site/src/pages/api/cpp/docs/tutorials/multi_threaded_inference.md
index bbc4ea95aea2..086e4408632b 100644
--- a/docs/static_site/src/pages/api/cpp/docs/tutorials/multi_threaded_inference.md
+++ b/docs/static_site/src/pages/api/cpp/docs/tutorials/multi_threaded_inference.md
@@ -67,20 +67,17 @@ To complete this tutorial you need to:
 - Build the multi-threaded inference example
 
 ### Setup the MXNet C++ API
-To use the C++ API in MXNet, you need to build MXNet from source with C++ package. Please follow the [built from source guide](/get_started/ubuntu_setup.html), and [C++ Package documentation](/api/cpp)
-The summary of those two documents is that you need to build MXNet from source with `USE_CPP_PACKAGE` flag set to 1. For example: `make -j USE_CPP_PACKAGE=1 USE_CUDA=1 USE_CUDNN=1`.
+To use the C++ API in MXNet, you need to build MXNet from source with C++ package. Please follow the [built from source guide](/get_started/build_from_source.html), and [C++ Package documentation](/api/cpp.html)
+The summary of those two documents is that you need to build MXNet from source with `USE_CPP_PACKAGE` flag set to 1.
 This example requires a build with CUDA and CUDNN.
 
-### Build the example
-If you have built mxnet from source with make, then do the following:
+### Get the example
+If you have built mxnet from source with cmake, then do the following:
 
 ```bash
-$ cd example/multi_threaded_inference
-$ make
+$ cp build/cpp-package/example/multi_threaded_inference .
 ```
 
-If you have built mxnet from source with cmake, please uncomment the specific lines for cmake build or set the following environment variables: `ONEDNN_BUILD_DIR (default is $(MXNET_ROOT)/3rdparty/onednn/build)`, `ONEDNN_INCLUDE_DIR (default is $(MXNET_ROOT)/3rdparty/onednn/include)`, `MXNET_LIB_DIR (default is $(MXNET_ROOT)/lib)`.
-
 ### Run multi threaded inference example
 The example is tested with models such as `imagenet1k-inception-bn`, `imagenet1k-resnet-50`,
 `imagenet1k-resnet-152`, `imagenet1k-resnet-18`
@@ -99,7 +96,7 @@ $ ./multi_threaded_inference [model_name] [is_gpu] [file_names]
 e.g.
 
 ```bash
-./multi_threaded_inference imagenet1k-inception-bn 2 1 grace_hopper.jpg dog.jpg
+./multi_threaded_inference imagenet1k-inception-bn 1 grace_hopper.jpg dog.jpg
 ```
 
 The above script spawns 2 threads, shares the same cachedop and params among two threads, and runs inference on GPU. It returns the inference results in the order in which files are provided.

From d82e0853cd960bb8de3a4ec2b474dee4b164040b Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 10 May 2021 10:21:40 -0700
Subject: [PATCH 43/47] Update doc and add cpp-package sanity check

---
 ci/docker/runtime_functions.sh                |  2 +-
 cpp-package/README.md                         | 23 +++++---
 cpp-package/cpp-package.mk                    | 45 ---------------
 cpp-package/example/Makefile                  | 56 -------------------
 cpp-package/example/README.md                 | 18 +-----
 cpp-package/example/alexnet.cpp               |  2 +-
 cpp-package/example/example.mk                | 39 -------------
 cpp-package/example/feature_extract/Makefile  | 41 --------------
 cpp-package/example/googlenet.cpp             |  2 +-
 cpp-package/example/inception_bn.cpp          |  2 +-
 cpp-package/example/inference/Makefile        | 40 -------------
 cpp-package/example/inference/README.md       |  7 +--
 cpp-package/example/inference/inference.mk    | 39 -------------
 cpp-package/example/lenet.cpp                 |  2 +-
 cpp-package/example/lenet_with_mxdataiter.cpp |  2 +-
 cpp-package/example/resnet.cpp                |  2 +-
 cpp-package/example/test_score.cpp            |  2 +-
 docs/static_site/src/pages/api/cpp/index.md   |  4 +-
 18 files changed, 28 insertions(+), 300 deletions(-)
 delete mode 100644 cpp-package/cpp-package.mk
 delete mode 100644 cpp-package/example/Makefile
 delete mode 100644 cpp-package/example/example.mk
 delete mode 100644 cpp-package/example/feature_extract/Makefile
 delete mode 100644 cpp-package/example/inference/Makefile
 delete mode 100644 cpp-package/example/inference/inference.mk

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 51f3a7de7f74..a5467335f497 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -711,7 +711,7 @@ sanity_license() {
 
 sanity_cpp() {
     set -ex
-    3rdparty/dmlc-core/scripts/lint.py mxnet cpp include src plugin tests --exclude_path src/operator/contrib/ctc_include include/onednn
+    3rdparty/dmlc-core/scripts/lint.py mxnet cpp include src plugin cpp-package tests --exclude_path src/operator/contrib/ctc_include include/onednn
 }
 
 sanity_python() {
diff --git a/cpp-package/README.md b/cpp-package/README.md
index 77ff0ee36e80..b476795ea66a 100644
--- a/cpp-package/README.md
+++ b/cpp-package/README.md
@@ -15,7 +15,7 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
 
-# MXNet C++ Package
+# MXNet - C++ API
 
 The MXNet C++ Package provides C++ API bindings to the users of MXNet.  Currently, these bindings are not available as standalone package.
 The users of these bindings are required to build this package as mentioned below.
@@ -32,13 +32,18 @@ The cpp-package directory contains the implementation of C++ API. As mentioned a
 	git clone --recursive https://github.com/apache/incubator-mxnet mxnet
 	```
 
-3.  Install the [prerequisites](<https://mxnet.apache.org/install/build_from_source#prerequisites>), desired [BLAS libraries](<https://mxnet.apache.org/install/build_from_source#blas-library>) and optional [OpenCV, CUDA, and cuDNN](<https://mxnet.apache.org/install/build_from_source#optional>) for building MXNet from source.
-4.  There is a configuration file for make, [make/config.mk](<https://github.com/apache/incubator-mxnet/blob/master/make/config.mk>) that contains all the compilation options. You can edit this file and set the appropriate options prior to running the **make** command.
-5.  Please refer to  [platform specific build instructions](<https://mxnet.apache.org/install/build_from_source#build-instructions-by-operating-system>) and available [build configurations](https://mxnet.apache.org/install/build_from_source#build-configurations) for more details.
-5.  For enabling the build of C++ Package, set the **USE\_CPP\_PACKAGE = 1** in [make/config.mk](<https://github.com/apache/incubator-mxnet/blob/master/make/config.mk>). Optionally, the compilation flag can also be specified on **make** command line as follows.
-	```
-	make -j USE_CPP_PACKAGE=1
-	```
+3.  Install the [recommended dependencies](https://mxnet.apache.org/versions/master/get_started/build_from_source.html#installing-mxnet's-recommended-dependencies) and [optional dependencies](https://mxnet.apache.org/versions/master/get_started/build_from_source.html#overview-of-optional-dependencies-and-optional-features) for building MXNet from source.
+4.  There is a configuration file for cmake, [config/*.cmake](<https://github.com/apache/incubator-mxnet/tree/master/config>) that contains all the compilation options. You can edit this file and set the appropriate options prior to running the **cmake** command.
+5.  Please refer to  [cmake configuration files](https://github.com/apache/incubator-mxnet/blob/970a2cfbe77d09ee610fdd70afca1a93247cf4fb/config/linux_gpu.cmake#L18-L37) for more details on how to configure and compile MXNet.
+6.  For enabling the build of C++ Package, set the **-DUSE\_CPP\_PACKAGE = 1** in cmake options.
+
+### Cross-Compilation steps:
+1.  Build the C++ package for the **host** platform to generate op.h file.
+2.  Remove the following line in [CMakeLists.txt](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/CMakeLists.txt#L15>).
+    ```
+	COMMAND python OpWrapperGenerator.py $<TARGET_FILE:mxnet>
+	``` 
+3.  Re-configure cmake for cross-compilation to build the **target** C++ package.
 
 ## Usage
 
@@ -46,7 +51,7 @@ In order to consume the C++ API please follow the steps below.
 
 1. Ensure that the MXNet shared library is built from source with the **USE\_CPP\_PACKAGE = 1**.
 2. Include the [MxNetCpp.h](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/include/mxnet-cpp/MxNetCpp.h>) in the program that is going to consume MXNet C++ API.
-	```
+	```c++
 	#include <mxnet-cpp/MxNetCpp.h>
 	```
 3. While building the program, ensure that the correct paths to the directories containing header files and MXNet shared library.
diff --git a/cpp-package/cpp-package.mk b/cpp-package/cpp-package.mk
deleted file mode 100644
index b9e7c33311a1..000000000000
--- a/cpp-package/cpp-package.mk
+++ /dev/null
@@ -1,45 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-ifndef LINT_LANG
-	LINT_LANG="all"
-endif
-
-ifdef CAFFE_PATH
-export LD_LIBRARY_PATH=$(CAFFE_PATH)/lib
-endif
-
-CPP_PACKAGE_OP_H_FILE = cpp-package/include/mxnet-cpp/op.h
-
-EXTRA_PACKAGES += cpp-package-all
-EXTRA_PACKAGES_CLEAN += cpp-package-clean
-
-.PHONY: cpp-package-all cpp-package-lint cpp-package-clean
-
-cpp-package-all: $(CPP_PACKAGE_OP_H_FILE)
-
-cpp-package-clean:
-	rm -f $(CPP_PACKAGE_OP_H_FILE)
-
-$(CPP_PACKAGE_OP_H_FILE): lib/libmxnet.so cpp-package/scripts/OpWrapperGenerator.py
-	(cd cpp-package/scripts; python OpWrapperGenerator.py $(ROOTDIR)/lib/libmxnet.so)
-
-cpp-package-lint:
-	(cd cpp-package; python scripts/lint.py dmlc ${LINT_LANG} include example)
-
-include cpp-package/example/example.mk
-include cpp-package/example/inference/inference.mk
diff --git a/cpp-package/example/Makefile b/cpp-package/example/Makefile
deleted file mode 100644
index dcb06399cfff..000000000000
--- a/cpp-package/example/Makefile
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-ifeq ($(OS),Windows_NT)
-	UNAME_S := Windows
-else
-	UNAME_S := $(shell uname -s)
-endif
-
-prebuild :
-	@mkdir -p build
-	$(shell ./get_data.sh)
-	$(shell cp -r ../../build ./)
-CPPEX_SRC = $(wildcard *.cpp)
-CPPEX_EXE = $(patsubst %.cpp, %, $(CPPEX_SRC))
-
-CFLAGS += -I../../include -I../../3rdparty/tvm/nnvm/include -I../../3rdparty/dmlc-core/include  -I../include
-
-ifeq ($(MXNET_USE_CPU),1)
-	CFLAGS += -D MXNET_USE_CPU
-endif
-
-# CPPEX_CFLAGS += -I../include
-CPPEX_EXTRA_LDFLAGS := -L../../build -lmxnet
-MXNET_LIB_PATH := $(shell cd ../../build; pwd)
-
-.PHONY: all clean
-
-all: prebuild  $(CPPEX_EXE)
-
-debug: CPPEX_CFLAGS += -DDEBUG -g
-debug: prebuild all
-
-$(CPPEX_EXE):% : %.cpp
-	$(CXX) -std=c++11 $(CFLAGS)  $(CPPEX_CFLAGS) -o build/$@ $(filter %.cpp %.a, $^) $(CPPEX_EXTRA_LDFLAGS)
-ifeq ($(UNAME_S), Darwin)
-	install_name_tool -add_rpath @loader_path build/$@
-	install_name_tool -add_rpath $(MXNET_LIB_PATH) build/$@
-endif
-
-clean:
-	@rm -rf build
diff --git a/cpp-package/example/README.md b/cpp-package/example/README.md
index 555316dd1ac3..65c967e0cb04 100644
--- a/cpp-package/example/README.md
+++ b/cpp-package/example/README.md
@@ -20,26 +20,14 @@
 ## Building C++ examples
 
 The examples in this folder demonstrate the **training** workflow. The **inference workflow** related examples can be found in [inference](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference>) folder.
-Please build the MXNet C++ Package as explained in the [README](<https://github.com/apache/incubator-mxnet/tree/master/cpp-package#building-c-package>) File before building these examples manually.
-The examples in this folder are built while building the MXNet library and cpp-package from source. However, they can be built manually as follows
-
-From cpp-package/examples directory
-
--  Build all examples in release mode: **make all**
--  Build all examples in debug mode: **make debug**
-
-By default, the examples are built to be run on GPU. To build examples to run on CPU:
-
--  Release: **make all MXNET\_USE\_CPU=1**
--  Debug: **make debug MXNET\_USE\_CPU=1**
+Please build the MXNet C++ Package as explained in the [README](<https://github.com/apache/incubator-mxnet/tree/master/cpp-package#building-c-package>) File.
+The examples in this folder are built while building the MXNet library and cpp-package from source. You can get the executable files by just copying them from ```incubator-mxnet/build/cpp-package/example```
 
 The examples that are built to be run on GPU may not work on the non-GPU machines.
-The makefile will also download the necessary data files and store in a data folder. (The download will take couple of minutes, but will be done only once on a fresh installation.)
-
 
 ## Examples demonstrating training workflow
 
-This directory contains following examples. In order to run the examples, ensure that the path to the MXNet shared library is added to the OS specific environment variable viz. **LD\_LIBRARY\_PATH** for Linux, Mac and Ubuntu OS and **PATH** for Windows OS. For example `export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/home/ubuntu/incubator-mxnet/lib` on ubuntu using gpu.
+This directory contains following examples. In order to run the examples, ensure that the path to the MXNet shared library is added to the OS specific environment variable viz. **LD\_LIBRARY\_PATH** for Linux, Mac and Ubuntu OS and **PATH** for Windows OS. For example `export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/home/ubuntu/incubator-mxnet/build` on ubuntu using gpu.
 
 ### [alexnet.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/alexnet.cpp>)
 
diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp
index 8e7b50f133bd..00bd1c592eed 100644
--- a/cpp-package/example/alexnet.cpp
+++ b/cpp-package/example/alexnet.cpp
@@ -221,7 +221,7 @@ int main(int argc, char const *argv[]) {
   int num_gpu;
   MXGetGPUCount(&num_gpu);
   int batch_size = 32;
-#if !MXNET_USE_CPU
+#if MXNET_USE_CUDA
   if (num_gpu > 0) {
     ctx = Context::gpu();
     batch_size = 256;
diff --git a/cpp-package/example/example.mk b/cpp-package/example/example.mk
deleted file mode 100644
index ef99d7426414..000000000000
--- a/cpp-package/example/example.mk
+++ /dev/null
@@ -1,39 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-CPPEX_SRC = $(wildcard cpp-package/example/*.cpp)
-CPPEX_EXE = $(patsubst cpp-package/example/%.cpp, build/cpp-package/example/%, $(CPPEX_SRC))
-
-CPPEX_CFLAGS += -Icpp-package/include
-CPPEX_EXTRA_LDFLAGS := -L$(ROOTDIR)/lib -lmxnet
-
-EXTRA_PACKAGES += cpp-package-example-all
-EXTRA_PACKAGES_CLEAN += cpp-package-example-clean
-
-.PHONY: cpp-package-example-all cpp-package-example-clean
-
-cpp-package-example-all: cpp-package-all $(CPPEX_EXE)
-
-build/cpp-package/example/% : cpp-package/example/%.cpp lib/libmxnet.so $(CPP_PACKAGE_OP_H_FILE)
-	@mkdir -p $(@D)
-	$(CXX) -std=c++11 $(CFLAGS) $(CPPEX_CFLAGS) -MM -MT cpp-package/example/$* $< >build/cpp-package/example//$*.d
-	$(CXX) -std=c++11 $(CFLAGS) $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(LDFLAGS) $(CPPEX_EXTRA_LDFLAGS)
-
-cpp-package-example-clean:
-	rm -rf build/cpp-package/example/*
-
--include build/cpp-package/example/*.d
diff --git a/cpp-package/example/feature_extract/Makefile b/cpp-package/example/feature_extract/Makefile
deleted file mode 100644
index 193eaa7e850b..000000000000
--- a/cpp-package/example/feature_extract/Makefile
+++ /dev/null
@@ -1,41 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-CXX=g++
-BLAS=-L /opt/openblas/lib -lopenblas -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
-CUDA=-DMSHADOW_USE_CUDA=1
-OPENCV_CFLAGS=`pkg-config --cflags opencv`
-OPENCV_LDFLAGS=`pkg-config --libs opencv`
-
-CFLAGS=$(COMMFLAGS) -I../../../3rdparty/nnvm/include -I../../../3rdparty/dmlc-core/include -I ../../include -I ../../../include -Wall -O3 -msse3 -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas -fopenmp
-LDFLAGS=$(COMMFLAGS) -L ../../../lib -lmxnet $(BLAS) $(CUDA) -lgomp -pthread
-
-all: feature_extract prepare_data_with_opencv
-
-feature_extract: ./feature_extract.cpp
-	$(CXX) -c -std=c++11 $(CFLAGS) $^
-	$(CXX) $(basename $@).o -o $@ $(LDFLAGS)
-	-rm -f $(basename $@).o
-
-prepare_data_with_opencv: ./prepare_data_with_opencv.cpp
-	$(CXX) -c -std=c++11 $(OPENCV_CFLAGS) $^
-	$(CXX) $(basename $@).o -o $@ $(OPENCV_LDFLAGS)
-	-rm -f $(basename $@).o
-
-clean:
-	-rm -f feature_extract
-	-rm -f prepare_data_with_opencv
diff --git a/cpp-package/example/googlenet.cpp b/cpp-package/example/googlenet.cpp
index 7b51f4fde3a7..c14ef5fd1dc1 100644
--- a/cpp-package/example/googlenet.cpp
+++ b/cpp-package/example/googlenet.cpp
@@ -120,7 +120,7 @@ int main(int argc, char const *argv[]) {
   float weight_decay = 1e-4;
 
   auto ctx = Context::gpu();
-#if MXNET_USE_CPU
+#if !MXNET_USE_CUDA
   ctx = Context::cpu();;
 #endif
 
diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp
index af018a2d93c2..b7cc64a3317a 100644
--- a/cpp-package/example/inception_bn.cpp
+++ b/cpp-package/example/inception_bn.cpp
@@ -166,7 +166,7 @@ int main(int argc, char const *argv[]) {
   auto ctx = Context::cpu();
   int num_gpu;
   MXGetGPUCount(&num_gpu);
-#if !MXNET_USE_CPU
+#if MXNET_USE_CUDA
   if (num_gpu > 0) {
     ctx = Context::gpu();
   }
diff --git a/cpp-package/example/inference/Makefile b/cpp-package/example/inference/Makefile
deleted file mode 100644
index 5efe6cfb68e5..000000000000
--- a/cpp-package/example/inference/Makefile
+++ /dev/null
@@ -1,40 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-CPPEX_SRC = $(wildcard *.cpp)
-CPPEX_EXE = $(patsubst %.cpp, %, $(CPPEX_SRC))
-OPENCV_CFLAGS=`pkg-config --cflags opencv`
-OPENCV_LDFLAGS=`pkg-config --libs opencv`
-
-CXX=g++
-
-
-CFLAGS=$(COMMFLAGS) -I../../../3rdparty/tvm/nnvm/include -I../../../3rdparty/dmlc-core/include -I ../../include -I ../../../include -Wall -O3 -msse3 -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas
-CPPEX_EXTRA_LDFLAGS := -L../../../lib -lmxnet $(OPENCV_LDFLAGS)
-
-all: $(CPPEX_EXE)
-
-debug: CPPEX_CFLAGS += -DDEBUG -g
-debug: all
-
-
-$(CPPEX_EXE):% : %.cpp
-	$(CXX) -std=c++0x $(CFLAGS)  $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(CPPEX_EXTRA_LDFLAGS)
-
-clean:
-	rm -f $(CPPEX_EXE)
diff --git a/cpp-package/example/inference/README.md b/cpp-package/example/inference/README.md
index 90047e5fe14f..8dc094f71693 100644
--- a/cpp-package/example/inference/README.md
+++ b/cpp-package/example/inference/README.md
@@ -19,12 +19,7 @@
 
 ## Building C++ Inference examples
 
-The examples in this folder demonstrate the **inference** workflow. Please build the MXNet C++ Package as explained in the [README](<https://github.com/apache/incubator-mxnet/tree/master/cpp-package#building-c-package>) File before building these examples.
-To build examples use following commands:
-
--  Release: **make all**
--  Debug: **make debug all**
-
+The examples in this folder demonstrate the **inference** workflow. Please build the MXNet C++ Package as explained in the [README](<https://github.com/apache/incubator-mxnet/tree/master/cpp-package#building-c-package>) File. You can get the executable files by just copying them from ```incubator-mxnet/build/cpp-package/example```
 
 ## Examples demonstrating inference workflow
 
diff --git a/cpp-package/example/inference/inference.mk b/cpp-package/example/inference/inference.mk
deleted file mode 100644
index b03055395f21..000000000000
--- a/cpp-package/example/inference/inference.mk
+++ /dev/null
@@ -1,39 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-CPPEX_SRC = $(wildcard cpp-package/example/inference/*.cpp)
-CPPEX_EXE = $(patsubst cpp-package/example/inference/%.cpp, build/cpp-package/example/%, $(CPPEX_SRC))
-
-CPPEX_CFLAGS += -Icpp-package/include
-CPPEX_EXTRA_LDFLAGS := -L$(ROOTDIR)/lib -lmxnet
-
-EXTRA_PACKAGES += cpp-package-inference-example-all
-EXTRA_PACKAGES_CLEAN += cpp-package-inference-example-clean
-
-.PHONY: cpp-package-inference-example-all cpp-package-inference-example-clean
-
-cpp-package-inference-example-all: cpp-package-all $(CPPEX_EXE)
-
-build/cpp-package/example/% : cpp-package/example/inference/%.cpp lib/libmxnet.so $(CPP_PACKAGE_OP_H_FILE)
-	@mkdir -p $(@D)
-	$(CXX) -std=c++11 $(CFLAGS) $(CPPEX_CFLAGS) -MM -MT cpp-package/example/inference/$* $< >build/cpp-package/example/$*.d
-	$(CXX) -std=c++11 $(CFLAGS) $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(LDFLAGS) $(CPPEX_EXTRA_LDFLAGS)
-
-cpp-package-inference-example-clean:
-	rm -rf build/cpp-package/example/inference*
-
--include build/cpp-package/example/inference/*.d
diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp
index 54be0edccc14..8f45cead9075 100644
--- a/cpp-package/example/lenet.cpp
+++ b/cpp-package/example/lenet.cpp
@@ -33,7 +33,7 @@ class Lenet {
  public:
   Lenet()
       : ctx_cpu(Context(DeviceType::kCPU, 0)),
-#if MXNET_USE_CPU
+#if !MXNET_USE_CUDA
         ctx_dev(Context(DeviceType::kCPU, 0))
 #else
         ctx_dev(Context(DeviceType::kGPU, 0))
diff --git a/cpp-package/example/lenet_with_mxdataiter.cpp b/cpp-package/example/lenet_with_mxdataiter.cpp
index 6b37693cda59..c6b1fd90f846 100644
--- a/cpp-package/example/lenet_with_mxdataiter.cpp
+++ b/cpp-package/example/lenet_with_mxdataiter.cpp
@@ -88,7 +88,7 @@ int main(int argc, char const *argv[]) {
   auto dev_ctx = Context::cpu();
   int num_gpu;
   MXGetGPUCount(&num_gpu);
-#if !MXNET_USE_CPU
+#if MXNET_USE_CUDA
   if (num_gpu > 0) {
     dev_ctx = Context::gpu();
   }
diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp
index c876724533d4..1a5f0aa01ff1 100644
--- a/cpp-package/example/resnet.cpp
+++ b/cpp-package/example/resnet.cpp
@@ -182,7 +182,7 @@ int main(int argc, char const *argv[]) {
   int num_gpu;
   MXGetGPUCount(&num_gpu);
   int batch_size = 8;
-#if !MXNET_USE_CPU
+#if MXNET_USE_CUDA
   if (num_gpu > 0) {
     ctx = Context::gpu();
     batch_size = 32;
diff --git a/cpp-package/example/test_score.cpp b/cpp-package/example/test_score.cpp
index 0ccdf65b3b19..f076ddfcaa08 100644
--- a/cpp-package/example/test_score.cpp
+++ b/cpp-package/example/test_score.cpp
@@ -84,7 +84,7 @@ int main(int argc, char** argv) {
   auto net = mlp(layers);
 
   Context ctx = Context::gpu();  // Use GPU for training
-#if MXNET_USE_CPU
+#if !MXNET_USE_CUDA
   ctx = Context::cpu();
 #endif
 
diff --git a/docs/static_site/src/pages/api/cpp/index.md b/docs/static_site/src/pages/api/cpp/index.md
index c6234008f5b1..ff781e8385e4 100644
--- a/docs/static_site/src/pages/api/cpp/index.md
+++ b/docs/static_site/src/pages/api/cpp/index.md
@@ -40,9 +40,9 @@ The cpp-package directory contains the implementation of C++ API. As mentioned a
 	git clone --recursive https://github.com/apache/incubator-mxnet mxnet
 	```
 
-3.  Install the [prerequisites](<https://mxnet.apache.org/get_started/build_from_source#prerequisites>), desired [BLAS libraries](<https://mxnet.apache.org/get_started/build_from_source#blas-library>) and optional [OpenCV, CUDA, and cuDNN](<https://mxnet.apache.org/get_started/build_from_source#optional>) for building MXNet from source.
+3.  Install the [recommended dependencies](https://mxnet.apache.org/versions/master/get_started/build_from_source.html#installing-mxnet's-recommended-dependencies) and [optional dependencies](https://mxnet.apache.org/versions/master/get_started/build_from_source.html#overview-of-optional-dependencies-and-optional-features) for building MXNet from source.
 4.  There is a configuration file for cmake, [config/*.cmake](<https://github.com/apache/incubator-mxnet/tree/master/config>) that contains all the compilation options. You can edit this file and set the appropriate options prior to running the **cmake** command.
-5.  Please refer to  [platform specific build instructions](<https://mxnet.apache.org/get_started/build_from_source#build-instructions-by-operating-system>) and available [build configurations](https://mxnet.apache.org/get_started/build_from_source#build-configurations) for more details.
+5.  Please refer to  [cmake configuration files](https://github.com/apache/incubator-mxnet/blob/970a2cfbe77d09ee610fdd70afca1a93247cf4fb/config/linux_gpu.cmake#L18-L37) for more details on how to configure and compile MXNet.
 6.  For enabling the build of C++ Package, set the **-DUSE\_CPP\_PACKAGE = 1** in cmake options.
 
 ### Cross-Compilation steps:

From 605e7acc6f437f7e7d288f5688f25ad1f5cd9a72 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 10 May 2021 10:34:35 -0700
Subject: [PATCH 44/47] fix sanity

---
 .../multi_threaded_inference.cc                | 18 +++++++++---------
 cpp-package/include/mxnet-cpp/executor.h       |  4 ++--
 cpp-package/include/mxnet-cpp/executor.hpp     |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc b/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc
index d191b81aeae3..4bf14751eb79 100644
--- a/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc
+++ b/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc
@@ -30,10 +30,10 @@
 #include <thread>
 #include <iomanip>
 #include <chrono>
+#include <random>
 #include <mxnet/ndarray.h>
 #include <opencv2/opencv.hpp>
 #include "mxnet-cpp/MxNetCpp.h"
-#include <random>
 
 const float DEFAULT_MEAN = 117.0;
 
@@ -144,7 +144,8 @@ void prepare_input_data(const mxnet::cpp::Shape& shape, const mxnet::cpp::Contex
 }
 
 // Run inference on a model
-void run_inference(const std::string& model_name, const std::vector<mxnet::cpp::NDArray>& input_arrs,
+void run_inference(const std::string& model_name,
+                   const std::vector<mxnet::cpp::NDArray>& input_arrs,
                    std::vector<mxnet::NDArray*> *output_mx_arr,
                    int num_inf_per_thread = 1, bool random_sleep = false,
                    int num_threads = 1, bool static_alloc = false,
@@ -256,7 +257,8 @@ void run_inference(const std::string& model_name, const std::vector<mxnet::cpp::
     int num_output = 0;
     const int *stypes;
     int ret = MXInvokeCachedOp(hdl, arr_handles[num].size(), arr_handles[num].data(),
-                               ctx.GetDeviceType(), 0, &num_output, &(cached_op_handles[num]), &stypes);
+                               ctx.GetDeviceType(), 0, &num_output,
+                               &(cached_op_handles[num]), &stypes);
     if (ret < 0) {
       LOG(FATAL) << MXGetLastError();
     }
@@ -293,9 +295,7 @@ void run_inference(const std::string& model_name, const std::vector<mxnet::cpp::
   if (ret2 < 0) {
     LOG(FATAL) << MXGetLastError();
   }
-
   mxnet::cpp::NDArray::WaitAll();
-
 }
 
 int main(int argc, char *argv[]) {
@@ -305,15 +305,14 @@ int main(int argc, char *argv[]) {
               << std::endl
               << "Example: ./.multi_threaded_inference imagenet1k-inception-bn 0 apple.jpg"
               << std::endl
-              << "NOTE: Thread number ordering will be based on the ordering of file inputs" << std::endl
+              << "NOTE: Thread number ordering will be based on the ordering of file inputs"
+              << std::endl
               << "NOTE: Epoch is assumed to be 0" << std::endl;
     return EXIT_FAILURE;
   }
   std::string model_name = std::string(argv[1]);
-  //int num_threads = std::atoi(argv[2]);
   bool is_gpu = std::atoi(argv[2]);
   CHECK(argc >= 4) << "Number of files provided should be atleast 1";
-  //CHECK(num_threads == argc - 3) << "Number of files provided, should be same as num_threads";
   int num_threads = argc - 3;
   std::vector<std::string> test_files;
   for (size_t i = 0; i < argc - 3; ++i) {
@@ -340,7 +339,8 @@ int main(int argc, char *argv[]) {
     files[i].resize(image_size);
     GetImageFile(test_files[i], files[i].data(), channels,
                  cv::Size(width, height));
-    input_arrs.emplace_back(mxnet::cpp::NDArray(files[i].data(), input_shape, mxnet::cpp::Context::cpu(0)));
+    input_arrs.emplace_back(mxnet::cpp::NDArray(files[i].data(),
+                            input_shape, mxnet::cpp::Context::cpu(0)));
   }
 
   // load symbol
diff --git a/cpp-package/include/mxnet-cpp/executor.h b/cpp-package/include/mxnet-cpp/executor.h
index b0ba994e3a30..7bdddf1dbbad 100644
--- a/cpp-package/include/mxnet-cpp/executor.h
+++ b/cpp-package/include/mxnet-cpp/executor.h
@@ -31,6 +31,7 @@
 #include <map>
 #include <set>
 #include <string>
+#include <algorithm>
 #include "mxnet-cpp/base.h"
 #include "mxnet-cpp/symbol.h"
 
@@ -110,7 +111,6 @@ class Executor {
       for (const auto &array : outputs) {
         out_handles.push_back(array.GetHandle());
       }
-                          
       std::vector<NDArrayHandle> head_grads_;
       for (auto d : head_grads) {
         head_grads_.push_back(d.GetHandle());
@@ -128,7 +128,7 @@ class Executor {
       grad_arrays.reserve(arg_arrays.size());
       for (const auto &array : arg_arrays) {
         NDArrayHandle grad;
-        CHECK_EQ(MXNDArrayGetGrad(array.GetHandle(), &grad),0);
+        CHECK_EQ(MXNDArrayGetGrad(array.GetHandle(), &grad), 0);
         grad_arrays.push_back(NDArray(grad));
       }
     }
diff --git a/cpp-package/include/mxnet-cpp/executor.hpp b/cpp-package/include/mxnet-cpp/executor.hpp
index b3fa1320611d..d7490eb22186 100644
--- a/cpp-package/include/mxnet-cpp/executor.hpp
+++ b/cpp-package/include/mxnet-cpp/executor.hpp
@@ -70,7 +70,7 @@ inline Executor::Executor(const Symbol &symbol, Context context,
     grad_reqs_uint.push_back(s);
   }
   CHECK_EQ(MXAutogradMarkVariables(arg_handles.size(), arg_handles.data(),
-                                   grad_reqs_uint.data(), grad_handles.data()),0);
+                                   grad_reqs_uint.data(), grad_handles.data()), 0);
 
   std::map<std::string, NDArray> arg_map = arg_dict();
   std::map<std::string, NDArray> aux_map = aux_dict();

From e49df4af87babbe5fc4e7d6b9861582440bb1b30 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 10 May 2021 10:51:15 -0700
Subject: [PATCH 45/47] update

---
 .../multi_threaded_inference/multi_threaded_inference.cc     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc b/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc
index 4bf14751eb79..b9d94b75a296 100644
--- a/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc
+++ b/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc
@@ -23,6 +23,8 @@
  * \brief Multi Threaded inference example with CachedOp
 */
 
+#include <mxnet/ndarray.h>
+
 #include <cstdio>
 #include <cstdlib>
 #include <iostream>
@@ -31,9 +33,8 @@
 #include <iomanip>
 #include <chrono>
 #include <random>
-#include <mxnet/ndarray.h>
-#include <opencv2/opencv.hpp>
 #include "mxnet-cpp/MxNetCpp.h"
+#include <opencv2/opencv.hpp>
 
 const float DEFAULT_MEAN = 117.0;
 

From 6bddda267df90f36aa71b888d68b0d197def7fff Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 13 May 2021 13:15:05 -0700
Subject: [PATCH 46/47] update

---
 ci/docker/runtime_functions.sh                |   2 +
 cpp-package/README.md                         |   4 +-
 cpp-package/example/CMakeLists.txt            |   3 +
 .../inference/unit_test_imagenet_inference.sh |  16 +-
 cpp-package/example/test_regress_label.cpp    |  56 +++++
 cpp-package/tests/ci_test.sh                  |   6 +-
 cpp-package/tests/travis/run_test.sh          |  42 ----
 cpp-package/tests/travis/setup.sh             |  23 --
 docs/static_site/src/pages/api/cpp/index.md   |   4 +-
 src/operator/regression_output.cc             | 159 +++++++++++++
 src/operator/regression_output.cu             |  53 +++++
 src/operator/svm_output-inl.h                 | 225 ++++++++++++++++++
 src/operator/svm_output.cc                    | 101 ++++++++
 src/operator/svm_output.cu                    | 119 +++++++++
 14 files changed, 733 insertions(+), 80 deletions(-)
 create mode 100644 cpp-package/example/test_regress_label.cpp
 delete mode 100755 cpp-package/tests/travis/run_test.sh
 delete mode 100755 cpp-package/tests/travis/setup.sh
 create mode 100644 src/operator/regression_output.cc
 create mode 100644 src/operator/regression_output.cu
 create mode 100644 src/operator/svm_output-inl.h
 create mode 100644 src/operator/svm_output.cc
 create mode 100644 src/operator/svm_output.cu

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index a5467335f497..974215524795 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -625,6 +625,8 @@ build_ubuntu_gpu_onednn_nocudnn() {
 build_ubuntu_gpu() {
     set -ex
     cd /work/build
+    # Work around to link libcuda to libmxnet
+    # should be removed after https://github.com/apache/incubator-mxnet/issues/17858 is resolved. 
     ln -s -f /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so libcuda.so.1
     export LIBRARY_PATH=${LIBRARY_PATH}:/work/build
     export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/work/build
diff --git a/cpp-package/README.md b/cpp-package/README.md
index b476795ea66a..90808688077f 100644
--- a/cpp-package/README.md
+++ b/cpp-package/README.md
@@ -22,8 +22,8 @@ The users of these bindings are required to build this package as mentioned belo
 
 ## Building C++ Package
 
-The cpp-package directory contains the implementation of C++ API. As mentioned above, users are required to build this directory or package before using it.
-**The cpp-package is built while building the MXNet shared library, *libmxnet.so*.**
+The cpp-package directory contains the implementation of C++ API. Users are required to build this directory or package before using it. 
+**The cpp-package is built while building the MXNet shared library, *libmxnet.so*, with *USE\_CPP\_PACKAGE* option turned on. Please follow the steps to build the C++ package**
 
 ### Steps to build the C++ package:
 1.  Building the MXNet C++ package requires building MXNet from source.
diff --git a/cpp-package/example/CMakeLists.txt b/cpp-package/example/CMakeLists.txt
index de7d085495a1..ed37668d7011 100644
--- a/cpp-package/example/CMakeLists.txt
+++ b/cpp-package/example/CMakeLists.txt
@@ -63,6 +63,9 @@ target_link_libraries(mlp_csv mxnet_cpp)
 add_executable(test_kvstore test_kvstore.cpp)
 target_link_libraries(test_kvstore mxnet_cpp)
 
+add_executable(test_regress_label test_regress_label.cpp)
+target_link_libraries(test_regress_label mxnet_cpp)
+
 add_executable(sentiment_analysis_rnn ./inference/sentiment_analysis_rnn.cpp)
 target_link_libraries(sentiment_analysis_rnn mxnet_cpp)
 
diff --git a/cpp-package/example/inference/unit_test_imagenet_inference.sh b/cpp-package/example/inference/unit_test_imagenet_inference.sh
index c645388cd419..4d89ba6fb075 100755
--- a/cpp-package/example/inference/unit_test_imagenet_inference.sh
+++ b/cpp-package/example/inference/unit_test_imagenet_inference.sh
@@ -42,22 +42,22 @@ cd ..
 # Running inference on imagenet.
 if [ "$(uname)" == "Darwin" ]; then
     echo ">>> INFO: FP32 real data"
-    DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:../../../lib ./imagenet_inference --symbol_file "./model/Inception-BN-symbol.json" --params_file "./model/Inception-BN-0126.params" --dataset "./data/val_256_q90.rec" --rgb_mean "123.68 116.779 103.939" --batch_size 1 --num_skipped_batches 50 --num_inference_batches 500
+    DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:../../../build ./imagenet_inference --symbol_file "./model/Inception-BN-symbol.json" --params_file "./model/Inception-BN-0126.params" --dataset "./data/val_256_q90.rec" --rgb_mean "123.68 116.779 103.939" --batch_size 1 --num_skipped_batches 50 --num_inference_batches 500
 
     echo ">>> INFO: FP32 dummy data"
-    DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:../../../lib ./imagenet_inference --symbol_file "./model/Inception-BN-symbol.json" --batch_size 1 --num_inference_batches 500 --benchmark
+    DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:../../../build ./imagenet_inference --symbol_file "./model/Inception-BN-symbol.json" --batch_size 1 --num_inference_batches 500 --benchmark
 else
     echo ">>> INFO: FP32 real data"
-    LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../../../lib ./imagenet_inference --symbol_file "./model/Inception-BN-symbol.json" --params_file "./model/Inception-BN-0126.params" --dataset "./data/val_256_q90.rec" --rgb_mean "123.68 116.779 103.939" --batch_size 1 --num_skipped_batches 50 --num_inference_batches 500
+    LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../../../build ./imagenet_inference --symbol_file "./model/Inception-BN-symbol.json" --params_file "./model/Inception-BN-0126.params" --dataset "./data/val_256_q90.rec" --rgb_mean "123.68 116.779 103.939" --batch_size 1 --num_skipped_batches 50 --num_inference_batches 500
 
     echo ">>> INFO: FP32 dummy data"
-    LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../../../lib ./imagenet_inference --symbol_file "./model/Inception-BN-symbol.json" --batch_size 1 --num_inference_batches 500 --benchmark
+    LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../../../build ./imagenet_inference --symbol_file "./model/Inception-BN-symbol.json" --batch_size 1 --num_inference_batches 500 --benchmark
 
-    lib_name=$(ls -a ../../../lib | grep -oE 'mkldnn' | tail -1)
-    if [[ -n ${lib_name} ]] && [[ 'mkldnn' =~ ${lib_name} ]]; then
+    lib_name=$(ls -a ../../../build | grep -oE 'onednn' | tail -1)
+    if [[ -n ${lib_name} ]] && [[ 'onednn' =~ ${lib_name} ]]; then
         echo ">>> INFO: INT8 dummy data"
-        LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../../../lib ./imagenet_inference --symbol_file "./model/resnet50_v1_int8-symbol.json" --batch_size 1 --num_inference_batches 500 --benchmark
+        LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../../../build ./imagenet_inference --symbol_file "./model/resnet50_v1_int8-symbol.json" --batch_size 1 --num_inference_batches 500 --benchmark
     else
-        echo "Skipped INT8 test because mkldnn was not found which is required for running inference with quantized models."
+        echo "Skipped INT8 test because onednn was not found which is required for running inference with quantized models."
     fi
 fi
diff --git a/cpp-package/example/test_regress_label.cpp b/cpp-package/example/test_regress_label.cpp
new file mode 100644
index 000000000000..8d1d6444b138
--- /dev/null
+++ b/cpp-package/example/test_regress_label.cpp
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ * This file is used for testing LinearRegressionOutput can
+ *   still bind if label is not provided
+ */
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include "dmlc/logging.h"
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+int main() {
+    LOG(INFO) << "Running LinearRegressionOutput symbol testing, "
+                 "executor should be able to bind without label.";
+    Symbol data = Symbol::Variable("data");
+    Symbol label = Symbol::Variable("regress_label");
+    Symbol symbol = LinearRegressionOutput(data, label);
+    std::map<std::string, mxnet::cpp::OpReqType> opReqMap;
+    for (const auto& iter : symbol.ListArguments()) {
+        opReqMap[iter] = mxnet::cpp::OpReqType::kNullOp;
+    }
+    std::map<std::string, mxnet::cpp::NDArray> argMap({
+        {"data", NDArray(Shape{1, 3}, Context::cpu(), true)}
+    });
+
+    try {
+        symbol.SimpleBind(Context::cpu(),
+                argMap,
+                std::map<std::string, mxnet::cpp::NDArray>(),
+                opReqMap,
+                std::map<std::string, mxnet::cpp::NDArray>());
+    } catch (const std::exception& e) {
+        LOG(ERROR) << "Error binding the symbol: " << MXGetLastError() << " " << e.what();
+        throw;
+    }
+    return 0;
+}
diff --git a/cpp-package/tests/ci_test.sh b/cpp-package/tests/ci_test.sh
index e025920b17cf..75805b04ef75 100755
--- a/cpp-package/tests/ci_test.sh
+++ b/cpp-package/tests/ci_test.sh
@@ -61,10 +61,10 @@ cp /work/build/cpp-package/example/test_ndarray_copy .
 ./test_ndarray_copy
 
 # skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/20011
-#cp ../../build/cpp-package/example/test_regress_label .
-#./test_regress_label
+cp /work/build/cpp-package/example/test_regress_label .
+./test_regress_label
 
-# sh unittests/unit_test_mlp_csv.sh
+sh unittests/unit_test_mlp_csv.sh
 
 cd inference
 
diff --git a/cpp-package/tests/travis/run_test.sh b/cpp-package/tests/travis/run_test.sh
deleted file mode 100755
index 4925b3526bf3..000000000000
--- a/cpp-package/tests/travis/run_test.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-if [ ${TASK} == "lint" ]; then
-    make lint || exit -1
-    echo "Check documentations of c++ code..."
-    make doc 2>log.txt
-    (cat log.txt| grep -v ENABLE_PREPROCESSING |grep -v "unsupported tag") > logclean.txt
-    echo "---------Error Log----------"
-    cat logclean.txt
-    echo "----------------------------"
-    (cat logclean.txt|grep warning) && exit -1
-    (cat logclean.txt|grep error) && exit -1
-    exit 0
-fi
-
-if [ ${TRAVIS_OS_NAME} == "linux" ]; then
-  # use g++-4.8 in linux
-  export CXX=g++-4.8
-fi
-
-if [ ${TASK} == "build" ]; then
-    make
-    exit $?
-fi
diff --git a/cpp-package/tests/travis/setup.sh b/cpp-package/tests/travis/setup.sh
deleted file mode 100755
index e0c850ed39a9..000000000000
--- a/cpp-package/tests/travis/setup.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-if [ ${TASK} == "lint" ]; then
-    pip3 install cpplint 'pylint==2.3.1' --user
-fi
diff --git a/docs/static_site/src/pages/api/cpp/index.md b/docs/static_site/src/pages/api/cpp/index.md
index ff781e8385e4..3a5031b7c537 100644
--- a/docs/static_site/src/pages/api/cpp/index.md
+++ b/docs/static_site/src/pages/api/cpp/index.md
@@ -30,8 +30,8 @@ The users of these bindings are required to build this package as mentioned belo
 
 ## Building C++ Package
 
-The cpp-package directory contains the implementation of C++ API. As mentioned above, users are required to build this directory or package before using it.
-**The cpp-package is built while building the MXNet shared library, *libmxnet.so*.**
+The cpp-package directory contains the implementation of C++ API. Users are required to build this directory or package before using it. 
+**The cpp-package is built while building the MXNet shared library, *libmxnet.so*, with *USE\_CPP\_PACKAGE* option turned on. Please follow the steps to build the C++ package**
 
 ### Steps to build the C++ package:
 1.  Building the MXNet C++ package requires building MXNet from source.
diff --git a/src/operator/regression_output.cc b/src/operator/regression_output.cc
new file mode 100644
index 000000000000..a337ec1ca1ad
--- /dev/null
+++ b/src/operator/regression_output.cc
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file regression_ouput.cc
+ * \brief Regression output operator.
+*/
+
+#include "./regression_output-inl.h"
+#include "./elemwise_op_common.h"
+
+
+#define MXNET_OPERATOR_REGISTER_REGRESSION_FWD(__name$, __kernel$, __bwdop$)           \
+  NNVM_REGISTER_OP(__name$)                                                            \
+  MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                   \
+  .set_num_inputs(2)                                                                   \
+  .set_num_outputs(1)                                                                  \
+  .set_attr<nnvm::FListInputNames>("FListInputNames",                                  \
+    [](const NodeAttrs& attrs) {                                                       \
+      return std::vector<std::string>{"data", "label"};                                \
+    })                                                                                 \
+  .set_attr<mxnet::FInferShape>("FInferShape", RegressionOpShape)                       \
+  .set_attr<nnvm::FGradient>("FGradient", RegressionOpGrad{__bwdop$})                  \
+  .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)                        \
+  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                                    \
+  [](const NodeAttrs& attrs){                                                          \
+    return std::vector<std::pair<int, int> >{{0, 0}};                                  \
+  })                                                                                   \
+  .set_attr<FCompute>("FCompute<cpu>", RegressionForward<cpu, __kernel$>)              \
+  .add_argument("data", "NDArray-or-Symbol", "Input data to the function.")            \
+  .add_argument("label", "NDArray-or-Symbol", "Input label to the function.")          \
+  .add_arguments(RegressionOutputParam::__FIELDS__())
+
+#define MXNET_OPERATOR_REGISTER_REGRESSION_BWD(__name$, __kernel$)                      \
+  NNVM_REGISTER_OP(__name$)                                                             \
+  .set_num_inputs(2)                                                                    \
+  .set_num_outputs(2)                                                                   \
+  .set_attr_parser(ParamParser<RegressionOutputParam>)                                  \
+  .set_attr<nnvm::TIsBackward>("TIsBackward", true)                                     \
+  .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 2>)                         \
+  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                                     \
+  [](const NodeAttrs& attrs){                                                           \
+    return std::vector<std::pair<int, int> >{{1, 0}};                                   \
+  })                                                                                    \
+  .set_attr<FCompute>("FCompute<cpu>", RegressionBackward<cpu, __kernel$>)
+
+namespace mxnet {
+namespace op {
+
+
+DMLC_REGISTER_PARAMETER(RegressionOutputParam);
+
+MXNET_OPERATOR_REGISTER_REGRESSION_FWD(LinearRegressionOutput,
+  mshadow_op::identity, "_backward_linear_reg_out")
+.set_attr<FInferStorageType>("FInferStorageType", RegressionInferStorageType<true>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", RegressionForwardEx<cpu, mshadow_op::identity>)
+.describe(R"code(Computes and optimizes for squared loss during backward propagation.
+Just outputs ``data`` during forward propagation.
+
+If :math:`\hat{y}_i` is the predicted value of the i-th sample, and :math:`y_i` is the corresponding target value,
+then the squared loss estimated over :math:`n` samples is defined as
+
+:math:`\text{SquaredLoss}(\textbf{Y}, \hat{\textbf{Y}} ) = \frac{1}{n} \sum_{i=0}^{n-1} \lVert  \textbf{y}_i - \hat{\textbf{y}}_i  \rVert_2`
+
+.. note::
+   Use the LinearRegressionOutput as the final output layer of a net.
+
+The storage type of ``label`` can be ``default`` or ``csr``
+
+- LinearRegressionOutput(default, default) = default
+- LinearRegressionOutput(default, csr) = default
+
+By default, gradients of this loss function are scaled by factor `1/m`, where m is the number of regression outputs of a training example.
+The parameter `grad_scale` can be used to change this scale to `grad_scale/m`.
+
+)code" ADD_FILELINE);
+
+MXNET_OPERATOR_REGISTER_REGRESSION_BWD(_backward_linear_reg_out, mshadow_op::minus)
+.set_attr<FInferStorageType>("FInferStorageType", RegressionInferStorageType<false>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", RegressionBackwardEx<cpu, mshadow_op::minus>);
+
+MXNET_OPERATOR_REGISTER_REGRESSION_FWD(MAERegressionOutput,
+  mshadow_op::identity, "_backward_mae_reg_out")
+.describe(R"code(Computes mean absolute error of the input.
+
+MAE is a risk metric corresponding to the expected value of the absolute error.
+
+If :math:`\hat{y}_i` is the predicted value of the i-th sample, and :math:`y_i` is the corresponding target value,
+then the mean absolute error (MAE) estimated over :math:`n` samples is defined as
+
+:math:`\text{MAE}(\textbf{Y}, \hat{\textbf{Y}} ) = \frac{1}{n} \sum_{i=0}^{n-1} \lVert \textbf{y}_i - \hat{\textbf{y}}_i \rVert_1`
+
+.. note::
+   Use the MAERegressionOutput as the final output layer of a net.
+
+The storage type of ``label`` can be ``default`` or ``csr``
+
+- MAERegressionOutput(default, default) = default
+- MAERegressionOutput(default, csr) = default
+
+By default, gradients of this loss function are scaled by factor `1/m`, where m is the number of regression outputs of a training example.
+The parameter `grad_scale` can be used to change this scale to `grad_scale/m`.
+
+)code" ADD_FILELINE);
+
+MXNET_OPERATOR_REGISTER_REGRESSION_BWD(_backward_mae_reg_out, mshadow_op::minus_sign);
+
+MXNET_OPERATOR_REGISTER_REGRESSION_FWD(LogisticRegressionOutput,
+  mshadow_op::sigmoid, "_backward_logistic_reg_out")
+.set_attr<FInferStorageType>("FInferStorageType", RegressionInferStorageType<true>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", RegressionForwardEx<cpu, mshadow_op::sigmoid>)
+.describe(R"code(Applies a logistic function to the input.
+
+The logistic function, also known as the sigmoid function, is computed as
+:math:`\frac{1}{1+exp(-\textbf{x})}`.
+
+Commonly, the sigmoid is used to squash the real-valued output of a linear model
+:math:`wTx+b` into the [0,1] range so that it can be interpreted as a probability.
+It is suitable for binary classification or probability prediction tasks.
+
+.. note::
+   Use the LogisticRegressionOutput as the final output layer of a net.
+
+The storage type of ``label`` can be ``default`` or ``csr``
+
+- LogisticRegressionOutput(default, default) = default
+- LogisticRegressionOutput(default, csr) = default
+
+The loss function used is the Binary Cross Entropy Loss:
+
+:math:`-{(y\log(p) + (1 - y)\log(1 - p))}`
+
+Where `y` is the ground truth probability of positive outcome for a given example, and `p` the probability predicted by the model. By default, gradients of this loss function are scaled by factor `1/m`, where m is the number of regression outputs of a training example.
+The parameter `grad_scale` can be used to change this scale to `grad_scale/m`.
+
+)code" ADD_FILELINE);
+
+MXNET_OPERATOR_REGISTER_REGRESSION_BWD(_backward_logistic_reg_out, mshadow_op::minus)
+.set_attr<FInferStorageType>("FInferStorageType", RegressionInferStorageType<false>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", RegressionBackwardEx<cpu, mshadow_op::minus>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/regression_output.cu b/src/operator/regression_output.cu
new file mode 100644
index 000000000000..ca11b84a212d
--- /dev/null
+++ b/src/operator/regression_output.cu
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file regression_ouput.cu
+ * \brief Regression output operator.
+*/
+#include "./regression_output-inl.h"
+
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(LinearRegressionOutput)
+.set_attr<FCompute>("FCompute<gpu>", RegressionForward<gpu, mshadow_op::identity>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", RegressionForwardEx<gpu, mshadow_op::identity>);
+
+NNVM_REGISTER_OP(_backward_linear_reg_out)
+.set_attr<FCompute>("FCompute<gpu>", RegressionBackward<gpu, mshadow_op::minus>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", RegressionBackwardEx<gpu, mshadow_op::minus>);
+
+NNVM_REGISTER_OP(MAERegressionOutput)
+.set_attr<FCompute>("FCompute<gpu>", RegressionForward<gpu, mshadow_op::identity>);
+
+NNVM_REGISTER_OP(_backward_mae_reg_out)
+.set_attr<FCompute>("FCompute<gpu>", RegressionBackward<gpu, mshadow_op::minus_sign>);
+
+NNVM_REGISTER_OP(LogisticRegressionOutput)
+.set_attr<FCompute>("FCompute<gpu>", RegressionForward<gpu, mshadow_op::sigmoid>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", RegressionForwardEx<gpu, mshadow_op::sigmoid>);
+
+NNVM_REGISTER_OP(_backward_logistic_reg_out)
+.set_attr<FCompute>("FCompute<gpu>", RegressionBackward<gpu, mshadow_op::minus>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", RegressionBackwardEx<gpu, mshadow_op::minus>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/svm_output-inl.h b/src/operator/svm_output-inl.h
new file mode 100644
index 000000000000..71fb91175f37
--- /dev/null
+++ b/src/operator/svm_output-inl.h
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file svm_output-inl.h
+ * \brief
+ * \author Jonas Amaro
+*/
+#ifndef MXNET_OPERATOR_SVM_OUTPUT_INL_H_
+#define MXNET_OPERATOR_SVM_OUTPUT_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include "./operator_common.h"
+#include "./mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+
+namespace svm_enum {
+enum SVMOutputOpInputs {kData, kLabel};
+enum SVMOutputOpOutputs {kOut};
+enum SVMOutputNormType {kNull, kBatch, kValid};
+enum SVMOutputOpResource {kTempSpace};
+}  // namespace svm_enum
+
+
+struct SVMOutputParam : public dmlc::Parameter<SVMOutputParam> {
+  float margin;
+  float regularization_coefficient;
+  bool use_linear;
+  DMLC_DECLARE_PARAMETER(SVMOutputParam) {
+    DMLC_DECLARE_FIELD(margin).set_default(1.0f)
+    .describe("The loss function penalizes outputs that lie outside this margin. "
+        "Default margin is 1.");
+    DMLC_DECLARE_FIELD(regularization_coefficient).set_default(1.0f)
+    .describe("Regularization parameter for the SVM. "
+        "This balances the tradeoff between coefficient size and error.");
+    DMLC_DECLARE_FIELD(use_linear).set_default(false)
+    .describe("Whether to use L1-SVM objective. L2-SVM objective is used by default.");
+  };
+};
+
+template<typename xpu, typename DType>
+class SVMOutputOp : public Operator {
+ public:
+  explicit SVMOutputOp(SVMOutputParam param) : param_(param) {}
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 2U) << "Expecting [data, label]";
+    CHECK_EQ(out_data.size(), 1U) << "Expecting [output]";
+    CHECK_EQ(req.size(), 1U) << "Expecting output.size() == req.size()";
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2, DType> data = in_data[svm_enum::kData].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> out = out_data[svm_enum::kOut].FlatTo2D<xpu, DType>(s);
+    Assign(out, req[svm_enum::kOut], F<mshadow_op::identity>(data));
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 2U);
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_GE(in_grad.size(), 1U);
+    CHECK_GE(req.size(), 1U);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const mxnet::TShape& label_shape = in_data[svm_enum::kLabel].shape_;
+
+    Tensor<xpu, 1, DType> label = in_data[svm_enum::kLabel].get_with_shape<xpu, 1, DType>(
+        Shape1(label_shape.ProdShape(0, label_shape.ndim())), s);
+    Tensor<xpu, 2, DType> out = out_data[svm_enum::kOut].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> grad = in_grad[svm_enum::kData].FlatTo2D<xpu, DType>(s);
+    CHECK_EQ(grad.shape_, out.shape_) << "SVMOutputs: shape mismatch";
+
+    if (param_.use_linear) {
+      L1_SVM(DType(param_.margin), DType(param_.regularization_coefficient), grad, label, out);
+    } else {
+      L2_SVM(DType(param_.margin), DType(param_.regularization_coefficient), grad, label, out);
+    }
+  }
+
+ private:
+  SVMOutputParam param_;
+};  // class SVMOutputOp
+
+// Declare Factory function, used for dispatch specialization
+template<typename xpu>
+Operator* CreateOp(SVMOutputParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class SVMOutputProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    return {"data", "label"};
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(mxnet::ShapeVector *in_shape,
+                  mxnet::ShapeVector *out_shape,
+                  mxnet::ShapeVector *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
+    const mxnet::TShape &dshape = in_shape->at(0);
+    if (!mxnet::ndim_is_known(dshape)) return false;
+    mxnet::TShape label_shape(dshape.ndim() - 1, -1);
+    for (int i = 0; i + 1 < dshape.ndim(); ++i)
+      label_shape[i] = dshape[i];
+    SHAPE_ASSIGN_CHECK(*in_shape, svm_enum::kLabel, label_shape);
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1U);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (size_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new SVMOutputProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "SVMOutput";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {in_data[svm_enum::kLabel], out_data[svm_enum::kOut]};
+  }
+
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+    return {{out_data[svm_enum::kOut], in_grad[svm_enum::kData]}};
+  }
+
+  std::vector<std::pair<int, void*> > ForwardInplaceOption(
+    const std::vector<int> &in_data,
+    const std::vector<void*> &out_data) const override {
+    return {{in_data[svm_enum::kData], out_data[svm_enum::kOut]}};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const mxnet::ShapeVector &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return nullptr;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ protected:
+  SVMOutputParam param_;
+};  // class SVMOutputProp
+#endif  // DMLC_USE_CXX11
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_SVM_OUTPUT_INL_H_
diff --git a/src/operator/svm_output.cc b/src/operator/svm_output.cc
new file mode 100644
index 000000000000..a52aa4779176
--- /dev/null
+++ b/src/operator/svm_output.cc
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file svm_output.cc
+ * \brief
+ * \author Jonas Amaro
+*/
+#include "./svm_output-inl.h"
+#include "./mshadow_op.h"
+
+namespace mshadow {
+  template<typename DType>
+  inline void L1_SVM(const DType & margin,
+                     const DType & reg_coef,
+                     Tensor<cpu, 2, DType> dst,
+                     const Tensor<cpu, 1, DType> & label,
+                     const Tensor<cpu, 2, DType> & src) {
+    for (index_t y = 0; y < dst.size(0); y++) {
+      const index_t k = static_cast<int>(label[y]);
+      for (index_t x = 0; x < dst.size(1); x++) {
+        if (x == k) {
+          dst[y][k] = -DType(margin > src[y][k]) * reg_coef;
+        } else {
+          dst[y][x] = DType(margin > -src[y][x]) * reg_coef;
+        }
+      }
+    }
+  }
+
+
+  template<typename DType>
+  inline void L2_SVM(const DType & margin,
+                     const DType & reg_coef,
+                     Tensor<cpu, 2, DType> dst,
+                     const Tensor<cpu, 1, DType> & label,
+                     const Tensor<cpu, 2, DType> & src) {
+    for (index_t y = 0; y < dst.size(0); y++) {
+      const index_t k = static_cast<int>(label[y]);
+      for (index_t x = 0; x < dst.size(1); x++) {
+        if (x == k) {
+          dst[y][k] = margin > src[y][k] ?  2*(margin - src[y][k]) : DType(0.0f);
+          dst[y][k] *= -reg_coef;
+        } else {
+          dst[y][x] = margin > -src[y][x] ? (-2)*(margin + src[y][x]) : DType(0.0f);
+          dst[y][x] *= -reg_coef;
+        }
+      }
+    }
+  }
+}  // namespace mshadow
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(SVMOutputParam param, int dtype) {
+  Operator *op = nullptr;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new SVMOutputOp<cpu, DType>(param);
+  })
+  return op;
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *SVMOutputProp::CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
+                                     std::vector<int> *in_type) const {
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(SVMOutputParam);
+
+MXNET_REGISTER_OP_PROPERTY(SVMOutput, SVMOutputProp)
+.describe(R"code(Computes support vector machine based transformation of the input.
+
+This tutorial demonstrates using SVM as output layer for classification instead of softmax:
+https://github.com/dmlc/mxnet/tree/master/example/svm_mnist.
+
+)code")
+.add_argument("data", "NDArray-or-Symbol", "Input data for SVM transformation.")
+.add_argument("label", "NDArray-or-Symbol", "Class label for the input data.")
+.add_arguments(SVMOutputParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/svm_output.cu b/src/operator/svm_output.cu
new file mode 100644
index 000000000000..081433df377a
--- /dev/null
+++ b/src/operator/svm_output.cu
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file svm_output.cu
+ * \brief
+ * \author Jonas Amaro
+*/
+
+#include "./svm_output-inl.h"
+#include <device_launch_parameters.h>
+#include "mshadow/tensor.h"
+
+
+namespace mshadow {
+
+template<int n_bits, typename DType>
+__global__  void L1_SVMKernel(const DType margin,
+                              const DType reg_coef,
+                              Tensor<gpu, 2, DType> dst,
+                              const Tensor<gpu, 1, DType> label,
+                              const Tensor<gpu, 2, DType> src) {
+  const index_t nmax = dst.size(1);
+  const unsigned n_size = 1 << n_bits;
+  const int y = blockIdx.x;
+  const int n = threadIdx.x;
+  const index_t k = static_cast<int>(label[y]);
+  for (index_t n_index = n; n_index < nmax; n_index += n_size) {
+    if (n_index == k) {
+      dst[y][k] = -DType(margin > src[y][k]) * reg_coef;
+    } else {
+      dst[y][n_index] = DType(margin > -src[y][n_index]) * reg_coef;
+    }
+  }
+}
+
+template<typename DType>
+inline void L1_SVM(const DType & margin,
+                   const DType & reg_coef,
+                   Tensor<gpu, 2, DType> dst,
+                   const Tensor<gpu, 1, DType> & label,
+                   const Tensor<gpu, 2, DType> & src) {
+  dim3 dimBlock(cuda::kBaseThreadNum);
+  dim3 dimGrid(dst.size(0));
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  L1_SVMKernel<cuda::kBaseThreadBits, DType> <<<dimGrid, dimBlock, 0, stream >>>
+    (margin, reg_coef, dst, label, src);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(L1_SVMKernel);
+}
+
+
+template<int n_bits, typename DType>
+__global__  void L2_SVMKernel(const DType margin,
+                              const DType reg_coef,
+                              Tensor<gpu, 2, DType> dst,
+                              const Tensor<gpu, 1, DType> label,
+                              const Tensor<gpu, 2, DType> src) {
+  const index_t nmax = dst.size(1);
+  const unsigned n_size = 1 << n_bits;
+  const int y = blockIdx.x;
+  const int n = threadIdx.x;
+  const index_t k = static_cast<int>(label[y]);
+  for (index_t n_index = n; n_index < nmax; n_index += n_size) {
+    if (n_index == k) {
+      dst[y][k] = margin > src[y][k] ? 2 * (margin - src[y][k]) : DType(0.0f);
+      dst[y][k] *= -reg_coef;
+    } else {
+      dst[y][n_index] = margin > -src[y][n_index] ? (-2)*(margin + src[y][n_index]) : DType(0.0f);
+      dst[y][n_index] *= -reg_coef;
+    }
+  }
+}
+
+template<typename DType>
+inline void L2_SVM(const DType & margin,
+                   const DType & reg_coef,
+                   Tensor<gpu, 2, DType> dst,
+                   const Tensor<gpu, 1, DType> & label,
+                   const Tensor<gpu, 2, DType> & src) {
+  dim3 dimBlock(cuda::kBaseThreadNum);
+  dim3 dimGrid(dst.size(0));
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  L2_SVMKernel<cuda::kBaseThreadBits, DType> <<<dimGrid, dimBlock, 0, stream >>>
+    (margin, reg_coef, dst, label, src);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(L2_SVMKernel);
+}
+}  // namespace mshadow
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<gpu>(SVMOutputParam param, int dtype) {
+  Operator *op = nullptr;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new SVMOutputOp<gpu, DType>(param);
+  })
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet
+

From e9d8b4531732e2eac8a758fbc8f943c02824b9cd Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 14 May 2021 15:31:18 -0700
Subject: [PATCH 47/47] add module api to amp list

---
 python/mxnet/amp/lists/symbol_fp16.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/mxnet/amp/lists/symbol_fp16.py b/python/mxnet/amp/lists/symbol_fp16.py
index ec277e17ece9..66ebfdf913cf 100644
--- a/python/mxnet/amp/lists/symbol_fp16.py
+++ b/python/mxnet/amp/lists/symbol_fp16.py
@@ -439,7 +439,6 @@
     'slice_like',
     'softsign',
     'sort',
-    'SoftmaxOutput',
     'space_to_depth',
     'sqrt',
     'squeeze',
@@ -577,6 +576,7 @@
     'topk',
 
     # Neural network
+    'SoftmaxOutput',
     'softmax',
     'log_softmax',
     'masked_softmax',
@@ -587,6 +587,10 @@
     'L2Normalization',
     'LRN',
     'SoftmaxActivation',
+    'LinearRegressionOutput',
+    'LogisticRegressionOutput',
+    'MAERegressionOutput',
+    'SVMOutput',
     'softmax_cross_entropy',
     'smooth_l1',
     'MakeLoss',
@@ -718,4 +722,8 @@
     ]
 
 LOSS_OUTPUT_FUNCTIONS = [
+    'SoftmaxOutput',
+    'LinearRegressionOutput',
+    'LogisticRegressionOutput',
+    'MAERegressionOutput',
     ]